aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c1
-rw-r--r--fs/9p/fid.c3
-rw-r--r--fs/9p/v9fs.c42
-rw-r--r--fs/9p/v9fs.h23
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c14
-rw-r--r--fs/9p/vfs_file.c27
-rw-r--r--fs/9p/vfs_inode.c92
-rw-r--r--fs/9p/vfs_super.c4
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/adfs.h2
-rw-r--r--fs/adfs/inode.c5
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/affs.h5
-rw-r--r--fs/affs/bitmap.c3
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/affs/namei.c7
-rw-r--r--fs/affs/super.c32
-rw-r--r--fs/affs/symlink.c7
-rw-r--r--fs/afs/cache.c1
-rw-r--r--fs/afs/cmservice.c1
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/fsclient.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/mntpt.c2
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/afs/vnode.c1
-rw-r--r--fs/afs/write.c29
-rw-r--r--fs/aio.c94
-rw-r--r--fs/anon_inodes.c37
-rw-r--r--fs/attr.c13
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/autofs_i.h31
-rw-r--r--fs/autofs4/dev-ioctl.c12
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/autofs4/inode.c63
-rw-r--r--fs/autofs4/root.c219
-rw-r--r--fs/befs/datastream.c1
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/bfs/inode.c48
-rw-r--r--fs/binfmt_aout.c68
-rw-r--r--fs/binfmt_elf.c207
-rw-r--r--fs/binfmt_elf_fdpic.c229
-rw-r--r--fs/binfmt_em86.c1
-rw-r--r--fs/binfmt_flat.c9
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/binfmt_som.c3
-rw-r--r--fs/bio-integrity.c4
-rw-r--r--fs/bio.c39
-rw-r--r--fs/block_dev.c19
-rw-r--r--fs/btrfs/acl.c48
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c1
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/extent-tree.c1
-rw-r--r--fs/btrfs/extent_io.c1
-rw-r--r--fs/btrfs/extent_map.c3
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c5
-rw-r--r--fs/btrfs/free-space-cache.c1
-rw-r--r--fs/btrfs/inode.c5
-rw-r--r--fs/btrfs/ioctl.c1
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/relocation.c1
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/transaction.c1
-rw-r--r--fs/btrfs/tree-log.c1
-rw-r--r--fs/btrfs/volumes.c1
-rw-r--r--fs/buffer.c15
-rw-r--r--fs/cachefiles/bind.c11
-rw-r--r--fs/cachefiles/daemon.c4
-rw-r--r--fs/cachefiles/interface.c1
-rw-r--r--fs/cachefiles/namei.c13
-rw-r--r--fs/cachefiles/rdwr.c3
-rw-r--r--fs/cachefiles/xattr.c1
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1195
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c258
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c122
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c680
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c81
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2933
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c409
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c484
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1223
-rw-r--r--fs/ceph/export.c224
-rw-r--r--fs/ceph/file.c938
-rw-r--r--fs/ceph/inode.c1766
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3043
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2240
-rw-r--r--fs/ceph/messenger.h255
-rw-r--r--fs/ceph/mon_client.c835
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1550
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1024
-rw-r--r--fs/ceph/osdmap.h125
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h374
-rw-r--r--fs/ceph/snap.c907
-rw-r--r--fs/ceph/super.c1031
-rw-r--r--fs/ceph/super.h902
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c845
-rw-r--r--fs/cifs/CHANGES9
-rw-r--r--fs/cifs/README2
-rw-r--r--fs/cifs/asn1.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c6
-rw-r--r--fs/cifs/cifs_spnego.c1
-rw-r--r--fs/cifs/cifs_unicode.c1
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsfs.h5
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifspdu.h8
-rw-r--r--fs/cifs/cifsproto.h13
-rw-r--r--fs/cifs/cifssmb.c498
-rw-r--r--fs/cifs/connect.c52
-rw-r--r--fs/cifs/dir.c5
-rw-r--r--fs/cifs/dns_resolve.c1
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/cifs/file.c18
-rw-r--r--fs/cifs/inode.c317
-rw-r--r--fs/cifs/link.c1
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c12
-rw-r--r--fs/cifs/smbdes.c2
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/transport.c1
-rw-r--r--fs/cifs/xattr.c9
-rw-r--r--fs/coda/dir.c1
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c1
-rw-r--r--fs/coda/sysctl.c10
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat.c21
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/compat_ioctl.c1548
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/configfs/symlink.c5
-rw-r--r--fs/dcache.c71
-rw-r--r--fs/debugfs/inode.c75
-rw-r--r--fs/devpts/inode.c17
-rw-r--r--fs/direct-io.c175
-rw-r--r--fs/dlm/ast.c74
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/config.c25
-rw-r--r--fs/dlm/debug_fs.c5
-rw-r--r--fs/dlm/dir.c7
-rw-r--r--fs/dlm/dlm_internal.h11
-rw-r--r--fs/dlm/lock.c127
-rw-r--r--fs/dlm/lockspace.c31
-rw-r--r--fs/dlm/lowcomms.c7
-rw-r--r--fs/dlm/member.c10
-rw-r--r--fs/dlm/memory.c6
-rw-r--r--fs/dlm/netlink.c3
-rw-r--r--fs/dlm/plock.c9
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/dlm/requestqueue.c2
-rw-r--r--fs/dlm/user.c23
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/ecryptfs/dentry.c3
-rw-r--r--fs/ecryptfs/file.c18
-rw-r--r--fs/ecryptfs/inode.c165
-rw-r--r--fs/ecryptfs/keystore.c1
-rw-r--r--fs/ecryptfs/kthread.c1
-rw-r--r--fs/ecryptfs/main.c14
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/mmap.c1
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/eventfd.c92
-rw-r--r--fs/eventpoll.c6
-rw-r--r--fs/exec.c155
-rw-r--r--fs/exofs/Kbuild2
-rw-r--r--fs/exofs/common.h112
-rw-r--r--fs/exofs/exofs.h140
-rw-r--r--fs/exofs/inode.c547
-rw-r--r--fs/exofs/ios.c823
-rw-r--r--fs/exofs/osd.c125
-rw-r--r--fs/exofs/pnfs.h45
-rw-r--r--fs/exofs/super.c417
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext2/acl.c79
-rw-r--r--fs/ext2/balloc.c13
-rw-r--r--fs/ext2/dir.c6
-rw-r--r--fs/ext2/ext2.h5
-rw-r--r--fs/ext2/file.c26
-rw-r--r--fs/ext2/ialloc.c14
-rw-r--r--fs/ext2/inode.c24
-rw-r--r--fs/ext2/namei.c51
-rw-r--r--fs/ext2/super.c208
-rw-r--r--fs/ext2/xattr.c21
-rw-r--r--fs/ext2/xattr_security.c17
-rw-r--r--fs/ext2/xattr_trusted.c16
-rw-r--r--fs/ext2/xattr_user.c25
-rw-r--r--fs/ext2/xip.c5
-rw-r--r--fs/ext3/acl.c74
-rw-r--r--fs/ext3/balloc.c12
-rw-r--r--fs/ext3/file.c7
-rw-r--r--fs/ext3/ialloc.c20
-rw-r--r--fs/ext3/inode.c75
-rw-r--r--fs/ext3/namei.c52
-rw-r--r--fs/ext3/resize.c37
-rw-r--r--fs/ext3/super.c707
-rw-r--r--fs/ext3/xattr.c60
-rw-r--r--fs/ext3/xattr_security.c21
-rw-r--r--fs/ext3/xattr_trusted.c18
-rw-r--r--fs/ext3/xattr_user.c25
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/acl.c74
-rw-r--r--fs/ext4/balloc.c81
-rw-r--r--fs/ext4/block_validity.c9
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h148
-rw-r--r--fs/ext4/ext4_extents.h3
-rw-r--r--fs/ext4/ext4_jbd2.c86
-rw-r--r--fs/ext4/ext4_jbd2.h68
-rw-r--r--fs/ext4/extents.c394
-rw-r--r--fs/ext4/file.c13
-rw-r--r--fs/ext4/fsync.c68
-rw-r--r--fs/ext4/ialloc.c52
-rw-r--r--fs/ext4/inode.c965
-rw-r--r--fs/ext4/ioctl.c41
-rw-r--r--fs/ext4/mballoc.c179
-rw-r--r--fs/ext4/mballoc.h10
-rw-r--r--fs/ext4/migrate.c63
-rw-r--r--fs/ext4/move_extent.c313
-rw-r--r--fs/ext4/namei.c124
-rw-r--r--fs/ext4/resize.c104
-rw-r--r--fs/ext4/super.c537
-rw-r--r--fs/ext4/xattr.c112
-rw-r--r--fs/ext4/xattr_security.c21
-rw-r--r--fs/ext4/xattr_trusted.c20
-rw-r--r--fs/ext4/xattr_user.c25
-rw-r--r--fs/fat/cache.c1
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/fatent.c25
-rw-r--r--fs/fat/inode.c19
-rw-r--r--fs/fat/misc.c57
-rw-r--r--fs/fat/namei_vfat.c33
-rw-r--r--fs/fcntl.c104
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/file.c4
-rw-r--r--fs/file_table.c53
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c69
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/object-list.c3
-rw-r--r--fs/fscache/object.c6
-rw-r--r--fs/fscache/operation.c5
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c30
-rw-r--r--fs/fuse/file.c3
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/generic_acl.c159
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/acl.c361
-rw-r--r--fs/gfs2/acl.h24
-rw-r--r--fs/gfs2/aops.c24
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/dentry.c1
-rw-r--r--fs/gfs2/dir.c34
-rw-r--r--fs/gfs2/export.c1
-rw-r--r--fs/gfs2/file.c40
-rw-r--r--fs/gfs2/glock.c110
-rw-r--r--fs/gfs2/glock.h12
-rw-r--r--fs/gfs2/glops.c22
-rw-r--r--fs/gfs2/incore.h14
-rw-r--r--fs/gfs2/inode.c15
-rw-r--r--fs/gfs2/lock_dlm.c17
-rw-r--r--fs/gfs2/log.c5
-rw-r--r--fs/gfs2/lops.c8
-rw-r--r--fs/gfs2/main.c28
-rw-r--r--fs/gfs2/meta_io.c46
-rw-r--r--fs/gfs2/meta_io.h12
-rw-r--r--fs/gfs2/ops_fstype.c174
-rw-r--r--fs/gfs2/ops_inode.c118
-rw-r--r--fs/gfs2/quota.c400
-rw-r--r--fs/gfs2/quota.h5
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c22
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c143
-rw-r--r--fs/gfs2/super.h4
-rw-r--r--fs/gfs2/sys.c37
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/gfs2/xattr.c156
-rw-r--r--fs/gfs2/xattr.h15
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/btree.c1
-rw-r--r--fs/hfs/catalog.c4
-rw-r--r--fs/hfs/dir.c11
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hfs/super.c8
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/hpfs/anode.c2
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dentry.c14
-rw-r--r--fs/hpfs/dir.c15
-rw-r--r--fs/hpfs/dnode.c21
-rw-r--r--fs/hpfs/ea.c7
-rw-r--r--fs/hpfs/hpfs_fn.h30
-rw-r--r--fs/hpfs/inode.c5
-rw-r--r--fs/hpfs/map.c6
-rw-r--r--fs/hpfs/name.c21
-rw-r--r--fs/hpfs/namei.c75
-rw-r--r--fs/hpfs/super.c18
-rw-r--r--fs/hppfs/hppfs.c20
-rw-r--r--fs/hugetlbfs/inode.c17
-rw-r--r--fs/inode.c40
-rw-r--r--fs/internal.h10
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/compress.c533
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/isofs/rock.c3
-rw-r--r--fs/jbd/commit.c11
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/recovery.c1
-rw-r--r--fs/jbd/transaction.c45
-rw-r--r--fs/jbd2/checkpoint.c16
-rw-r--r--fs/jbd2/commit.c38
-rw-r--r--fs/jbd2/journal.c149
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jbd2/transaction.c43
-rw-r--r--fs/jffs2/acl.c65
-rw-r--r--fs/jffs2/compr.c2
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_zlib.c1
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/gc.c3
-rw-r--r--fs/jffs2/nodelist.c1
-rw-r--r--fs/jffs2/nodemgmt.c1
-rw-r--r--fs/jffs2/readinode.c6
-rw-r--r--fs/jffs2/security.c18
-rw-r--r--fs/jffs2/summary.c2
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/write.c1
-rw-r--r--fs/jffs2/xattr.c8
-rw-r--r--fs/jffs2/xattr_trusted.c18
-rw-r--r--fs/jffs2/xattr_user.c18
-rw-r--r--fs/jfs/acl.c27
-rw-r--r--fs/jfs/file.c31
-rw-r--r--fs/jfs/inode.c14
-rw-r--r--fs/jfs/jfs_acl.h7
-rw-r--r--fs/jfs/jfs_dmap.c5
-rw-r--r--fs/jfs/jfs_dtree.c29
-rw-r--r--fs/jfs/jfs_extent.c16
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_inode.c8
-rw-r--r--fs/jfs/jfs_inode.h3
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_unicode.h1
-rw-r--r--fs/jfs/jfs_xtree.c21
-rw-r--r--fs/jfs/namei.c23
-rw-r--r--fs/jfs/super.c9
-rw-r--r--fs/jfs/xattr.c18
-rw-r--r--fs/libfs.c79
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/mon.c13
-rw-r--r--fs/lockd/svc.c29
-rw-r--r--fs/lockd/svc4proc.c5
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c5
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/locks.c7
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c333
-rw-r--r--fs/logfs/dev_mtd.c254
-rw-r--r--fs/logfs/dir.c827
-rw-r--r--fs/logfs/file.c263
-rw-r--r--fs/logfs/gc.c731
-rw-r--r--fs/logfs/inode.c418
-rw-r--r--fs/logfs/journal.c891
-rw-r--r--fs/logfs/logfs.h725
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2258
-rw-r--r--fs/logfs/segment.c936
-rw-r--r--fs/logfs/super.c650
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/minix/itree_v1.c1
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c972
-rw-r--r--fs/namespace.c83
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/ioctl.c3
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/ncpfs/symlink.c1
-rw-r--r--fs/nfs/Kconfig5
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback.c15
-rw-r--r--fs/nfs/callback.h24
-rw-r--r--fs/nfs/callback_proc.c224
-rw-r--r--fs/nfs/callback_xdr.c137
-rw-r--r--fs/nfs/client.c63
-rw-r--r--fs/nfs/delegation.c78
-rw-r--r--fs/nfs/delegation.h13
-rw-r--r--fs/nfs/dir.c72
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/dns_resolve.c23
-rw-r--r--fs/nfs/file.c39
-rw-r--r--fs/nfs/fscache.c10
-rw-r--r--fs/nfs/inode.c109
-rw-r--r--fs/nfs/internal.h56
-rw-r--r--fs/nfs/iostat.h28
-rw-r--r--fs/nfs/mount_clnt.c2
-rw-r--r--fs/nfs/namespace.c1
-rw-r--r--fs/nfs/nfs2xdr.c3
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3proc.c10
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4_fs.h21
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c822
-rw-r--r--fs/nfs/nfs4renewd.c24
-rw-r--r--fs/nfs/nfs4state.c317
-rw-r--r--fs/nfs/nfs4xdr.c154
-rw-r--r--fs/nfs/pagelist.c40
-rw-r--r--fs/nfs/proc.c42
-rw-r--r--fs/nfs/read.c12
-rw-r--r--fs/nfs/super.c145
-rw-r--r--fs/nfs/symlink.c3
-rw-r--r--fs/nfs/sysctl.c24
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c265
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/nfsctl.c7
-rw-r--r--fs/nfsd/auth.c12
-rw-r--r--fs/nfsd/cache.h83
-rw-r--r--fs/nfsd/export.c56
-rw-r--r--fs/nfsd/lockd.c10
-rw-r--r--fs/nfsd/nfs2acl.c28
-rw-r--r--fs/nfsd/nfs3acl.c16
-rw-r--r--fs/nfsd/nfs3proc.c20
-rw-r--r--fs/nfsd/nfs3xdr.c15
-rw-r--r--fs/nfsd/nfs4acl.c11
-rw-r--r--fs/nfsd/nfs4callback.c25
-rw-r--r--fs/nfsd/nfs4idmap.c18
-rw-r--r--fs/nfsd/nfs4proc.c20
-rw-r--r--fs/nfsd/nfs4recover.c21
-rw-r--r--fs/nfsd/nfs4state.c91
-rw-r--r--fs/nfsd/nfs4xdr.c43
-rw-r--r--fs/nfsd/nfscache.c12
-rw-r--r--fs/nfsd/nfsctl.c76
-rw-r--r--fs/nfsd/nfsd.h338
-rw-r--r--fs/nfsd/nfsfh.c102
-rw-r--r--fs/nfsd/nfsfh.h208
-rw-r--r--fs/nfsd/nfsproc.c22
-rw-r--r--fs/nfsd/nfssvc.c22
-rw-r--r--fs/nfsd/nfsxdr.c12
-rw-r--r--fs/nfsd/state.h408
-rw-r--r--fs/nfsd/stats.c11
-rw-r--r--fs/nfsd/vfs.c303
-rw-r--r--fs/nfsd/vfs.h101
-rw-r--r--fs/nfsd/xdr.h173
-rw-r--r--fs/nfsd/xdr3.h344
-rw-r--r--fs/nfsd/xdr4.h562
-rw-r--r--fs/nilfs2/alloc.c109
-rw-r--r--fs/nilfs2/alloc.h23
-rw-r--r--fs/nilfs2/bmap.c12
-rw-r--r--fs/nilfs2/btnode.c77
-rw-r--r--fs/nilfs2/btnode.h6
-rw-r--r--fs/nilfs2/btree.c106
-rw-r--r--fs/nilfs2/btree.h22
-rw-r--r--fs/nilfs2/cpfile.c57
-rw-r--r--fs/nilfs2/cpfile.h3
-rw-r--r--fs/nilfs2/dat.c52
-rw-r--r--fs/nilfs2/dat.h3
-rw-r--r--fs/nilfs2/dir.c40
-rw-r--r--fs/nilfs2/direct.c17
-rw-r--r--fs/nilfs2/gcdat.c3
-rw-r--r--fs/nilfs2/gcinode.c11
-rw-r--r--fs/nilfs2/ifile.c35
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c8
-rw-r--r--fs/nilfs2/ioctl.c69
-rw-r--r--fs/nilfs2/mdt.c57
-rw-r--r--fs/nilfs2/mdt.h25
-rw-r--r--fs/nilfs2/namei.c96
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/page.c5
-rw-r--r--fs/nilfs2/recovery.c76
-rw-r--r--fs/nilfs2/segbuf.c208
-rw-r--r--fs/nilfs2/segbuf.h51
-rw-r--r--fs/nilfs2/segment.c501
-rw-r--r--fs/nilfs2/segment.h8
-rw-r--r--fs/nilfs2/sufile.c205
-rw-r--r--fs/nilfs2/sufile.h14
-rw-r--r--fs/nilfs2/super.c110
-rw-r--r--fs/nilfs2/the_nilfs.c195
-rw-r--r--fs/nilfs2/the_nilfs.h14
-rw-r--r--fs/notify/fsnotify.c1
-rw-r--r--fs/notify/inode_mark.c1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c84
-rw-r--r--fs/ntfs/ChangeLog1702
-rw-r--r--fs/ntfs/aops.c1
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c3
-rw-r--r--fs/ntfs/dir.c3
-rw-r--r--fs/ntfs/file.c7
-rw-r--r--fs/ntfs/index.c2
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/logfile.c2
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ntfs/super.c33
-rw-r--r--fs/ntfs/sysctl.c4
-rw-r--r--fs/ocfs2/Kconfig10
-rw-r--r--fs/ocfs2/Makefile8
-rw-r--r--fs/ocfs2/acl.c169
-rw-r--r--fs/ocfs2/acl.h22
-rw-r--r--fs/ocfs2/alloc.c34
-rw-r--r--fs/ocfs2/alloc.h5
-rw-r--r--fs/ocfs2/aops.c52
-rw-r--r--fs/ocfs2/blockcheck.c2
-rw-r--r--fs/ocfs2/buffer_head_io.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c13
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h7
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/cluster/nodemanager.c52
-rw-r--r--fs/ocfs2/cluster/nodemanager.h7
-rw-r--r--fs/ocfs2/cluster/quorum.c17
-rw-r--r--fs/ocfs2/cluster/tcp.c14
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h4
-rw-r--r--fs/ocfs2/dir.c39
-rw-r--r--fs/ocfs2/dlm/Makefile3
-rw-r--r--fs/ocfs2/dlm/dlmapi.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c3
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlm/dlmlock.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c44
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c165
-rw-r--r--fs/ocfs2/dlm/dlmthread.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c9
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)127
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c (renamed from fs/ocfs2/dlm/dlmfsver.c)0
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h (renamed from fs/ocfs2/dlm/dlmfsver.h)0
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)308
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)16
-rw-r--r--fs/ocfs2/dlmglue.c373
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/extent_map.c30
-rw-r--r--fs/ocfs2/file.c60
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c26
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/ioctl.h6
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c86
-rw-r--r--fs/ocfs2/ocfs2.h46
-rw-r--r--fs/ocfs2/ocfs2_fs.h70
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h79
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h2
-rw-r--r--fs/ocfs2/quota.h4
-rw-r--r--fs/ocfs2/quota_global.c8
-rw-r--r--fs/ocfs2/quota_local.c5
-rw-r--r--fs/ocfs2/refcounttree.c174
-rw-r--r--fs/ocfs2/stack_o2cb.c50
-rw-r--r--fs/ocfs2/stack_user.c52
-rw-r--r--fs/ocfs2/stackglue.c113
-rw-r--r--fs/ocfs2/stackglue.h95
-rw-r--r--fs/ocfs2/suballoc.c300
-rw-r--r--fs/ocfs2/suballoc.h6
-rw-r--r--fs/ocfs2/super.c107
-rw-r--r--fs/ocfs2/symlink.c12
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/uptodate.c4
-rw-r--r--fs/ocfs2/xattr.c2264
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/bitmap.c2
-rw-r--r--fs/omfs/inode.c11
-rw-r--r--fs/open.c53
-rw-r--r--fs/partitions/check.c16
-rw-r--r--fs/partitions/efi.c31
-rw-r--r--fs/partitions/efi.h8
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/pipe.c45
-rw-r--r--fs/pnode.c28
-rw-r--r--fs/pnode.h5
-rw-r--r--fs/proc/array.c132
-rw-r--r--fs/proc/base.c119
-rw-r--r--fs/proc/generic.c60
-rw-r--r--fs/proc/inode.c32
-rw-r--r--fs/proc/internal.h10
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/kmsg.c14
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/page.c45
-rw-r--r--fs/proc/proc_devtree.c49
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/root.c6
-rw-r--r--fs/proc/stat.c20
-rw-r--r--fs/proc/task_mmu.c149
-rw-r--r--fs/proc/task_nommu.c9
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/qnx4/bitmap.c26
-rw-r--r--fs/qnx4/dir.c6
-rw-r--r--fs/qnx4/inode.c51
-rw-r--r--fs/qnx4/namei.c6
-rw-r--r--fs/quota/Kconfig15
-rw-r--r--fs/quota/Makefile2
-rw-r--r--fs/quota/compat.c118
-rw-r--r--fs/quota/dquot.c765
-rw-r--r--fs/quota/netlink.c96
-rw-r--r--fs/quota/quota.c642
-rw-r--r--fs/quota/quota_v1.c2
-rw-r--r--fs/quota/quota_v2.c170
-rw-r--r--fs/quota/quotaio_v2.h19
-rw-r--r--fs/ramfs/file-nommu.c29
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/reiserfs/Makefile6
-rw-r--r--fs/reiserfs/bitmap.c19
-rw-r--r--fs/reiserfs/dir.c11
-rw-r--r--fs/reiserfs/do_balan.c17
-rw-r--r--fs/reiserfs/file.c4
-rw-r--r--fs/reiserfs/fix_node.c22
-rw-r--r--fs/reiserfs/inode.c164
-rw-r--r--fs/reiserfs/ioctl.c80
-rw-r--r--fs/reiserfs/journal.c164
-rw-r--r--fs/reiserfs/lock.c97
-rw-r--r--fs/reiserfs/namei.c51
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/procfs.c65
-rw-r--r--fs/reiserfs/resize.c2
-rw-r--r--fs/reiserfs/stree.c73
-rw-r--r--fs/reiserfs/super.c76
-rw-r--r--fs/reiserfs/xattr.c85
-rw-r--r--fs/reiserfs/xattr_acl.c72
-rw-r--r--fs/reiserfs/xattr_security.c24
-rw-r--r--fs/reiserfs/xattr_trusted.c21
-rw-r--r--fs/reiserfs/xattr_user.c21
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/select.c19
-rw-r--r--fs/seq_file.c130
-rw-r--r--fs/signalfd.c3
-rw-r--r--fs/smbfs/file.c1
-rw-r--r--fs/smbfs/smbiod.c1
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c25
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/block.c76
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c68
-rw-r--r--fs/squashfs/decompressor.h55
-rw-r--r--fs/squashfs/dir.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/file.c1
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/inode.c1
-rw-r--r--fs/squashfs/namei.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h6
-rw-r--r--fs/squashfs/squashfs_fs_sb.h40
-rw-r--r--fs/squashfs/super.c49
-rw-r--r--fs/squashfs/symlink.c2
-rw-r--r--fs/squashfs/zlib_wrapper.c151
-rw-r--r--fs/stack.c71
-rw-r--r--fs/stat.c10
-rw-r--r--fs/super.c24
-rw-r--r--fs/sync.c83
-rw-r--r--fs/sysfs/bin.c56
-rw-r--r--fs/sysfs/dir.c532
-rw-r--r--fs/sysfs/file.c88
-rw-r--r--fs/sysfs/inode.c205
-rw-r--r--fs/sysfs/mount.c5
-rw-r--r--fs/sysfs/symlink.c50
-rw-r--r--fs/sysfs/sysfs.h37
-rw-r--r--fs/sysv/inode.c10
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/timerfd.c3
-rw-r--r--fs/ubifs/commit.c1
-rw-r--r--fs/ubifs/debug.c12
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c24
-rw-r--r--fs/ubifs/gc.c97
-rw-r--r--fs/ubifs/io.c1
-rw-r--r--fs/ubifs/lpt.c1
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/recovery.c3
-rw-r--r--fs/ubifs/sb.c1
-rw-r--r--fs/ubifs/super.c29
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h1
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/balloc.c88
-rw-r--r--fs/udf/dir.c4
-rw-r--r--fs/udf/file.c29
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c70
-rw-r--r--fs/udf/namei.c75
-rw-r--r--fs/udf/partition.c1
-rw-r--r--fs/udf/super.c32
-rw-r--r--fs/udf/symlink.c11
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/udf/unicode.c1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c14
-rw-r--r--fs/ufs/file.c3
-rw-r--r--fs/ufs/ialloc.c11
-rw-r--r--fs/ufs/inode.c9
-rw-r--r--fs/ufs/namei.c26
-rw-r--r--fs/ufs/super.c61
-rw-r--r--fs/ufs/truncate.c10
-rw-r--r--fs/ufs/ufs.h6
-rw-r--r--fs/ufs/ufs_fs.h15
-rw-r--r--fs/xattr.c28
-rw-r--r--fs/xattr_acl.c4
-rw-r--r--fs/xfs/Makefile11
-rw-r--r--fs/xfs/linux-2.6/kmem.c57
-rw-r--r--fs/xfs/linux-2.6/kmem.h21
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c73
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c392
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c542
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h95
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c856
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c922
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h77
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c349
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c347
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c62
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c59
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h1503
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c98
-rw-r--r--fs/xfs/quota/xfs_dquot.c157
-rw-r--r--fs/xfs/quota/xfs_dquot.h23
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c99
-rw-r--r--fs/xfs/quota/xfs_dquot_item.h4
-rw-r--r--fs/xfs/quota/xfs_qm.c80
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c10
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c49
-rw-r--r--fs/xfs/support/debug.h18
-rw-r--r--fs/xfs/support/ktrace.c323
-rw-r--r--fs/xfs/support/ktrace.h85
-rw-r--r--fs/xfs/xfs.h16
-rw-r--r--fs/xfs/xfs_acl.h7
-rw-r--r--fs/xfs/xfs_ag.h30
-rw-r--r--fs/xfs/xfs_alloc.c354
-rw-r--r--fs/xfs/xfs_alloc.h27
-rw-r--r--fs/xfs/xfs_alloc_btree.c10
-rw-r--r--fs/xfs/xfs_attr.c171
-rw-r--r--fs/xfs/xfs_attr.h14
-rw-r--r--fs/xfs/xfs_attr_leaf.c46
-rw-r--r--fs/xfs/xfs_attr_sf.h42
-rw-r--r--fs/xfs/xfs_bmap.c1163
-rw-r--r--fs/xfs/xfs_bmap.h58
-rw-r--r--fs/xfs/xfs_bmap_btree.c11
-rw-r--r--fs/xfs/xfs_bmap_btree.h15
-rw-r--r--fs/xfs/xfs_btree.c9
-rw-r--r--fs/xfs/xfs_btree_trace.h17
-rw-r--r--fs/xfs/xfs_buf_item.c159
-rw-r--r--fs/xfs/xfs_buf_item.h20
-rw-r--r--fs/xfs/xfs_da_btree.c7
-rw-r--r--fs/xfs/xfs_da_btree.h12
-rw-r--r--fs/xfs/xfs_dfrag.c151
-rw-r--r--fs/xfs/xfs_dfrag.h3
-rw-r--r--fs/xfs/xfs_dir2.c16
-rw-r--r--fs/xfs/xfs_dir2.h4
-rw-r--r--fs/xfs/xfs_dir2_block.c29
-rw-r--r--fs/xfs/xfs_dir2_leaf.c23
-rw-r--r--fs/xfs/xfs_dir2_node.c29
-rw-r--r--fs/xfs/xfs_dir2_node.h2
-rw-r--r--fs/xfs/xfs_dir2_sf.c28
-rw-r--r--fs/xfs/xfs_dir2_trace.c216
-rw-r--r--fs/xfs/xfs_dir2_trace.h72
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_filestream.c50
-rw-r--r--fs/xfs/xfs_filestream.h36
-rw-r--r--fs/xfs/xfs_fs.h3
-rw-r--r--fs/xfs/xfs_fsops.c69
-rw-r--r--fs/xfs/xfs_ialloc.c64
-rw-r--r--fs/xfs/xfs_iget.c160
-rw-r--r--fs/xfs/xfs_inode.c278
-rw-r--r--fs/xfs/xfs_inode.h96
-rw-r--r--fs/xfs/xfs_inode_item.c146
-rw-r--r--fs/xfs/xfs_inode_item.h12
-rw-r--r--fs/xfs/xfs_iomap.c94
-rw-r--r--fs/xfs/xfs_iomap.h8
-rw-r--r--fs/xfs/xfs_itable.c14
-rw-r--r--fs/xfs/xfs_log.c670
-rw-r--r--fs/xfs/xfs_log.h35
-rw-r--r--fs/xfs/xfs_log_priv.h25
-rw-r--r--fs/xfs/xfs_log_recover.c271
-rw-r--r--fs/xfs/xfs_log_recover.h23
-rw-r--r--fs/xfs/xfs_mount.c268
-rw-r--r--fs/xfs/xfs_mount.h58
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_quota.h17
-rw-r--r--fs/xfs/xfs_rename.c1
-rw-r--r--fs/xfs/xfs_rtalloc.c3
-rw-r--r--fs/xfs/xfs_rw.c174
-rw-r--r--fs/xfs/xfs_rw.h33
-rw-r--r--fs/xfs/xfs_trans.c16
-rw-r--r--fs/xfs/xfs_trans.h54
-rw-r--r--fs/xfs/xfs_trans_ail.c34
-rw-r--r--fs/xfs/xfs_trans_buf.c302
-rw-r--r--fs/xfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_vnodeops.c294
-rw-r--r--fs/xfs/xfs_vnodeops.h26
893 files changed, 65545 insertions, 23671 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 14d944204571..82ee460e534d 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/idr.h> 29#include <linux/idr.h>
29#include <net/9p/9p.h> 30#include <net/9p/9p.h>
@@ -151,7 +152,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
151 if (access == V9FS_ACCESS_SINGLE) 152 if (access == V9FS_ACCESS_SINGLE)
152 return ERR_PTR(-EPERM); 153 return ERR_PTR(-EPERM);
153 154
154 if (v9fs_extended(v9ses)) 155 if (v9fs_proto_dotu(v9ses))
155 uname = NULL; 156 uname = NULL;
156 else 157 else
157 uname = v9ses->uname; 158 uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index cf62b05e296a..cb57d3326182 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/parser.h> 30#include <linux/parser.h>
31#include <linux/idr.h> 31#include <linux/idr.h>
32#include <linux/slab.h>
32#include <net/9p/9p.h> 33#include <net/9p/9p.h>
33#include <net/9p/client.h> 34#include <net/9p/client.h>
34#include <net/9p/transport.h> 35#include <net/9p/transport.h>
@@ -84,7 +85,7 @@ static const match_table_t tokens = {
84 85
85static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) 86static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
86{ 87{
87 char *options; 88 char *options, *tmp_options;
88 substring_t args[MAX_OPT_ARGS]; 89 substring_t args[MAX_OPT_ARGS];
89 char *p; 90 char *p;
90 int option = 0; 91 int option = 0;
@@ -102,9 +103,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
102 if (!opts) 103 if (!opts)
103 return 0; 104 return 0;
104 105
105 options = kstrdup(opts, GFP_KERNEL); 106 tmp_options = kstrdup(opts, GFP_KERNEL);
106 if (!options) 107 if (!tmp_options) {
108 ret = -ENOMEM;
107 goto fail_option_alloc; 109 goto fail_option_alloc;
110 }
111 options = tmp_options;
108 112
109 while ((p = strsep(&options, ",")) != NULL) { 113 while ((p = strsep(&options, ",")) != NULL) {
110 int token; 114 int token;
@@ -159,8 +163,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
159 break; 163 break;
160 case Opt_cache: 164 case Opt_cache:
161 s = match_strdup(&args[0]); 165 s = match_strdup(&args[0]);
162 if (!s) 166 if (!s) {
163 goto fail_option_alloc; 167 ret = -ENOMEM;
168 P9_DPRINTK(P9_DEBUG_ERROR,
169 "problem allocating copy of cache arg\n");
170 goto free_and_return;
171 }
164 172
165 if (strcmp(s, "loose") == 0) 173 if (strcmp(s, "loose") == 0)
166 v9ses->cache = CACHE_LOOSE; 174 v9ses->cache = CACHE_LOOSE;
@@ -173,8 +181,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
173 181
174 case Opt_access: 182 case Opt_access:
175 s = match_strdup(&args[0]); 183 s = match_strdup(&args[0]);
176 if (!s) 184 if (!s) {
177 goto fail_option_alloc; 185 ret = -ENOMEM;
186 P9_DPRINTK(P9_DEBUG_ERROR,
187 "problem allocating copy of access arg\n");
188 goto free_and_return;
189 }
178 190
179 v9ses->flags &= ~V9FS_ACCESS_MASK; 191 v9ses->flags &= ~V9FS_ACCESS_MASK;
180 if (strcmp(s, "user") == 0) 192 if (strcmp(s, "user") == 0)
@@ -194,13 +206,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
194 continue; 206 continue;
195 } 207 }
196 } 208 }
197 kfree(options);
198 return ret;
199 209
210free_and_return:
211 kfree(tmp_options);
200fail_option_alloc: 212fail_option_alloc:
201 P9_DPRINTK(P9_DEBUG_ERROR, 213 return ret;
202 "failed to allocate copy of option argument\n");
203 return -ENOMEM;
204} 214}
205 215
206/** 216/**
@@ -232,7 +242,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
232 list_add(&v9ses->slist, &v9fs_sessionlist); 242 list_add(&v9ses->slist, &v9fs_sessionlist);
233 spin_unlock(&v9fs_sessionlist_lock); 243 spin_unlock(&v9fs_sessionlist_lock);
234 244
235 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; 245 v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
236 strcpy(v9ses->uname, V9FS_DEFUSER); 246 strcpy(v9ses->uname, V9FS_DEFUSER);
237 strcpy(v9ses->aname, V9FS_DEFANAME); 247 strcpy(v9ses->aname, V9FS_DEFANAME);
238 v9ses->uid = ~0; 248 v9ses->uid = ~0;
@@ -253,13 +263,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
253 goto error; 263 goto error;
254 } 264 }
255 265
256 if (!v9ses->clnt->dotu) 266 if (!p9_is_proto_dotu(v9ses->clnt))
257 v9ses->flags &= ~V9FS_EXTENDED; 267 v9ses->flags &= ~V9FS_PROTO_2000U;
258 268
259 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 269 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
260 270
261 /* for legacy mode, fall back to V9FS_ACCESS_ANY */ 271 /* for legacy mode, fall back to V9FS_ACCESS_ANY */
262 if (!v9fs_extended(v9ses) && 272 if (!v9fs_proto_dotu(v9ses) &&
263 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { 273 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
264 274
265 v9ses->flags &= ~V9FS_ACCESS_MASK; 275 v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 019f4ccb70c1..6b801d1ddf4b 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -23,7 +23,8 @@
23 23
24/** 24/**
25 * enum p9_session_flags - option flags for each 9P session 25 * enum p9_session_flags - option flags for each 9P session
26 * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions 26 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
27 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
27 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy 28 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
28 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) 29 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
29 * @V9FS_ACCESS_ANY: use a single attach for all users 30 * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -32,11 +33,12 @@
32 * Session flags reflect options selected by users at mount time 33 * Session flags reflect options selected by users at mount time
33 */ 34 */
34enum p9_session_flags { 35enum p9_session_flags {
35 V9FS_EXTENDED = 0x01, 36 V9FS_PROTO_2000U = 0x01,
36 V9FS_ACCESS_SINGLE = 0x02, 37 V9FS_PROTO_2000L = 0x02,
37 V9FS_ACCESS_USER = 0x04, 38 V9FS_ACCESS_SINGLE = 0x04,
38 V9FS_ACCESS_ANY = 0x06, 39 V9FS_ACCESS_USER = 0x08,
39 V9FS_ACCESS_MASK = 0x06, 40 V9FS_ACCESS_ANY = 0x0C,
41 V9FS_ACCESS_MASK = 0x0C,
40}; 42};
41 43
42/* possible values of ->cache */ 44/* possible values of ->cache */
@@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
121 return (inode->i_sb->s_fs_info); 123 return (inode->i_sb->s_fs_info);
122} 124}
123 125
124static inline int v9fs_extended(struct v9fs_session_info *v9ses) 126static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
125{ 127{
126 return v9ses->flags & V9FS_EXTENDED; 128 return v9ses->flags & V9FS_PROTO_2000U;
129}
130
131static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
132{
133 return v9ses->flags & V9FS_PROTO_2000L;
127} 134}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 3a7560e35865..ed835836e0dc 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -60,3 +60,4 @@ void v9fs_dentry_release(struct dentry *);
60int v9fs_uflags2omode(int uflags, int extended); 60int v9fs_uflags2omode(int uflags, int extended);
61 61
62ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 62ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
63void v9fs_blank_wstat(struct p9_wstat *wstat);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 15cce53bf61e..909711f57c0d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <linux/slab.h>
35#include <net/9p/9p.h> 36#include <net/9p/9p.h>
36#include <net/9p/client.h> 37#include <net/9p/client.h>
37 38
@@ -76,6 +77,15 @@ static inline int dt_type(struct p9_wstat *mistat)
76 return rettype; 77 return rettype;
77} 78}
78 79
80static void p9stat_init(struct p9_wstat *stbuf)
81{
82 stbuf->name = NULL;
83 stbuf->uid = NULL;
84 stbuf->gid = NULL;
85 stbuf->muid = NULL;
86 stbuf->extension = NULL;
87}
88
79/** 89/**
80 * v9fs_dir_readdir - read a directory 90 * v9fs_dir_readdir - read a directory
81 * @filp: opened file structure 91 * @filp: opened file structure
@@ -131,11 +141,11 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 rdir->head = 0; 141 rdir->head = 0;
132 rdir->tail = err; 142 rdir->tail = err;
133 } 143 }
134
135 while (rdir->head < rdir->tail) { 144 while (rdir->head < rdir->tail) {
145 p9stat_init(&st);
136 err = p9stat_read(rdir->buf + rdir->head, 146 err = p9stat_read(rdir->buf + rdir->head,
137 buflen - rdir->head, &st, 147 buflen - rdir->head, &st,
138 fid->clnt->dotu); 148 fid->clnt->proto_version);
139 if (err) { 149 if (err) {
140 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 150 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
141 err = -EIO; 151 err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3902bf43a088..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
61 61
62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); 62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
63 v9ses = v9fs_inode2v9ses(inode); 63 v9ses = v9fs_inode2v9ses(inode);
64 omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); 64 omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
65 fid = file->private_data; 65 fid = file->private_data;
66 if (!fid) { 66 if (!fid) {
67 fid = v9fs_fid_clone(file->f_path.dentry); 67 fid = v9fs_fid_clone(file->f_path.dentry);
@@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
77 i_size_write(inode, 0); 77 i_size_write(inode, 0);
78 inode->i_blocks = 0; 78 inode->i_blocks = 0;
79 } 79 }
80 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) 80 if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
81 generic_file_llseek(file, 0, SEEK_END); 81 generic_file_llseek(file, 0, SEEK_END);
82 } 82 }
83 83
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); 114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
115 115
116 /* No mandatory locks */ 116 /* No mandatory locks */
117 if (__mandatory_lock(inode)) 117 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
118 return -ENOLCK; 118 return -ENOLCK;
119 119
120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { 120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
215 struct p9_fid *fid; 215 struct p9_fid *fid;
216 struct p9_client *clnt; 216 struct p9_client *clnt;
217 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
218 int origin = *offset; 218 loff_t origin = *offset;
219 unsigned long pg_start, pg_end; 219 unsigned long pg_start, pg_end;
220 220
221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
@@ -257,6 +257,23 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 return total; 257 return total;
258} 258}
259 259
260static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
261 int datasync)
262{
263 struct p9_fid *fid;
264 struct p9_wstat wstat;
265 int retval;
266
267 P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
268 dentry, datasync);
269
270 fid = filp->private_data;
271 v9fs_blank_wstat(&wstat);
272
273 retval = p9_client_wstat(fid, &wstat);
274 return retval;
275}
276
260static const struct file_operations v9fs_cached_file_operations = { 277static const struct file_operations v9fs_cached_file_operations = {
261 .llseek = generic_file_llseek, 278 .llseek = generic_file_llseek,
262 .read = do_sync_read, 279 .read = do_sync_read,
@@ -266,6 +283,7 @@ static const struct file_operations v9fs_cached_file_operations = {
266 .release = v9fs_dir_release, 283 .release = v9fs_dir_release,
267 .lock = v9fs_file_lock, 284 .lock = v9fs_file_lock,
268 .mmap = generic_file_readonly_mmap, 285 .mmap = generic_file_readonly_mmap,
286 .fsync = v9fs_file_fsync,
269}; 287};
270 288
271const struct file_operations v9fs_file_operations = { 289const struct file_operations v9fs_file_operations = {
@@ -276,4 +294,5 @@ const struct file_operations v9fs_file_operations = {
276 .release = v9fs_dir_release, 294 .release = v9fs_dir_release,
277 .lock = v9fs_file_lock, 295 .lock = v9fs_file_lock,
278 .mmap = generic_file_readonly_mmap, 296 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync,
279}; 298};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 18f74ec4dce9..63c2b5af268a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
@@ -60,7 +61,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
60 res = mode & 0777; 61 res = mode & 0777;
61 if (S_ISDIR(mode)) 62 if (S_ISDIR(mode))
62 res |= P9_DMDIR; 63 res |= P9_DMDIR;
63 if (v9fs_extended(v9ses)) { 64 if (v9fs_proto_dotu(v9ses)) {
64 if (S_ISLNK(mode)) 65 if (S_ISLNK(mode))
65 res |= P9_DMSYMLINK; 66 res |= P9_DMSYMLINK;
66 if (v9ses->nodev == 0) { 67 if (v9ses->nodev == 0) {
@@ -102,21 +103,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
102 103
103 if ((mode & P9_DMDIR) == P9_DMDIR) 104 if ((mode & P9_DMDIR) == P9_DMDIR)
104 res |= S_IFDIR; 105 res |= S_IFDIR;
105 else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses))) 106 else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
106 res |= S_IFLNK; 107 res |= S_IFLNK;
107 else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses)) 108 else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
108 && (v9ses->nodev == 0)) 109 && (v9ses->nodev == 0))
109 res |= S_IFSOCK; 110 res |= S_IFSOCK;
110 else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses)) 111 else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
111 && (v9ses->nodev == 0)) 112 && (v9ses->nodev == 0))
112 res |= S_IFIFO; 113 res |= S_IFIFO;
113 else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses)) 114 else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
114 && (v9ses->nodev == 0)) 115 && (v9ses->nodev == 0))
115 res |= S_IFBLK; 116 res |= S_IFBLK;
116 else 117 else
117 res |= S_IFREG; 118 res |= S_IFREG;
118 119
119 if (v9fs_extended(v9ses)) { 120 if (v9fs_proto_dotu(v9ses)) {
120 if ((mode & P9_DMSETUID) == P9_DMSETUID) 121 if ((mode & P9_DMSETUID) == P9_DMSETUID)
121 res |= S_ISUID; 122 res |= S_ISUID;
122 123
@@ -176,7 +177,7 @@ int v9fs_uflags2omode(int uflags, int extended)
176 * 177 *
177 */ 178 */
178 179
179static void 180void
180v9fs_blank_wstat(struct p9_wstat *wstat) 181v9fs_blank_wstat(struct p9_wstat *wstat)
181{ 182{
182 wstat->type = ~0; 183 wstat->type = ~0;
@@ -265,7 +266,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
265 case S_IFBLK: 266 case S_IFBLK:
266 case S_IFCHR: 267 case S_IFCHR:
267 case S_IFSOCK: 268 case S_IFSOCK:
268 if (!v9fs_extended(v9ses)) { 269 if (!v9fs_proto_dotu(v9ses)) {
269 P9_DPRINTK(P9_DEBUG_ERROR, 270 P9_DPRINTK(P9_DEBUG_ERROR,
270 "special files without extended mode\n"); 271 "special files without extended mode\n");
271 err = -EINVAL; 272 err = -EINVAL;
@@ -278,7 +279,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
278 inode->i_fop = &v9fs_file_operations; 279 inode->i_fop = &v9fs_file_operations;
279 break; 280 break;
280 case S_IFLNK: 281 case S_IFLNK:
281 if (!v9fs_extended(v9ses)) { 282 if (!v9fs_proto_dotu(v9ses)) {
282 P9_DPRINTK(P9_DEBUG_ERROR, 283 P9_DPRINTK(P9_DEBUG_ERROR,
283 "extended modes used w/o 9P2000.u\n"); 284 "extended modes used w/o 9P2000.u\n");
284 err = -EINVAL; 285 err = -EINVAL;
@@ -288,7 +289,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
288 break; 289 break;
289 case S_IFDIR: 290 case S_IFDIR:
290 inc_nlink(inode); 291 inc_nlink(inode);
291 if (v9fs_extended(v9ses)) 292 if (v9fs_proto_dotu(v9ses))
292 inode->i_op = &v9fs_dir_inode_operations_ext; 293 inode->i_op = &v9fs_dir_inode_operations_ext;
293 else 294 else
294 inode->i_op = &v9fs_dir_inode_operations; 295 inode->i_op = &v9fs_dir_inode_operations;
@@ -575,7 +576,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
575 flags = O_RDWR; 576 flags = O_RDWR;
576 577
577 fid = v9fs_create(v9ses, dir, dentry, NULL, perm, 578 fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
578 v9fs_uflags2omode(flags, v9fs_extended(v9ses))); 579 v9fs_uflags2omode(flags,
580 v9fs_proto_dotu(v9ses)));
579 if (IS_ERR(fid)) { 581 if (IS_ERR(fid)) {
580 err = PTR_ERR(fid); 582 err = PTR_ERR(fid);
581 fid = NULL; 583 fid = NULL;
@@ -858,7 +860,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
858 if (iattr->ia_valid & ATTR_SIZE) 860 if (iattr->ia_valid & ATTR_SIZE)
859 wstat.length = iattr->ia_size; 861 wstat.length = iattr->ia_size;
860 862
861 if (v9fs_extended(v9ses)) { 863 if (v9fs_proto_dotu(v9ses)) {
862 if (iattr->ia_valid & ATTR_UID) 864 if (iattr->ia_valid & ATTR_UID)
863 wstat.n_uid = iattr->ia_uid; 865 wstat.n_uid = iattr->ia_uid;
864 866
@@ -886,6 +888,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
886 struct super_block *sb) 888 struct super_block *sb)
887{ 889{
888 char ext[32]; 890 char ext[32];
891 char tag_name[14];
892 unsigned int i_nlink;
889 struct v9fs_session_info *v9ses = sb->s_fs_info; 893 struct v9fs_session_info *v9ses = sb->s_fs_info;
890 894
891 inode->i_nlink = 1; 895 inode->i_nlink = 1;
@@ -897,11 +901,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
897 inode->i_uid = v9ses->dfltuid; 901 inode->i_uid = v9ses->dfltuid;
898 inode->i_gid = v9ses->dfltgid; 902 inode->i_gid = v9ses->dfltgid;
899 903
900 if (v9fs_extended(v9ses)) { 904 if (v9fs_proto_dotu(v9ses)) {
901 inode->i_uid = stat->n_uid; 905 inode->i_uid = stat->n_uid;
902 inode->i_gid = stat->n_gid; 906 inode->i_gid = stat->n_gid;
903 } 907 }
904 908 if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
909 if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
910 /*
911 * Hadlink support got added later to
912 * to the .u extension. So there can be
913 * server out there that doesn't support
914 * this even with .u extension. So check
915 * for non NULL stat->extension
916 */
917 strncpy(ext, stat->extension, sizeof(ext));
918 /* HARDLINKCOUNT %u */
919 sscanf(ext, "%13s %u", tag_name, &i_nlink);
920 if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
921 inode->i_nlink = i_nlink;
922 }
923 }
905 inode->i_mode = p9mode2unixmode(v9ses, stat->mode); 924 inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
906 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { 925 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
907 char type = 0; 926 char type = 0;
@@ -976,7 +995,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
976 if (IS_ERR(fid)) 995 if (IS_ERR(fid))
977 return PTR_ERR(fid); 996 return PTR_ERR(fid);
978 997
979 if (!v9fs_extended(v9ses)) 998 if (!v9fs_proto_dotu(v9ses))
980 return -EBADF; 999 return -EBADF;
981 1000
982 st = p9_client_stat(fid); 1001 st = p9_client_stat(fid);
@@ -1001,44 +1020,6 @@ done:
1001} 1020}
1002 1021
1003/** 1022/**
1004 * v9fs_vfs_readlink - read a symlink's location
1005 * @dentry: dentry for symlink
1006 * @buffer: buffer to load symlink location into
1007 * @buflen: length of buffer
1008 *
1009 */
1010
1011static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
1012 int buflen)
1013{
1014 int retval;
1015 int ret;
1016 char *link = __getname();
1017
1018 if (unlikely(!link))
1019 return -ENOMEM;
1020
1021 if (buflen > PATH_MAX)
1022 buflen = PATH_MAX;
1023
1024 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
1025 dentry);
1026
1027 retval = v9fs_readlink(dentry, link, buflen);
1028
1029 if (retval > 0) {
1030 if ((ret = copy_to_user(buffer, link, retval)) != 0) {
1031 P9_DPRINTK(P9_DEBUG_ERROR,
1032 "problem copying to user: %d\n", ret);
1033 retval = ret;
1034 }
1035 }
1036
1037 __putname(link);
1038 return retval;
1039}
1040
1041/**
1042 * v9fs_vfs_follow_link - follow a symlink path 1023 * v9fs_vfs_follow_link - follow a symlink path
1043 * @dentry: dentry for symlink 1024 * @dentry: dentry for symlink
1044 * @nd: nameidata 1025 * @nd: nameidata
@@ -1104,7 +1085,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1104 struct p9_fid *fid; 1085 struct p9_fid *fid;
1105 1086
1106 v9ses = v9fs_inode2v9ses(dir); 1087 v9ses = v9fs_inode2v9ses(dir);
1107 if (!v9fs_extended(v9ses)) { 1088 if (!v9fs_proto_dotu(v9ses)) {
1108 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); 1089 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
1109 return -EPERM; 1090 return -EPERM;
1110 } 1091 }
@@ -1230,7 +1211,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = {
1230 .rmdir = v9fs_vfs_rmdir, 1211 .rmdir = v9fs_vfs_rmdir,
1231 .mknod = v9fs_vfs_mknod, 1212 .mknod = v9fs_vfs_mknod,
1232 .rename = v9fs_vfs_rename, 1213 .rename = v9fs_vfs_rename,
1233 .readlink = v9fs_vfs_readlink,
1234 .getattr = v9fs_vfs_getattr, 1214 .getattr = v9fs_vfs_getattr,
1235 .setattr = v9fs_vfs_setattr, 1215 .setattr = v9fs_vfs_setattr,
1236}; 1216};
@@ -1253,7 +1233,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
1253}; 1233};
1254 1234
1255static const struct inode_operations v9fs_symlink_inode_operations = { 1235static const struct inode_operations v9fs_symlink_inode_operations = {
1256 .readlink = v9fs_vfs_readlink, 1236 .readlink = generic_readlink,
1257 .follow_link = v9fs_vfs_follow_link, 1237 .follow_link = v9fs_vfs_follow_link,
1258 .put_link = v9fs_vfs_put_link, 1238 .put_link = v9fs_vfs_put_link,
1259 .getattr = v9fs_vfs_getattr, 1239 .getattr = v9fs_vfs_getattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 14a86448572c..a271549d9e21 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h>
40#include <net/9p/9p.h> 41#include <net/9p/9p.h>
41#include <net/9p/client.h> 42#include <net/9p/client.h>
42 43
@@ -188,7 +189,8 @@ static void v9fs_kill_super(struct super_block *s)
188 189
189 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); 190 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
190 191
191 v9fs_dentry_release(s->s_root); /* clunk root */ 192 if (s->s_root)
193 v9fs_dentry_release(s->s_root); /* clunk root */
192 194
193 kill_anon_super(s); 195 kill_anon_super(s);
194 196
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a5..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
177source "fs/jffs2/Kconfig" 177source "fs/jffs2/Kconfig"
178# UBIFS File system configuration 178# UBIFS File system configuration
179source "fs/ubifs/Kconfig" 179source "fs/ubifs/Kconfig"
180source "fs/logfs/Kconfig"
180source "fs/cramfs/Kconfig" 181source "fs/cramfs/Kconfig"
181source "fs/squashfs/Kconfig" 182source "fs/squashfs/Kconfig"
182source "fs/freevxfs/Kconfig" 183source "fs/freevxfs/Kconfig"
@@ -234,6 +235,7 @@ config NFS_COMMON
234 235
235source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
236source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
237source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
238source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
239source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
99obj-$(CONFIG_UFS_FS) += ufs/ 99obj-$(CONFIG_UFS_FS) += ufs/
100obj-$(CONFIG_EFS_FS) += efs/ 100obj-$(CONFIG_EFS_FS) += efs/
101obj-$(CONFIG_JFFS2_FS) += jffs2/ 101obj-$(CONFIG_JFFS2_FS) += jffs2/
102obj-$(CONFIG_LOGFS) += logfs/
102obj-$(CONFIG_UBIFS_FS) += ubifs/ 103obj-$(CONFIG_UBIFS_FS) += ubifs/
103obj-$(CONFIG_AFFS_FS) += affs/ 104obj-$(CONFIG_AFFS_FS) += affs/
104obj-$(CONFIG_ROMFS_FS) += romfs/ 105obj-$(CONFIG_ROMFS_FS) += romfs/
@@ -124,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
124obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
125obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
126obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b832..2ff622f6f547 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
121 121
122/* Inode stuff */ 122/* Inode stuff */
123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); 123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
124int adfs_write_inode(struct inode *inode,int unused); 124int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
125int adfs_notify_change(struct dentry *dentry, struct iattr *attr); 125int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
126 126
127/* map.c */ 127/* map.c */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5d..0f5e30978135 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/writeback.h>
12#include "adfs.h" 13#include "adfs.h"
13 14
14/* 15/*
@@ -360,7 +361,7 @@ out:
360 * The adfs-specific inode data has already been updated by 361 * The adfs-specific inode data has already been updated by
361 * adfs_notify_change() 362 * adfs_notify_change()
362 */ 363 */
363int adfs_write_inode(struct inode *inode, int wait) 364int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
364{ 365{
365 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
366 struct object_info obj; 367 struct object_info obj;
@@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait)
375 obj.attr = ADFS_I(inode)->attr; 376 obj.attr = ADFS_I(inode)->attr;
376 obj.size = inode->i_size; 377 obj.size = inode->i_size;
377 378
378 ret = adfs_dir_update(sb, &obj, wait); 379 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
379 unlock_kernel(); 380 unlock_kernel();
380 return ret; 381 return ret;
381} 382}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/parser.h> 13#include <linux/parser.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h>
16#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include "adfs.h" 19#include "adfs.h"
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e511dc621a2e..861dae68ac12 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -106,8 +106,8 @@ struct affs_sb_info {
106 u32 s_last_bmap; 106 u32 s_last_bmap;
107 struct buffer_head *s_bmap_bh; 107 struct buffer_head *s_bmap_bh;
108 char *s_prefix; /* Prefix for volumes and assigns. */ 108 char *s_prefix; /* Prefix for volumes and assigns. */
109 int s_prefix_len; /* Length of prefix. */
110 char s_volume[32]; /* Volume prefix for absolute symlinks. */ 109 char s_volume[32]; /* Volume prefix for absolute symlinks. */
110 spinlock_t symlink_lock; /* protects the previous two */
111}; 111};
112 112
113#define SF_INTL 0x0001 /* International filesystem. */ 113#define SF_INTL 0x0001 /* International filesystem. */
@@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode);
175extern void affs_clear_inode(struct inode *inode); 175extern void affs_clear_inode(struct inode *inode);
176extern struct inode *affs_iget(struct super_block *sb, 176extern struct inode *affs_iget(struct super_block *sb,
177 unsigned long ino); 177 unsigned long ino);
178extern int affs_write_inode(struct inode *inode, int); 178extern int affs_write_inode(struct inode *inode,
179 struct writeback_control *wbc);
179extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); 180extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
180 181
181/* file.c */ 182/* file.c */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1c..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
7 * block allocation, deallocation, calculation of free space. 7 * block allocation, deallocation, calculation of free space.
8 */ 8 */
9 9
10#include <linux/slab.h>
10#include "affs.h" 11#include "affs.h"
11 12
12/* This is, of course, shamelessly stolen from fs/minix */ 13/* This is, of course, shamelessly stolen from fs/minix */
@@ -128,7 +129,7 @@ err_range:
128/* 129/*
129 * Allocate a block in the given allocation zone. 130 * Allocate a block in the given allocation zone.
130 * Since we have to byte-swap the bitmap on little-endian 131 * Since we have to byte-swap the bitmap on little-endian
131 * machines, this is rather expensive. Therefor we will 132 * machines, this is rather expensive. Therefore we will
132 * preallocate up to 16 blocks from the same word, if 133 * preallocate up to 16 blocks from the same word, if
133 * possible. We are not doing preallocations in the 134 * possible. We are not doing preallocations in the
134 * header zone, though. 135 * header zone, though.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c4..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
10 * (C) 1991 Linus Torvalds - minix filesystem 10 * (C) 1991 Linus Torvalds - minix filesystem
11 */ 11 */
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/gfp.h>
13#include "affs.h" 14#include "affs.h"
14 15
15extern const struct inode_operations affs_symlink_inode_operations; 16extern const struct inode_operations affs_symlink_inode_operations;
@@ -166,7 +167,7 @@ bad_inode:
166} 167}
167 168
168int 169int
169affs_write_inode(struct inode *inode, int unused) 170affs_write_inode(struct inode *inode, struct writeback_control *wbc)
170{ 171{
171 struct super_block *sb = inode->i_sb; 172 struct super_block *sb = inode->i_sb;
172 struct buffer_head *bh; 173 struct buffer_head *bh;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 960d336ec694..d70bbbac6b7b 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
341 p = (char *)AFFS_HEAD(bh)->table; 341 p = (char *)AFFS_HEAD(bh)->table;
342 lc = '/'; 342 lc = '/';
343 if (*symname == '/') { 343 if (*symname == '/') {
344 struct affs_sb_info *sbi = AFFS_SB(sb);
344 while (*symname == '/') 345 while (*symname == '/')
345 symname++; 346 symname++;
346 while (AFFS_SB(sb)->s_volume[i]) /* Cannot overflow */ 347 spin_lock(&sbi->symlink_lock);
347 *p++ = AFFS_SB(sb)->s_volume[i++]; 348 while (sbi->s_volume[i]) /* Cannot overflow */
349 *p++ = sbi->s_volume[i++];
350 spin_unlock(&sbi->symlink_lock);
348 } 351 }
349 while (i < maxlen && (c = *symname++)) { 352 while (i < maxlen && (c = *symname++)) {
350 if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { 353 if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 104fdcb3a7fc..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/slab.h>
20#include "affs.h" 21#include "affs.h"
21 22
22extern struct timezone sys_tz; 23extern struct timezone sys_tz;
@@ -203,7 +204,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
203 switch (token) { 204 switch (token) {
204 case Opt_bs: 205 case Opt_bs:
205 if (match_int(&args[0], &n)) 206 if (match_int(&args[0], &n))
206 return -EINVAL; 207 return 0;
207 if (n != 512 && n != 1024 && n != 2048 208 if (n != 512 && n != 1024 && n != 2048
208 && n != 4096) { 209 && n != 4096) {
209 printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); 210 printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
@@ -213,7 +214,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
213 break; 214 break;
214 case Opt_mode: 215 case Opt_mode:
215 if (match_octal(&args[0], &option)) 216 if (match_octal(&args[0], &option))
216 return 1; 217 return 0;
217 *mode = option & 0777; 218 *mode = option & 0777;
218 *mount_opts |= SF_SETMODE; 219 *mount_opts |= SF_SETMODE;
219 break; 220 break;
@@ -221,8 +222,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
221 *mount_opts |= SF_MUFS; 222 *mount_opts |= SF_MUFS;
222 break; 223 break;
223 case Opt_prefix: 224 case Opt_prefix:
224 /* Free any previous prefix */
225 kfree(*prefix);
226 *prefix = match_strdup(&args[0]); 225 *prefix = match_strdup(&args[0]);
227 if (!*prefix) 226 if (!*prefix)
228 return 0; 227 return 0;
@@ -233,21 +232,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
233 break; 232 break;
234 case Opt_reserved: 233 case Opt_reserved:
235 if (match_int(&args[0], reserved)) 234 if (match_int(&args[0], reserved))
236 return 1; 235 return 0;
237 break; 236 break;
238 case Opt_root: 237 case Opt_root:
239 if (match_int(&args[0], root)) 238 if (match_int(&args[0], root))
240 return 1; 239 return 0;
241 break; 240 break;
242 case Opt_setgid: 241 case Opt_setgid:
243 if (match_int(&args[0], &option)) 242 if (match_int(&args[0], &option))
244 return 1; 243 return 0;
245 *gid = option; 244 *gid = option;
246 *mount_opts |= SF_SETGID; 245 *mount_opts |= SF_SETGID;
247 break; 246 break;
248 case Opt_setuid: 247 case Opt_setuid:
249 if (match_int(&args[0], &option)) 248 if (match_int(&args[0], &option))
250 return -EINVAL; 249 return 0;
251 *uid = option; 250 *uid = option;
252 *mount_opts |= SF_SETUID; 251 *mount_opts |= SF_SETUID;
253 break; 252 break;
@@ -311,11 +310,14 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
311 return -ENOMEM; 310 return -ENOMEM;
312 sb->s_fs_info = sbi; 311 sb->s_fs_info = sbi;
313 mutex_init(&sbi->s_bmlock); 312 mutex_init(&sbi->s_bmlock);
313 spin_lock_init(&sbi->symlink_lock);
314 314
315 if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, 315 if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
316 &blocksize,&sbi->s_prefix, 316 &blocksize,&sbi->s_prefix,
317 sbi->s_volume, &mount_flags)) { 317 sbi->s_volume, &mount_flags)) {
318 printk(KERN_ERR "AFFS: Error parsing options\n"); 318 printk(KERN_ERR "AFFS: Error parsing options\n");
319 kfree(sbi->s_prefix);
320 kfree(sbi);
319 return -EINVAL; 321 return -EINVAL;
320 } 322 }
321 /* N.B. after this point s_prefix must be released */ 323 /* N.B. after this point s_prefix must be released */
@@ -516,14 +518,18 @@ affs_remount(struct super_block *sb, int *flags, char *data)
516 unsigned long mount_flags; 518 unsigned long mount_flags;
517 int res = 0; 519 int res = 0;
518 char *new_opts = kstrdup(data, GFP_KERNEL); 520 char *new_opts = kstrdup(data, GFP_KERNEL);
521 char volume[32];
522 char *prefix = NULL;
519 523
520 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); 524 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
521 525
522 *flags |= MS_NODIRATIME; 526 *flags |= MS_NODIRATIME;
523 527
528 memcpy(volume, sbi->s_volume, 32);
524 if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, 529 if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
525 &blocksize, &sbi->s_prefix, sbi->s_volume, 530 &blocksize, &prefix, volume,
526 &mount_flags)) { 531 &mount_flags)) {
532 kfree(prefix);
527 kfree(new_opts); 533 kfree(new_opts);
528 return -EINVAL; 534 return -EINVAL;
529 } 535 }
@@ -534,6 +540,14 @@ affs_remount(struct super_block *sb, int *flags, char *data)
534 sbi->s_mode = mode; 540 sbi->s_mode = mode;
535 sbi->s_uid = uid; 541 sbi->s_uid = uid;
536 sbi->s_gid = gid; 542 sbi->s_gid = gid;
543 /* protect against readers */
544 spin_lock(&sbi->symlink_lock);
545 if (prefix) {
546 kfree(sbi->s_prefix);
547 sbi->s_prefix = prefix;
548 }
549 memcpy(sbi->s_volume, volume, 32);
550 spin_unlock(&sbi->symlink_lock);
537 551
538 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 552 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
539 unlock_kernel(); 553 unlock_kernel();
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 41782539c907..ee00f08c4f53 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
20 int i, j; 20 int i, j;
21 char c; 21 char c;
22 char lc; 22 char lc;
23 char *pf;
24 23
25 pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); 24 pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
26 25
@@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
32 j = 0; 31 j = 0;
33 lf = (struct slink_front *)bh->b_data; 32 lf = (struct slink_front *)bh->b_data;
34 lc = 0; 33 lc = 0;
35 pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/";
36 34
37 if (strchr(lf->symname,':')) { /* Handle assign or volume name */ 35 if (strchr(lf->symname,':')) { /* Handle assign or volume name */
36 struct affs_sb_info *sbi = AFFS_SB(inode->i_sb);
37 char *pf;
38 spin_lock(&sbi->symlink_lock);
39 pf = sbi->s_prefix ? sbi->s_prefix : "/";
38 while (i < 1023 && (c = pf[i])) 40 while (i < 1023 && (c = pf[i]))
39 link[i++] = c; 41 link[i++] = c;
42 spin_unlock(&sbi->symlink_lock);
40 while (i < 1023 && lf->symname[j] != ':') 43 while (i < 1023 && lf->symname[j] != ':')
41 link[i++] = lf->symname[j++]; 44 link[i++] = lf->symname[j++];
42 if (i < 1023) 45 if (i < 1023)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include "internal.h" 13#include "internal.h"
15 14
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/slab.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/ctype.h> 17#include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/gfp.h>
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
15#include "internal.h" 16#include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/slab.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/pagemap.h> 20#include <linux/pagemap.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf71..c54dad4e6063 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
733 struct page *page, void *fsdata); 733 struct page *page, void *fsdata);
734extern int afs_writepage(struct page *, struct writeback_control *); 734extern int afs_writepage(struct page *, struct writeback_control *);
735extern int afs_writepages(struct address_space *, struct writeback_control *); 735extern int afs_writepages(struct address_space *, struct writeback_control *);
736extern int afs_write_inode(struct inode *, int);
737extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); 736extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
738extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 737extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
739 unsigned long, loff_t); 738 unsigned long, loff_t);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..5e813a816ce4 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
19#include <linux/namei.h> 18#include <linux/namei.h>
19#include <linux/gfp.h>
20#include "internal.h" 20#include "internal.h"
21 21
22 22
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <net/sock.h> 13#include <net/sock.h>
13#include <net/af_rxrpc.h> 14#include <net/af_rxrpc.h>
14#include <rxrpc/packet.h> 15#include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6a..14f6431598ad 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
48static const struct super_operations afs_super_ops = { 48static const struct super_operations afs_super_ops = {
49 .statfs = afs_statfs, 49 .statfs = afs_statfs,
50 .alloc_inode = afs_alloc_inode, 50 .alloc_inode = afs_alloc_inode,
51 .write_inode = afs_write_inode,
52 .destroy_inode = afs_destroy_inode, 51 .destroy_inode = afs_destroy_inode,
53 .clear_inode = afs_clear_inode, 52 .clear_inode = afs_clear_inode,
54 .put_super = afs_put_super, 53 .put_super = afs_put_super,
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/gfp.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
18#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c63a3c8beb73..3bed54a294d4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping,
585} 585}
586 586
587/* 587/*
588 * write an inode back
589 */
590int afs_write_inode(struct inode *inode, int sync)
591{
592 struct afs_vnode *vnode = AFS_FS_I(inode);
593 int ret;
594
595 _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
596
597 ret = 0;
598 if (sync) {
599 ret = filemap_fdatawait(inode->i_mapping);
600 if (ret < 0)
601 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
602 }
603
604 _leave(" = %d", ret);
605 return ret;
606}
607
608/*
609 * completion of write to server 588 * completion of write to server
610 */ 589 */
611void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 590void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
@@ -671,7 +650,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
671 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 650 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
672 ssize_t result; 651 ssize_t result;
673 size_t count = iov_length(iov, nr_segs); 652 size_t count = iov_length(iov, nr_segs);
674 int ret;
675 653
676 _enter("{%x.%u},{%zu},%lu,", 654 _enter("{%x.%u},{%zu},%lu,",
677 vnode->fid.vid, vnode->fid.vnode, count, nr_segs); 655 vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
@@ -691,13 +669,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
691 return result; 669 return result;
692 } 670 }
693 671
694 /* return error values for O_SYNC and IS_SYNC() */
695 if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) {
696 ret = afs_fsync(iocb->ki_filp, dentry, 1);
697 if (ret < 0)
698 result = ret;
699 }
700
701 _leave(" = %zd", result); 672 _leave(" = %zd", result);
702 return result; 673 return result;
703} 674}
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..1cf12b3dd83a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
15#include <linux/aio_abi.h> 15#include <linux/aio_abi.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/backing-dev.h>
18#include <linux/uio.h> 19#include <linux/uio.h>
19 20
20#define DEBUG 0 21#define DEBUG 0
@@ -32,6 +33,9 @@
32#include <linux/workqueue.h> 33#include <linux/workqueue.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/eventfd.h> 35#include <linux/eventfd.h>
36#include <linux/blkdev.h>
37#include <linux/mempool.h>
38#include <linux/hash.h>
35 39
36#include <asm/kmap_types.h> 40#include <asm/kmap_types.h>
37#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
60static DEFINE_SPINLOCK(fput_lock); 64static DEFINE_SPINLOCK(fput_lock);
61static LIST_HEAD(fput_head); 65static LIST_HEAD(fput_head);
62 66
67#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
68#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
69struct aio_batch_entry {
70 struct hlist_node list;
71 struct address_space *mapping;
72};
73mempool_t *abe_pool;
74
63static void aio_kick_handler(struct work_struct *); 75static void aio_kick_handler(struct work_struct *);
64static void aio_queue_work(struct kioctx *); 76static void aio_queue_work(struct kioctx *);
65 77
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
73 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 85 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
74 86
75 aio_wq = create_workqueue("aio"); 87 aio_wq = create_workqueue("aio");
88 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
89 BUG_ON(!abe_pool);
76 90
77 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 91 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
78 92
@@ -697,10 +711,8 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
697 */ 711 */
698 ret = retry(iocb); 712 ret = retry(iocb);
699 713
700 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { 714 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
701 BUG_ON(!list_empty(&iocb->ki_wait.task_list));
702 aio_complete(iocb, ret, 0); 715 aio_complete(iocb, ret, 0);
703 }
704out: 716out:
705 spin_lock_irq(&ctx->ctx_lock); 717 spin_lock_irq(&ctx->ctx_lock);
706 718
@@ -852,13 +864,6 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
852 unsigned long flags; 864 unsigned long flags;
853 int run = 0; 865 int run = 0;
854 866
855 /* We're supposed to be the only path putting the iocb back on the run
856 * list. If we find that the iocb is *back* on a wait queue already
857 * than retry has happened before we could queue the iocb. This also
858 * means that the retry could have completed and freed our iocb, no
859 * good. */
860 BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
861
862 spin_lock_irqsave(&ctx->ctx_lock, flags); 867 spin_lock_irqsave(&ctx->ctx_lock, flags);
863 /* set this inside the lock so that we can't race with aio_run_iocb() 868 /* set this inside the lock so that we can't race with aio_run_iocb()
864 * testing it and putting the iocb on the run list under the lock */ 869 * testing it and putting the iocb on the run list under the lock */
@@ -872,7 +877,7 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
872/* 877/*
873 * kick_iocb: 878 * kick_iocb:
874 * Called typically from a wait queue callback context 879 * Called typically from a wait queue callback context
875 * (aio_wake_function) to trigger a retry of the iocb. 880 * to trigger a retry of the iocb.
876 * The retry is usually executed by aio workqueue 881 * The retry is usually executed by aio workqueue
877 * threads (See aio_kick_handler). 882 * threads (See aio_kick_handler).
878 */ 883 */
@@ -1506,33 +1511,44 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1506 return 0; 1511 return 0;
1507} 1512}
1508 1513
1509/* 1514static void aio_batch_add(struct address_space *mapping,
1510 * aio_wake_function: 1515 struct hlist_head *batch_hash)
1511 * wait queue callback function for aio notification, 1516{
1512 * Simply triggers a retry of the operation via kick_iocb. 1517 struct aio_batch_entry *abe;
1513 * 1518 struct hlist_node *pos;
1514 * This callback is specified in the wait queue entry in 1519 unsigned bucket;
1515 * a kiocb. 1520
1516 * 1521 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
1517 * Note: 1522 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
1518 * This routine is executed with the wait queue lock held. 1523 if (abe->mapping == mapping)
1519 * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests 1524 return;
1520 * the ioctx lock inside the wait queue lock. This is safe 1525 }
1521 * because this callback isn't used for wait queues which 1526
1522 * are nested inside ioctx lock (i.e. ctx->wait) 1527 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1523 */ 1528 BUG_ON(!igrab(mapping->host));
1524static int aio_wake_function(wait_queue_t *wait, unsigned mode, 1529 abe->mapping = mapping;
1525 int sync, void *key) 1530 hlist_add_head(&abe->list, &batch_hash[bucket]);
1531 return;
1532}
1533
1534static void aio_batch_free(struct hlist_head *batch_hash)
1526{ 1535{
1527 struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait); 1536 struct aio_batch_entry *abe;
1537 struct hlist_node *pos, *n;
1538 int i;
1528 1539
1529 list_del_init(&wait->task_list); 1540 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
1530 kick_iocb(iocb); 1541 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
1531 return 1; 1542 blk_run_address_space(abe->mapping);
1543 iput(abe->mapping->host);
1544 hlist_del(&abe->list);
1545 mempool_free(abe, abe_pool);
1546 }
1547 }
1532} 1548}
1533 1549
1534static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1550static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1535 struct iocb *iocb) 1551 struct iocb *iocb, struct hlist_head *batch_hash)
1536{ 1552{
1537 struct kiocb *req; 1553 struct kiocb *req;
1538 struct file *file; 1554 struct file *file;
@@ -1592,8 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1592 req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; 1608 req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
1593 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1609 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1594 req->ki_opcode = iocb->aio_lio_opcode; 1610 req->ki_opcode = iocb->aio_lio_opcode;
1595 init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
1596 INIT_LIST_HEAD(&req->ki_wait.task_list);
1597 1611
1598 ret = aio_setup_iocb(req); 1612 ret = aio_setup_iocb(req);
1599 1613
@@ -1608,6 +1622,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1608 ; 1622 ;
1609 } 1623 }
1610 spin_unlock_irq(&ctx->ctx_lock); 1624 spin_unlock_irq(&ctx->ctx_lock);
1625 if (req->ki_opcode == IOCB_CMD_PREAD ||
1626 req->ki_opcode == IOCB_CMD_PREADV ||
1627 req->ki_opcode == IOCB_CMD_PWRITE ||
1628 req->ki_opcode == IOCB_CMD_PWRITEV)
1629 aio_batch_add(file->f_mapping, batch_hash);
1630
1611 aio_put_req(req); /* drop extra ref to req */ 1631 aio_put_req(req); /* drop extra ref to req */
1612 return 0; 1632 return 0;
1613 1633
@@ -1635,6 +1655,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1635 struct kioctx *ctx; 1655 struct kioctx *ctx;
1636 long ret = 0; 1656 long ret = 0;
1637 int i; 1657 int i;
1658 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
1638 1659
1639 if (unlikely(nr < 0)) 1660 if (unlikely(nr < 0))
1640 return -EINVAL; 1661 return -EINVAL;
@@ -1666,10 +1687,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1666 break; 1687 break;
1667 } 1688 }
1668 1689
1669 ret = io_submit_one(ctx, user_iocb, &tmp); 1690 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
1670 if (ret) 1691 if (ret)
1671 break; 1692 break;
1672 } 1693 }
1694 aio_batch_free(batch_hash);
1673 1695
1674 put_ioctx(ctx); 1696 put_ioctx(ctx);
1675 return i ? i : ret; 1697 return i ? i : ret;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2ca7a7cafdbf..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
@@ -35,14 +34,13 @@ static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
35 mnt); 34 mnt);
36} 35}
37 36
38static int anon_inodefs_delete_dentry(struct dentry *dentry) 37/*
38 * anon_inodefs_dname() is called from d_path().
39 */
40static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
39{ 41{
40 /* 42 return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
41 * We faked vfs to believe the dentry was hashed when we created it. 43 dentry->d_name.name);
42 * Now we restore the flag so that dput() will work correctly.
43 */
44 dentry->d_flags |= DCACHE_UNHASHED;
45 return 1;
46} 44}
47 45
48static struct file_system_type anon_inode_fs_type = { 46static struct file_system_type anon_inode_fs_type = {
@@ -51,7 +49,7 @@ static struct file_system_type anon_inode_fs_type = {
51 .kill_sb = kill_anon_super, 49 .kill_sb = kill_anon_super,
52}; 50};
53static const struct dentry_operations anon_inodefs_dentry_operations = { 51static const struct dentry_operations anon_inodefs_dentry_operations = {
54 .d_delete = anon_inodefs_delete_dentry, 52 .d_dname = anon_inodefs_dname,
55}; 53};
56 54
57/* 55/*
@@ -88,7 +86,7 @@ struct file *anon_inode_getfile(const char *name,
88 void *priv, int flags) 86 void *priv, int flags)
89{ 87{
90 struct qstr this; 88 struct qstr this;
91 struct dentry *dentry; 89 struct path path;
92 struct file *file; 90 struct file *file;
93 int error; 91 int error;
94 92
@@ -106,10 +104,11 @@ struct file *anon_inode_getfile(const char *name,
106 this.name = name; 104 this.name = name;
107 this.len = strlen(name); 105 this.len = strlen(name);
108 this.hash = 0; 106 this.hash = 0;
109 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); 107 path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
110 if (!dentry) 108 if (!path.dentry)
111 goto err_module; 109 goto err_module;
112 110
111 path.mnt = mntget(anon_inode_mnt);
113 /* 112 /*
114 * We know the anon_inode inode count is always greater than zero, 113 * We know the anon_inode inode count is always greater than zero,
115 * so we can avoid doing an igrab() and we can use an open-coded 114 * so we can avoid doing an igrab() and we can use an open-coded
@@ -117,27 +116,24 @@ struct file *anon_inode_getfile(const char *name,
117 */ 116 */
118 atomic_inc(&anon_inode_inode->i_count); 117 atomic_inc(&anon_inode_inode->i_count);
119 118
120 dentry->d_op = &anon_inodefs_dentry_operations; 119 path.dentry->d_op = &anon_inodefs_dentry_operations;
121 /* Do not publish this dentry inside the global dentry hash table */ 120 d_instantiate(path.dentry, anon_inode_inode);
122 dentry->d_flags &= ~DCACHE_UNHASHED;
123 d_instantiate(dentry, anon_inode_inode);
124 121
125 error = -ENFILE; 122 error = -ENFILE;
126 file = alloc_file(anon_inode_mnt, dentry, 123 file = alloc_file(&path, OPEN_FMODE(flags), fops);
127 FMODE_READ | FMODE_WRITE, fops);
128 if (!file) 124 if (!file)
129 goto err_dput; 125 goto err_dput;
130 file->f_mapping = anon_inode_inode->i_mapping; 126 file->f_mapping = anon_inode_inode->i_mapping;
131 127
132 file->f_pos = 0; 128 file->f_pos = 0;
133 file->f_flags = O_RDWR | (flags & O_NONBLOCK); 129 file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
134 file->f_version = 0; 130 file->f_version = 0;
135 file->private_data = priv; 131 file->private_data = priv;
136 132
137 return file; 133 return file;
138 134
139err_dput: 135err_dput:
140 dput(dentry); 136 path_put(&path);
141err_module: 137err_module:
142 module_put(fops->owner); 138 module_put(fops->owner);
143 return ERR_PTR(error); 139 return ERR_PTR(error);
@@ -212,6 +208,7 @@ static struct inode *anon_inode_mkinode(void)
212 inode->i_mode = S_IRUSR | S_IWUSR; 208 inode->i_mode = S_IRUSR | S_IWUSR;
213 inode->i_uid = current_fsuid(); 209 inode->i_uid = current_fsuid();
214 inode->i_gid = current_fsgid(); 210 inode->i_gid = current_fsgid();
211 inode->i_flags |= S_PRIVATE;
215 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 212 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
216 return inode; 213 return inode;
217} 214}
diff --git a/fs/attr.c b/fs/attr.c
index 96d394bdaddf..0815e93bb487 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -12,7 +12,6 @@
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/fsnotify.h> 13#include <linux/fsnotify.h>
14#include <linux/fcntl.h> 14#include <linux/fcntl.h>
15#include <linux/quotaops.h>
16#include <linux/security.h> 15#include <linux/security.h>
17 16
18/* Taken over from the old code... */ 17/* Taken over from the old code... */
@@ -82,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset)
82 if (inode->i_size < offset) { 81 if (inode->i_size < offset) {
83 unsigned long limit; 82 unsigned long limit;
84 83
85 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 84 limit = rlimit(RLIMIT_FSIZE);
86 if (limit != RLIM_INFINITY && offset > limit) 85 if (limit != RLIM_INFINITY && offset > limit)
87 goto out_sig; 86 goto out_sig;
88 if (offset > inode->i_sb->s_maxbytes) 87 if (offset > inode->i_sb->s_maxbytes)
@@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
212 error = inode->i_op->setattr(dentry, attr); 211 error = inode->i_op->setattr(dentry, attr);
213 } else { 212 } else {
214 error = inode_change_ok(inode, attr); 213 error = inode_change_ok(inode, attr);
215 if (!error) { 214 if (!error)
216 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 215 error = inode_setattr(inode, attr);
217 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
218 error = vfs_dq_transfer(inode, attr) ?
219 -EDQUOT : 0;
220 if (!error)
221 error = inode_setattr(inode, attr);
222 }
223 } 216 }
224 217
225 if (ia_valid & ATTR_SIZE) 218 if (ia_valid & ATTR_SIZE)
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/slab.h>
16#include <linux/param.h> 17#include <linux/param.h>
17#include <linux/time.h> 18#include <linux/time.h>
18#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8f7cdde41733..3d283abf67d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -75,6 +75,8 @@ struct autofs_info {
75 struct completion expire_complete; 75 struct completion expire_complete;
76 76
77 struct list_head active; 77 struct list_head active;
78 int active_count;
79
78 struct list_head expiring; 80 struct list_head expiring;
79 81
80 struct autofs_sb_info *sbi; 82 struct autofs_sb_info *sbi;
@@ -95,6 +97,7 @@ struct autofs_info {
95 97
96#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 98#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
97#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ 99#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
100#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
98 101
99struct autofs_wait_queue { 102struct autofs_wait_queue {
100 wait_queue_head_t queue; 103 wait_queue_head_t queue;
@@ -161,7 +164,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
161{ 164{
162 struct autofs_info *inf = autofs4_dentry_ino(dentry); 165 struct autofs_info *inf = autofs4_dentry_ino(dentry);
163 166
164 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) 167 if (inf->flags & AUTOFS_INF_PENDING)
165 return 1; 168 return 1;
166 169
167 if (inf->flags & AUTOFS_INF_EXPIRING) 170 if (inf->flags & AUTOFS_INF_EXPIRING)
@@ -264,5 +267,31 @@ out:
264 return ret; 267 return ret;
265} 268}
266 269
270static inline void autofs4_add_expiring(struct dentry *dentry)
271{
272 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
273 struct autofs_info *ino = autofs4_dentry_ino(dentry);
274 if (ino) {
275 spin_lock(&sbi->lookup_lock);
276 if (list_empty(&ino->expiring))
277 list_add(&ino->expiring, &sbi->expiring_list);
278 spin_unlock(&sbi->lookup_lock);
279 }
280 return;
281}
282
283static inline void autofs4_del_expiring(struct dentry *dentry)
284{
285 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
286 struct autofs_info *ino = autofs4_dentry_ino(dentry);
287 if (ino) {
288 spin_lock(&sbi->lookup_lock);
289 if (!list_empty(&ino->expiring))
290 list_del_init(&ino->expiring);
291 spin_unlock(&sbi->lookup_lock);
292 }
293 return;
294}
295
267void autofs4_dentry_release(struct dentry *); 296void autofs4_dentry_release(struct dentry *);
268extern void autofs4_kill_sb(struct super_block *); 297extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fcb245f..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
22#include <linux/magic.h> 22#include <linux/magic.h>
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/slab.h>
25 26
26#include "autofs_i.h" 27#include "autofs_i.h"
27 28
@@ -544,10 +545,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
544 goto out; 545 goto out;
545 devid = new_encode_dev(path.mnt->mnt_sb->s_dev); 546 devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
546 err = 0; 547 err = 0;
547 if (path.dentry->d_inode && 548 if (path.mnt->mnt_root == path.dentry) {
548 path.mnt->mnt_root == path.dentry) {
549 err = 1; 549 err = 1;
550 magic = path.dentry->d_inode->i_sb->s_magic; 550 magic = path.mnt->mnt_sb->s_magic;
551 } 551 }
552 } else { 552 } else {
553 dev_t dev = sbi->sb->s_dev; 553 dev_t dev = sbi->sb->s_dev;
@@ -560,10 +560,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
560 560
561 err = have_submounts(path.dentry); 561 err = have_submounts(path.dentry);
562 562
563 if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { 563 if (follow_down(&path))
564 if (follow_down(&path)) 564 magic = path.mnt->mnt_sb->s_magic;
565 magic = path.mnt->mnt_sb->s_magic;
566 }
567 } 565 }
568 566
569 param->ismountpoint.out.devid = devid; 567 param->ismountpoint.out.devid = devid;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3da18d453488..a796c9417fb1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -27,7 +27,7 @@ static inline int autofs4_can_expire(struct dentry *dentry,
27 return 0; 27 return 0;
28 28
29 /* No point expiring a pending mount */ 29 /* No point expiring a pending mount */
30 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) 30 if (ino->flags & AUTOFS_INF_PENDING)
31 return 0; 31 return 0;
32 32
33 if (!do_now) { 33 if (!do_now) {
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 69c8142da838..821b2b955dac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,6 +49,7 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
49 ino->dentry = NULL; 49 ino->dentry = NULL;
50 ino->size = 0; 50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 51 INIT_LIST_HEAD(&ino->active);
52 ino->active_count = 0;
52 INIT_LIST_HEAD(&ino->expiring); 53 INIT_LIST_HEAD(&ino->expiring);
53 atomic_set(&ino->count, 0); 54 atomic_set(&ino->count, 0);
54 } 55 }
@@ -95,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino)
95 kfree(ino); 96 kfree(ino);
96} 97}
97 98
98/*
99 * Deal with the infamous "Busy inodes after umount ..." message.
100 *
101 * Clean up the dentry tree. This happens with autofs if the user
102 * space program goes away due to a SIGKILL, SIGSEGV etc.
103 */
104static void autofs4_force_release(struct autofs_sb_info *sbi)
105{
106 struct dentry *this_parent = sbi->sb->s_root;
107 struct list_head *next;
108
109 if (!sbi->sb->s_root)
110 return;
111
112 spin_lock(&dcache_lock);
113repeat:
114 next = this_parent->d_subdirs.next;
115resume:
116 while (next != &this_parent->d_subdirs) {
117 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
118
119 /* Negative dentry - don`t care */
120 if (!simple_positive(dentry)) {
121 next = next->next;
122 continue;
123 }
124
125 if (!list_empty(&dentry->d_subdirs)) {
126 this_parent = dentry;
127 goto repeat;
128 }
129
130 next = next->next;
131 spin_unlock(&dcache_lock);
132
133 DPRINTK("dentry %p %.*s",
134 dentry, (int)dentry->d_name.len, dentry->d_name.name);
135
136 dput(dentry);
137 spin_lock(&dcache_lock);
138 }
139
140 if (this_parent != sbi->sb->s_root) {
141 struct dentry *dentry = this_parent;
142
143 next = this_parent->d_u.d_child.next;
144 this_parent = this_parent->d_parent;
145 spin_unlock(&dcache_lock);
146 DPRINTK("parent dentry %p %.*s",
147 dentry, (int)dentry->d_name.len, dentry->d_name.name);
148 dput(dentry);
149 spin_lock(&dcache_lock);
150 goto resume;
151 }
152 spin_unlock(&dcache_lock);
153}
154
155void autofs4_kill_sb(struct super_block *sb) 99void autofs4_kill_sb(struct super_block *sb)
156{ 100{
157 struct autofs_sb_info *sbi = autofs4_sbi(sb); 101 struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -168,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb)
168 /* Free wait queues, close pipe */ 112 /* Free wait queues, close pipe */
169 autofs4_catatonic_mode(sbi); 113 autofs4_catatonic_mode(sbi);
170 114
171 /* Clean up and release dangling references */
172 autofs4_force_release(sbi);
173
174 sb->s_fs_info = NULL; 115 sb->s_fs_info = NULL;
175 kfree(sbi); 116 kfree(sbi);
176 117
177out_kill_sb: 118out_kill_sb:
178 DPRINTK("shutting down"); 119 DPRINTK("shutting down");
179 kill_anon_super(sb); 120 kill_litter_super(sb);
180} 121}
181 122
182static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) 123static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index b96a3c57359d..109a6c606d92 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/slab.h>
18#include <linux/param.h> 19#include <linux/param.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include "autofs_i.h" 21#include "autofs_i.h"
@@ -72,6 +73,46 @@ const struct inode_operations autofs4_dir_inode_operations = {
72 .rmdir = autofs4_dir_rmdir, 73 .rmdir = autofs4_dir_rmdir,
73}; 74};
74 75
76static void autofs4_add_active(struct dentry *dentry)
77{
78 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
79 struct autofs_info *ino = autofs4_dentry_ino(dentry);
80 if (ino) {
81 spin_lock(&sbi->lookup_lock);
82 if (!ino->active_count) {
83 if (list_empty(&ino->active))
84 list_add(&ino->active, &sbi->active_list);
85 }
86 ino->active_count++;
87 spin_unlock(&sbi->lookup_lock);
88 }
89 return;
90}
91
92static void autofs4_del_active(struct dentry *dentry)
93{
94 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
95 struct autofs_info *ino = autofs4_dentry_ino(dentry);
96 if (ino) {
97 spin_lock(&sbi->lookup_lock);
98 ino->active_count--;
99 if (!ino->active_count) {
100 if (!list_empty(&ino->active))
101 list_del_init(&ino->active);
102 }
103 spin_unlock(&sbi->lookup_lock);
104 }
105 return;
106}
107
108static unsigned int autofs4_need_mount(unsigned int flags)
109{
110 unsigned int res = 0;
111 if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
112 res = 1;
113 return res;
114}
115
75static int autofs4_dir_open(struct inode *inode, struct file *file) 116static int autofs4_dir_open(struct inode *inode, struct file *file)
76{ 117{
77 struct dentry *dentry = file->f_path.dentry; 118 struct dentry *dentry = file->f_path.dentry;
@@ -93,7 +134,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
93 * it. 134 * it.
94 */ 135 */
95 spin_lock(&dcache_lock); 136 spin_lock(&dcache_lock);
96 if (!d_mountpoint(dentry) && __simple_empty(dentry)) { 137 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
97 spin_unlock(&dcache_lock); 138 spin_unlock(&dcache_lock);
98 return -ENOENT; 139 return -ENOENT;
99 } 140 }
@@ -126,32 +167,32 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
126 167
127 /* Turn this into a real negative dentry? */ 168 /* Turn this into a real negative dentry? */
128 if (status == -ENOENT) { 169 if (status == -ENOENT) {
129 spin_lock(&dentry->d_lock); 170 spin_lock(&sbi->fs_lock);
130 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; 171 ino->flags &= ~AUTOFS_INF_PENDING;
131 spin_unlock(&dentry->d_lock); 172 spin_unlock(&sbi->fs_lock);
132 return status; 173 return status;
133 } else if (status) { 174 } else if (status) {
134 /* Return a negative dentry, but leave it "pending" */ 175 /* Return a negative dentry, but leave it "pending" */
135 return status; 176 return status;
136 } 177 }
137 /* Trigger mount for path component or follow link */ 178 /* Trigger mount for path component or follow link */
138 } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING || 179 } else if (ino->flags & AUTOFS_INF_PENDING ||
139 flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) || 180 autofs4_need_mount(flags) ||
140 current->link_count) { 181 current->link_count) {
141 DPRINTK("waiting for mount name=%.*s", 182 DPRINTK("waiting for mount name=%.*s",
142 dentry->d_name.len, dentry->d_name.name); 183 dentry->d_name.len, dentry->d_name.name);
143 184
144 spin_lock(&dentry->d_lock); 185 spin_lock(&sbi->fs_lock);
145 dentry->d_flags |= DCACHE_AUTOFS_PENDING; 186 ino->flags |= AUTOFS_INF_PENDING;
146 spin_unlock(&dentry->d_lock); 187 spin_unlock(&sbi->fs_lock);
147 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 188 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
148 189
149 DPRINTK("mount done status=%d", status); 190 DPRINTK("mount done status=%d", status);
150 191
151 if (status) { 192 if (status) {
152 spin_lock(&dentry->d_lock); 193 spin_lock(&sbi->fs_lock);
153 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; 194 ino->flags &= ~AUTOFS_INF_PENDING;
154 spin_unlock(&dentry->d_lock); 195 spin_unlock(&sbi->fs_lock);
155 return status; 196 return status;
156 } 197 }
157 } 198 }
@@ -160,9 +201,9 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
160 if (ino) 201 if (ino)
161 ino->last_used = jiffies; 202 ino->last_used = jiffies;
162 203
163 spin_lock(&dentry->d_lock); 204 spin_lock(&sbi->fs_lock);
164 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; 205 ino->flags &= ~AUTOFS_INF_PENDING;
165 spin_unlock(&dentry->d_lock); 206 spin_unlock(&sbi->fs_lock);
166 207
167 return 0; 208 return 0;
168} 209}
@@ -202,19 +243,24 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
202 autofs4_expire_wait(dentry); 243 autofs4_expire_wait(dentry);
203 244
204 /* We trigger a mount for almost all flags */ 245 /* We trigger a mount for almost all flags */
205 lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS); 246 lookup_type = autofs4_need_mount(nd->flags);
206 if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING)) 247 spin_lock(&sbi->fs_lock);
248 spin_lock(&dcache_lock);
249 if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
250 spin_unlock(&dcache_lock);
251 spin_unlock(&sbi->fs_lock);
207 goto follow; 252 goto follow;
253 }
208 254
209 /* 255 /*
210 * If the dentry contains directories then it is an autofs 256 * If the dentry contains directories then it is an autofs
211 * multi-mount with no root mount offset. So don't try to 257 * multi-mount with no root mount offset. So don't try to
212 * mount it again. 258 * mount it again.
213 */ 259 */
214 spin_lock(&dcache_lock); 260 if (ino->flags & AUTOFS_INF_PENDING ||
215 if (dentry->d_flags & DCACHE_AUTOFS_PENDING || 261 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
216 (!d_mountpoint(dentry) && __simple_empty(dentry))) {
217 spin_unlock(&dcache_lock); 262 spin_unlock(&dcache_lock);
263 spin_unlock(&sbi->fs_lock);
218 264
219 status = try_to_fill_dentry(dentry, 0); 265 status = try_to_fill_dentry(dentry, 0);
220 if (status) 266 if (status)
@@ -223,6 +269,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
223 goto follow; 269 goto follow;
224 } 270 }
225 spin_unlock(&dcache_lock); 271 spin_unlock(&dcache_lock);
272 spin_unlock(&sbi->fs_lock);
226follow: 273follow:
227 /* 274 /*
228 * If there is no root mount it must be an autofs 275 * If there is no root mount it must be an autofs
@@ -294,8 +341,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
294 /* Check for a non-mountpoint directory with no contents */ 341 /* Check for a non-mountpoint directory with no contents */
295 spin_lock(&dcache_lock); 342 spin_lock(&dcache_lock);
296 if (S_ISDIR(dentry->d_inode->i_mode) && 343 if (S_ISDIR(dentry->d_inode->i_mode) &&
297 !d_mountpoint(dentry) && 344 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
298 __simple_empty(dentry)) {
299 DPRINTK("dentry=%p %.*s, emptydir", 345 DPRINTK("dentry=%p %.*s, emptydir",
300 dentry, dentry->d_name.len, dentry->d_name.name); 346 dentry, dentry->d_name.len, dentry->d_name.name);
301 spin_unlock(&dcache_lock); 347 spin_unlock(&dcache_lock);
@@ -359,8 +405,11 @@ static const struct dentry_operations autofs4_dentry_operations = {
359 .d_release = autofs4_dentry_release, 405 .d_release = autofs4_dentry_release,
360}; 406};
361 407
362static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) 408static struct dentry *autofs4_lookup_active(struct dentry *dentry)
363{ 409{
410 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
411 struct dentry *parent = dentry->d_parent;
412 struct qstr *name = &dentry->d_name;
364 unsigned int len = name->len; 413 unsigned int len = name->len;
365 unsigned int hash = name->hash; 414 unsigned int hash = name->hash;
366 const unsigned char *str = name->name; 415 const unsigned char *str = name->name;
@@ -371,23 +420,23 @@ static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct d
371 head = &sbi->active_list; 420 head = &sbi->active_list;
372 list_for_each(p, head) { 421 list_for_each(p, head) {
373 struct autofs_info *ino; 422 struct autofs_info *ino;
374 struct dentry *dentry; 423 struct dentry *active;
375 struct qstr *qstr; 424 struct qstr *qstr;
376 425
377 ino = list_entry(p, struct autofs_info, active); 426 ino = list_entry(p, struct autofs_info, active);
378 dentry = ino->dentry; 427 active = ino->dentry;
379 428
380 spin_lock(&dentry->d_lock); 429 spin_lock(&active->d_lock);
381 430
382 /* Already gone? */ 431 /* Already gone? */
383 if (atomic_read(&dentry->d_count) == 0) 432 if (atomic_read(&active->d_count) == 0)
384 goto next; 433 goto next;
385 434
386 qstr = &dentry->d_name; 435 qstr = &active->d_name;
387 436
388 if (dentry->d_name.hash != hash) 437 if (active->d_name.hash != hash)
389 goto next; 438 goto next;
390 if (dentry->d_parent != parent) 439 if (active->d_parent != parent)
391 goto next; 440 goto next;
392 441
393 if (qstr->len != len) 442 if (qstr->len != len)
@@ -395,15 +444,15 @@ static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct d
395 if (memcmp(qstr->name, str, len)) 444 if (memcmp(qstr->name, str, len))
396 goto next; 445 goto next;
397 446
398 if (d_unhashed(dentry)) { 447 if (d_unhashed(active)) {
399 dget(dentry); 448 dget(active);
400 spin_unlock(&dentry->d_lock); 449 spin_unlock(&active->d_lock);
401 spin_unlock(&sbi->lookup_lock); 450 spin_unlock(&sbi->lookup_lock);
402 spin_unlock(&dcache_lock); 451 spin_unlock(&dcache_lock);
403 return dentry; 452 return active;
404 } 453 }
405next: 454next:
406 spin_unlock(&dentry->d_lock); 455 spin_unlock(&active->d_lock);
407 } 456 }
408 spin_unlock(&sbi->lookup_lock); 457 spin_unlock(&sbi->lookup_lock);
409 spin_unlock(&dcache_lock); 458 spin_unlock(&dcache_lock);
@@ -411,8 +460,11 @@ next:
411 return NULL; 460 return NULL;
412} 461}
413 462
414static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) 463static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
415{ 464{
465 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
466 struct dentry *parent = dentry->d_parent;
467 struct qstr *name = &dentry->d_name;
416 unsigned int len = name->len; 468 unsigned int len = name->len;
417 unsigned int hash = name->hash; 469 unsigned int hash = name->hash;
418 const unsigned char *str = name->name; 470 const unsigned char *str = name->name;
@@ -423,23 +475,23 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
423 head = &sbi->expiring_list; 475 head = &sbi->expiring_list;
424 list_for_each(p, head) { 476 list_for_each(p, head) {
425 struct autofs_info *ino; 477 struct autofs_info *ino;
426 struct dentry *dentry; 478 struct dentry *expiring;
427 struct qstr *qstr; 479 struct qstr *qstr;
428 480
429 ino = list_entry(p, struct autofs_info, expiring); 481 ino = list_entry(p, struct autofs_info, expiring);
430 dentry = ino->dentry; 482 expiring = ino->dentry;
431 483
432 spin_lock(&dentry->d_lock); 484 spin_lock(&expiring->d_lock);
433 485
434 /* Bad luck, we've already been dentry_iput */ 486 /* Bad luck, we've already been dentry_iput */
435 if (!dentry->d_inode) 487 if (!expiring->d_inode)
436 goto next; 488 goto next;
437 489
438 qstr = &dentry->d_name; 490 qstr = &expiring->d_name;
439 491
440 if (dentry->d_name.hash != hash) 492 if (expiring->d_name.hash != hash)
441 goto next; 493 goto next;
442 if (dentry->d_parent != parent) 494 if (expiring->d_parent != parent)
443 goto next; 495 goto next;
444 496
445 if (qstr->len != len) 497 if (qstr->len != len)
@@ -447,15 +499,15 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
447 if (memcmp(qstr->name, str, len)) 499 if (memcmp(qstr->name, str, len))
448 goto next; 500 goto next;
449 501
450 if (d_unhashed(dentry)) { 502 if (d_unhashed(expiring)) {
451 dget(dentry); 503 dget(expiring);
452 spin_unlock(&dentry->d_lock); 504 spin_unlock(&expiring->d_lock);
453 spin_unlock(&sbi->lookup_lock); 505 spin_unlock(&sbi->lookup_lock);
454 spin_unlock(&dcache_lock); 506 spin_unlock(&dcache_lock);
455 return dentry; 507 return expiring;
456 } 508 }
457next: 509next:
458 spin_unlock(&dentry->d_lock); 510 spin_unlock(&expiring->d_lock);
459 } 511 }
460 spin_unlock(&sbi->lookup_lock); 512 spin_unlock(&sbi->lookup_lock);
461 spin_unlock(&dcache_lock); 513 spin_unlock(&dcache_lock);
@@ -468,7 +520,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
468{ 520{
469 struct autofs_sb_info *sbi; 521 struct autofs_sb_info *sbi;
470 struct autofs_info *ino; 522 struct autofs_info *ino;
471 struct dentry *expiring, *unhashed; 523 struct dentry *expiring, *active;
472 int oz_mode; 524 int oz_mode;
473 525
474 DPRINTK("name = %.*s", 526 DPRINTK("name = %.*s",
@@ -484,10 +536,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
484 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 536 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
485 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 537 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
486 538
487 unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); 539 active = autofs4_lookup_active(dentry);
488 if (unhashed) 540 if (active) {
489 dentry = unhashed; 541 dentry = active;
490 else { 542 ino = autofs4_dentry_ino(dentry);
543 } else {
491 /* 544 /*
492 * Mark the dentry incomplete but don't hash it. We do this 545 * Mark the dentry incomplete but don't hash it. We do this
493 * to serialize our inode creation operations (symlink and 546 * to serialize our inode creation operations (symlink and
@@ -513,36 +566,28 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
513 dentry->d_fsdata = ino; 566 dentry->d_fsdata = ino;
514 ino->dentry = dentry; 567 ino->dentry = dentry;
515 568
516 spin_lock(&sbi->lookup_lock); 569 autofs4_add_active(dentry);
517 list_add(&ino->active, &sbi->active_list);
518 spin_unlock(&sbi->lookup_lock);
519 570
520 d_instantiate(dentry, NULL); 571 d_instantiate(dentry, NULL);
521 } 572 }
522 573
523 if (!oz_mode) { 574 if (!oz_mode) {
524 mutex_unlock(&dir->i_mutex); 575 mutex_unlock(&dir->i_mutex);
525 expiring = autofs4_lookup_expiring(sbi, 576 expiring = autofs4_lookup_expiring(dentry);
526 dentry->d_parent,
527 &dentry->d_name);
528 if (expiring) { 577 if (expiring) {
529 /* 578 /*
530 * If we are racing with expire the request might not 579 * If we are racing with expire the request might not
531 * be quite complete but the directory has been removed 580 * be quite complete but the directory has been removed
532 * so it must have been successful, so just wait for it. 581 * so it must have been successful, so just wait for it.
533 */ 582 */
534 ino = autofs4_dentry_ino(expiring);
535 autofs4_expire_wait(expiring); 583 autofs4_expire_wait(expiring);
536 spin_lock(&sbi->lookup_lock); 584 autofs4_del_expiring(expiring);
537 if (!list_empty(&ino->expiring))
538 list_del_init(&ino->expiring);
539 spin_unlock(&sbi->lookup_lock);
540 dput(expiring); 585 dput(expiring);
541 } 586 }
542 587
543 spin_lock(&dentry->d_lock); 588 spin_lock(&sbi->fs_lock);
544 dentry->d_flags |= DCACHE_AUTOFS_PENDING; 589 ino->flags |= AUTOFS_INF_PENDING;
545 spin_unlock(&dentry->d_lock); 590 spin_unlock(&sbi->fs_lock);
546 if (dentry->d_op && dentry->d_op->d_revalidate) 591 if (dentry->d_op && dentry->d_op->d_revalidate)
547 (dentry->d_op->d_revalidate)(dentry, nd); 592 (dentry->d_op->d_revalidate)(dentry, nd);
548 mutex_lock(&dir->i_mutex); 593 mutex_lock(&dir->i_mutex);
@@ -552,22 +597,22 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
552 * If we are still pending, check if we had to handle 597 * If we are still pending, check if we had to handle
553 * a signal. If so we can force a restart.. 598 * a signal. If so we can force a restart..
554 */ 599 */
555 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) { 600 if (ino->flags & AUTOFS_INF_PENDING) {
556 /* See if we were interrupted */ 601 /* See if we were interrupted */
557 if (signal_pending(current)) { 602 if (signal_pending(current)) {
558 sigset_t *sigset = &current->pending.signal; 603 sigset_t *sigset = &current->pending.signal;
559 if (sigismember (sigset, SIGKILL) || 604 if (sigismember (sigset, SIGKILL) ||
560 sigismember (sigset, SIGQUIT) || 605 sigismember (sigset, SIGQUIT) ||
561 sigismember (sigset, SIGINT)) { 606 sigismember (sigset, SIGINT)) {
562 if (unhashed) 607 if (active)
563 dput(unhashed); 608 dput(active);
564 return ERR_PTR(-ERESTARTNOINTR); 609 return ERR_PTR(-ERESTARTNOINTR);
565 } 610 }
566 } 611 }
567 if (!oz_mode) { 612 if (!oz_mode) {
568 spin_lock(&dentry->d_lock); 613 spin_lock(&sbi->fs_lock);
569 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; 614 ino->flags &= ~AUTOFS_INF_PENDING;
570 spin_unlock(&dentry->d_lock); 615 spin_unlock(&sbi->fs_lock);
571 } 616 }
572 } 617 }
573 618
@@ -592,14 +637,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
592 else 637 else
593 dentry = ERR_PTR(-ENOENT); 638 dentry = ERR_PTR(-ENOENT);
594 639
595 if (unhashed) 640 if (active)
596 dput(unhashed); 641 dput(active);
597 642
598 return dentry; 643 return dentry;
599 } 644 }
600 645
601 if (unhashed) 646 if (active)
602 return unhashed; 647 return active;
603 648
604 return NULL; 649 return NULL;
605} 650}
@@ -624,10 +669,7 @@ static int autofs4_dir_symlink(struct inode *dir,
624 if (!ino) 669 if (!ino)
625 return -ENOMEM; 670 return -ENOMEM;
626 671
627 spin_lock(&sbi->lookup_lock); 672 autofs4_del_active(dentry);
628 if (!list_empty(&ino->active))
629 list_del_init(&ino->active);
630 spin_unlock(&sbi->lookup_lock);
631 673
632 ino->size = strlen(symname); 674 ino->size = strlen(symname);
633 cp = kmalloc(ino->size + 1, GFP_KERNEL); 675 cp = kmalloc(ino->size + 1, GFP_KERNEL);
@@ -705,10 +747,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
705 dir->i_mtime = CURRENT_TIME; 747 dir->i_mtime = CURRENT_TIME;
706 748
707 spin_lock(&dcache_lock); 749 spin_lock(&dcache_lock);
708 spin_lock(&sbi->lookup_lock); 750 autofs4_add_expiring(dentry);
709 if (list_empty(&ino->expiring))
710 list_add(&ino->expiring, &sbi->expiring_list);
711 spin_unlock(&sbi->lookup_lock);
712 spin_lock(&dentry->d_lock); 751 spin_lock(&dentry->d_lock);
713 __d_drop(dentry); 752 __d_drop(dentry);
714 spin_unlock(&dentry->d_lock); 753 spin_unlock(&dentry->d_lock);
@@ -734,10 +773,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
734 spin_unlock(&dcache_lock); 773 spin_unlock(&dcache_lock);
735 return -ENOTEMPTY; 774 return -ENOTEMPTY;
736 } 775 }
737 spin_lock(&sbi->lookup_lock); 776 autofs4_add_expiring(dentry);
738 if (list_empty(&ino->expiring))
739 list_add(&ino->expiring, &sbi->expiring_list);
740 spin_unlock(&sbi->lookup_lock);
741 spin_lock(&dentry->d_lock); 777 spin_lock(&dentry->d_lock);
742 __d_drop(dentry); 778 __d_drop(dentry);
743 spin_unlock(&dentry->d_lock); 779 spin_unlock(&dentry->d_lock);
@@ -775,10 +811,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
775 if (!ino) 811 if (!ino)
776 return -ENOMEM; 812 return -ENOMEM;
777 813
778 spin_lock(&sbi->lookup_lock); 814 autofs4_del_active(dentry);
779 if (!list_empty(&ino->active))
780 list_del_init(&ino->active);
781 spin_unlock(&sbi->lookup_lock);
782 815
783 inode = autofs4_get_inode(dir->i_sb, ino); 816 inode = autofs4_get_inode(dir->i_sb, ino);
784 if (!inode) { 817 if (!inode) {
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/string.h> 15#include <linux/string.h>
17 16
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 33baf27fac78..34ddda888e63 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
873 brelse(bh); 873 brelse(bh);
874 874
875 unacquire_priv_sbp: 875 unacquire_priv_sbp:
876 kfree(befs_sb->mount_opts.iocharset);
876 kfree(sb->s_fs_info); 877 kfree(sb->s_fs_info);
877 878
878 unacquire_none: 879 unacquire_none:
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 6f60336c6628..f22a7d3dc362 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
15#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/vfs.h> 17#include <linux/vfs.h>
18#include <linux/writeback.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include "bfs.h" 20#include "bfs.h"
20 21
@@ -98,7 +99,7 @@ error:
98 return ERR_PTR(-EIO); 99 return ERR_PTR(-EIO);
99} 100}
100 101
101static int bfs_write_inode(struct inode *inode, int wait) 102static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
102{ 103{
103 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 104 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
104 unsigned int ino = (u16)inode->i_ino; 105 unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
147 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); 148 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
148 149
149 mark_buffer_dirty(bh); 150 mark_buffer_dirty(bh);
150 if (wait) { 151 if (wbc->sync_mode == WB_SYNC_ALL) {
151 sync_dirty_buffer(bh); 152 sync_dirty_buffer(bh);
152 if (buffer_req(bh) && !buffer_uptodate(bh)) 153 if (buffer_req(bh) && !buffer_uptodate(bh))
153 err = -EIO; 154 err = -EIO;
@@ -353,35 +354,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
353 struct inode *inode; 354 struct inode *inode;
354 unsigned i, imap_len; 355 unsigned i, imap_len;
355 struct bfs_sb_info *info; 356 struct bfs_sb_info *info;
356 long ret = -EINVAL; 357 int ret = -EINVAL;
357 unsigned long i_sblock, i_eblock, i_eoff, s_size; 358 unsigned long i_sblock, i_eblock, i_eoff, s_size;
358 359
359 info = kzalloc(sizeof(*info), GFP_KERNEL); 360 info = kzalloc(sizeof(*info), GFP_KERNEL);
360 if (!info) 361 if (!info)
361 return -ENOMEM; 362 return -ENOMEM;
363 mutex_init(&info->bfs_lock);
362 s->s_fs_info = info; 364 s->s_fs_info = info;
363 365
364 sb_set_blocksize(s, BFS_BSIZE); 366 sb_set_blocksize(s, BFS_BSIZE);
365 367
366 bh = sb_bread(s, 0); 368 info->si_sbh = sb_bread(s, 0);
367 if(!bh) 369 if (!info->si_sbh)
368 goto out; 370 goto out;
369 bfs_sb = (struct bfs_super_block *)bh->b_data; 371 bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data;
370 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { 372 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
371 if (!silent) 373 if (!silent)
372 printf("No BFS filesystem on %s (magic=%08x)\n", 374 printf("No BFS filesystem on %s (magic=%08x)\n",
373 s->s_id, le32_to_cpu(bfs_sb->s_magic)); 375 s->s_id, le32_to_cpu(bfs_sb->s_magic));
374 goto out; 376 goto out1;
375 } 377 }
376 if (BFS_UNCLEAN(bfs_sb, s) && !silent) 378 if (BFS_UNCLEAN(bfs_sb, s) && !silent)
377 printf("%s is unclean, continuing\n", s->s_id); 379 printf("%s is unclean, continuing\n", s->s_id);
378 380
379 s->s_magic = BFS_MAGIC; 381 s->s_magic = BFS_MAGIC;
380 info->si_sbh = bh;
381 382
382 if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { 383 if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
383 printf("Superblock is corrupted\n"); 384 printf("Superblock is corrupted\n");
384 goto out; 385 goto out1;
385 } 386 }
386 387
387 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / 388 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
@@ -390,7 +391,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
390 imap_len = (info->si_lasti / 8) + 1; 391 imap_len = (info->si_lasti / 8) + 1;
391 info->si_imap = kzalloc(imap_len, GFP_KERNEL); 392 info->si_imap = kzalloc(imap_len, GFP_KERNEL);
392 if (!info->si_imap) 393 if (!info->si_imap)
393 goto out; 394 goto out1;
394 for (i = 0; i < BFS_ROOT_INO; i++) 395 for (i = 0; i < BFS_ROOT_INO; i++)
395 set_bit(i, info->si_imap); 396 set_bit(i, info->si_imap);
396 397
@@ -398,15 +399,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
398 inode = bfs_iget(s, BFS_ROOT_INO); 399 inode = bfs_iget(s, BFS_ROOT_INO);
399 if (IS_ERR(inode)) { 400 if (IS_ERR(inode)) {
400 ret = PTR_ERR(inode); 401 ret = PTR_ERR(inode);
401 kfree(info->si_imap); 402 goto out2;
402 goto out;
403 } 403 }
404 s->s_root = d_alloc_root(inode); 404 s->s_root = d_alloc_root(inode);
405 if (!s->s_root) { 405 if (!s->s_root) {
406 iput(inode); 406 iput(inode);
407 ret = -ENOMEM; 407 ret = -ENOMEM;
408 kfree(info->si_imap); 408 goto out2;
409 goto out;
410 } 409 }
411 410
412 info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; 411 info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
@@ -419,10 +418,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
419 bh = sb_bread(s, info->si_blocks - 1); 418 bh = sb_bread(s, info->si_blocks - 1);
420 if (!bh) { 419 if (!bh) {
421 printf("Last block not available: %lu\n", info->si_blocks - 1); 420 printf("Last block not available: %lu\n", info->si_blocks - 1);
422 iput(inode);
423 ret = -EIO; 421 ret = -EIO;
424 kfree(info->si_imap); 422 goto out3;
425 goto out;
426 } 423 }
427 brelse(bh); 424 brelse(bh);
428 425
@@ -459,11 +456,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
459 printf("Inode 0x%08x corrupted\n", i); 456 printf("Inode 0x%08x corrupted\n", i);
460 457
461 brelse(bh); 458 brelse(bh);
462 s->s_root = NULL; 459 ret = -EIO;
463 kfree(info->si_imap); 460 goto out3;
464 kfree(info);
465 s->s_fs_info = NULL;
466 return -EIO;
467 } 461 }
468 462
469 if (!di->i_ino) { 463 if (!di->i_ino) {
@@ -483,11 +477,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
483 s->s_dirt = 1; 477 s->s_dirt = 1;
484 } 478 }
485 dump_imap("read_super", s); 479 dump_imap("read_super", s);
486 mutex_init(&info->bfs_lock);
487 return 0; 480 return 0;
488 481
482out3:
483 dput(s->s_root);
484 s->s_root = NULL;
485out2:
486 kfree(info->si_imap);
487out1:
488 brelse(info->si_sbh);
489out: 489out:
490 brelse(bh); 490 mutex_destroy(&info->bfs_lock);
491 kfree(info); 491 kfree(info);
492 s->s_fs_info = NULL; 492 s->s_fs_info = NULL;
493 return ret; 493 return ret;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index b639dcf7c778..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,10 +20,11 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/user.h> 22#include <linux/user.h>
23#include <linux/slab.h>
24#include <linux/binfmts.h> 23#include <linux/binfmts.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
26#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/coredump.h>
27#include <linux/slab.h>
27 28
28#include <asm/system.h> 29#include <asm/system.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -32,7 +33,7 @@
32 33
33static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 34static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
34static int load_aout_library(struct file*); 35static int load_aout_library(struct file*);
35static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); 36static int aout_core_dump(struct coredump_params *cprm);
36 37
37static struct linux_binfmt aout_format = { 38static struct linux_binfmt aout_format = {
38 .module = THIS_MODULE, 39 .module = THIS_MODULE,
@@ -60,26 +61,6 @@ static int set_brk(unsigned long start, unsigned long end)
60} 61}
61 62
62/* 63/*
63 * These are the only things you should do on a core-file: use only these
64 * macros to write out all the necessary info.
65 */
66
67static int dump_write(struct file *file, const void *addr, int nr)
68{
69 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
70}
71
72#define DUMP_WRITE(addr, nr) \
73 if (!dump_write(file, (void *)(addr), (nr))) \
74 goto end_coredump;
75
76#define DUMP_SEEK(offset) \
77if (file->f_op->llseek) { \
78 if (file->f_op->llseek(file,(offset),0) != (offset)) \
79 goto end_coredump; \
80} else file->f_pos = (offset)
81
82/*
83 * Routine writes a core dump image in the current directory. 64 * Routine writes a core dump image in the current directory.
84 * Currently only a stub-function. 65 * Currently only a stub-function.
85 * 66 *
@@ -89,18 +70,21 @@ if (file->f_op->llseek) { \
89 * dumping of the process results in another error.. 70 * dumping of the process results in another error..
90 */ 71 */
91 72
92static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) 73static int aout_core_dump(struct coredump_params *cprm)
93{ 74{
75 struct file *file = cprm->file;
94 mm_segment_t fs; 76 mm_segment_t fs;
95 int has_dumped = 0; 77 int has_dumped = 0;
96 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
97 struct user dump; 80 struct user dump;
98#ifdef __alpha__ 81#ifdef __alpha__
99# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
100#else 83#else
101# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
102#endif 86#endif
103# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
104 88
105 fs = get_fs(); 89 fs = get_fs();
106 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -108,47 +92,52 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
108 current->flags |= PF_DUMPCORE; 92 current->flags |= PF_DUMPCORE;
109 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); 93 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
110 dump.u_ar0 = offsetof(struct user, regs); 94 dump.u_ar0 = offsetof(struct user, regs);
111 dump.signal = signr; 95 dump.signal = cprm->signr;
112 aout_dump_thread(regs, &dump); 96 aout_dump_thread(cprm->regs, &dump);
113 97
114/* If the size of the dump file exceeds the rlimit, then see what would happen 98/* If the size of the dump file exceeds the rlimit, then see what would happen
115 if we wrote the stack, but not the data area. */ 99 if we wrote the stack, but not the data area. */
116 if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) 100 if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
117 dump.u_dsize = 0; 101 dump.u_dsize = 0;
118 102
119/* Make sure we have enough room to write the stack and data areas. */ 103/* Make sure we have enough room to write the stack and data areas. */
120 if ((dump.u_ssize + 1) * PAGE_SIZE > limit) 104 if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
121 dump.u_ssize = 0; 105 dump.u_ssize = 0;
122 106
123/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
124 set_fs(USER_DS); 108 set_fs(USER_DS);
125 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
126 dump.u_dsize = 0; 110 dump.u_dsize = 0;
127 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
128 dump.u_ssize = 0; 112 dump.u_ssize = 0;
129 113
130 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
131/* struct user */ 115/* struct user */
132 DUMP_WRITE(&dump,sizeof(dump)); 116 if (!dump_write(file, &dump, sizeof(dump)))
117 goto end_coredump;
133/* Now dump all of the user data. Include malloced stuff as well */ 118/* Now dump all of the user data. Include malloced stuff as well */
134 DUMP_SEEK(PAGE_SIZE); 119 if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
120 goto end_coredump;
135/* now we start writing out the user space info */ 121/* now we start writing out the user space info */
136 set_fs(USER_DS); 122 set_fs(USER_DS);
137/* Dump the data area */ 123/* Dump the data area */
138 if (dump.u_dsize != 0) { 124 if (dump.u_dsize != 0) {
139 dump_start = START_DATA(dump); 125 dump_start = START_DATA(dump);
140 dump_size = dump.u_dsize << PAGE_SHIFT; 126 dump_size = dump.u_dsize << PAGE_SHIFT;
141 DUMP_WRITE(dump_start,dump_size); 127 if (!dump_write(file, dump_start, dump_size))
128 goto end_coredump;
142 } 129 }
143/* Now prepare to dump the stack area */ 130/* Now prepare to dump the stack area */
144 if (dump.u_ssize != 0) { 131 if (dump.u_ssize != 0) {
145 dump_start = START_STACK(dump); 132 dump_start = START_STACK(dump);
146 dump_size = dump.u_ssize << PAGE_SHIFT; 133 dump_size = dump.u_ssize << PAGE_SHIFT;
147 DUMP_WRITE(dump_start,dump_size); 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump;
148 } 136 }
149/* Finally dump the task struct. Not be used by gdb, but could be useful */ 137/* Finally dump the task struct. Not be used by gdb, but could be useful */
150 set_fs(KERNEL_DS); 138 set_fs(KERNEL_DS);
151 DUMP_WRITE(current,sizeof(*current)); 139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
152end_coredump: 141end_coredump:
153 set_fs(fs); 142 set_fs(fs);
154 return has_dumped; 143 return has_dumped;
@@ -246,7 +235,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
246 * size limits imposed on them by creating programs with large 235 * size limits imposed on them by creating programs with large
247 * arrays in the data or bss. 236 * arrays in the data or bss.
248 */ 237 */
249 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 238 rlim = rlimit(RLIMIT_DATA);
250 if (rlim >= RLIM_INFINITY) 239 if (rlim >= RLIM_INFINITY)
251 rlim = ~0; 240 rlim = ~0;
252 if (ex.a_data + ex.a_bss > rlim) 241 if (ex.a_data + ex.a_bss > rlim)
@@ -263,6 +252,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
263#else 252#else
264 set_personality(PER_LINUX); 253 set_personality(PER_LINUX);
265#endif 254#endif
255 setup_new_exec(bprm);
266 256
267 current->mm->end_code = ex.a_text + 257 current->mm->end_code = ex.a_text +
268 (current->mm->start_code = N_TXTADDR(ex)); 258 (current->mm->start_code = N_TXTADDR(ex));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e4..535e763ab1a6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/elf.h> 32#include <linux/elf.h>
33#include <linux/utsname.h> 33#include <linux/utsname.h>
34#include <linux/coredump.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/param.h> 36#include <asm/param.h>
36#include <asm/page.h> 37#include <asm/page.h>
@@ -44,8 +45,8 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
44 * If we don't support core dumping, then supply a NULL so we 45 * If we don't support core dumping, then supply a NULL so we
45 * don't even try. 46 * don't even try.
46 */ 47 */
47#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 48#ifdef CONFIG_ELF_CORE
48static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); 49static int elf_core_dump(struct coredump_params *cprm);
49#else 50#else
50#define elf_core_dump NULL 51#define elf_core_dump NULL
51#endif 52#endif
@@ -662,27 +663,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
662 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') 663 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
663 goto out_free_interp; 664 goto out_free_interp;
664 665
665 /*
666 * The early SET_PERSONALITY here is so that the lookup
667 * for the interpreter happens in the namespace of the
668 * to-be-execed image. SET_PERSONALITY can select an
669 * alternate root.
670 *
671 * However, SET_PERSONALITY is NOT allowed to switch
672 * this task into the new images's memory mapping
673 * policy - that is, TASK_SIZE must still evaluate to
674 * that which is appropriate to the execing application.
675 * This is because exit_mmap() needs to have TASK_SIZE
676 * evaluate to the size of the old image.
677 *
678 * So if (say) a 64-bit application is execing a 32-bit
679 * application it is the architecture's responsibility
680 * to defer changing the value of TASK_SIZE until the
681 * switch really is going to happen - do this in
682 * flush_thread(). - akpm
683 */
684 SET_PERSONALITY(loc->elf_ex);
685
686 interpreter = open_exec(elf_interpreter); 666 interpreter = open_exec(elf_interpreter);
687 retval = PTR_ERR(interpreter); 667 retval = PTR_ERR(interpreter);
688 if (IS_ERR(interpreter)) 668 if (IS_ERR(interpreter))
@@ -730,9 +710,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
730 /* Verify the interpreter has a valid arch */ 710 /* Verify the interpreter has a valid arch */
731 if (!elf_check_arch(&loc->interp_elf_ex)) 711 if (!elf_check_arch(&loc->interp_elf_ex))
732 goto out_free_dentry; 712 goto out_free_dentry;
733 } else {
734 /* Executables without an interpreter also need a personality */
735 SET_PERSONALITY(loc->elf_ex);
736 } 713 }
737 714
738 /* Flush all traces of the currently running executable */ 715 /* Flush all traces of the currently running executable */
@@ -752,7 +729,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
752 729
753 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 730 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
754 current->flags |= PF_RANDOMIZE; 731 current->flags |= PF_RANDOMIZE;
755 arch_pick_mmap_layout(current->mm); 732
733 setup_new_exec(bprm);
756 734
757 /* Do this so that we can load the interpreter, if need be. We will 735 /* Do this so that we can load the interpreter, if need be. We will
758 change some of these later */ 736 change some of these later */
@@ -767,7 +745,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
767 745
768 current->mm->start_stack = bprm->p; 746 current->mm->start_stack = bprm->p;
769 747
770 /* Now we do a little grungy work by mmaping the ELF image into 748 /* Now we do a little grungy work by mmapping the ELF image into
771 the correct location in memory. */ 749 the correct location in memory. */
772 for(i = 0, elf_ppnt = elf_phdata; 750 for(i = 0, elf_ppnt = elf_phdata;
773 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { 751 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
@@ -1101,48 +1079,13 @@ out:
1101 return error; 1079 return error;
1102} 1080}
1103 1081
1104/* 1082#ifdef CONFIG_ELF_CORE
1105 * Note that some platforms still use traditional core dumps and not
1106 * the ELF core dump. Each platform can select it as appropriate.
1107 */
1108#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
1109
1110/* 1083/*
1111 * ELF core dumper 1084 * ELF core dumper
1112 * 1085 *
1113 * Modelled on fs/exec.c:aout_core_dump() 1086 * Modelled on fs/exec.c:aout_core_dump()
1114 * Jeremy Fitzhardinge <jeremy@sw.oz.au> 1087 * Jeremy Fitzhardinge <jeremy@sw.oz.au>
1115 */ 1088 */
1116/*
1117 * These are the only things you should do on a core-file: use only these
1118 * functions to write out all the necessary info.
1119 */
1120static int dump_write(struct file *file, const void *addr, int nr)
1121{
1122 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
1123}
1124
1125static int dump_seek(struct file *file, loff_t off)
1126{
1127 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
1128 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
1129 return 0;
1130 } else {
1131 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
1132 if (!buf)
1133 return 0;
1134 while (off > 0) {
1135 unsigned long n = off;
1136 if (n > PAGE_SIZE)
1137 n = PAGE_SIZE;
1138 if (!dump_write(file, buf, n))
1139 return 0;
1140 off -= n;
1141 }
1142 free_page((unsigned long)buf);
1143 }
1144 return 1;
1145}
1146 1089
1147/* 1090/*
1148 * Decide what to dump of a segment, part, all or none. 1091 * Decide what to dump of a segment, part, all or none.
@@ -1277,10 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file,
1277} 1220}
1278#undef DUMP_WRITE 1221#undef DUMP_WRITE
1279 1222
1280#define DUMP_WRITE(addr, nr) \
1281 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1282 goto end_coredump;
1283
1284static void fill_elf_header(struct elfhdr *elf, int segs, 1223static void fill_elf_header(struct elfhdr *elf, int segs,
1285 u16 machine, u32 flags, u8 osabi) 1224 u16 machine, u32 flags, u8 osabi)
1286{ 1225{
@@ -1899,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1899 return gate_vma; 1838 return gate_vma;
1900} 1839}
1901 1840
1841static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
1842 elf_addr_t e_shoff, int segs)
1843{
1844 elf->e_shoff = e_shoff;
1845 elf->e_shentsize = sizeof(*shdr4extnum);
1846 elf->e_shnum = 1;
1847 elf->e_shstrndx = SHN_UNDEF;
1848
1849 memset(shdr4extnum, 0, sizeof(*shdr4extnum));
1850
1851 shdr4extnum->sh_type = SHT_NULL;
1852 shdr4extnum->sh_size = elf->e_shnum;
1853 shdr4extnum->sh_link = elf->e_shstrndx;
1854 shdr4extnum->sh_info = segs;
1855}
1856
1857static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
1858 unsigned long mm_flags)
1859{
1860 struct vm_area_struct *vma;
1861 size_t size = 0;
1862
1863 for (vma = first_vma(current, gate_vma); vma != NULL;
1864 vma = next_vma(vma, gate_vma))
1865 size += vma_dump_size(vma, mm_flags);
1866 return size;
1867}
1868
1902/* 1869/*
1903 * Actual dumper 1870 * Actual dumper
1904 * 1871 *
@@ -1906,7 +1873,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1906 * and then they are actually written out. If we run out of core limit 1873 * and then they are actually written out. If we run out of core limit
1907 * we just truncate. 1874 * we just truncate.
1908 */ 1875 */
1909static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) 1876static int elf_core_dump(struct coredump_params *cprm)
1910{ 1877{
1911 int has_dumped = 0; 1878 int has_dumped = 0;
1912 mm_segment_t fs; 1879 mm_segment_t fs;
@@ -1915,8 +1882,11 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1915 struct vm_area_struct *vma, *gate_vma; 1882 struct vm_area_struct *vma, *gate_vma;
1916 struct elfhdr *elf = NULL; 1883 struct elfhdr *elf = NULL;
1917 loff_t offset = 0, dataoff, foffset; 1884 loff_t offset = 0, dataoff, foffset;
1918 unsigned long mm_flags;
1919 struct elf_note_info info; 1885 struct elf_note_info info;
1886 struct elf_phdr *phdr4note = NULL;
1887 struct elf_shdr *shdr4extnum = NULL;
1888 Elf_Half e_phnum;
1889 elf_addr_t e_shoff;
1920 1890
1921 /* 1891 /*
1922 * We no longer stop all VM operations. 1892 * We no longer stop all VM operations.
@@ -1939,20 +1909,25 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1939 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. 1909 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
1940 */ 1910 */
1941 segs = current->mm->map_count; 1911 segs = current->mm->map_count;
1942#ifdef ELF_CORE_EXTRA_PHDRS 1912 segs += elf_core_extra_phdrs();
1943 segs += ELF_CORE_EXTRA_PHDRS;
1944#endif
1945 1913
1946 gate_vma = get_gate_vma(current); 1914 gate_vma = get_gate_vma(current);
1947 if (gate_vma != NULL) 1915 if (gate_vma != NULL)
1948 segs++; 1916 segs++;
1949 1917
1918 /* for notes section */
1919 segs++;
1920
1921 /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
1922 * this, kernel supports extended numbering. Have a look at
1923 * include/linux/elf.h for further information. */
1924 e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
1925
1950 /* 1926 /*
1951 * Collect all the non-memory information about the process for the 1927 * Collect all the non-memory information about the process for the
1952 * notes. This also sets up the file header. 1928 * notes. This also sets up the file header.
1953 */ 1929 */
1954 if (!fill_note_info(elf, segs + 1, /* including notes section */ 1930 if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
1955 &info, signr, regs))
1956 goto cleanup; 1931 goto cleanup;
1957 1932
1958 has_dumped = 1; 1933 has_dumped = 1;
@@ -1961,31 +1936,47 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1961 fs = get_fs(); 1936 fs = get_fs();
1962 set_fs(KERNEL_DS); 1937 set_fs(KERNEL_DS);
1963 1938
1964 DUMP_WRITE(elf, sizeof(*elf));
1965 offset += sizeof(*elf); /* Elf header */ 1939 offset += sizeof(*elf); /* Elf header */
1966 offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */ 1940 offset += segs * sizeof(struct elf_phdr); /* Program headers */
1967 foffset = offset; 1941 foffset = offset;
1968 1942
1969 /* Write notes phdr entry */ 1943 /* Write notes phdr entry */
1970 { 1944 {
1971 struct elf_phdr phdr;
1972 size_t sz = get_note_info_size(&info); 1945 size_t sz = get_note_info_size(&info);
1973 1946
1974 sz += elf_coredump_extra_notes_size(); 1947 sz += elf_coredump_extra_notes_size();
1975 1948
1976 fill_elf_note_phdr(&phdr, sz, offset); 1949 phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
1950 if (!phdr4note)
1951 goto end_coredump;
1952
1953 fill_elf_note_phdr(phdr4note, sz, offset);
1977 offset += sz; 1954 offset += sz;
1978 DUMP_WRITE(&phdr, sizeof(phdr));
1979 } 1955 }
1980 1956
1981 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1957 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1982 1958
1983 /* 1959 offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
1984 * We must use the same mm->flags while dumping core to avoid 1960 offset += elf_core_extra_data_size();
1985 * inconsistency between the program headers and bodies, otherwise an 1961 e_shoff = offset;
1986 * unusable core file can be generated. 1962
1987 */ 1963 if (e_phnum == PN_XNUM) {
1988 mm_flags = current->mm->flags; 1964 shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
1965 if (!shdr4extnum)
1966 goto end_coredump;
1967 fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
1968 }
1969
1970 offset = dataoff;
1971
1972 size += sizeof(*elf);
1973 if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
1974 goto end_coredump;
1975
1976 size += sizeof(*phdr4note);
1977 if (size > cprm->limit
1978 || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
1979 goto end_coredump;
1989 1980
1990 /* Write program headers for segments dump */ 1981 /* Write program headers for segments dump */
1991 for (vma = first_vma(current, gate_vma); vma != NULL; 1982 for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -1996,7 +1987,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1996 phdr.p_offset = offset; 1987 phdr.p_offset = offset;
1997 phdr.p_vaddr = vma->vm_start; 1988 phdr.p_vaddr = vma->vm_start;
1998 phdr.p_paddr = 0; 1989 phdr.p_paddr = 0;
1999 phdr.p_filesz = vma_dump_size(vma, mm_flags); 1990 phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
2000 phdr.p_memsz = vma->vm_end - vma->vm_start; 1991 phdr.p_memsz = vma->vm_end - vma->vm_start;
2001 offset += phdr.p_filesz; 1992 offset += phdr.p_filesz;
2002 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 1993 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -2006,22 +1997,24 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2006 phdr.p_flags |= PF_X; 1997 phdr.p_flags |= PF_X;
2007 phdr.p_align = ELF_EXEC_PAGESIZE; 1998 phdr.p_align = ELF_EXEC_PAGESIZE;
2008 1999
2009 DUMP_WRITE(&phdr, sizeof(phdr)); 2000 size += sizeof(phdr);
2001 if (size > cprm->limit
2002 || !dump_write(cprm->file, &phdr, sizeof(phdr)))
2003 goto end_coredump;
2010 } 2004 }
2011 2005
2012#ifdef ELF_CORE_WRITE_EXTRA_PHDRS 2006 if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
2013 ELF_CORE_WRITE_EXTRA_PHDRS; 2007 goto end_coredump;
2014#endif
2015 2008
2016 /* write out the notes section */ 2009 /* write out the notes section */
2017 if (!write_note_info(&info, file, &foffset)) 2010 if (!write_note_info(&info, cprm->file, &foffset))
2018 goto end_coredump; 2011 goto end_coredump;
2019 2012
2020 if (elf_coredump_extra_notes_write(file, &foffset)) 2013 if (elf_coredump_extra_notes_write(cprm->file, &foffset))
2021 goto end_coredump; 2014 goto end_coredump;
2022 2015
2023 /* Align to page */ 2016 /* Align to page */
2024 if (!dump_seek(file, dataoff - foffset)) 2017 if (!dump_seek(cprm->file, dataoff - foffset))
2025 goto end_coredump; 2018 goto end_coredump;
2026 2019
2027 for (vma = first_vma(current, gate_vma); vma != NULL; 2020 for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -2029,7 +2022,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2029 unsigned long addr; 2022 unsigned long addr;
2030 unsigned long end; 2023 unsigned long end;
2031 2024
2032 end = vma->vm_start + vma_dump_size(vma, mm_flags); 2025 end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
2033 2026
2034 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { 2027 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
2035 struct page *page; 2028 struct page *page;
@@ -2038,32 +2031,42 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2038 page = get_dump_page(addr); 2031 page = get_dump_page(addr);
2039 if (page) { 2032 if (page) {
2040 void *kaddr = kmap(page); 2033 void *kaddr = kmap(page);
2041 stop = ((size += PAGE_SIZE) > limit) || 2034 stop = ((size += PAGE_SIZE) > cprm->limit) ||
2042 !dump_write(file, kaddr, PAGE_SIZE); 2035 !dump_write(cprm->file, kaddr,
2036 PAGE_SIZE);
2043 kunmap(page); 2037 kunmap(page);
2044 page_cache_release(page); 2038 page_cache_release(page);
2045 } else 2039 } else
2046 stop = !dump_seek(file, PAGE_SIZE); 2040 stop = !dump_seek(cprm->file, PAGE_SIZE);
2047 if (stop) 2041 if (stop)
2048 goto end_coredump; 2042 goto end_coredump;
2049 } 2043 }
2050 } 2044 }
2051 2045
2052#ifdef ELF_CORE_WRITE_EXTRA_DATA 2046 if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
2053 ELF_CORE_WRITE_EXTRA_DATA; 2047 goto end_coredump;
2054#endif 2048
2049 if (e_phnum == PN_XNUM) {
2050 size += sizeof(*shdr4extnum);
2051 if (size > cprm->limit
2052 || !dump_write(cprm->file, shdr4extnum,
2053 sizeof(*shdr4extnum)))
2054 goto end_coredump;
2055 }
2055 2056
2056end_coredump: 2057end_coredump:
2057 set_fs(fs); 2058 set_fs(fs);
2058 2059
2059cleanup: 2060cleanup:
2060 free_note_info(&info); 2061 free_note_info(&info);
2062 kfree(shdr4extnum);
2063 kfree(phdr4note);
2061 kfree(elf); 2064 kfree(elf);
2062out: 2065out:
2063 return has_dumped; 2066 return has_dumped;
2064} 2067}
2065 2068
2066#endif /* USE_ELF_CORE_DUMP */ 2069#endif /* CONFIG_ELF_CORE */
2067 2070
2068static int __init init_elf_binfmt(void) 2071static int __init init_elf_binfmt(void)
2069{ 2072{
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 38502c67987c..7ab23e006e4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -34,6 +34,7 @@
34#include <linux/elf.h> 34#include <linux/elf.h>
35#include <linux/elf-fdpic.h> 35#include <linux/elf-fdpic.h>
36#include <linux/elfcore.h> 36#include <linux/elfcore.h>
37#include <linux/coredump.h>
37 38
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/param.h> 40#include <asm/param.h>
@@ -75,14 +76,14 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
75static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, 76static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
76 struct file *, struct mm_struct *); 77 struct file *, struct mm_struct *);
77 78
78#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 79#ifdef CONFIG_ELF_CORE
79static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit); 80static int elf_fdpic_core_dump(struct coredump_params *cprm);
80#endif 81#endif
81 82
82static struct linux_binfmt elf_fdpic_format = { 83static struct linux_binfmt elf_fdpic_format = {
83 .module = THIS_MODULE, 84 .module = THIS_MODULE,
84 .load_binary = load_elf_fdpic_binary, 85 .load_binary = load_elf_fdpic_binary,
85#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 86#ifdef CONFIG_ELF_CORE
86 .core_dump = elf_fdpic_core_dump, 87 .core_dump = elf_fdpic_core_dump,
87#endif 88#endif
88 .min_coredump = ELF_EXEC_PAGESIZE, 89 .min_coredump = ELF_EXEC_PAGESIZE,
@@ -171,6 +172,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
171#ifdef ELF_FDPIC_PLAT_INIT 172#ifdef ELF_FDPIC_PLAT_INIT
172 unsigned long dynaddr; 173 unsigned long dynaddr;
173#endif 174#endif
175#ifndef CONFIG_MMU
176 unsigned long stack_prot;
177#endif
174 struct file *interpreter = NULL; /* to shut gcc up */ 178 struct file *interpreter = NULL; /* to shut gcc up */
175 char *interpreter_name = NULL; 179 char *interpreter_name = NULL;
176 int executable_stack; 180 int executable_stack;
@@ -316,6 +320,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
316 * defunct, deceased, etc. after this point we have to exit via 320 * defunct, deceased, etc. after this point we have to exit via
317 * error_kill */ 321 * error_kill */
318 set_personality(PER_LINUX_FDPIC); 322 set_personality(PER_LINUX_FDPIC);
323 if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
324 current->personality |= READ_IMPLIES_EXEC;
325
326 setup_new_exec(bprm);
327
319 set_binfmt(&elf_fdpic_format); 328 set_binfmt(&elf_fdpic_format);
320 329
321 current->mm->start_code = 0; 330 current->mm->start_code = 0;
@@ -377,10 +386,15 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
377 if (stack_size < PAGE_SIZE * 2) 386 if (stack_size < PAGE_SIZE * 2)
378 stack_size = PAGE_SIZE * 2; 387 stack_size = PAGE_SIZE * 2;
379 388
389 stack_prot = PROT_READ | PROT_WRITE;
390 if (executable_stack == EXSTACK_ENABLE_X ||
391 (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
392 stack_prot |= PROT_EXEC;
393
380 down_write(&current->mm->mmap_sem); 394 down_write(&current->mm->mmap_sem);
381 current->mm->start_brk = do_mmap(NULL, 0, stack_size, 395 current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
382 PROT_READ | PROT_WRITE | PROT_EXEC, 396 MAP_PRIVATE | MAP_ANONYMOUS |
383 MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, 397 MAP_UNINITIALIZED | MAP_GROWSDOWN,
384 0); 398 0);
385 399
386 if (IS_ERR_VALUE(current->mm->start_brk)) { 400 if (IS_ERR_VALUE(current->mm->start_brk)) {
@@ -1200,27 +1214,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1200 * 1214 *
1201 * Modelled on fs/binfmt_elf.c core dumper 1215 * Modelled on fs/binfmt_elf.c core dumper
1202 */ 1216 */
1203#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 1217#ifdef CONFIG_ELF_CORE
1204
1205/*
1206 * These are the only things you should do on a core-file: use only these
1207 * functions to write out all the necessary info.
1208 */
1209static int dump_write(struct file *file, const void *addr, int nr)
1210{
1211 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
1212}
1213
1214static int dump_seek(struct file *file, loff_t off)
1215{
1216 if (file->f_op->llseek) {
1217 if (file->f_op->llseek(file, off, SEEK_SET) != off)
1218 return 0;
1219 } else {
1220 file->f_pos = off;
1221 }
1222 return 1;
1223}
1224 1218
1225/* 1219/*
1226 * Decide whether a segment is worth dumping; default is yes to be 1220 * Decide whether a segment is worth dumping; default is yes to be
@@ -1300,34 +1294,35 @@ static int notesize(struct memelfnote *en)
1300 1294
1301/* #define DEBUG */ 1295/* #define DEBUG */
1302 1296
1303#define DUMP_WRITE(addr, nr) \ 1297#define DUMP_WRITE(addr, nr, foffset) \
1304 do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) 1298 do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
1305#define DUMP_SEEK(off) \
1306 do { if (!dump_seek(file, (off))) return 0; } while(0)
1307 1299
1308static int writenote(struct memelfnote *men, struct file *file) 1300static int alignfile(struct file *file, loff_t *foffset)
1309{ 1301{
1310 struct elf_note en; 1302 static const char buf[4] = { 0, };
1303 DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
1304 return 1;
1305}
1311 1306
1307static int writenote(struct memelfnote *men, struct file *file,
1308 loff_t *foffset)
1309{
1310 struct elf_note en;
1312 en.n_namesz = strlen(men->name) + 1; 1311 en.n_namesz = strlen(men->name) + 1;
1313 en.n_descsz = men->datasz; 1312 en.n_descsz = men->datasz;
1314 en.n_type = men->type; 1313 en.n_type = men->type;
1315 1314
1316 DUMP_WRITE(&en, sizeof(en)); 1315 DUMP_WRITE(&en, sizeof(en), foffset);
1317 DUMP_WRITE(men->name, en.n_namesz); 1316 DUMP_WRITE(men->name, en.n_namesz, foffset);
1318 /* XXX - cast from long long to long to avoid need for libgcc.a */ 1317 if (!alignfile(file, foffset))
1319 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ 1318 return 0;
1320 DUMP_WRITE(men->data, men->datasz); 1319 DUMP_WRITE(men->data, men->datasz, foffset);
1321 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ 1320 if (!alignfile(file, foffset))
1321 return 0;
1322 1322
1323 return 1; 1323 return 1;
1324} 1324}
1325#undef DUMP_WRITE 1325#undef DUMP_WRITE
1326#undef DUMP_SEEK
1327
1328#define DUMP_WRITE(addr, nr) \
1329 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1330 goto end_coredump;
1331 1326
1332static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) 1327static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
1333{ 1328{
@@ -1379,7 +1374,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
1379 1374
1380/* 1375/*
1381 * fill up all the fields in prstatus from the given task struct, except 1376 * fill up all the fields in prstatus from the given task struct, except
1382 * registers which need to be filled up seperately. 1377 * registers which need to be filled up separately.
1383 */ 1378 */
1384static void fill_prstatus(struct elf_prstatus *prstatus, 1379static void fill_prstatus(struct elf_prstatus *prstatus,
1385 struct task_struct *p, long signr) 1380 struct task_struct *p, long signr)
@@ -1510,6 +1505,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
1510 return sz; 1505 return sz;
1511} 1506}
1512 1507
1508static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
1509 elf_addr_t e_shoff, int segs)
1510{
1511 elf->e_shoff = e_shoff;
1512 elf->e_shentsize = sizeof(*shdr4extnum);
1513 elf->e_shnum = 1;
1514 elf->e_shstrndx = SHN_UNDEF;
1515
1516 memset(shdr4extnum, 0, sizeof(*shdr4extnum));
1517
1518 shdr4extnum->sh_type = SHT_NULL;
1519 shdr4extnum->sh_size = elf->e_shnum;
1520 shdr4extnum->sh_link = elf->e_shstrndx;
1521 shdr4extnum->sh_info = segs;
1522}
1523
1513/* 1524/*
1514 * dump the segments for an MMU process 1525 * dump the segments for an MMU process
1515 */ 1526 */
@@ -1538,7 +1549,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1538 err = -EIO; 1549 err = -EIO;
1539 kunmap(page); 1550 kunmap(page);
1540 page_cache_release(page); 1551 page_cache_release(page);
1541 } else if (!dump_seek(file, file->f_pos + PAGE_SIZE)) 1552 } else if (!dump_seek(file, PAGE_SIZE))
1542 err = -EFBIG; 1553 err = -EFBIG;
1543 if (err) 1554 if (err)
1544 goto out; 1555 goto out;
@@ -1574,6 +1585,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1574} 1585}
1575#endif 1586#endif
1576 1587
1588static size_t elf_core_vma_data_size(unsigned long mm_flags)
1589{
1590 struct vm_area_struct *vma;
1591 size_t size = 0;
1592
1593 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1594 if (maydump(vma, mm_flags))
1595 size += vma->vm_end - vma->vm_start;
1596 return size;
1597}
1598
1577/* 1599/*
1578 * Actual dumper 1600 * Actual dumper
1579 * 1601 *
@@ -1581,8 +1603,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1581 * and then they are actually written out. If we run out of core limit 1603 * and then they are actually written out. If we run out of core limit
1582 * we just truncate. 1604 * we just truncate.
1583 */ 1605 */
1584static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, 1606static int elf_fdpic_core_dump(struct coredump_params *cprm)
1585 struct file *file, unsigned long limit)
1586{ 1607{
1587#define NUM_NOTES 6 1608#define NUM_NOTES 6
1588 int has_dumped = 0; 1609 int has_dumped = 0;
@@ -1592,7 +1613,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1592 int i; 1613 int i;
1593 struct vm_area_struct *vma; 1614 struct vm_area_struct *vma;
1594 struct elfhdr *elf = NULL; 1615 struct elfhdr *elf = NULL;
1595 loff_t offset = 0, dataoff; 1616 loff_t offset = 0, dataoff, foffset;
1596 int numnote; 1617 int numnote;
1597 struct memelfnote *notes = NULL; 1618 struct memelfnote *notes = NULL;
1598 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ 1619 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
@@ -1605,7 +1626,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1605#endif 1626#endif
1606 int thread_status_size = 0; 1627 int thread_status_size = 0;
1607 elf_addr_t *auxv; 1628 elf_addr_t *auxv;
1608 unsigned long mm_flags; 1629 struct elf_phdr *phdr4note = NULL;
1630 struct elf_shdr *shdr4extnum = NULL;
1631 Elf_Half e_phnum;
1632 elf_addr_t e_shoff;
1609 1633
1610 /* 1634 /*
1611 * We no longer stop all VM operations. 1635 * We no longer stop all VM operations.
@@ -1641,7 +1665,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1641 goto cleanup; 1665 goto cleanup;
1642#endif 1666#endif
1643 1667
1644 if (signr) { 1668 if (cprm->signr) {
1645 struct core_thread *ct; 1669 struct core_thread *ct;
1646 struct elf_thread_status *tmp; 1670 struct elf_thread_status *tmp;
1647 1671
@@ -1660,22 +1684,28 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1660 int sz; 1684 int sz;
1661 1685
1662 tmp = list_entry(t, struct elf_thread_status, list); 1686 tmp = list_entry(t, struct elf_thread_status, list);
1663 sz = elf_dump_thread_status(signr, tmp); 1687 sz = elf_dump_thread_status(cprm->signr, tmp);
1664 thread_status_size += sz; 1688 thread_status_size += sz;
1665 } 1689 }
1666 } 1690 }
1667 1691
1668 /* now collect the dump for the current */ 1692 /* now collect the dump for the current */
1669 fill_prstatus(prstatus, current, signr); 1693 fill_prstatus(prstatus, current, cprm->signr);
1670 elf_core_copy_regs(&prstatus->pr_reg, regs); 1694 elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
1671 1695
1672 segs = current->mm->map_count; 1696 segs = current->mm->map_count;
1673#ifdef ELF_CORE_EXTRA_PHDRS 1697 segs += elf_core_extra_phdrs();
1674 segs += ELF_CORE_EXTRA_PHDRS; 1698
1675#endif 1699 /* for notes section */
1700 segs++;
1701
1702 /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
1703 * this, kernel supports extended numbering. Have a look at
1704 * include/linux/elf.h for further information. */
1705 e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
1676 1706
1677 /* Set up header */ 1707 /* Set up header */
1678 fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ 1708 fill_elf_fdpic_header(elf, e_phnum);
1679 1709
1680 has_dumped = 1; 1710 has_dumped = 1;
1681 current->flags |= PF_DUMPCORE; 1711 current->flags |= PF_DUMPCORE;
@@ -1702,7 +1732,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1702 1732
1703 /* Try to dump the FPU. */ 1733 /* Try to dump the FPU. */
1704 if ((prstatus->pr_fpvalid = 1734 if ((prstatus->pr_fpvalid =
1705 elf_core_copy_task_fpregs(current, regs, fpu))) 1735 elf_core_copy_task_fpregs(current, cprm->regs, fpu)))
1706 fill_note(notes + numnote++, 1736 fill_note(notes + numnote++,
1707 "CORE", NT_PRFPREG, sizeof(*fpu), fpu); 1737 "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
1708#ifdef ELF_CORE_COPY_XFPREGS 1738#ifdef ELF_CORE_COPY_XFPREGS
@@ -1714,13 +1744,12 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1714 fs = get_fs(); 1744 fs = get_fs();
1715 set_fs(KERNEL_DS); 1745 set_fs(KERNEL_DS);
1716 1746
1717 DUMP_WRITE(elf, sizeof(*elf));
1718 offset += sizeof(*elf); /* Elf header */ 1747 offset += sizeof(*elf); /* Elf header */
1719 offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ 1748 offset += segs * sizeof(struct elf_phdr); /* Program headers */
1749 foffset = offset;
1720 1750
1721 /* Write notes phdr entry */ 1751 /* Write notes phdr entry */
1722 { 1752 {
1723 struct elf_phdr phdr;
1724 int sz = 0; 1753 int sz = 0;
1725 1754
1726 for (i = 0; i < numnote; i++) 1755 for (i = 0; i < numnote; i++)
@@ -1728,20 +1757,38 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1728 1757
1729 sz += thread_status_size; 1758 sz += thread_status_size;
1730 1759
1731 fill_elf_note_phdr(&phdr, sz, offset); 1760 phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
1761 if (!phdr4note)
1762 goto end_coredump;
1763
1764 fill_elf_note_phdr(phdr4note, sz, offset);
1732 offset += sz; 1765 offset += sz;
1733 DUMP_WRITE(&phdr, sizeof(phdr));
1734 } 1766 }
1735 1767
1736 /* Page-align dumped data */ 1768 /* Page-align dumped data */
1737 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1769 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1738 1770
1739 /* 1771 offset += elf_core_vma_data_size(cprm->mm_flags);
1740 * We must use the same mm->flags while dumping core to avoid 1772 offset += elf_core_extra_data_size();
1741 * inconsistency between the program headers and bodies, otherwise an 1773 e_shoff = offset;
1742 * unusable core file can be generated. 1774
1743 */ 1775 if (e_phnum == PN_XNUM) {
1744 mm_flags = current->mm->flags; 1776 shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
1777 if (!shdr4extnum)
1778 goto end_coredump;
1779 fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
1780 }
1781
1782 offset = dataoff;
1783
1784 size += sizeof(*elf);
1785 if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
1786 goto end_coredump;
1787
1788 size += sizeof(*phdr4note);
1789 if (size > cprm->limit
1790 || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
1791 goto end_coredump;
1745 1792
1746 /* write program headers for segments dump */ 1793 /* write program headers for segments dump */
1747 for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1794 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
@@ -1754,7 +1801,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1754 phdr.p_offset = offset; 1801 phdr.p_offset = offset;
1755 phdr.p_vaddr = vma->vm_start; 1802 phdr.p_vaddr = vma->vm_start;
1756 phdr.p_paddr = 0; 1803 phdr.p_paddr = 0;
1757 phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0; 1804 phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
1758 phdr.p_memsz = sz; 1805 phdr.p_memsz = sz;
1759 offset += phdr.p_filesz; 1806 offset += phdr.p_filesz;
1760 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 1807 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1764,16 +1811,18 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1764 phdr.p_flags |= PF_X; 1811 phdr.p_flags |= PF_X;
1765 phdr.p_align = ELF_EXEC_PAGESIZE; 1812 phdr.p_align = ELF_EXEC_PAGESIZE;
1766 1813
1767 DUMP_WRITE(&phdr, sizeof(phdr)); 1814 size += sizeof(phdr);
1815 if (size > cprm->limit
1816 || !dump_write(cprm->file, &phdr, sizeof(phdr)))
1817 goto end_coredump;
1768 } 1818 }
1769 1819
1770#ifdef ELF_CORE_WRITE_EXTRA_PHDRS 1820 if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
1771 ELF_CORE_WRITE_EXTRA_PHDRS; 1821 goto end_coredump;
1772#endif
1773 1822
1774 /* write out the notes section */ 1823 /* write out the notes section */
1775 for (i = 0; i < numnote; i++) 1824 for (i = 0; i < numnote; i++)
1776 if (!writenote(notes + i, file)) 1825 if (!writenote(notes + i, cprm->file, &foffset))
1777 goto end_coredump; 1826 goto end_coredump;
1778 1827
1779 /* write out the thread status notes section */ 1828 /* write out the thread status notes section */
@@ -1782,25 +1831,33 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1782 list_entry(t, struct elf_thread_status, list); 1831 list_entry(t, struct elf_thread_status, list);
1783 1832
1784 for (i = 0; i < tmp->num_notes; i++) 1833 for (i = 0; i < tmp->num_notes; i++)
1785 if (!writenote(&tmp->notes[i], file)) 1834 if (!writenote(&tmp->notes[i], cprm->file, &foffset))
1786 goto end_coredump; 1835 goto end_coredump;
1787 } 1836 }
1788 1837
1789 if (!dump_seek(file, dataoff)) 1838 if (!dump_seek(cprm->file, dataoff - foffset))
1790 goto end_coredump; 1839 goto end_coredump;
1791 1840
1792 if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) 1841 if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
1842 cprm->mm_flags) < 0)
1793 goto end_coredump; 1843 goto end_coredump;
1794 1844
1795#ifdef ELF_CORE_WRITE_EXTRA_DATA 1845 if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
1796 ELF_CORE_WRITE_EXTRA_DATA; 1846 goto end_coredump;
1797#endif
1798 1847
1799 if (file->f_pos != offset) { 1848 if (e_phnum == PN_XNUM) {
1849 size += sizeof(*shdr4extnum);
1850 if (size > cprm->limit
1851 || !dump_write(cprm->file, shdr4extnum,
1852 sizeof(*shdr4extnum)))
1853 goto end_coredump;
1854 }
1855
1856 if (cprm->file->f_pos != offset) {
1800 /* Sanity check */ 1857 /* Sanity check */
1801 printk(KERN_WARNING 1858 printk(KERN_WARNING
1802 "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", 1859 "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n",
1803 file->f_pos, offset); 1860 cprm->file->f_pos, offset);
1804 } 1861 }
1805 1862
1806end_coredump: 1863end_coredump:
@@ -1812,7 +1869,7 @@ cleanup:
1812 list_del(tmp); 1869 list_del(tmp);
1813 kfree(list_entry(tmp, struct elf_thread_status, list)); 1870 kfree(list_entry(tmp, struct elf_thread_status, list));
1814 } 1871 }
1815 1872 kfree(phdr4note);
1816 kfree(elf); 1873 kfree(elf);
1817 kfree(prstatus); 1874 kfree(prstatus);
1818 kfree(psinfo); 1875 kfree(psinfo);
@@ -1825,4 +1882,4 @@ cleanup:
1825#undef NUM_NOTES 1882#undef NUM_NOTES
1826} 1883}
1827 1884
1828#endif /* USE_ELF_CORE_DUMP */ 1885#endif /* CONFIG_ELF_CORE */
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/slab.h>
15#include <linux/binfmts.h> 14#include <linux/binfmts.h>
16#include <linux/elf.h> 15#include <linux/elf.h>
17#include <linux/init.h> 16#include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a2796651e756..e0e769bdca59 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p);
87#endif 87#endif
88 88
89static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); 89static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
90static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); 90static int flat_core_dump(struct coredump_params *cprm);
91 91
92static struct linux_binfmt flat_format = { 92static struct linux_binfmt flat_format = {
93 .module = THIS_MODULE, 93 .module = THIS_MODULE,
@@ -102,10 +102,10 @@ static struct linux_binfmt flat_format = {
102 * Currently only a stub-function. 102 * Currently only a stub-function.
103 */ 103 */
104 104
105static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) 105static int flat_core_dump(struct coredump_params *cprm)
106{ 106{
107 printk("Process %s:%d received signr %d and should have core dumped\n", 107 printk("Process %s:%d received signr %d and should have core dumped\n",
108 current->comm, current->pid, (int) signr); 108 current->comm, current->pid, (int) cprm->signr);
109 return(1); 109 return(1);
110} 110}
111 111
@@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm,
501 * size limits imposed on them by creating programs with large 501 * size limits imposed on them by creating programs with large
502 * arrays in the data or bss. 502 * arrays in the data or bss.
503 */ 503 */
504 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 504 rlim = rlimit(RLIMIT_DATA);
505 if (rlim >= RLIM_INFINITY) 505 if (rlim >= RLIM_INFINITY)
506 rlim = ~0; 506 rlim = ~0;
507 if (data_len + bss_len > rlim) { 507 if (data_len + bss_len > rlim) {
@@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm,
519 519
520 /* OK, This is the point of no return */ 520 /* OK, This is the point of no return */
521 set_personality(PER_LINUX_32BIT); 521 set_personality(PER_LINUX_32BIT);
522 setup_new_exec(bprm);
522 } 523 }
523 524
524 /* 525 /*
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/stat.h> 10#include <linux/stat.h>
11#include <linux/slab.h>
12#include <linux/binfmts.h> 11#include <linux/binfmts.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/file.h> 13#include <linux/file.h>
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index eff74b9c9e77..cc8560f6c9b0 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -43,7 +43,7 @@ static int load_som_library(struct file *);
43 * don't even try. 43 * don't even try.
44 */ 44 */
45#if 0 45#if 0
46static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit); 46static int som_core_dump(struct coredump_params *cprm);
47#else 47#else
48#define som_core_dump NULL 48#define som_core_dump NULL
49#endif 49#endif
@@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
227 /* OK, This is the point of no return */ 227 /* OK, This is the point of no return */
228 current->flags &= ~PF_FORKNOEXEC; 228 current->flags &= ~PF_FORKNOEXEC;
229 current->personality = PER_HPUX; 229 current->personality = PER_HPUX;
230 setup_new_exec(bprm);
230 231
231 /* Set the task size for HP-UX processes such that 232 /* Set the task size for HP-UX processes such that
232 * the gateway page is outside the address space. 233 * the gateway page is outside the address space.
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 49a34e7f7306..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/slab.h>
27 28
28struct integrity_slab { 29struct integrity_slab {
29 struct kmem_cache *slab; 30 struct kmem_cache *slab;
@@ -61,7 +62,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr)
61 62
62static inline int use_bip_pool(unsigned int idx) 63static inline int use_bip_pool(unsigned int idx)
63{ 64{
64 if (idx == BIOVEC_NR_POOLS) 65 if (idx == BIOVEC_MAX_IDX)
65 return 1; 66 return 1;
66 67
67 return 0; 68 return 0;
@@ -95,6 +96,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
95 96
96 /* Use mempool if lower order alloc failed or max vecs were requested */ 97 /* Use mempool if lower order alloc failed or max vecs were requested */
97 if (bip == NULL) { 98 if (bip == NULL) {
99 idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */
98 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); 100 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
99 101
100 if (unlikely(bip == NULL)) { 102 if (unlikely(bip == NULL)) {
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..e1f922184b45 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
78 78
79 i = 0; 79 i = 0;
80 while (i < bio_slab_nr) { 80 while (i < bio_slab_nr) {
81 struct bio_slab *bslab = &bio_slabs[i]; 81 bslab = &bio_slabs[i];
82 82
83 if (!bslab->slab && entry == -1) 83 if (!bslab->slab && entry == -1)
84 entry = i; 84 entry = i;
@@ -264,15 +264,14 @@ EXPORT_SYMBOL(bio_init);
264 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
265 * @gfp_mask: the GFP_ mask given to the slab allocator 265 * @gfp_mask: the GFP_ mask given to the slab allocator
266 * @nr_iovecs: number of iovecs to pre-allocate 266 * @nr_iovecs: number of iovecs to pre-allocate
267 * @bs: the bio_set to allocate from. If %NULL, just use kmalloc 267 * @bs: the bio_set to allocate from.
268 * 268 *
269 * Description: 269 * Description:
270 * bio_alloc_bioset will first try its own mempool to satisfy the allocation. 270 * bio_alloc_bioset will try its own mempool to satisfy the allocation.
271 * If %__GFP_WAIT is set then we will block on the internal pool waiting 271 * If %__GFP_WAIT is set then we will block on the internal pool waiting
272 * for a &struct bio to become free. If a %NULL @bs is passed in, we will 272 * for a &struct bio to become free.
273 * fall back to just using @kmalloc to allocate the required memory.
274 * 273 *
275 * Note that the caller must set ->bi_destructor on succesful return 274 * Note that the caller must set ->bi_destructor on successful return
276 * of a bio, to do the appropriate freeing of the bio once the reference 275 * of a bio, to do the appropriate freeing of the bio once the reference
277 * count drops to zero. 276 * count drops to zero.
278 **/ 277 **/
@@ -507,10 +506,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
507 int nr_pages; 506 int nr_pages;
508 507
509 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 508 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
510 if (nr_pages > queue_max_phys_segments(q)) 509 if (nr_pages > queue_max_segments(q))
511 nr_pages = queue_max_phys_segments(q); 510 nr_pages = queue_max_segments(q);
512 if (nr_pages > queue_max_hw_segments(q))
513 nr_pages = queue_max_hw_segments(q);
514 511
515 return nr_pages; 512 return nr_pages;
516} 513}
@@ -542,13 +539,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
542 539
543 if (page == prev->bv_page && 540 if (page == prev->bv_page &&
544 offset == prev->bv_offset + prev->bv_len) { 541 offset == prev->bv_offset + prev->bv_len) {
542 unsigned int prev_bv_len = prev->bv_len;
545 prev->bv_len += len; 543 prev->bv_len += len;
546 544
547 if (q->merge_bvec_fn) { 545 if (q->merge_bvec_fn) {
548 struct bvec_merge_data bvm = { 546 struct bvec_merge_data bvm = {
547 /* prev_bvec is already charged in
548 bi_size, discharge it in order to
549 simulate merging updated prev_bvec
550 as new bvec. */
549 .bi_bdev = bio->bi_bdev, 551 .bi_bdev = bio->bi_bdev,
550 .bi_sector = bio->bi_sector, 552 .bi_sector = bio->bi_sector,
551 .bi_size = bio->bi_size, 553 .bi_size = bio->bi_size - prev_bv_len,
552 .bi_rw = bio->bi_rw, 554 .bi_rw = bio->bi_rw,
553 }; 555 };
554 556
@@ -570,8 +572,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
570 * make this too complex. 572 * make this too complex.
571 */ 573 */
572 574
573 while (bio->bi_phys_segments >= queue_max_phys_segments(q) 575 while (bio->bi_phys_segments >= queue_max_segments(q)) {
574 || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
575 576
576 if (retried_segments) 577 if (retried_segments)
577 return 0; 578 return 0;
@@ -1393,6 +1394,18 @@ void bio_check_pages_dirty(struct bio *bio)
1393 } 1394 }
1394} 1395}
1395 1396
1397#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1398void bio_flush_dcache_pages(struct bio *bi)
1399{
1400 int i;
1401 struct bio_vec *bvec;
1402
1403 bio_for_each_segment(bvec, bi, i)
1404 flush_dcache_page(bvec->bv_page);
1405}
1406EXPORT_SYMBOL(bio_flush_dcache_pages);
1407#endif
1408
1396/** 1409/**
1397 * bio_endio - end I/O on a bio 1410 * bio_endio - end I/O on a bio
1398 * @bio: bio 1411 * @bio: bio
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88c..d11d0289f3d2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -246,7 +246,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
246 if (!sb) 246 if (!sb)
247 goto out; 247 goto out;
248 if (sb->s_flags & MS_RDONLY) { 248 if (sb->s_flags & MS_RDONLY) {
249 deactivate_locked_super(sb); 249 sb->s_frozen = SB_FREEZE_TRANS;
250 up_write(&sb->s_umount);
250 mutex_unlock(&bdev->bd_fsfreeze_mutex); 251 mutex_unlock(&bdev->bd_fsfreeze_mutex);
251 return sb; 252 return sb;
252 } 253 }
@@ -307,7 +308,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
307 BUG_ON(sb->s_bdev != bdev); 308 BUG_ON(sb->s_bdev != bdev);
308 down_write(&sb->s_umount); 309 down_write(&sb->s_umount);
309 if (sb->s_flags & MS_RDONLY) 310 if (sb->s_flags & MS_RDONLY)
310 goto out_deactivate; 311 goto out_unfrozen;
311 312
312 if (sb->s_op->unfreeze_fs) { 313 if (sb->s_op->unfreeze_fs) {
313 error = sb->s_op->unfreeze_fs(sb); 314 error = sb->s_op->unfreeze_fs(sb);
@@ -321,11 +322,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
321 } 322 }
322 } 323 }
323 324
325out_unfrozen:
324 sb->s_frozen = SB_UNFROZEN; 326 sb->s_frozen = SB_UNFROZEN;
325 smp_wmb(); 327 smp_wmb();
326 wake_up(&sb->s_wait_unfrozen); 328 wake_up(&sb->s_wait_unfrozen);
327 329
328out_deactivate:
329 if (sb) 330 if (sb)
330 deactivate_locked_super(sb); 331 deactivate_locked_super(sb);
331out_unlock: 332out_unlock:
@@ -405,7 +406,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
405 406
406static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 407static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
407{ 408{
408 return sync_blockdev(I_BDEV(filp->f_mapping->host)); 409 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
410 int error;
411
412 error = sync_blockdev(bdev);
413 if (error)
414 return error;
415
416 error = blkdev_issue_flush(bdev, NULL);
417 if (error == -EOPNOTSUPP)
418 error = 0;
419 return error;
409} 420}
410 421
411/* 422/*
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index da3133c69830..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
@@ -73,13 +74,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
73 return acl; 74 return acl;
74} 75}
75 76
76static int btrfs_xattr_get_acl(struct inode *inode, int type, 77static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
77 void *value, size_t size) 78 void *value, size_t size, int type)
78{ 79{
79 struct posix_acl *acl; 80 struct posix_acl *acl;
80 int ret = 0; 81 int ret = 0;
81 82
82 acl = btrfs_get_acl(inode, type); 83 acl = btrfs_get_acl(dentry->d_inode, type);
83 84
84 if (IS_ERR(acl)) 85 if (IS_ERR(acl))
85 return PTR_ERR(acl); 86 return PTR_ERR(acl);
@@ -153,8 +154,8 @@ out:
153 return ret; 154 return ret;
154} 155}
155 156
156static int btrfs_xattr_set_acl(struct inode *inode, int type, 157static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
157 const void *value, size_t size) 158 const void *value, size_t size, int flags, int type)
158{ 159{
159 int ret; 160 int ret;
160 struct posix_acl *acl = NULL; 161 struct posix_acl *acl = NULL;
@@ -169,38 +170,13 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type,
169 } 170 }
170 } 171 }
171 172
172 ret = btrfs_set_acl(NULL, inode, acl, type); 173 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
173 174
174 posix_acl_release(acl); 175 posix_acl_release(acl);
175 176
176 return ret; 177 return ret;
177} 178}
178 179
179
180static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
181 void *value, size_t size)
182{
183 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
184}
185
186static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
187 const void *value, size_t size, int flags)
188{
189 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
190}
191
192static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
193 void *value, size_t size)
194{
195 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
196}
197
198static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
199 const void *value, size_t size, int flags)
200{
201 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
202}
203
204int btrfs_check_acl(struct inode *inode, int mask) 180int btrfs_check_acl(struct inode *inode, int mask)
205{ 181{
206 struct posix_acl *acl; 182 struct posix_acl *acl;
@@ -308,14 +284,16 @@ int btrfs_acl_chmod(struct inode *inode)
308 284
309struct xattr_handler btrfs_xattr_acl_default_handler = { 285struct xattr_handler btrfs_xattr_acl_default_handler = {
310 .prefix = POSIX_ACL_XATTR_DEFAULT, 286 .prefix = POSIX_ACL_XATTR_DEFAULT,
311 .get = btrfs_xattr_acl_default_get, 287 .flags = ACL_TYPE_DEFAULT,
312 .set = btrfs_xattr_acl_default_set, 288 .get = btrfs_xattr_acl_get,
289 .set = btrfs_xattr_acl_set,
313}; 290};
314 291
315struct xattr_handler btrfs_xattr_acl_access_handler = { 292struct xattr_handler btrfs_xattr_acl_access_handler = {
316 .prefix = POSIX_ACL_XATTR_ACCESS, 293 .prefix = POSIX_ACL_XATTR_ACCESS,
317 .get = btrfs_xattr_acl_access_get, 294 .flags = ACL_TYPE_ACCESS,
318 .set = btrfs_xattr_acl_access_set, 295 .get = btrfs_xattr_acl_get,
296 .set = btrfs_xattr_acl_set,
319}; 297};
320 298
321#else /* CONFIG_BTRFS_FS_POSIX_ACL */ 299#else /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1d54c5308df5..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,6 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/slab.h>
34#include "compat.h" 35#include "compat.h"
35#include "ctree.h" 36#include "ctree.h"
36#include "disk-io.h" 37#include "disk-io.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index babf7fbaec84..6795a713b205 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ae8c40922c54..746a7248678e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -2327,7 +2328,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2327int btrfs_readpage(struct file *file, struct page *page); 2328int btrfs_readpage(struct file *file, struct page *page);
2328void btrfs_delete_inode(struct inode *inode); 2329void btrfs_delete_inode(struct inode *inode);
2329void btrfs_put_inode(struct inode *inode); 2330void btrfs_put_inode(struct inode *inode);
2330int btrfs_write_inode(struct inode *inode, int wait); 2331int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2331void btrfs_dirty_inode(struct inode *inode); 2332void btrfs_dirty_inode(struct inode *inode);
2332struct inode *btrfs_alloc_inode(struct super_block *sb); 2333struct inode *btrfs_alloc_inode(struct super_block *sb);
2333void btrfs_destroy_inode(struct inode *inode); 2334void btrfs_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6632e5c4c8bb..e7b8f2c89ccb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 101041d4d2b2..9e23ffea7f54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fc742e59815e..d2d03684fab2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 5a01f35507dd..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
@@ -242,7 +241,7 @@ out:
242 * Insert @em into @tree or perform a simple forward/backward merge with 241 * Insert @em into @tree or perform a simple forward/backward merge with
243 * existing mappings. The extent_map struct passed in will be inserted 242 * existing mappings. The extent_map struct passed in will be inserted
244 * into the tree directly, with an additional reference taken, or a 243 * into the tree directly, with an additional reference taken, or a
245 * reference dropped if the merge attempt was sucessfull. 244 * reference dropped if the merge attempt was successfull.
246 */ 245 */
247int add_extent_mapping(struct extent_map_tree *tree, 246int add_extent_mapping(struct extent_map_tree *tree,
248 struct extent_map *em) 247 struct extent_map *em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d146dde7efb6..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
@@ -839,7 +840,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
839 unsigned long last_index; 840 unsigned long last_index;
840 int will_write; 841 int will_write;
841 842
842 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || 843 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
843 (file->f_flags & O_DIRECT)); 844 (file->f_flags & O_DIRECT));
844 845
845 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, 846 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
@@ -1006,7 +1007,7 @@ out_nolock:
1006 if (err) 1007 if (err)
1007 num_written = err; 1008 num_written = err;
1008 1009
1009 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 1010 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1010 trans = btrfs_start_transaction(root, 1); 1011 trans = btrfs_start_transaction(root, 1);
1011 ret = btrfs_log_dentry_safe(trans, root, 1012 ret = btrfs_log_dentry_safe(trans, root,
1012 file->f_dentry); 1013 file->f_dentry);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a85b90c86cb0..2bfdc641d4e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -3943,7 +3944,7 @@ err:
3943 return ret; 3944 return ret;
3944} 3945}
3945 3946
3946int btrfs_write_inode(struct inode *inode, int wait) 3947int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3947{ 3948{
3948 struct btrfs_root *root = BTRFS_I(inode)->root; 3949 struct btrfs_root *root = BTRFS_I(inode)->root;
3949 struct btrfs_trans_handle *trans; 3950 struct btrfs_trans_handle *trans;
@@ -3952,7 +3953,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
3952 if (root->fs_info->btree_inode == inode) 3953 if (root->fs_info->btree_inode == inode)
3953 return 0; 3954 return 0;
3954 3955
3955 if (wait) { 3956 if (wbc->sync_mode == WB_SYNC_ALL) {
3956 trans = btrfs_join_transaction(root, 1); 3957 trans = btrfs_join_transaction(root, 1);
3957 btrfs_set_trans_block_group(trans, inode); 3958 btrfs_set_trans_block_group(trans, inode);
3958 ret = btrfs_commit_transaction(trans, root); 3959 ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2b7dd88fc54f..e84ef60ffe35 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c99882b9763..a127c0ebb2dc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b23942cbc0d..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d11b12fc086b..1866dff0538e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
164 complete(&root->kobj_unregister); 164 complete(&root->kobj_unregister);
165} 165}
166 166
167static struct sysfs_ops btrfs_super_attr_ops = { 167static const struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show, 168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store, 169 .store = btrfs_super_attr_store,
170}; 170};
171 171
172static struct sysfs_ops btrfs_root_attr_ops = { 172static const struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show, 173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store, 174 .store = btrfs_root_attr_store,
175}; 175};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01cebd661997..2cb116099b90 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1255fcc8ade5..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9bf1f581b872..aa7dc36dac78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bfd..c9c266db0624 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2893 2893
2894 /* 2894 /*
2895 * The page straddles i_size. It must be zeroed out on each and every 2895 * The page straddles i_size. It must be zeroed out on each and every
2896 * writepage invokation because it may be mmapped. "A file is mapped 2896 * writepage invocation because it may be mmapped. "A file is mapped
2897 * in multiples of the page size. For a file that is not a multiple of 2897 * in multiples of the page size. For a file that is not a multiple of
2898 * the page size, the remaining memory is zeroed when mapped, and 2898 * the page size, the remaining memory is zeroed when mapped, and
2899 * writes to that region are not written out to the file." 2899 * writes to that region are not written out to the file."
@@ -3265,7 +3265,7 @@ static void recalc_bh_state(void)
3265 3265
3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3267{ 3267{
3268 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); 3268 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3269 if (ret) { 3269 if (ret) {
3270 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3270 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3271 get_cpu_var(bh_accounting).nr++; 3271 get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh)
3352} 3352}
3353EXPORT_SYMBOL(bh_submit_read); 3353EXPORT_SYMBOL(bh_submit_read);
3354 3354
3355static void
3356init_buffer_head(void *data)
3357{
3358 struct buffer_head *bh = data;
3359
3360 memset(bh, 0, sizeof(*bh));
3361 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3362}
3363
3364void __init buffer_init(void) 3355void __init buffer_init(void)
3365{ 3356{
3366 int nrpages; 3357 int nrpages;
@@ -3369,7 +3360,7 @@ void __init buffer_init(void)
3369 sizeof(struct buffer_head), 0, 3360 sizeof(struct buffer_head), 0,
3370 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 3361 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3371 SLAB_MEM_SPREAD), 3362 SLAB_MEM_SPREAD),
3372 init_buffer_head); 3363 NULL);
3373 3364
3374 /* 3365 /*
3375 * Limit the bh occupancy to 10% of ZONE_NORMAL 3366 * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 3797e0077b35..2906077ac798 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -84,7 +84,7 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
84static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) 84static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
85{ 85{
86 struct cachefiles_object *fsdef; 86 struct cachefiles_object *fsdef;
87 struct nameidata nd; 87 struct path path;
88 struct kstatfs stats; 88 struct kstatfs stats;
89 struct dentry *graveyard, *cachedir, *root; 89 struct dentry *graveyard, *cachedir, *root;
90 const struct cred *saved_cred; 90 const struct cred *saved_cred;
@@ -114,15 +114,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
114 _debug("- fsdef %p", fsdef); 114 _debug("- fsdef %p", fsdef);
115 115
116 /* look up the directory at the root of the cache */ 116 /* look up the directory at the root of the cache */
117 memset(&nd, 0, sizeof(nd)); 117 ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
118
119 ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
120 if (ret < 0) 118 if (ret < 0)
121 goto error_open_root; 119 goto error_open_root;
122 120
123 cache->mnt = mntget(nd.path.mnt); 121 cache->mnt = path.mnt;
124 root = dget(nd.path.dentry); 122 root = path.dentry;
125 path_put(&nd.path);
126 123
127 /* check parameters */ 124 /* check parameters */
128 ret = -EOPNOTSUPP; 125 ret = -EOPNOTSUPP;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 4618516dd994..c2413561ea75 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -21,6 +21,7 @@
21#include <linux/mount.h> 21#include <linux/mount.h>
22#include <linux/statfs.h> 22#include <linux/statfs.h>
23#include <linux/ctype.h> 23#include <linux/ctype.h>
24#include <linux/string.h>
24#include <linux/fs_struct.h> 25#include <linux/fs_struct.h>
25#include "internal.h" 26#include "internal.h"
26 27
@@ -257,8 +258,7 @@ static ssize_t cachefiles_daemon_write(struct file *file,
257 if (args == data) 258 if (args == data)
258 goto error; 259 goto error;
259 *args = '\0'; 260 *args = '\0';
260 for (args++; isspace(*args); args++) 261 args = skip_spaces(++args);
261 continue;
262 } 262 }
263 263
264 /* run the appropriate command handler */ 264 /* run the appropriate command handler */
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/mount.h> 13#include <linux/mount.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 14ac4806e291..d5db84a1ee0d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/security.h> 21#include <linux/security.h>
22#include <linux/slab.h>
22#include "internal.h" 23#include "internal.h"
23 24
24#define CACHEFILES_KEYBUF_SIZE 512 25#define CACHEFILES_KEYBUF_SIZE 512
@@ -348,7 +349,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
348 dir = dget_parent(object->dentry); 349 dir = dget_parent(object->dentry);
349 350
350 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 351 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
351 ret = cachefiles_bury_object(cache, dir, object->dentry); 352
353 /* we need to check that our parent is _still_ our parent - it may have
354 * been renamed */
355 if (dir == object->dentry->d_parent) {
356 ret = cachefiles_bury_object(cache, dir, object->dentry);
357 } else {
358 /* it got moved, presumably by cachefilesd culling it, so it's
359 * no longer in the key path and we can ignore it */
360 mutex_unlock(&dir->d_inode->i_mutex);
361 ret = 0;
362 }
352 363
353 dput(dir); 364 dput(dir);
354 _leave(" = %d", ret); 365 _leave(" = %d", ret);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a6c8c6fe8df9..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,8 +10,8 @@
10 */ 10 */
11 11
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/ima.h>
15#include "internal.h" 15#include "internal.h"
16 16
17/* 17/*
@@ -923,7 +923,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
923 if (IS_ERR(file)) { 923 if (IS_ERR(file)) {
924 ret = PTR_ERR(file); 924 ret = PTR_ERR(file);
925 } else { 925 } else {
926 ima_counts_get(file);
927 ret = -EIO; 926 ret = -EIO;
928 if (file->f_op->write) { 927 if (file->f_op->write) {
929 pos = (loff_t) page->index << PAGE_SHIFT; 928 pos = (loff_t) page->index << PAGE_SHIFT;
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
16#include <linux/fsnotify.h> 16#include <linux/fsnotify.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/xattr.h> 18#include <linux/xattr.h>
19#include <linux/slab.h>
19#include "internal.h" 20#include "internal.h"
20 21
21static const char cachefiles_xattr_cache[] = 22static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..aa3cd7cc3e40
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1195 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/slab.h>
9#include <linux/pagevec.h>
10#include <linux/task_io_accounting_ops.h>
11
12#include "super.h"
13#include "osd_client.h"
14
15/*
16 * Ceph address space ops.
17 *
18 * There are a few funny things going on here.
19 *
20 * The page->private field is used to reference a struct
21 * ceph_snap_context for _every_ dirty page. This indicates which
22 * snapshot the page was logically dirtied in, and thus which snap
23 * context needs to be associated with the osd write during writeback.
24 *
25 * Similarly, struct ceph_inode_info maintains a set of counters to
26 * count dirty pages on the inode. In the absense of snapshots,
27 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
28 *
29 * When a snapshot is taken (that is, when the client receives
30 * notification that a snapshot was taken), each inode with caps and
31 * with dirty pages (dirty pages implies there is a cap) gets a new
32 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
33 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
34 * moved to capsnap->dirty. (Unless a sync write is currently in
35 * progress. In that case, the capsnap is said to be "pending", new
36 * writes cannot start, and the capsnap isn't "finalized" until the
37 * write completes (or fails) and a final size/mtime for the inode for
38 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
39 *
40 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
41 * we look for the first capsnap in i_cap_snaps and write out pages in
42 * that snap context _only_. Then we move on to the next capsnap,
43 * eventually reaching the "live" or "head" context (i.e., pages that
44 * are not yet snapped) and are writing the most recently dirtied
45 * pages.
46 *
47 * Invalidate and so forth must take care to ensure the dirty page
48 * accounting is preserved.
49 */
50
51#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
52#define CONGESTION_OFF_THRESH(congestion_kb) \
53 (CONGESTION_ON_THRESH(congestion_kb) - \
54 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
55
56
57
58/*
59 * Dirty a page. Optimistically adjust accounting, on the assumption
60 * that we won't race with invalidate. If we do, readjust.
61 */
62static int ceph_set_page_dirty(struct page *page)
63{
64 struct address_space *mapping = page->mapping;
65 struct inode *inode;
66 struct ceph_inode_info *ci;
67 int undo = 0;
68 struct ceph_snap_context *snapc;
69
70 if (unlikely(!mapping))
71 return !TestSetPageDirty(page);
72
73 if (TestSetPageDirty(page)) {
74 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
75 mapping->host, page, page->index);
76 return 0;
77 }
78
79 inode = mapping->host;
80 ci = ceph_inode(inode);
81
82 /*
83 * Note that we're grabbing a snapc ref here without holding
84 * any locks!
85 */
86 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
87
88 /* dirty the head */
89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0)
91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0)
94 igrab(inode);
95 ++ci->i_wrbuffer_ref;
96 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
97 "snapc %p seq %lld (%d snaps)\n",
98 mapping->host, page, page->index,
99 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
100 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
101 snapc, snapc->seq, snapc->num_snaps);
102 spin_unlock(&inode->i_lock);
103
104 /* now adjust page */
105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page));
108
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY);
117
118 /*
119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback.
121 */
122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page);
124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1;
127 }
128
129 spin_unlock_irq(&mapping->tree_lock);
130
131 if (undo)
132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
136
137 BUG_ON(!PageDirty(page));
138 return 1;
139}
140
141/*
142 * If we are truncating the full page (i.e. offset == 0), adjust the
143 * dirty page counters appropriately. Only called if there is private
144 * data on the page.
145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset)
147{
148 struct inode *inode;
149 struct ceph_inode_info *ci;
150 struct ceph_snap_context *snapc = (void *)page->private;
151
152 BUG_ON(!PageLocked(page));
153 BUG_ON(!page->private);
154 BUG_ON(!PagePrivate(page));
155 BUG_ON(!page->mapping);
156
157 inode = page->mapping->host;
158
159 /*
160 * We can get non-dirty pages here due to races between
161 * set_page_dirty and truncate_complete_page; just spit out a
162 * warning, in case we end up with accounting problems later.
163 */
164 if (!PageDirty(page))
165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
166
167 if (offset == 0)
168 ClearPageChecked(page);
169
170 ci = ceph_inode(inode);
171 if (offset == 0) {
172 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
173 inode, page, page->index, offset);
174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
175 ceph_put_snap_context(snapc);
176 page->private = 0;
177 ClearPagePrivate(page);
178 } else {
179 dout("%p invalidatepage %p idx %lu partial dirty page\n",
180 inode, page, page->index);
181 }
182}
183
184/* just a sanity check */
185static int ceph_releasepage(struct page *page, gfp_t g)
186{
187 struct inode *inode = page->mapping ? page->mapping->host : NULL;
188 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
189 WARN_ON(PageDirty(page));
190 WARN_ON(page->private);
191 WARN_ON(PagePrivate(page));
192 return 0;
193}
194
195/*
196 * read a single page, without unlocking it.
197 */
198static int readpage_nounlock(struct file *filp, struct page *page)
199{
200 struct inode *inode = filp->f_dentry->d_inode;
201 struct ceph_inode_info *ci = ceph_inode(inode);
202 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
203 int err = 0;
204 u64 len = PAGE_CACHE_SIZE;
205
206 dout("readpage inode %p file %p page %p index %lu\n",
207 inode, filp, page, page->index);
208 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
209 page->index << PAGE_CACHE_SHIFT, &len,
210 ci->i_truncate_seq, ci->i_truncate_size,
211 &page, 1);
212 if (err == -ENOENT)
213 err = 0;
214 if (err < 0) {
215 SetPageError(page);
216 goto out;
217 } else if (err < PAGE_CACHE_SIZE) {
218 /* zero fill remainder of page */
219 zero_user_segment(page, err, PAGE_CACHE_SIZE);
220 }
221 SetPageUptodate(page);
222
223out:
224 return err < 0 ? err : 0;
225}
226
227static int ceph_readpage(struct file *filp, struct page *page)
228{
229 int r = readpage_nounlock(filp, page);
230 unlock_page(page);
231 return r;
232}
233
234/*
235 * Build a vector of contiguous pages from the provided page list.
236 */
237static struct page **page_vector_from_list(struct list_head *page_list,
238 unsigned *nr_pages)
239{
240 struct page **pages;
241 struct page *page;
242 int next_index, contig_pages = 0;
243
244 /* build page vector */
245 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
246 if (!pages)
247 return ERR_PTR(-ENOMEM);
248
249 BUG_ON(list_empty(page_list));
250 next_index = list_entry(page_list->prev, struct page, lru)->index;
251 list_for_each_entry_reverse(page, page_list, lru) {
252 if (page->index == next_index) {
253 dout("readpages page %d %p\n", contig_pages, page);
254 pages[contig_pages] = page;
255 contig_pages++;
256 next_index++;
257 } else {
258 break;
259 }
260 }
261 *nr_pages = contig_pages;
262 return pages;
263}
264
265/*
266 * Read multiple pages. Leave pages we don't read + unlock in page_list;
267 * the caller (VM) cleans them up.
268 */
269static int ceph_readpages(struct file *file, struct address_space *mapping,
270 struct list_head *page_list, unsigned nr_pages)
271{
272 struct inode *inode = file->f_dentry->d_inode;
273 struct ceph_inode_info *ci = ceph_inode(inode);
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0;
276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset;
279 u64 len;
280
281 dout("readpages %p file %p nr_pages %d\n",
282 inode, file, nr_pages);
283
284 pages = page_vector_from_list(page_list, &nr_pages);
285 if (IS_ERR(pages))
286 return PTR_ERR(pages);
287
288 /* guess read extent */
289 offset = pages[0]->index << PAGE_CACHE_SHIFT;
290 len = nr_pages << PAGE_CACHE_SHIFT;
291 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
292 offset, &len,
293 ci->i_truncate_seq, ci->i_truncate_size,
294 pages, nr_pages);
295 if (rc == -ENOENT)
296 rc = 0;
297 if (rc < 0)
298 goto out;
299
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page =
305 list_entry(page_list->prev, struct page, lru);
306
307 list_del(&page->lru);
308
309 if (rc < (int)PAGE_CACHE_SIZE) {
310 /* zero (remainder of) page */
311 int s = rc < 0 ? 0 : rc;
312 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 }
314
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page);
319 continue;
320 }
321 dout("readpages %p adding %p idx %lu\n", inode, page,
322 page->index);
323 flush_dcache_page(page);
324 SetPageUptodate(page);
325 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0)
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0;
331
332out:
333 kfree(pages);
334 return rc;
335}
336
337/*
338 * Get ref for the oldest snapc for an inode with dirty data... that is, the
339 * only snap context we are allowed to write back.
340 *
341 * Caller holds i_lock.
342 */
343static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
344 u64 *snap_size)
345{
346 struct ceph_inode_info *ci = ceph_inode(inode);
347 struct ceph_snap_context *snapc = NULL;
348 struct ceph_cap_snap *capsnap = NULL;
349
350 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
351 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
352 capsnap->context, capsnap->dirty_pages);
353 if (capsnap->dirty_pages) {
354 snapc = ceph_get_snap_context(capsnap->context);
355 if (snap_size)
356 *snap_size = capsnap->size;
357 break;
358 }
359 }
360 if (!snapc && ci->i_snap_realm) {
361 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
362 dout(" head snapc %p has %d dirty pages\n",
363 snapc, ci->i_wrbuffer_ref_head);
364 }
365 return snapc;
366}
367
368static struct ceph_snap_context *get_oldest_context(struct inode *inode,
369 u64 *snap_size)
370{
371 struct ceph_snap_context *snapc = NULL;
372
373 spin_lock(&inode->i_lock);
374 snapc = __get_oldest_context(inode, snap_size);
375 spin_unlock(&inode->i_lock);
376 return snapc;
377}
378
379/*
380 * Write a single page, but leave the page locked.
381 *
382 * If we get a write error, set the page error bit, but still adjust the
383 * dirty page accounting (i.e., page is no longer dirty).
384 */
385static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
386{
387 struct inode *inode;
388 struct ceph_inode_info *ci;
389 struct ceph_client *client;
390 struct ceph_osd_client *osdc;
391 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
392 int len = PAGE_CACHE_SIZE;
393 loff_t i_size;
394 int err = 0;
395 struct ceph_snap_context *snapc;
396 u64 snap_size = 0;
397 long writeback_stat;
398
399 dout("writepage %p idx %lu\n", page, page->index);
400
401 if (!page->mapping || !page->mapping->host) {
402 dout("writepage %p - no mapping\n", page);
403 return -EFAULT;
404 }
405 inode = page->mapping->host;
406 ci = ceph_inode(inode);
407 client = ceph_inode_to_client(inode);
408 osdc = &client->osdc;
409
410 /* verify this is a writeable snap context */
411 snapc = (void *)page->private;
412 if (snapc == NULL) {
413 dout("writepage %p page %p not dirty?\n", inode, page);
414 goto out;
415 }
416 if (snapc != get_oldest_context(inode, &snap_size)) {
417 dout("writepage %p page %p snapc %p not writeable - noop\n",
418 inode, page, (void *)page->private);
419 /* we should only noop if called by kswapd */
420 WARN_ON((current->flags & PF_MEMALLOC) == 0);
421 goto out;
422 }
423
424 /* is this a partial page at end of file? */
425 if (snap_size)
426 i_size = snap_size;
427 else
428 i_size = i_size_read(inode);
429 if (i_size < page_off + len)
430 len = i_size - page_off;
431
432 dout("writepage %p page %p index %lu on %llu~%u\n",
433 inode, page, page->index, page_off, len);
434
435 writeback_stat = atomic_long_inc_return(&client->writeback_count);
436 if (writeback_stat >
437 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
438 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
439
440 set_page_writeback(page);
441 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
442 &ci->i_layout, snapc,
443 page_off, len,
444 ci->i_truncate_seq, ci->i_truncate_size,
445 &inode->i_mtime,
446 &page, 1, 0, 0, true);
447 if (err < 0) {
448 dout("writepage setting page/mapping error %d %p\n", err, page);
449 SetPageError(page);
450 mapping_set_error(&inode->i_data, err);
451 if (wbc)
452 wbc->pages_skipped++;
453 } else {
454 dout("writepage cleaned page %p\n", page);
455 err = 0; /* vfs expects us to return 0 */
456 }
457 page->private = 0;
458 ClearPagePrivate(page);
459 end_page_writeback(page);
460 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
461 ceph_put_snap_context(snapc);
462out:
463 return err;
464}
465
466static int ceph_writepage(struct page *page, struct writeback_control *wbc)
467{
468 int err;
469 struct inode *inode = page->mapping->host;
470 BUG_ON(!inode);
471 igrab(inode);
472 err = writepage_nounlock(page, wbc);
473 unlock_page(page);
474 iput(inode);
475 return err;
476}
477
478
479/*
480 * lame release_pages helper. release_pages() isn't exported to
481 * modules.
482 */
483static void ceph_release_pages(struct page **pages, int num)
484{
485 struct pagevec pvec;
486 int i;
487
488 pagevec_init(&pvec, 0);
489 for (i = 0; i < num; i++) {
490 if (pagevec_add(&pvec, pages[i]) == 0)
491 pagevec_release(&pvec);
492 }
493 pagevec_release(&pvec);
494}
495
496
497/*
498 * async writeback completion handler.
499 *
500 * If we get an error, set the mapping error bit, but not the individual
501 * page error bits.
502 */
503static void writepages_finish(struct ceph_osd_request *req,
504 struct ceph_msg *msg)
505{
506 struct inode *inode = req->r_inode;
507 struct ceph_osd_reply_head *replyhead;
508 struct ceph_osd_op *op;
509 struct ceph_inode_info *ci = ceph_inode(inode);
510 unsigned wrote;
511 struct page *page;
512 int i;
513 struct ceph_snap_context *snapc = req->r_snapc;
514 struct address_space *mapping = inode->i_mapping;
515 struct writeback_control *wbc = req->r_wbc;
516 __s32 rc = -EIO;
517 u64 bytes = 0;
518 struct ceph_client *client = ceph_inode_to_client(inode);
519 long writeback_stat;
520 unsigned issued = __ceph_caps_issued(ci, NULL);
521
522 /* parse reply */
523 replyhead = msg->front.iov_base;
524 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
525 op = (void *)(replyhead + 1);
526 rc = le32_to_cpu(replyhead->result);
527 bytes = le64_to_cpu(op->extent.length);
528
529 if (rc >= 0) {
530 /*
531 * Assume we wrote the pages we originally sent. The
532 * osd might reply with fewer pages if our writeback
533 * raced with a truncation and was adjusted at the osd,
534 * so don't believe the reply.
535 */
536 wrote = req->r_num_pages;
537 } else {
538 wrote = 0;
539 mapping_set_error(mapping, rc);
540 }
541 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
542 inode, rc, bytes, wrote);
543
544 /* clean all pages */
545 for (i = 0; i < req->r_num_pages; i++) {
546 page = req->r_pages[i];
547 BUG_ON(!page);
548 WARN_ON(!PageUptodate(page));
549
550 writeback_stat =
551 atomic_long_dec_return(&client->writeback_count);
552 if (writeback_stat <
553 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
554 clear_bdi_congested(&client->backing_dev_info,
555 BLK_RW_ASYNC);
556
557 if (i >= wrote) {
558 dout("inode %p skipping page %p\n", inode, page);
559 wbc->pages_skipped++;
560 }
561 page->private = 0;
562 ClearPagePrivate(page);
563 ceph_put_snap_context(snapc);
564 dout("unlocking %d %p\n", i, page);
565 end_page_writeback(page);
566
567 /*
568 * We lost the cache cap, need to truncate the page before
569 * it is unlocked, otherwise we'd truncate it later in the
570 * page truncation thread, possibly losing some data that
571 * raced its way in
572 */
573 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
574 generic_error_remove_page(inode->i_mapping, page);
575
576 unlock_page(page);
577 }
578 dout("%p wrote+cleaned %d pages\n", inode, wrote);
579 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
580
581 ceph_release_pages(req->r_pages, req->r_num_pages);
582 if (req->r_pages_from_pool)
583 mempool_free(req->r_pages,
584 ceph_client(inode->i_sb)->wb_pagevec_pool);
585 else
586 kfree(req->r_pages);
587 ceph_osdc_put_request(req);
588}
589
590/*
591 * allocate a page vec, either directly, or if necessary, via a the
592 * mempool. we avoid the mempool if we can because req->r_num_pages
593 * may be less than the maximum write size.
594 */
595static void alloc_page_vec(struct ceph_client *client,
596 struct ceph_osd_request *req)
597{
598 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
599 GFP_NOFS);
600 if (!req->r_pages) {
601 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
602 req->r_pages_from_pool = 1;
603 WARN_ON(!req->r_pages);
604 }
605}
606
607/*
608 * initiate async writeback
609 */
610static int ceph_writepages_start(struct address_space *mapping,
611 struct writeback_control *wbc)
612{
613 struct inode *inode = mapping->host;
614 struct backing_dev_info *bdi = mapping->backing_dev_info;
615 struct ceph_inode_info *ci = ceph_inode(inode);
616 struct ceph_client *client;
617 pgoff_t index, start, end;
618 int range_whole = 0;
619 int should_loop = 1;
620 pgoff_t max_pages = 0, max_pages_ever = 0;
621 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
622 struct pagevec pvec;
623 int done = 0;
624 int rc = 0;
625 unsigned wsize = 1 << inode->i_blkbits;
626 struct ceph_osd_request *req = NULL;
627 int do_sync;
628 u64 snap_size = 0;
629
630 /*
631 * Include a 'sync' in the OSD request if this is a data
632 * integrity write (e.g., O_SYNC write or fsync()), or if our
633 * cap is being revoked.
634 */
635 do_sync = wbc->sync_mode == WB_SYNC_ALL;
636 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
637 do_sync = 1;
638 dout("writepages_start %p dosync=%d (mode=%s)\n",
639 inode, do_sync,
640 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
641 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
642
643 client = ceph_inode_to_client(inode);
644 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
645 pr_warning("writepage_start %p on forced umount\n", inode);
646 return -EIO; /* we're in a forced umount, don't write! */
647 }
648 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
649 wsize = client->mount_args->wsize;
650 if (wsize < PAGE_CACHE_SIZE)
651 wsize = PAGE_CACHE_SIZE;
652 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
653
654 pagevec_init(&pvec, 0);
655
656 /* ?? */
657 if (wbc->nonblocking && bdi_write_congested(bdi)) {
658 dout(" writepages congested\n");
659 wbc->encountered_congestion = 1;
660 goto out_final;
661 }
662
663 /* where to start/end? */
664 if (wbc->range_cyclic) {
665 start = mapping->writeback_index; /* Start from prev offset */
666 end = -1;
667 dout(" cyclic, start at %lu\n", start);
668 } else {
669 start = wbc->range_start >> PAGE_CACHE_SHIFT;
670 end = wbc->range_end >> PAGE_CACHE_SHIFT;
671 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
672 range_whole = 1;
673 should_loop = 0;
674 dout(" not cyclic, %lu to %lu\n", start, end);
675 }
676 index = start;
677
678retry:
679 /* find oldest snap context with dirty data */
680 ceph_put_snap_context(snapc);
681 snapc = get_oldest_context(inode, &snap_size);
682 if (!snapc) {
683 /* hmm, why does writepages get called when there
684 is no dirty data? */
685 dout(" no snap context with dirty data?\n");
686 goto out;
687 }
688 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
689 snapc, snapc->seq, snapc->num_snaps);
690 if (last_snapc && snapc != last_snapc) {
691 /* if we switched to a newer snapc, restart our scan at the
692 * start of the original file range. */
693 dout(" snapc differs from last pass, restarting at %lu\n",
694 index);
695 index = start;
696 }
697 last_snapc = snapc;
698
699 while (!done && index <= end) {
700 unsigned i;
701 int first;
702 pgoff_t next;
703 int pvec_pages, locked_pages;
704 struct page *page;
705 int want;
706 u64 offset, len;
707 struct ceph_osd_request_head *reqhead;
708 struct ceph_osd_op *op;
709 long writeback_stat;
710
711 next = 0;
712 locked_pages = 0;
713 max_pages = max_pages_ever;
714
715get_more_pages:
716 first = -1;
717 want = min(end - index,
718 min((pgoff_t)PAGEVEC_SIZE,
719 max_pages - (pgoff_t)locked_pages) - 1)
720 + 1;
721 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
722 PAGECACHE_TAG_DIRTY,
723 want);
724 dout("pagevec_lookup_tag got %d\n", pvec_pages);
725 if (!pvec_pages && !locked_pages)
726 break;
727 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
728 page = pvec.pages[i];
729 dout("? %p idx %lu\n", page, page->index);
730 if (locked_pages == 0)
731 lock_page(page); /* first page */
732 else if (!trylock_page(page))
733 break;
734
735 /* only dirty pages, or our accounting breaks */
736 if (unlikely(!PageDirty(page)) ||
737 unlikely(page->mapping != mapping)) {
738 dout("!dirty or !mapping %p\n", page);
739 unlock_page(page);
740 break;
741 }
742 if (!wbc->range_cyclic && page->index > end) {
743 dout("end of range %p\n", page);
744 done = 1;
745 unlock_page(page);
746 break;
747 }
748 if (next && (page->index != next)) {
749 dout("not consecutive %p\n", page);
750 unlock_page(page);
751 break;
752 }
753 if (wbc->sync_mode != WB_SYNC_NONE) {
754 dout("waiting on writeback %p\n", page);
755 wait_on_page_writeback(page);
756 }
757 if ((snap_size && page_offset(page) > snap_size) ||
758 (!snap_size &&
759 page_offset(page) > i_size_read(inode))) {
760 dout("%p page eof %llu\n", page, snap_size ?
761 snap_size : i_size_read(inode));
762 done = 1;
763 unlock_page(page);
764 break;
765 }
766 if (PageWriteback(page)) {
767 dout("%p under writeback\n", page);
768 unlock_page(page);
769 break;
770 }
771
772 /* only if matching snap context */
773 if (snapc != (void *)page->private) {
774 dout("page snapc %p != oldest %p\n",
775 (void *)page->private, snapc);
776 unlock_page(page);
777 if (!locked_pages)
778 continue; /* keep looking for snap */
779 break;
780 }
781
782 if (!clear_page_dirty_for_io(page)) {
783 dout("%p !clear_page_dirty_for_io\n", page);
784 unlock_page(page);
785 break;
786 }
787
788 /* ok */
789 if (locked_pages == 0) {
790 /* prepare async write request */
791 offset = page->index << PAGE_CACHE_SHIFT;
792 len = wsize;
793 req = ceph_osdc_new_request(&client->osdc,
794 &ci->i_layout,
795 ceph_vino(inode),
796 offset, &len,
797 CEPH_OSD_OP_WRITE,
798 CEPH_OSD_FLAG_WRITE |
799 CEPH_OSD_FLAG_ONDISK,
800 snapc, do_sync,
801 ci->i_truncate_seq,
802 ci->i_truncate_size,
803 &inode->i_mtime, true, 1);
804 max_pages = req->r_num_pages;
805
806 alloc_page_vec(client, req);
807 req->r_callback = writepages_finish;
808 req->r_inode = inode;
809 req->r_wbc = wbc;
810 }
811
812 /* note position of first page in pvec */
813 if (first < 0)
814 first = i;
815 dout("%p will write page %p idx %lu\n",
816 inode, page, page->index);
817
818 writeback_stat = atomic_long_inc_return(&client->writeback_count);
819 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
820 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
821 }
822
823 set_page_writeback(page);
824 req->r_pages[locked_pages] = page;
825 locked_pages++;
826 next = page->index + 1;
827 }
828
829 /* did we get anything? */
830 if (!locked_pages)
831 goto release_pvec_pages;
832 if (i) {
833 int j;
834 BUG_ON(!locked_pages || first < 0);
835
836 if (pvec_pages && i == pvec_pages &&
837 locked_pages < max_pages) {
838 dout("reached end pvec, trying for more\n");
839 pagevec_reinit(&pvec);
840 goto get_more_pages;
841 }
842
843 /* shift unused pages over in the pvec... we
844 * will need to release them below. */
845 for (j = i; j < pvec_pages; j++) {
846 dout(" pvec leftover page %p\n",
847 pvec.pages[j]);
848 pvec.pages[j-i+first] = pvec.pages[j];
849 }
850 pvec.nr -= i-first;
851 }
852
853 /* submit the write */
854 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
855 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
856 (u64)locked_pages << PAGE_CACHE_SHIFT);
857 dout("writepages got %d pages at %llu~%llu\n",
858 locked_pages, offset, len);
859
860 /* revise final length, page count */
861 req->r_num_pages = locked_pages;
862 reqhead = req->r_request->front.iov_base;
863 op = (void *)(reqhead + 1);
864 op->extent.length = cpu_to_le64(len);
865 op->payload_len = cpu_to_le32(len);
866 req->r_request->hdr.data_len = cpu_to_le32(len);
867
868 ceph_osdc_start_request(&client->osdc, req, true);
869 req = NULL;
870
871 /* continue? */
872 index = next;
873 wbc->nr_to_write -= locked_pages;
874 if (wbc->nr_to_write <= 0)
875 done = 1;
876
877release_pvec_pages:
878 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
879 pvec.nr ? pvec.pages[0] : NULL);
880 pagevec_release(&pvec);
881
882 if (locked_pages && !done)
883 goto retry;
884 }
885
886 if (should_loop && !done) {
887 /* more to do; loop back to beginning of file */
888 dout("writepages looping back to beginning of file\n");
889 should_loop = 0;
890 index = 0;
891 goto retry;
892 }
893
894 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
895 mapping->writeback_index = index;
896
897out:
898 if (req)
899 ceph_osdc_put_request(req);
900 if (rc > 0)
901 rc = 0; /* vfs expects us to return 0 */
902 ceph_put_snap_context(snapc);
903 dout("writepages done, rc = %d\n", rc);
904out_final:
905 return rc;
906}
907
908
909
910/*
911 * See if a given @snapc is either writeable, or already written.
912 */
913static int context_is_writeable_or_written(struct inode *inode,
914 struct ceph_snap_context *snapc)
915{
916 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
917 return !oldest || snapc->seq <= oldest->seq;
918}
919
920/*
921 * We are only allowed to write into/dirty the page if the page is
922 * clean, or already dirty within the same snap context.
923 *
924 * called with page locked.
925 * return success with page locked,
926 * or any failure (incl -EAGAIN) with page unlocked.
927 */
928static int ceph_update_writeable_page(struct file *file,
929 loff_t pos, unsigned len,
930 struct page *page)
931{
932 struct inode *inode = file->f_dentry->d_inode;
933 struct ceph_inode_info *ci = ceph_inode(inode);
934 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
935 loff_t page_off = pos & PAGE_CACHE_MASK;
936 int pos_in_page = pos & ~PAGE_CACHE_MASK;
937 int end_in_page = pos_in_page + len;
938 loff_t i_size;
939 struct ceph_snap_context *snapc;
940 int r;
941
942retry_locked:
943 /* writepages currently holds page lock, but if we change that later, */
944 wait_on_page_writeback(page);
945
946 /* check snap context */
947 BUG_ON(!ci->i_snap_realm);
948 down_read(&mdsc->snap_rwsem);
949 BUG_ON(!ci->i_snap_realm->cached_context);
950 if (page->private &&
951 (void *)page->private != ci->i_snap_realm->cached_context) {
952 /*
953 * this page is already dirty in another (older) snap
954 * context! is it writeable now?
955 */
956 snapc = get_oldest_context(inode, NULL);
957 up_read(&mdsc->snap_rwsem);
958
959 if (snapc != (void *)page->private) {
960 dout(" page %p snapc %p not current or oldest\n",
961 page, (void *)page->private);
962 /*
963 * queue for writeback, and wait for snapc to
964 * be writeable or written
965 */
966 snapc = ceph_get_snap_context((void *)page->private);
967 unlock_page(page);
968 ceph_queue_writeback(inode);
969 r = wait_event_interruptible(ci->i_cap_wq,
970 context_is_writeable_or_written(inode, snapc));
971 ceph_put_snap_context(snapc);
972 if (r == -ERESTARTSYS)
973 return r;
974 return -EAGAIN;
975 }
976
977 /* yay, writeable, do it now (without dropping page lock) */
978 dout(" page %p snapc %p not current, but oldest\n",
979 page, snapc);
980 if (!clear_page_dirty_for_io(page))
981 goto retry_locked;
982 r = writepage_nounlock(page, NULL);
983 if (r < 0)
984 goto fail_nosnap;
985 goto retry_locked;
986 }
987
988 if (PageUptodate(page)) {
989 dout(" page %p already uptodate\n", page);
990 return 0;
991 }
992
993 /* full page? */
994 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
995 return 0;
996
997 /* past end of file? */
998 i_size = inode->i_size; /* caller holds i_mutex */
999
1000 if (i_size + len > inode->i_sb->s_maxbytes) {
1001 /* file is too big */
1002 r = -EINVAL;
1003 goto fail;
1004 }
1005
1006 if (page_off >= i_size ||
1007 (pos_in_page == 0 && (pos+len) >= i_size &&
1008 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1009 dout(" zeroing %p 0 - %d and %d - %d\n",
1010 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1011 zero_user_segments(page,
1012 0, pos_in_page,
1013 end_in_page, PAGE_CACHE_SIZE);
1014 return 0;
1015 }
1016
1017 /* we need to read it. */
1018 up_read(&mdsc->snap_rwsem);
1019 r = readpage_nounlock(file, page);
1020 if (r < 0)
1021 goto fail_nosnap;
1022 goto retry_locked;
1023
1024fail:
1025 up_read(&mdsc->snap_rwsem);
1026fail_nosnap:
1027 unlock_page(page);
1028 return r;
1029}
1030
1031/*
1032 * We are only allowed to write into/dirty the page if the page is
1033 * clean, or already dirty within the same snap context.
1034 */
1035static int ceph_write_begin(struct file *file, struct address_space *mapping,
1036 loff_t pos, unsigned len, unsigned flags,
1037 struct page **pagep, void **fsdata)
1038{
1039 struct inode *inode = file->f_dentry->d_inode;
1040 struct page *page;
1041 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1042 int r;
1043
1044 do {
1045 /* get a page */
1046 page = grab_cache_page_write_begin(mapping, index, 0);
1047 if (!page)
1048 return -ENOMEM;
1049 *pagep = page;
1050
1051 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1052 inode, page, (int)pos, (int)len);
1053
1054 r = ceph_update_writeable_page(file, pos, len, page);
1055 } while (r == -EAGAIN);
1056
1057 return r;
1058}
1059
1060/*
1061 * we don't do anything in here that simple_write_end doesn't do
1062 * except adjust dirty page accounting and drop read lock on
1063 * mdsc->snap_rwsem.
1064 */
1065static int ceph_write_end(struct file *file, struct address_space *mapping,
1066 loff_t pos, unsigned len, unsigned copied,
1067 struct page *page, void *fsdata)
1068{
1069 struct inode *inode = file->f_dentry->d_inode;
1070 struct ceph_client *client = ceph_inode_to_client(inode);
1071 struct ceph_mds_client *mdsc = &client->mdsc;
1072 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1073 int check_cap = 0;
1074
1075 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1076 inode, page, (int)pos, (int)copied, (int)len);
1077
1078 /* zero the stale part of the page if we did a short copy */
1079 if (copied < len)
1080 zero_user_segment(page, from+copied, len);
1081
1082 /* did file size increase? */
1083 /* (no need for i_size_read(); we caller holds i_mutex */
1084 if (pos+copied > inode->i_size)
1085 check_cap = ceph_inode_set_size(inode, pos+copied);
1086
1087 if (!PageUptodate(page))
1088 SetPageUptodate(page);
1089
1090 set_page_dirty(page);
1091
1092 unlock_page(page);
1093 up_read(&mdsc->snap_rwsem);
1094 page_cache_release(page);
1095
1096 if (check_cap)
1097 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1098
1099 return copied;
1100}
1101
1102/*
1103 * we set .direct_IO to indicate direct io is supported, but since we
1104 * intercept O_DIRECT reads and writes early, this function should
1105 * never get called.
1106 */
1107static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1108 const struct iovec *iov,
1109 loff_t pos, unsigned long nr_segs)
1110{
1111 WARN_ON(1);
1112 return -EINVAL;
1113}
1114
1115const struct address_space_operations ceph_aops = {
1116 .readpage = ceph_readpage,
1117 .readpages = ceph_readpages,
1118 .writepage = ceph_writepage,
1119 .writepages = ceph_writepages_start,
1120 .write_begin = ceph_write_begin,
1121 .write_end = ceph_write_end,
1122 .set_page_dirty = ceph_set_page_dirty,
1123 .invalidatepage = ceph_invalidatepage,
1124 .releasepage = ceph_releasepage,
1125 .direct_IO = ceph_direct_io,
1126};
1127
1128
1129/*
1130 * vm ops
1131 */
1132
1133/*
1134 * Reuse write_begin here for simplicity.
1135 */
1136static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1137{
1138 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1139 struct page *page = vmf->page;
1140 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1141 loff_t off = page->index << PAGE_CACHE_SHIFT;
1142 loff_t size, len;
1143 int ret;
1144
1145 size = i_size_read(inode);
1146 if (off + PAGE_CACHE_SIZE <= size)
1147 len = PAGE_CACHE_SIZE;
1148 else
1149 len = size & ~PAGE_CACHE_MASK;
1150
1151 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1152 off, len, page, page->index);
1153
1154 lock_page(page);
1155
1156 ret = VM_FAULT_NOPAGE;
1157 if ((off > size) ||
1158 (page->mapping != inode->i_mapping))
1159 goto out;
1160
1161 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1162 if (ret == 0) {
1163 /* success. we'll keep the page locked. */
1164 set_page_dirty(page);
1165 up_read(&mdsc->snap_rwsem);
1166 ret = VM_FAULT_LOCKED;
1167 } else {
1168 if (ret == -ENOMEM)
1169 ret = VM_FAULT_OOM;
1170 else
1171 ret = VM_FAULT_SIGBUS;
1172 }
1173out:
1174 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1175 if (ret != VM_FAULT_LOCKED)
1176 unlock_page(page);
1177 return ret;
1178}
1179
1180static struct vm_operations_struct ceph_vmops = {
1181 .fault = filemap_fault,
1182 .page_mkwrite = ceph_page_mkwrite,
1183};
1184
1185int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1186{
1187 struct address_space *mapping = file->f_mapping;
1188
1189 if (!mapping->a_ops->readpage)
1190 return -ENOEXEC;
1191 file_accessed(file);
1192 vma->vm_ops = &ceph_vmops;
1193 vma->vm_flags |= VM_CAN_NONLINEAR;
1194 return 0;
1195}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..f6394b94b866
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,258 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building request\n", ret);
153 return ret;
154 }
155 dout(" built request %d bytes\n", ret);
156 ceph_encode_32(&p, ret);
157 return p + ret - msg_buf;
158}
159
160/*
161 * Handle auth message from monitor.
162 */
163int ceph_handle_auth_reply(struct ceph_auth_client *ac,
164 void *buf, size_t len,
165 void *reply_buf, size_t reply_len)
166{
167 void *p = buf;
168 void *end = buf + len;
169 int protocol;
170 s32 result;
171 u64 global_id;
172 void *payload, *payload_end;
173 int payload_len;
174 char *result_msg;
175 int result_msg_len;
176 int ret = -EINVAL;
177
178 dout("handle_auth_reply %p %p\n", p, end);
179 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
180 protocol = ceph_decode_32(&p);
181 result = ceph_decode_32(&p);
182 global_id = ceph_decode_64(&p);
183 payload_len = ceph_decode_32(&p);
184 payload = p;
185 p += payload_len;
186 ceph_decode_need(&p, end, sizeof(u32), bad);
187 result_msg_len = ceph_decode_32(&p);
188 result_msg = p;
189 p += result_msg_len;
190 if (p != end)
191 goto bad;
192
193 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
194 result_msg, global_id, payload_len);
195
196 payload_end = payload + payload_len;
197
198 if (global_id && ac->global_id != global_id) {
199 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
200 ac->global_id = global_id;
201 }
202
203 if (ac->negotiating) {
204 /* server does not support our protocols? */
205 if (!protocol && result < 0) {
206 ret = result;
207 goto out;
208 }
209 /* set up (new) protocol handler? */
210 if (ac->protocol && ac->protocol != protocol) {
211 ac->ops->destroy(ac);
212 ac->protocol = 0;
213 ac->ops = NULL;
214 }
215 if (ac->protocol != protocol) {
216 ret = ceph_auth_init_protocol(ac, protocol);
217 if (ret) {
218 pr_err("error %d on auth protocol %d init\n",
219 ret, protocol);
220 goto out;
221 }
222 }
223
224 ac->negotiating = false;
225 }
226
227 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
228 if (ret == -EAGAIN) {
229 return ceph_build_auth_request(ac, reply_buf, reply_len);
230 } else if (ret) {
231 pr_err("authentication error %d\n", ret);
232 return ret;
233 }
234 return 0;
235
236bad:
237 pr_err("failed to decode auth msg\n");
238out:
239 return ret;
240}
241
242int ceph_build_auth(struct ceph_auth_client *ac,
243 void *msg_buf, size_t msg_len)
244{
245 if (!ac->protocol)
246 return ceph_auth_build_hello(ac, msg_buf, msg_len);
247 BUG_ON(!ac->ops);
248 if (!ac->ops->is_authenticated(ac))
249 return ceph_build_auth_request(ac, msg_buf, msg_len);
250 return 0;
251}
252
253int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
254{
255 if (!ac->ops)
256 return 0;
257 return ac->ops->is_authenticated(ac);
258}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34/*
35 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here.
37 */
38static int handle_reply(struct ceph_auth_client *ac, int result,
39 void *buf, void *end)
40{
41 struct ceph_auth_none_info *xi = ac->private;
42
43 xi->starting = false;
44 return result;
45}
46
47/*
48 * build an 'authorizer' with our entity_name and global_id. we can
49 * reuse a single static copy since it is identical for all services
50 * we connect to.
51 */
52static int ceph_auth_none_create_authorizer(
53 struct ceph_auth_client *ac, int peer_type,
54 struct ceph_authorizer **a,
55 void **buf, size_t *len,
56 void **reply_buf, size_t *reply_len)
57{
58 struct ceph_auth_none_info *ai = ac->private;
59 struct ceph_none_authorizer *au = &ai->au;
60 void *p, *end;
61 int ret;
62
63 if (!ai->built_authorizer) {
64 p = au->buf;
65 end = p + sizeof(au->buf);
66 ceph_encode_8(&p, 1);
67 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
68 if (ret < 0)
69 goto bad;
70 ceph_decode_need(&p, end, sizeof(u64), bad2);
71 ceph_encode_64(&p, ac->global_id);
72 au->buf_len = p - (void *)au->buf;
73 ai->built_authorizer = true;
74 dout("built authorizer len %d\n", au->buf_len);
75 }
76
77 *a = (struct ceph_authorizer *)au;
78 *buf = au->buf;
79 *len = au->buf_len;
80 *reply_buf = au->reply_buf;
81 *reply_len = sizeof(au->reply_buf);
82 return 0;
83
84bad2:
85 ret = -ERANGE;
86bad:
87 return ret;
88}
89
90static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
91 struct ceph_authorizer *a)
92{
93 /* nothing to do */
94}
95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .reset = reset,
98 .destroy = destroy,
99 .is_authenticated = is_authenticated,
100 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
103};
104
105int ceph_auth_none_init(struct ceph_auth_client *ac)
106{
107 struct ceph_auth_none_info *xi;
108
109 dout("ceph_auth_none_init %p\n", ac);
110 xi = kzalloc(sizeof(*xi), GFP_NOFS);
111 if (!xi)
112 return -ENOMEM;
113
114 xi->starting = true;
115 xi->built_authorizer = false;
116
117 ac->protocol = CEPH_AUTH_NONE;
118 ac->private = xi;
119 ac->ops = &ceph_auth_none_ops;
120 return 0;
121}
122
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..d9001a4dc8cc
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,680 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15struct kmem_cache *ceph_x_ticketbuf_cachep;
16
17#define TEMP_TICKET_BUF_LEN 256
18
19static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
20
21static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
22{
23 struct ceph_x_info *xi = ac->private;
24 int need;
25
26 ceph_x_validate_tickets(ac, &need);
27 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
28 ac->want_keys, need, xi->have_keys);
29 return (ac->want_keys & xi->have_keys) == ac->want_keys;
30}
31
32static int ceph_x_encrypt_buflen(int ilen)
33{
34 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
35 sizeof(u32);
36}
37
38static int ceph_x_encrypt(struct ceph_crypto_key *secret,
39 void *ibuf, int ilen, void *obuf, size_t olen)
40{
41 struct ceph_x_encrypt_header head = {
42 .struct_v = 1,
43 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
44 };
45 size_t len = olen - sizeof(u32);
46 int ret;
47
48 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
49 &head, sizeof(head), ibuf, ilen);
50 if (ret)
51 return ret;
52 ceph_encode_32(&obuf, len);
53 return len + sizeof(u32);
54}
55
56static int ceph_x_decrypt(struct ceph_crypto_key *secret,
57 void **p, void *end, void *obuf, size_t olen)
58{
59 struct ceph_x_encrypt_header head;
60 size_t head_len = sizeof(head);
61 int len, ret;
62
63 len = ceph_decode_32(p);
64 if (*p + len > end)
65 return -EINVAL;
66
67 dout("ceph_x_decrypt len %d\n", len);
68 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
69 *p, len);
70 if (ret)
71 return ret;
72 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
73 return -EPERM;
74 *p += len;
75 return olen;
76}
77
78/*
79 * get existing (or insert new) ticket handler
80 */
81struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
82 int service)
83{
84 struct ceph_x_ticket_handler *th;
85 struct ceph_x_info *xi = ac->private;
86 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
87
88 while (*p) {
89 parent = *p;
90 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
91 if (service < th->service)
92 p = &(*p)->rb_left;
93 else if (service > th->service)
94 p = &(*p)->rb_right;
95 else
96 return th;
97 }
98
99 /* add it */
100 th = kzalloc(sizeof(*th), GFP_NOFS);
101 if (!th)
102 return ERR_PTR(-ENOMEM);
103 th->service = service;
104 rb_link_node(&th->node, parent, p);
105 rb_insert_color(&th->node, &xi->ticket_handlers);
106 return th;
107}
108
109static void remove_ticket_handler(struct ceph_auth_client *ac,
110 struct ceph_x_ticket_handler *th)
111{
112 struct ceph_x_info *xi = ac->private;
113
114 dout("remove_ticket_handler %p %d\n", th, th->service);
115 rb_erase(&th->node, &xi->ticket_handlers);
116 ceph_crypto_key_destroy(&th->session_key);
117 if (th->ticket_blob)
118 ceph_buffer_put(th->ticket_blob);
119 kfree(th);
120}
121
122static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
123 struct ceph_crypto_key *secret,
124 void *buf, void *end)
125{
126 struct ceph_x_info *xi = ac->private;
127 int num;
128 void *p = buf;
129 int ret;
130 char *dbuf;
131 char *ticket_buf;
132 u8 struct_v;
133
134 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
135 if (!dbuf)
136 return -ENOMEM;
137
138 ret = -ENOMEM;
139 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
140 GFP_NOFS | GFP_ATOMIC);
141 if (!ticket_buf)
142 goto out_dbuf;
143
144 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
145 struct_v = ceph_decode_8(&p);
146 if (struct_v != 1)
147 goto bad;
148 num = ceph_decode_32(&p);
149 dout("%d tickets\n", num);
150 while (num--) {
151 int type;
152 u8 struct_v;
153 struct ceph_x_ticket_handler *th;
154 void *dp, *dend;
155 int dlen;
156 char is_enc;
157 struct timespec validity;
158 struct ceph_crypto_key old_key;
159 void *tp, *tpend;
160 struct ceph_timespec new_validity;
161 struct ceph_crypto_key new_session_key;
162 struct ceph_buffer *new_ticket_blob;
163 unsigned long new_expires, new_renew_after;
164 u64 new_secret_id;
165
166 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
167
168 type = ceph_decode_32(&p);
169 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
170
171 struct_v = ceph_decode_8(&p);
172 if (struct_v != 1)
173 goto bad;
174
175 th = get_ticket_handler(ac, type);
176 if (IS_ERR(th)) {
177 ret = PTR_ERR(th);
178 goto out;
179 }
180
181 /* blob for me */
182 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
183 TEMP_TICKET_BUF_LEN);
184 if (dlen <= 0) {
185 ret = dlen;
186 goto out;
187 }
188 dout(" decrypted %d bytes\n", dlen);
189 dend = dbuf + dlen;
190 dp = dbuf;
191
192 struct_v = ceph_decode_8(&dp);
193 if (struct_v != 1)
194 goto bad;
195
196 memcpy(&old_key, &th->session_key, sizeof(old_key));
197 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
198 if (ret)
199 goto out;
200
201 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
202 ceph_decode_timespec(&validity, &new_validity);
203 new_expires = get_seconds() + validity.tv_sec;
204 new_renew_after = new_expires - (validity.tv_sec / 4);
205 dout(" expires=%lu renew_after=%lu\n", new_expires,
206 new_renew_after);
207
208 /* ticket blob for service */
209 ceph_decode_8_safe(&p, end, is_enc, bad);
210 tp = ticket_buf;
211 if (is_enc) {
212 /* encrypted */
213 dout(" encrypted ticket\n");
214 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
215 TEMP_TICKET_BUF_LEN);
216 if (dlen < 0) {
217 ret = dlen;
218 goto out;
219 }
220 dlen = ceph_decode_32(&tp);
221 } else {
222 /* unencrypted */
223 ceph_decode_32_safe(&p, end, dlen, bad);
224 ceph_decode_need(&p, end, dlen, bad);
225 ceph_decode_copy(&p, ticket_buf, dlen);
226 }
227 tpend = tp + dlen;
228 dout(" ticket blob is %d bytes\n", dlen);
229 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
230 struct_v = ceph_decode_8(&tp);
231 new_secret_id = ceph_decode_64(&tp);
232 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
233 if (ret)
234 goto out;
235
236 /* all is well, update our ticket */
237 ceph_crypto_key_destroy(&th->session_key);
238 if (th->ticket_blob)
239 ceph_buffer_put(th->ticket_blob);
240 th->session_key = new_session_key;
241 th->ticket_blob = new_ticket_blob;
242 th->validity = new_validity;
243 th->secret_id = new_secret_id;
244 th->expires = new_expires;
245 th->renew_after = new_renew_after;
246 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
247 type, ceph_entity_type_name(type), th->secret_id,
248 (int)th->ticket_blob->vec.iov_len);
249 xi->have_keys |= th->service;
250 }
251
252 ret = 0;
253out:
254 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
255out_dbuf:
256 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
257 return ret;
258
259bad:
260 ret = -EINVAL;
261 goto out;
262}
263
264static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
265 struct ceph_x_ticket_handler *th,
266 struct ceph_x_authorizer *au)
267{
268 int maxlen;
269 struct ceph_x_authorize_a *msg_a;
270 struct ceph_x_authorize_b msg_b;
271 void *p, *end;
272 int ret;
273 int ticket_blob_len =
274 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
275
276 dout("build_authorizer for %s %p\n",
277 ceph_entity_type_name(th->service), au);
278
279 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
280 ceph_x_encrypt_buflen(ticket_blob_len);
281 dout(" need len %d\n", maxlen);
282 if (au->buf && au->buf->alloc_len < maxlen) {
283 ceph_buffer_put(au->buf);
284 au->buf = NULL;
285 }
286 if (!au->buf) {
287 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
288 if (!au->buf)
289 return -ENOMEM;
290 }
291 au->service = th->service;
292
293 msg_a = au->buf->vec.iov_base;
294 msg_a->struct_v = 1;
295 msg_a->global_id = cpu_to_le64(ac->global_id);
296 msg_a->service_id = cpu_to_le32(th->service);
297 msg_a->ticket_blob.struct_v = 1;
298 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
299 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
300 if (ticket_blob_len) {
301 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
302 th->ticket_blob->vec.iov_len);
303 }
304 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
305 le64_to_cpu(msg_a->ticket_blob.secret_id));
306
307 p = msg_a + 1;
308 p += ticket_blob_len;
309 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
310
311 get_random_bytes(&au->nonce, sizeof(au->nonce));
312 msg_b.struct_v = 1;
313 msg_b.nonce = cpu_to_le64(au->nonce);
314 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
315 p, end - p);
316 if (ret < 0)
317 goto out_buf;
318 p += ret;
319 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
320 dout(" built authorizer nonce %llx len %d\n", au->nonce,
321 (int)au->buf->vec.iov_len);
322 BUG_ON(au->buf->vec.iov_len > maxlen);
323 return 0;
324
325out_buf:
326 ceph_buffer_put(au->buf);
327 au->buf = NULL;
328 return ret;
329}
330
331static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
332 void **p, void *end)
333{
334 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
335 ceph_encode_8(p, 1);
336 ceph_encode_64(p, th->secret_id);
337 if (th->ticket_blob) {
338 const char *buf = th->ticket_blob->vec.iov_base;
339 u32 len = th->ticket_blob->vec.iov_len;
340
341 ceph_encode_32_safe(p, end, len, bad);
342 ceph_encode_copy_safe(p, end, buf, len, bad);
343 } else {
344 ceph_encode_32_safe(p, end, 0, bad);
345 }
346
347 return 0;
348bad:
349 return -ERANGE;
350}
351
352static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
353{
354 int want = ac->want_keys;
355 struct ceph_x_info *xi = ac->private;
356 int service;
357
358 *pneed = ac->want_keys & ~(xi->have_keys);
359
360 for (service = 1; service <= want; service <<= 1) {
361 struct ceph_x_ticket_handler *th;
362
363 if (!(ac->want_keys & service))
364 continue;
365
366 if (*pneed & service)
367 continue;
368
369 th = get_ticket_handler(ac, service);
370
371 if (!th) {
372 *pneed |= service;
373 continue;
374 }
375
376 if (get_seconds() >= th->renew_after)
377 *pneed |= service;
378 if (get_seconds() >= th->expires)
379 xi->have_keys &= ~service;
380 }
381}
382
383
384static int ceph_x_build_request(struct ceph_auth_client *ac,
385 void *buf, void *end)
386{
387 struct ceph_x_info *xi = ac->private;
388 int need;
389 struct ceph_x_request_header *head = buf;
390 int ret;
391 struct ceph_x_ticket_handler *th =
392 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
393
394 ceph_x_validate_tickets(ac, &need);
395
396 dout("build_request want %x have %x need %x\n",
397 ac->want_keys, xi->have_keys, need);
398
399 if (need & CEPH_ENTITY_TYPE_AUTH) {
400 struct ceph_x_authenticate *auth = (void *)(head + 1);
401 void *p = auth + 1;
402 struct ceph_x_challenge_blob tmp;
403 char tmp_enc[40];
404 u64 *u;
405
406 if (p > end)
407 return -ERANGE;
408
409 dout(" get_auth_session_key\n");
410 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
411
412 /* encrypt and hash */
413 get_random_bytes(&auth->client_challenge, sizeof(u64));
414 tmp.client_challenge = auth->client_challenge;
415 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
416 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
417 tmp_enc, sizeof(tmp_enc));
418 if (ret < 0)
419 return ret;
420
421 auth->struct_v = 1;
422 auth->key = 0;
423 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
424 auth->key ^= *u;
425 dout(" server_challenge %llx client_challenge %llx key %llx\n",
426 xi->server_challenge, le64_to_cpu(auth->client_challenge),
427 le64_to_cpu(auth->key));
428
429 /* now encode the old ticket if exists */
430 ret = ceph_x_encode_ticket(th, &p, end);
431 if (ret < 0)
432 return ret;
433
434 return p - buf;
435 }
436
437 if (need) {
438 void *p = head + 1;
439 struct ceph_x_service_ticket_request *req;
440
441 if (p > end)
442 return -ERANGE;
443 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
444
445 BUG_ON(!th);
446 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
447 if (ret)
448 return ret;
449 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
450 xi->auth_authorizer.buf->vec.iov_len);
451
452 req = p;
453 req->keys = cpu_to_le32(need);
454 p += sizeof(*req);
455 return p - buf;
456 }
457
458 return 0;
459}
460
461static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
462 void *buf, void *end)
463{
464 struct ceph_x_info *xi = ac->private;
465 struct ceph_x_reply_header *head = buf;
466 struct ceph_x_ticket_handler *th;
467 int len = end - buf;
468 int op;
469 int ret;
470
471 if (result)
472 return result; /* XXX hmm? */
473
474 if (xi->starting) {
475 /* it's a hello */
476 struct ceph_x_server_challenge *sc = buf;
477
478 if (len != sizeof(*sc))
479 return -EINVAL;
480 xi->server_challenge = le64_to_cpu(sc->server_challenge);
481 dout("handle_reply got server challenge %llx\n",
482 xi->server_challenge);
483 xi->starting = false;
484 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
485 return -EAGAIN;
486 }
487
488 op = le32_to_cpu(head->op);
489 result = le32_to_cpu(head->result);
490 dout("handle_reply op %d result %d\n", op, result);
491 switch (op) {
492 case CEPHX_GET_AUTH_SESSION_KEY:
493 /* verify auth key */
494 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
495 buf + sizeof(*head), end);
496 break;
497
498 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
499 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
500 BUG_ON(!th);
501 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
502 buf + sizeof(*head), end);
503 break;
504
505 default:
506 return -EINVAL;
507 }
508 if (ret)
509 return ret;
510 if (ac->want_keys == xi->have_keys)
511 return 0;
512 return -EAGAIN;
513}
514
515static int ceph_x_create_authorizer(
516 struct ceph_auth_client *ac, int peer_type,
517 struct ceph_authorizer **a,
518 void **buf, size_t *len,
519 void **reply_buf, size_t *reply_len)
520{
521 struct ceph_x_authorizer *au;
522 struct ceph_x_ticket_handler *th;
523 int ret;
524
525 th = get_ticket_handler(ac, peer_type);
526 if (IS_ERR(th))
527 return PTR_ERR(th);
528
529 au = kzalloc(sizeof(*au), GFP_NOFS);
530 if (!au)
531 return -ENOMEM;
532
533 ret = ceph_x_build_authorizer(ac, th, au);
534 if (ret) {
535 kfree(au);
536 return ret;
537 }
538
539 *a = (struct ceph_authorizer *)au;
540 *buf = au->buf->vec.iov_base;
541 *len = au->buf->vec.iov_len;
542 *reply_buf = au->reply_buf;
543 *reply_len = sizeof(au->reply_buf);
544 return 0;
545}
546
547static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
548 struct ceph_authorizer *a, size_t len)
549{
550 struct ceph_x_authorizer *au = (void *)a;
551 struct ceph_x_ticket_handler *th;
552 int ret = 0;
553 struct ceph_x_authorize_reply reply;
554 void *p = au->reply_buf;
555 void *end = p + sizeof(au->reply_buf);
556
557 th = get_ticket_handler(ac, au->service);
558 if (!th)
559 return -EIO; /* hrm! */
560 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
561 if (ret < 0)
562 return ret;
563 if (ret != sizeof(reply))
564 return -EPERM;
565
566 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
567 ret = -EPERM;
568 else
569 ret = 0;
570 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
571 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
572 return ret;
573}
574
575static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
576 struct ceph_authorizer *a)
577{
578 struct ceph_x_authorizer *au = (void *)a;
579
580 ceph_buffer_put(au->buf);
581 kfree(au);
582}
583
584
585static void ceph_x_reset(struct ceph_auth_client *ac)
586{
587 struct ceph_x_info *xi = ac->private;
588
589 dout("reset\n");
590 xi->starting = true;
591 xi->server_challenge = 0;
592}
593
594static void ceph_x_destroy(struct ceph_auth_client *ac)
595{
596 struct ceph_x_info *xi = ac->private;
597 struct rb_node *p;
598
599 dout("ceph_x_destroy %p\n", ac);
600 ceph_crypto_key_destroy(&xi->secret);
601
602 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
603 struct ceph_x_ticket_handler *th =
604 rb_entry(p, struct ceph_x_ticket_handler, node);
605 remove_ticket_handler(ac, th);
606 }
607
608 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
609
610 kfree(ac->private);
611 ac->private = NULL;
612}
613
614static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
615 int peer_type)
616{
617 struct ceph_x_ticket_handler *th;
618
619 th = get_ticket_handler(ac, peer_type);
620 if (th && !IS_ERR(th))
621 remove_ticket_handler(ac, th);
622}
623
624
625static const struct ceph_auth_client_ops ceph_x_ops = {
626 .is_authenticated = ceph_x_is_authenticated,
627 .build_request = ceph_x_build_request,
628 .handle_reply = ceph_x_handle_reply,
629 .create_authorizer = ceph_x_create_authorizer,
630 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
631 .destroy_authorizer = ceph_x_destroy_authorizer,
632 .invalidate_authorizer = ceph_x_invalidate_authorizer,
633 .reset = ceph_x_reset,
634 .destroy = ceph_x_destroy,
635};
636
637
638int ceph_x_init(struct ceph_auth_client *ac)
639{
640 struct ceph_x_info *xi;
641 int ret;
642
643 dout("ceph_x_init %p\n", ac);
644 xi = kzalloc(sizeof(*xi), GFP_NOFS);
645 if (!xi)
646 return -ENOMEM;
647
648 ret = -ENOMEM;
649 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
650 TEMP_TICKET_BUF_LEN, 8,
651 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
652 NULL);
653 if (!ceph_x_ticketbuf_cachep)
654 goto done_nomem;
655 ret = -EINVAL;
656 if (!ac->secret) {
657 pr_err("no secret set (for auth_x protocol)\n");
658 goto done_nomem;
659 }
660
661 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
662 if (ret)
663 goto done_nomem;
664
665 xi->starting = true;
666 xi->ticket_handlers = RB_ROOT;
667
668 ac->protocol = CEPH_AUTH_CEPHX;
669 ac->private = xi;
670 ac->ops = &ceph_x_ops;
671 return 0;
672
673done_nomem:
674 kfree(xi);
675 if (ceph_x_ticketbuf_cachep)
676 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
677 return ret;
678}
679
680
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{
68 size_t len;
69
70 ceph_decode_need(p, end, sizeof(u32), bad);
71 len = ceph_decode_32(p);
72 dout("decode_buffer len %d\n", (int)len);
73 ceph_decode_need(p, end, len, bad);
74 *b = ceph_buffer_new(len, GFP_NOFS);
75 if (!*b)
76 return -ENOMEM;
77 ceph_decode_copy(p, (*b)->vec.iov_base, len);
78 return 0;
79bad:
80 return -EINVAL;
81}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..3710e077a857
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2933 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/slab.h>
7#include <linux/vmalloc.h>
8#include <linux/wait.h>
9#include <linux/writeback.h>
10
11#include "super.h"
12#include "decode.h"
13#include "messenger.h"
14
15/*
16 * Capability management
17 *
18 * The Ceph metadata servers control client access to inode metadata
19 * and file data by issuing capabilities, granting clients permission
20 * to read and/or write both inode field and file data to OSDs
21 * (storage nodes). Each capability consists of a set of bits
22 * indicating which operations are allowed.
23 *
24 * If the client holds a *_SHARED cap, the client has a coherent value
25 * that can be safely read from the cached inode.
26 *
27 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
28 * client is allowed to change inode attributes (e.g., file size,
29 * mtime), note its dirty state in the ceph_cap, and asynchronously
30 * flush that metadata change to the MDS.
31 *
32 * In the event of a conflicting operation (perhaps by another
33 * client), the MDS will revoke the conflicting client capabilities.
34 *
35 * In order for a client to cache an inode, it must hold a capability
36 * with at least one MDS server. When inodes are released, release
37 * notifications are batched and periodically sent en masse to the MDS
38 * cluster to release server state.
39 */
40
41
42/*
43 * Generate readable cap strings for debugging output.
44 */
45#define MAX_CAP_STR 20
46static char cap_str[MAX_CAP_STR][40];
47static DEFINE_SPINLOCK(cap_str_lock);
48static int last_cap_str;
49
50static char *gcap_string(char *s, int c)
51{
52 if (c & CEPH_CAP_GSHARED)
53 *s++ = 's';
54 if (c & CEPH_CAP_GEXCL)
55 *s++ = 'x';
56 if (c & CEPH_CAP_GCACHE)
57 *s++ = 'c';
58 if (c & CEPH_CAP_GRD)
59 *s++ = 'r';
60 if (c & CEPH_CAP_GWR)
61 *s++ = 'w';
62 if (c & CEPH_CAP_GBUFFER)
63 *s++ = 'b';
64 if (c & CEPH_CAP_GLAZYIO)
65 *s++ = 'l';
66 return s;
67}
68
69const char *ceph_cap_string(int caps)
70{
71 int i;
72 char *s;
73 int c;
74
75 spin_lock(&cap_str_lock);
76 i = last_cap_str++;
77 if (last_cap_str == MAX_CAP_STR)
78 last_cap_str = 0;
79 spin_unlock(&cap_str_lock);
80
81 s = cap_str[i];
82
83 if (caps & CEPH_CAP_PIN)
84 *s++ = 'p';
85
86 c = (caps >> CEPH_CAP_SAUTH) & 3;
87 if (c) {
88 *s++ = 'A';
89 s = gcap_string(s, c);
90 }
91
92 c = (caps >> CEPH_CAP_SLINK) & 3;
93 if (c) {
94 *s++ = 'L';
95 s = gcap_string(s, c);
96 }
97
98 c = (caps >> CEPH_CAP_SXATTR) & 3;
99 if (c) {
100 *s++ = 'X';
101 s = gcap_string(s, c);
102 }
103
104 c = caps >> CEPH_CAP_SFILE;
105 if (c) {
106 *s++ = 'F';
107 s = gcap_string(s, c);
108 }
109
110 if (s == cap_str[i])
111 *s++ = '-';
112 *s = 0;
113 return cap_str[i];
114}
115
116/*
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{
137 INIT_LIST_HEAD(&caps_list);
138 spin_lock_init(&caps_list_lock);
139}
140
141void ceph_caps_finalize(void)
142{
143 struct ceph_cap *cap;
144
145 spin_lock(&caps_list_lock);
146 while (!list_empty(&caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
148 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap);
150 }
151 caps_total_count = 0;
152 caps_avail_count = 0;
153 caps_use_count = 0;
154 caps_reserve_count = 0;
155 caps_min_count = 0;
156 spin_unlock(&caps_list_lock);
157}
158
159void ceph_adjust_min_caps(int delta)
160{
161 spin_lock(&caps_list_lock);
162 caps_min_count += delta;
163 BUG_ON(caps_min_count < 0);
164 spin_unlock(&caps_list_lock);
165}
166
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
168{
169 int i;
170 struct ceph_cap *cap;
171 int have;
172 int alloc = 0;
173 LIST_HEAD(newcaps);
174 int ret = 0;
175
176 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177
178 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock);
180 if (caps_avail_count >= need)
181 have = need;
182 else
183 have = caps_avail_count;
184 caps_avail_count -= have;
185 caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
187 caps_avail_count);
188 spin_unlock(&caps_list_lock);
189
190 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
192 if (!cap) {
193 ret = -ENOMEM;
194 goto out_alloc_count;
195 }
196 list_add(&cap->caps_item, &newcaps);
197 alloc++;
198 }
199 BUG_ON(have + alloc != need);
200
201 spin_lock(&caps_list_lock);
202 caps_total_count += alloc;
203 caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list);
205
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
207 caps_avail_count);
208 spin_unlock(&caps_list_lock);
209
210 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count,
213 caps_avail_count);
214 return 0;
215
216out_alloc_count:
217 /* we didn't manage to reserve as much as we needed */
218 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
219 ctx, need, have);
220 return ret;
221}
222
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
224{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) {
227 spin_lock(&caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count;
231 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count,
234 caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
236 caps_avail_count);
237 spin_unlock(&caps_list_lock);
238 }
239 return 0;
240}
241
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
243{
244 struct ceph_cap *cap = NULL;
245
246 /* temporary, until we do something about cap import/export */
247 if (!ctx)
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249
250 spin_lock(&caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
252 ctx, ctx->count, caps_total_count, caps_use_count,
253 caps_reserve_count, caps_avail_count);
254 BUG_ON(!ctx->count);
255 BUG_ON(ctx->count > caps_reserve_count);
256 BUG_ON(list_empty(&caps_list));
257
258 ctx->count--;
259 caps_reserve_count--;
260 caps_use_count++;
261
262 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
263 list_del(&cap->caps_item);
264
265 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
266 caps_avail_count);
267 spin_unlock(&caps_list_lock);
268 return cap;
269}
270
271void ceph_put_cap(struct ceph_cap *cap)
272{
273 spin_lock(&caps_list_lock);
274 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
275 cap, caps_total_count, caps_use_count,
276 caps_reserve_count, caps_avail_count);
277 caps_use_count--;
278 /*
279 * Keep some preallocated caps around (ceph_min_count), to
280 * avoid lots of free/alloc churn.
281 */
282 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
283 caps_total_count--;
284 kmem_cache_free(ceph_cap_cachep, cap);
285 } else {
286 caps_avail_count++;
287 list_add(&cap->caps_item, &caps_list);
288 }
289
290 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
291 caps_avail_count);
292 spin_unlock(&caps_list_lock);
293}
294
295void ceph_reservation_status(struct ceph_client *client,
296 int *total, int *avail, int *used, int *reserved,
297 int *min)
298{
299 if (total)
300 *total = caps_total_count;
301 if (avail)
302 *avail = caps_avail_count;
303 if (used)
304 *used = caps_use_count;
305 if (reserved)
306 *reserved = caps_reserve_count;
307 if (min)
308 *min = caps_min_count;
309}
310
311/*
312 * Find ceph_cap for given mds, if any.
313 *
314 * Called with i_lock held.
315 */
316static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
317{
318 struct ceph_cap *cap;
319 struct rb_node *n = ci->i_caps.rb_node;
320
321 while (n) {
322 cap = rb_entry(n, struct ceph_cap, ci_node);
323 if (mds < cap->mds)
324 n = n->rb_left;
325 else if (mds > cap->mds)
326 n = n->rb_right;
327 else
328 return cap;
329 }
330 return NULL;
331}
332
333/*
334 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
335 * -1.
336 */
337static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
338{
339 struct ceph_cap *cap;
340 int mds = -1;
341 struct rb_node *p;
342
343 /* prefer mds with WR|WRBUFFER|EXCL caps */
344 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
345 cap = rb_entry(p, struct ceph_cap, ci_node);
346 mds = cap->mds;
347 if (mseq)
348 *mseq = cap->mseq;
349 if (cap->issued & (CEPH_CAP_FILE_WR |
350 CEPH_CAP_FILE_BUFFER |
351 CEPH_CAP_FILE_EXCL))
352 break;
353 }
354 return mds;
355}
356
357int ceph_get_cap_mds(struct inode *inode)
358{
359 int mds;
360 spin_lock(&inode->i_lock);
361 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
362 spin_unlock(&inode->i_lock);
363 return mds;
364}
365
366/*
367 * Called under i_lock.
368 */
369static void __insert_cap_node(struct ceph_inode_info *ci,
370 struct ceph_cap *new)
371{
372 struct rb_node **p = &ci->i_caps.rb_node;
373 struct rb_node *parent = NULL;
374 struct ceph_cap *cap = NULL;
375
376 while (*p) {
377 parent = *p;
378 cap = rb_entry(parent, struct ceph_cap, ci_node);
379 if (new->mds < cap->mds)
380 p = &(*p)->rb_left;
381 else if (new->mds > cap->mds)
382 p = &(*p)->rb_right;
383 else
384 BUG();
385 }
386
387 rb_link_node(&new->ci_node, parent, p);
388 rb_insert_color(&new->ci_node, &ci->i_caps);
389}
390
391/*
392 * (re)set cap hold timeouts, which control the delayed release
393 * of unused caps back to the MDS. Should be called on cap use.
394 */
395static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
396 struct ceph_inode_info *ci)
397{
398 struct ceph_mount_args *ma = mdsc->client->mount_args;
399
400 ci->i_hold_caps_min = round_jiffies(jiffies +
401 ma->caps_wanted_delay_min * HZ);
402 ci->i_hold_caps_max = round_jiffies(jiffies +
403 ma->caps_wanted_delay_max * HZ);
404 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
405 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
406}
407
408/*
409 * (Re)queue cap at the end of the delayed cap release list.
410 *
411 * If I_FLUSH is set, leave the inode at the front of the list.
412 *
413 * Caller holds i_lock
414 * -> we take mdsc->cap_delay_lock
415 */
416static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
417 struct ceph_inode_info *ci)
418{
419 __cap_set_timeouts(mdsc, ci);
420 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
421 ci->i_ceph_flags, ci->i_hold_caps_max);
422 if (!mdsc->stopping) {
423 spin_lock(&mdsc->cap_delay_lock);
424 if (!list_empty(&ci->i_cap_delay_list)) {
425 if (ci->i_ceph_flags & CEPH_I_FLUSH)
426 goto no_change;
427 list_del_init(&ci->i_cap_delay_list);
428 }
429 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
430no_change:
431 spin_unlock(&mdsc->cap_delay_lock);
432 }
433}
434
435/*
436 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
437 * indicating we should send a cap message to flush dirty metadata
438 * asap, and move to the front of the delayed cap list.
439 */
440static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
441 struct ceph_inode_info *ci)
442{
443 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
444 spin_lock(&mdsc->cap_delay_lock);
445 ci->i_ceph_flags |= CEPH_I_FLUSH;
446 if (!list_empty(&ci->i_cap_delay_list))
447 list_del_init(&ci->i_cap_delay_list);
448 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
449 spin_unlock(&mdsc->cap_delay_lock);
450}
451
452/*
453 * Cancel delayed work on cap.
454 *
455 * Caller must hold i_lock.
456 */
457static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
458 struct ceph_inode_info *ci)
459{
460 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
461 if (list_empty(&ci->i_cap_delay_list))
462 return;
463 spin_lock(&mdsc->cap_delay_lock);
464 list_del_init(&ci->i_cap_delay_list);
465 spin_unlock(&mdsc->cap_delay_lock);
466}
467
468/*
469 * Common issue checks for add_cap, handle_cap_grant.
470 */
471static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
472 unsigned issued)
473{
474 unsigned had = __ceph_caps_issued(ci, NULL);
475
476 /*
477 * Each time we receive FILE_CACHE anew, we increment
478 * i_rdcache_gen.
479 */
480 if ((issued & CEPH_CAP_FILE_CACHE) &&
481 (had & CEPH_CAP_FILE_CACHE) == 0)
482 ci->i_rdcache_gen++;
483
484 /*
485 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
486 * don't know what happened to this directory while we didn't
487 * have the cap.
488 */
489 if ((issued & CEPH_CAP_FILE_SHARED) &&
490 (had & CEPH_CAP_FILE_SHARED) == 0) {
491 ci->i_shared_gen++;
492 if (S_ISDIR(ci->vfs_inode.i_mode)) {
493 dout(" marking %p NOT complete\n", &ci->vfs_inode);
494 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
495 }
496 }
497}
498
499/*
500 * Add a capability under the given MDS session.
501 *
502 * Caller should hold session snap_rwsem (read) and s_mutex.
503 *
504 * @fmode is the open file mode, if we are opening a file, otherwise
505 * it is < 0. (This is so we can atomically add the cap and add an
506 * open file reference to it.)
507 */
508int ceph_add_cap(struct inode *inode,
509 struct ceph_mds_session *session, u64 cap_id,
510 int fmode, unsigned issued, unsigned wanted,
511 unsigned seq, unsigned mseq, u64 realmino, int flags,
512 struct ceph_cap_reservation *caps_reservation)
513{
514 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
515 struct ceph_inode_info *ci = ceph_inode(inode);
516 struct ceph_cap *new_cap = NULL;
517 struct ceph_cap *cap;
518 int mds = session->s_mds;
519 int actual_wanted;
520
521 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
522 session->s_mds, cap_id, ceph_cap_string(issued), seq);
523
524 /*
525 * If we are opening the file, include file mode wanted bits
526 * in wanted.
527 */
528 if (fmode >= 0)
529 wanted |= ceph_caps_for_mode(fmode);
530
531retry:
532 spin_lock(&inode->i_lock);
533 cap = __get_cap_for_mds(ci, mds);
534 if (!cap) {
535 if (new_cap) {
536 cap = new_cap;
537 new_cap = NULL;
538 } else {
539 spin_unlock(&inode->i_lock);
540 new_cap = get_cap(caps_reservation);
541 if (new_cap == NULL)
542 return -ENOMEM;
543 goto retry;
544 }
545
546 cap->issued = 0;
547 cap->implemented = 0;
548 cap->mds = mds;
549 cap->mds_wanted = 0;
550
551 cap->ci = ci;
552 __insert_cap_node(ci, cap);
553
554 /* clear out old exporting info? (i.e. on cap import) */
555 if (ci->i_cap_exporting_mds == mds) {
556 ci->i_cap_exporting_issued = 0;
557 ci->i_cap_exporting_mseq = 0;
558 ci->i_cap_exporting_mds = -1;
559 }
560
561 /* add to session cap list */
562 cap->session = session;
563 spin_lock(&session->s_cap_lock);
564 list_add_tail(&cap->session_caps, &session->s_caps);
565 session->s_nr_caps++;
566 spin_unlock(&session->s_cap_lock);
567 }
568
569 if (!ci->i_snap_realm) {
570 /*
571 * add this inode to the appropriate snap realm
572 */
573 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
574 realmino);
575 if (realm) {
576 ceph_get_snap_realm(mdsc, realm);
577 spin_lock(&realm->inodes_with_caps_lock);
578 ci->i_snap_realm = realm;
579 list_add(&ci->i_snap_realm_item,
580 &realm->inodes_with_caps);
581 spin_unlock(&realm->inodes_with_caps_lock);
582 } else {
583 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
584 realmino);
585 }
586 }
587
588 __check_cap_issue(ci, cap, issued);
589
590 /*
591 * If we are issued caps we don't want, or the mds' wanted
592 * value appears to be off, queue a check so we'll release
593 * later and/or update the mds wanted value.
594 */
595 actual_wanted = __ceph_caps_wanted(ci);
596 if ((wanted & ~actual_wanted) ||
597 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
598 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
599 ceph_cap_string(issued), ceph_cap_string(wanted),
600 ceph_cap_string(actual_wanted));
601 __cap_delay_requeue(mdsc, ci);
602 }
603
604 if (flags & CEPH_CAP_FLAG_AUTH)
605 ci->i_auth_cap = cap;
606 else if (ci->i_auth_cap == cap)
607 ci->i_auth_cap = NULL;
608
609 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
610 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
611 ceph_cap_string(issued|cap->issued), seq, mds);
612 cap->cap_id = cap_id;
613 cap->issued = issued;
614 cap->implemented |= issued;
615 cap->mds_wanted |= wanted;
616 cap->seq = seq;
617 cap->issue_seq = seq;
618 cap->mseq = mseq;
619 cap->cap_gen = session->s_cap_gen;
620
621 if (fmode >= 0)
622 __ceph_get_fmode(ci, fmode);
623 spin_unlock(&inode->i_lock);
624 wake_up(&ci->i_cap_wq);
625 return 0;
626}
627
628/*
629 * Return true if cap has not timed out and belongs to the current
630 * generation of the MDS session (i.e. has not gone 'stale' due to
631 * us losing touch with the mds).
632 */
633static int __cap_is_valid(struct ceph_cap *cap)
634{
635 unsigned long ttl;
636 u32 gen;
637
638 spin_lock(&cap->session->s_cap_lock);
639 gen = cap->session->s_cap_gen;
640 ttl = cap->session->s_cap_ttl;
641 spin_unlock(&cap->session->s_cap_lock);
642
643 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
644 dout("__cap_is_valid %p cap %p issued %s "
645 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
646 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
647 return 0;
648 }
649
650 return 1;
651}
652
653/*
654 * Return set of valid cap bits issued to us. Note that caps time
655 * out, and may be invalidated in bulk if the client session times out
656 * and session->s_cap_gen is bumped.
657 */
658int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
659{
660 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
661 struct ceph_cap *cap;
662 struct rb_node *p;
663
664 if (implemented)
665 *implemented = 0;
666 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
667 cap = rb_entry(p, struct ceph_cap, ci_node);
668 if (!__cap_is_valid(cap))
669 continue;
670 dout("__ceph_caps_issued %p cap %p issued %s\n",
671 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
672 have |= cap->issued;
673 if (implemented)
674 *implemented |= cap->implemented;
675 }
676 return have;
677}
678
679/*
680 * Get cap bits issued by caps other than @ocap
681 */
682int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
683{
684 int have = ci->i_snap_caps;
685 struct ceph_cap *cap;
686 struct rb_node *p;
687
688 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
689 cap = rb_entry(p, struct ceph_cap, ci_node);
690 if (cap == ocap)
691 continue;
692 if (!__cap_is_valid(cap))
693 continue;
694 have |= cap->issued;
695 }
696 return have;
697}
698
699/*
700 * Move a cap to the end of the LRU (oldest caps at list head, newest
701 * at list tail).
702 */
703static void __touch_cap(struct ceph_cap *cap)
704{
705 struct ceph_mds_session *s = cap->session;
706
707 spin_lock(&s->s_cap_lock);
708 if (s->s_cap_iterator == NULL) {
709 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
710 s->s_mds);
711 list_move_tail(&cap->session_caps, &s->s_caps);
712 } else {
713 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
714 &cap->ci->vfs_inode, cap, s->s_mds);
715 }
716 spin_unlock(&s->s_cap_lock);
717}
718
719/*
720 * Check if we hold the given mask. If so, move the cap(s) to the
721 * front of their respective LRUs. (This is the preferred way for
722 * callers to check for caps they want.)
723 */
724int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
725{
726 struct ceph_cap *cap;
727 struct rb_node *p;
728 int have = ci->i_snap_caps;
729
730 if ((have & mask) == mask) {
731 dout("__ceph_caps_issued_mask %p snap issued %s"
732 " (mask %s)\n", &ci->vfs_inode,
733 ceph_cap_string(have),
734 ceph_cap_string(mask));
735 return 1;
736 }
737
738 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
739 cap = rb_entry(p, struct ceph_cap, ci_node);
740 if (!__cap_is_valid(cap))
741 continue;
742 if ((cap->issued & mask) == mask) {
743 dout("__ceph_caps_issued_mask %p cap %p issued %s"
744 " (mask %s)\n", &ci->vfs_inode, cap,
745 ceph_cap_string(cap->issued),
746 ceph_cap_string(mask));
747 if (touch)
748 __touch_cap(cap);
749 return 1;
750 }
751
752 /* does a combination of caps satisfy mask? */
753 have |= cap->issued;
754 if ((have & mask) == mask) {
755 dout("__ceph_caps_issued_mask %p combo issued %s"
756 " (mask %s)\n", &ci->vfs_inode,
757 ceph_cap_string(cap->issued),
758 ceph_cap_string(mask));
759 if (touch) {
760 struct rb_node *q;
761
762 /* touch this + preceeding caps */
763 __touch_cap(cap);
764 for (q = rb_first(&ci->i_caps); q != p;
765 q = rb_next(q)) {
766 cap = rb_entry(q, struct ceph_cap,
767 ci_node);
768 if (!__cap_is_valid(cap))
769 continue;
770 __touch_cap(cap);
771 }
772 }
773 return 1;
774 }
775 }
776
777 return 0;
778}
779
780/*
781 * Return true if mask caps are currently being revoked by an MDS.
782 */
783int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
784{
785 struct inode *inode = &ci->vfs_inode;
786 struct ceph_cap *cap;
787 struct rb_node *p;
788 int ret = 0;
789
790 spin_lock(&inode->i_lock);
791 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
792 cap = rb_entry(p, struct ceph_cap, ci_node);
793 if (__cap_is_valid(cap) &&
794 (cap->implemented & ~cap->issued & mask)) {
795 ret = 1;
796 break;
797 }
798 }
799 spin_unlock(&inode->i_lock);
800 dout("ceph_caps_revoking %p %s = %d\n", inode,
801 ceph_cap_string(mask), ret);
802 return ret;
803}
804
805int __ceph_caps_used(struct ceph_inode_info *ci)
806{
807 int used = 0;
808 if (ci->i_pin_ref)
809 used |= CEPH_CAP_PIN;
810 if (ci->i_rd_ref)
811 used |= CEPH_CAP_FILE_RD;
812 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
813 used |= CEPH_CAP_FILE_CACHE;
814 if (ci->i_wr_ref)
815 used |= CEPH_CAP_FILE_WR;
816 if (ci->i_wrbuffer_ref)
817 used |= CEPH_CAP_FILE_BUFFER;
818 return used;
819}
820
821/*
822 * wanted, by virtue of open file modes
823 */
824int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
825{
826 int want = 0;
827 int mode;
828 for (mode = 0; mode < 4; mode++)
829 if (ci->i_nr_by_mode[mode])
830 want |= ceph_caps_for_mode(mode);
831 return want;
832}
833
834/*
835 * Return caps we have registered with the MDS(s) as 'wanted'.
836 */
837int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
838{
839 struct ceph_cap *cap;
840 struct rb_node *p;
841 int mds_wanted = 0;
842
843 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
844 cap = rb_entry(p, struct ceph_cap, ci_node);
845 if (!__cap_is_valid(cap))
846 continue;
847 mds_wanted |= cap->mds_wanted;
848 }
849 return mds_wanted;
850}
851
852/*
853 * called under i_lock
854 */
855static int __ceph_is_any_caps(struct ceph_inode_info *ci)
856{
857 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
858}
859
860/*
861 * caller should hold i_lock.
862 * caller will not hold session s_mutex if called from destroy_inode.
863 */
864void __ceph_remove_cap(struct ceph_cap *cap)
865{
866 struct ceph_mds_session *session = cap->session;
867 struct ceph_inode_info *ci = cap->ci;
868 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
869
870 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
871
872 /* remove from inode list */
873 rb_erase(&cap->ci_node, &ci->i_caps);
874 cap->ci = NULL;
875 if (ci->i_auth_cap == cap)
876 ci->i_auth_cap = NULL;
877
878 /* remove from session list */
879 spin_lock(&session->s_cap_lock);
880 if (session->s_cap_iterator == cap) {
881 /* not yet, we are iterating over this very cap */
882 dout("__ceph_remove_cap delaying %p removal from session %p\n",
883 cap, cap->session);
884 } else {
885 list_del_init(&cap->session_caps);
886 session->s_nr_caps--;
887 cap->session = NULL;
888 }
889 spin_unlock(&session->s_cap_lock);
890
891 if (cap->session == NULL)
892 ceph_put_cap(cap);
893
894 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
895 struct ceph_snap_realm *realm = ci->i_snap_realm;
896 spin_lock(&realm->inodes_with_caps_lock);
897 list_del_init(&ci->i_snap_realm_item);
898 ci->i_snap_realm_counter++;
899 ci->i_snap_realm = NULL;
900 spin_unlock(&realm->inodes_with_caps_lock);
901 ceph_put_snap_realm(mdsc, realm);
902 }
903 if (!__ceph_is_any_real_caps(ci))
904 __cap_delay_cancel(mdsc, ci);
905}
906
907/*
908 * Build and send a cap message to the given MDS.
909 *
910 * Caller should be holding s_mutex.
911 */
912static int send_cap_msg(struct ceph_mds_session *session,
913 u64 ino, u64 cid, int op,
914 int caps, int wanted, int dirty,
915 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
916 u64 size, u64 max_size,
917 struct timespec *mtime, struct timespec *atime,
918 u64 time_warp_seq,
919 uid_t uid, gid_t gid, mode_t mode,
920 u64 xattr_version,
921 struct ceph_buffer *xattrs_buf,
922 u64 follows)
923{
924 struct ceph_mds_caps *fc;
925 struct ceph_msg *msg;
926
927 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
928 " seq %u/%u mseq %u follows %lld size %llu/%llu"
929 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
930 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
931 ceph_cap_string(dirty),
932 seq, issue_seq, mseq, follows, size, max_size,
933 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
934
935 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
936 if (IS_ERR(msg))
937 return PTR_ERR(msg);
938
939 msg->hdr.tid = cpu_to_le64(flush_tid);
940
941 fc = msg->front.iov_base;
942 memset(fc, 0, sizeof(*fc));
943
944 fc->cap_id = cpu_to_le64(cid);
945 fc->op = cpu_to_le32(op);
946 fc->seq = cpu_to_le32(seq);
947 fc->issue_seq = cpu_to_le32(issue_seq);
948 fc->migrate_seq = cpu_to_le32(mseq);
949 fc->caps = cpu_to_le32(caps);
950 fc->wanted = cpu_to_le32(wanted);
951 fc->dirty = cpu_to_le32(dirty);
952 fc->ino = cpu_to_le64(ino);
953 fc->snap_follows = cpu_to_le64(follows);
954
955 fc->size = cpu_to_le64(size);
956 fc->max_size = cpu_to_le64(max_size);
957 if (mtime)
958 ceph_encode_timespec(&fc->mtime, mtime);
959 if (atime)
960 ceph_encode_timespec(&fc->atime, atime);
961 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
962
963 fc->uid = cpu_to_le32(uid);
964 fc->gid = cpu_to_le32(gid);
965 fc->mode = cpu_to_le32(mode);
966
967 fc->xattr_version = cpu_to_le64(xattr_version);
968 if (xattrs_buf) {
969 msg->middle = ceph_buffer_get(xattrs_buf);
970 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
972 }
973
974 ceph_con_send(&session->s_con, msg);
975 return 0;
976}
977
978/*
979 * Queue cap releases when an inode is dropped from our cache. Since
980 * inode is about to be destroyed, there is no need for i_lock.
981 */
982void ceph_queue_caps_release(struct inode *inode)
983{
984 struct ceph_inode_info *ci = ceph_inode(inode);
985 struct rb_node *p;
986
987 p = rb_first(&ci->i_caps);
988 while (p) {
989 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
990 struct ceph_mds_session *session = cap->session;
991 struct ceph_msg *msg;
992 struct ceph_mds_cap_release *head;
993 struct ceph_mds_cap_item *item;
994
995 spin_lock(&session->s_cap_lock);
996 BUG_ON(!session->s_num_cap_releases);
997 msg = list_first_entry(&session->s_cap_releases,
998 struct ceph_msg, list_head);
999
1000 dout(" adding %p release to mds%d msg %p (%d left)\n",
1001 inode, session->s_mds, msg, session->s_num_cap_releases);
1002
1003 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1004 head = msg->front.iov_base;
1005 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1006 item = msg->front.iov_base + msg->front.iov_len;
1007 item->ino = cpu_to_le64(ceph_ino(inode));
1008 item->cap_id = cpu_to_le64(cap->cap_id);
1009 item->migrate_seq = cpu_to_le32(cap->mseq);
1010 item->seq = cpu_to_le32(cap->issue_seq);
1011
1012 session->s_num_cap_releases--;
1013
1014 msg->front.iov_len += sizeof(*item);
1015 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1016 dout(" release msg %p full\n", msg);
1017 list_move_tail(&msg->list_head,
1018 &session->s_cap_releases_done);
1019 } else {
1020 dout(" release msg %p at %d/%d (%d)\n", msg,
1021 (int)le32_to_cpu(head->num),
1022 (int)CEPH_CAPS_PER_RELEASE,
1023 (int)msg->front.iov_len);
1024 }
1025 spin_unlock(&session->s_cap_lock);
1026 p = rb_next(p);
1027 __ceph_remove_cap(cap);
1028 }
1029}
1030
1031/*
1032 * Send a cap msg on the given inode. Update our caps state, then
1033 * drop i_lock and send the message.
1034 *
1035 * Make note of max_size reported/requested from mds, revoked caps
1036 * that have now been implemented.
1037 *
1038 * Make half-hearted attempt ot to invalidate page cache if we are
1039 * dropping RDCACHE. Note that this will leave behind locked pages
1040 * that we'll then need to deal with elsewhere.
1041 *
1042 * Return non-zero if delayed release, or we experienced an error
1043 * such that the caller should requeue + retry later.
1044 *
1045 * called with i_lock, then drops it.
1046 * caller should hold snap_rwsem (read), s_mutex.
1047 */
1048static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1049 int op, int used, int want, int retain, int flushing,
1050 unsigned *pflush_tid)
1051 __releases(cap->ci->vfs_inode->i_lock)
1052{
1053 struct ceph_inode_info *ci = cap->ci;
1054 struct inode *inode = &ci->vfs_inode;
1055 u64 cap_id = cap->cap_id;
1056 int held, revoking, dropping, keep;
1057 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1058 u64 size, max_size;
1059 struct timespec mtime, atime;
1060 int wake = 0;
1061 mode_t mode;
1062 uid_t uid;
1063 gid_t gid;
1064 struct ceph_mds_session *session;
1065 u64 xattr_version = 0;
1066 int delayed = 0;
1067 u64 flush_tid = 0;
1068 int i;
1069 int ret;
1070
1071 held = cap->issued | cap->implemented;
1072 revoking = cap->implemented & ~cap->issued;
1073 retain &= ~revoking;
1074 dropping = cap->issued & ~retain;
1075
1076 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1077 inode, cap, cap->session,
1078 ceph_cap_string(held), ceph_cap_string(held & retain),
1079 ceph_cap_string(revoking));
1080 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1081
1082 session = cap->session;
1083
1084 /* don't release wanted unless we've waited a bit. */
1085 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1086 time_before(jiffies, ci->i_hold_caps_min)) {
1087 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1088 ceph_cap_string(cap->issued),
1089 ceph_cap_string(cap->issued & retain),
1090 ceph_cap_string(cap->mds_wanted),
1091 ceph_cap_string(want));
1092 want |= cap->mds_wanted;
1093 retain |= cap->issued;
1094 delayed = 1;
1095 }
1096 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1097
1098 cap->issued &= retain; /* drop bits we don't want */
1099 if (cap->implemented & ~cap->issued) {
1100 /*
1101 * Wake up any waiters on wanted -> needed transition.
1102 * This is due to the weird transition from buffered
1103 * to sync IO... we need to flush dirty pages _before_
1104 * allowing sync writes to avoid reordering.
1105 */
1106 wake = 1;
1107 }
1108 cap->implemented &= cap->issued | used;
1109 cap->mds_wanted = want;
1110
1111 if (flushing) {
1112 /*
1113 * assign a tid for flush operations so we can avoid
1114 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1115 * clean type races. track latest tid for every bit
1116 * so we can handle flush AxFw, flush Fw, and have the
1117 * first ack clean Ax.
1118 */
1119 flush_tid = ++ci->i_cap_flush_last_tid;
1120 if (pflush_tid)
1121 *pflush_tid = flush_tid;
1122 dout(" cap_flush_tid %d\n", (int)flush_tid);
1123 for (i = 0; i < CEPH_CAP_BITS; i++)
1124 if (flushing & (1 << i))
1125 ci->i_cap_flush_tid[i] = flush_tid;
1126 }
1127
1128 keep = cap->implemented;
1129 seq = cap->seq;
1130 issue_seq = cap->issue_seq;
1131 mseq = cap->mseq;
1132 size = inode->i_size;
1133 ci->i_reported_size = size;
1134 max_size = ci->i_wanted_max_size;
1135 ci->i_requested_max_size = max_size;
1136 mtime = inode->i_mtime;
1137 atime = inode->i_atime;
1138 time_warp_seq = ci->i_time_warp_seq;
1139 follows = ci->i_snap_realm->cached_context->seq;
1140 uid = inode->i_uid;
1141 gid = inode->i_gid;
1142 mode = inode->i_mode;
1143
1144 if (dropping & CEPH_CAP_XATTR_EXCL) {
1145 __ceph_build_xattrs_blob(ci);
1146 xattr_version = ci->i_xattrs.version + 1;
1147 }
1148
1149 spin_unlock(&inode->i_lock);
1150
1151 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1152 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1153 size, max_size, &mtime, &atime, time_warp_seq,
1154 uid, gid, mode,
1155 xattr_version,
1156 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1157 follows);
1158 if (ret < 0) {
1159 dout("error sending cap msg, must requeue %p\n", inode);
1160 delayed = 1;
1161 }
1162
1163 if (wake)
1164 wake_up(&ci->i_cap_wq);
1165
1166 return delayed;
1167}
1168
1169/*
1170 * When a snapshot is taken, clients accumulate dirty metadata on
1171 * inodes with capabilities in ceph_cap_snaps to describe the file
1172 * state at the time the snapshot was taken. This must be flushed
1173 * asynchronously back to the MDS once sync writes complete and dirty
1174 * data is written out.
1175 *
1176 * Called under i_lock. Takes s_mutex as needed.
1177 */
1178void __ceph_flush_snaps(struct ceph_inode_info *ci,
1179 struct ceph_mds_session **psession)
1180{
1181 struct inode *inode = &ci->vfs_inode;
1182 int mds;
1183 struct ceph_cap_snap *capsnap;
1184 u32 mseq;
1185 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1186 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1187 session->s_mutex */
1188 u64 next_follows = 0; /* keep track of how far we've gotten through the
1189 i_cap_snaps list, and skip these entries next time
1190 around to avoid an infinite loop */
1191
1192 if (psession)
1193 session = *psession;
1194
1195 dout("__flush_snaps %p\n", inode);
1196retry:
1197 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1198 /* avoid an infiniute loop after retry */
1199 if (capsnap->follows < next_follows)
1200 continue;
1201 /*
1202 * we need to wait for sync writes to complete and for dirty
1203 * pages to be written out.
1204 */
1205 if (capsnap->dirty_pages || capsnap->writing)
1206 continue;
1207
1208 /* pick mds, take s_mutex */
1209 mds = __ceph_get_cap_mds(ci, &mseq);
1210 if (session && session->s_mds != mds) {
1211 dout("oops, wrong session %p mutex\n", session);
1212 mutex_unlock(&session->s_mutex);
1213 ceph_put_mds_session(session);
1214 session = NULL;
1215 }
1216 if (!session) {
1217 spin_unlock(&inode->i_lock);
1218 mutex_lock(&mdsc->mutex);
1219 session = __ceph_lookup_mds_session(mdsc, mds);
1220 mutex_unlock(&mdsc->mutex);
1221 if (session) {
1222 dout("inverting session/ino locks on %p\n",
1223 session);
1224 mutex_lock(&session->s_mutex);
1225 }
1226 /*
1227 * if session == NULL, we raced against a cap
1228 * deletion. retry, and we'll get a better
1229 * @mds value next time.
1230 */
1231 spin_lock(&inode->i_lock);
1232 goto retry;
1233 }
1234
1235 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1236 atomic_inc(&capsnap->nref);
1237 if (!list_empty(&capsnap->flushing_item))
1238 list_del_init(&capsnap->flushing_item);
1239 list_add_tail(&capsnap->flushing_item,
1240 &session->s_cap_snaps_flushing);
1241 spin_unlock(&inode->i_lock);
1242
1243 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1244 inode, capsnap, next_follows, capsnap->size);
1245 send_cap_msg(session, ceph_vino(inode).ino, 0,
1246 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1247 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1248 capsnap->size, 0,
1249 &capsnap->mtime, &capsnap->atime,
1250 capsnap->time_warp_seq,
1251 capsnap->uid, capsnap->gid, capsnap->mode,
1252 0, NULL,
1253 capsnap->follows);
1254
1255 next_follows = capsnap->follows + 1;
1256 ceph_put_cap_snap(capsnap);
1257
1258 spin_lock(&inode->i_lock);
1259 goto retry;
1260 }
1261
1262 /* we flushed them all; remove this inode from the queue */
1263 spin_lock(&mdsc->snap_flush_lock);
1264 list_del_init(&ci->i_snap_flush_item);
1265 spin_unlock(&mdsc->snap_flush_lock);
1266
1267 if (psession)
1268 *psession = session;
1269 else if (session) {
1270 mutex_unlock(&session->s_mutex);
1271 ceph_put_mds_session(session);
1272 }
1273}
1274
1275static void ceph_flush_snaps(struct ceph_inode_info *ci)
1276{
1277 struct inode *inode = &ci->vfs_inode;
1278
1279 spin_lock(&inode->i_lock);
1280 __ceph_flush_snaps(ci, NULL);
1281 spin_unlock(&inode->i_lock);
1282}
1283
1284/*
1285 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1286 * list.
1287 */
1288void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1289{
1290 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1291 struct inode *inode = &ci->vfs_inode;
1292 int was = ci->i_dirty_caps;
1293 int dirty = 0;
1294
1295 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1296 ceph_cap_string(mask), ceph_cap_string(was),
1297 ceph_cap_string(was | mask));
1298 ci->i_dirty_caps |= mask;
1299 if (was == 0) {
1300 dout(" inode %p now dirty\n", &ci->vfs_inode);
1301 BUG_ON(!list_empty(&ci->i_dirty_item));
1302 spin_lock(&mdsc->cap_dirty_lock);
1303 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1304 spin_unlock(&mdsc->cap_dirty_lock);
1305 if (ci->i_flushing_caps == 0) {
1306 igrab(inode);
1307 dirty |= I_DIRTY_SYNC;
1308 }
1309 }
1310 BUG_ON(list_empty(&ci->i_dirty_item));
1311 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1312 (mask & CEPH_CAP_FILE_BUFFER))
1313 dirty |= I_DIRTY_DATASYNC;
1314 if (dirty)
1315 __mark_inode_dirty(inode, dirty);
1316 __cap_delay_requeue(mdsc, ci);
1317}
1318
1319/*
1320 * Add dirty inode to the flushing list. Assigned a seq number so we
1321 * can wait for caps to flush without starving.
1322 *
1323 * Called under i_lock.
1324 */
1325static int __mark_caps_flushing(struct inode *inode,
1326 struct ceph_mds_session *session)
1327{
1328 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1329 struct ceph_inode_info *ci = ceph_inode(inode);
1330 int flushing;
1331
1332 BUG_ON(ci->i_dirty_caps == 0);
1333 BUG_ON(list_empty(&ci->i_dirty_item));
1334
1335 flushing = ci->i_dirty_caps;
1336 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1337 ceph_cap_string(flushing),
1338 ceph_cap_string(ci->i_flushing_caps),
1339 ceph_cap_string(ci->i_flushing_caps | flushing));
1340 ci->i_flushing_caps |= flushing;
1341 ci->i_dirty_caps = 0;
1342 dout(" inode %p now !dirty\n", inode);
1343
1344 spin_lock(&mdsc->cap_dirty_lock);
1345 list_del_init(&ci->i_dirty_item);
1346
1347 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1348 if (list_empty(&ci->i_flushing_item)) {
1349 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1350 mdsc->num_cap_flushing++;
1351 dout(" inode %p now flushing seq %lld\n", inode,
1352 ci->i_cap_flush_seq);
1353 } else {
1354 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1355 dout(" inode %p now flushing (more) seq %lld\n", inode,
1356 ci->i_cap_flush_seq);
1357 }
1358 spin_unlock(&mdsc->cap_dirty_lock);
1359
1360 return flushing;
1361}
1362
1363/*
1364 * try to invalidate mapping pages without blocking.
1365 */
1366static int mapping_is_empty(struct address_space *mapping)
1367{
1368 struct page *page = find_get_page(mapping, 0);
1369
1370 if (!page)
1371 return 1;
1372
1373 put_page(page);
1374 return 0;
1375}
1376
1377static int try_nonblocking_invalidate(struct inode *inode)
1378{
1379 struct ceph_inode_info *ci = ceph_inode(inode);
1380 u32 invalidating_gen = ci->i_rdcache_gen;
1381
1382 spin_unlock(&inode->i_lock);
1383 invalidate_mapping_pages(&inode->i_data, 0, -1);
1384 spin_lock(&inode->i_lock);
1385
1386 if (mapping_is_empty(&inode->i_data) &&
1387 invalidating_gen == ci->i_rdcache_gen) {
1388 /* success. */
1389 dout("try_nonblocking_invalidate %p success\n", inode);
1390 ci->i_rdcache_gen = 0;
1391 ci->i_rdcache_revoking = 0;
1392 return 0;
1393 }
1394 dout("try_nonblocking_invalidate %p failed\n", inode);
1395 return -1;
1396}
1397
1398/*
1399 * Swiss army knife function to examine currently used and wanted
1400 * versus held caps. Release, flush, ack revoked caps to mds as
1401 * appropriate.
1402 *
1403 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1404 * cap release further.
1405 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1406 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1407 * further delay.
1408 */
1409void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1410 struct ceph_mds_session *session)
1411 __releases(session->s_mutex)
1412{
1413 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1414 struct ceph_mds_client *mdsc = &client->mdsc;
1415 struct inode *inode = &ci->vfs_inode;
1416 struct ceph_cap *cap;
1417 int file_wanted, used;
1418 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1419 int issued, implemented, want, retain, revoking, flushing = 0;
1420 int mds = -1; /* keep track of how far we've gone through i_caps list
1421 to avoid an infinite loop on retry */
1422 struct rb_node *p;
1423 int tried_invalidate = 0;
1424 int delayed = 0, sent = 0, force_requeue = 0, num;
1425 int queue_invalidate = 0;
1426 int is_delayed = flags & CHECK_CAPS_NODELAY;
1427
1428 /* if we are unmounting, flush any unused caps immediately. */
1429 if (mdsc->stopping)
1430 is_delayed = 1;
1431
1432 spin_lock(&inode->i_lock);
1433
1434 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1435 flags |= CHECK_CAPS_FLUSH;
1436
1437 /* flush snaps first time around only */
1438 if (!list_empty(&ci->i_cap_snaps))
1439 __ceph_flush_snaps(ci, &session);
1440 goto retry_locked;
1441retry:
1442 spin_lock(&inode->i_lock);
1443retry_locked:
1444 file_wanted = __ceph_caps_file_wanted(ci);
1445 used = __ceph_caps_used(ci);
1446 want = file_wanted | used;
1447 issued = __ceph_caps_issued(ci, &implemented);
1448 revoking = implemented & ~issued;
1449
1450 retain = want | CEPH_CAP_PIN;
1451 if (!mdsc->stopping && inode->i_nlink > 0) {
1452 if (want) {
1453 retain |= CEPH_CAP_ANY; /* be greedy */
1454 } else {
1455 retain |= CEPH_CAP_ANY_SHARED;
1456 /*
1457 * keep RD only if we didn't have the file open RW,
1458 * because then the mds would revoke it anyway to
1459 * journal max_size=0.
1460 */
1461 if (ci->i_max_size == 0)
1462 retain |= CEPH_CAP_ANY_RD;
1463 }
1464 }
1465
1466 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1467 " issued %s revoking %s retain %s %s%s%s\n", inode,
1468 ceph_cap_string(file_wanted),
1469 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1470 ceph_cap_string(ci->i_flushing_caps),
1471 ceph_cap_string(issued), ceph_cap_string(revoking),
1472 ceph_cap_string(retain),
1473 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1474 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1475 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1476
1477 /*
1478 * If we no longer need to hold onto old our caps, and we may
1479 * have cached pages, but don't want them, then try to invalidate.
1480 * If we fail, it's because pages are locked.... try again later.
1481 */
1482 if ((!is_delayed || mdsc->stopping) &&
1483 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1484 ci->i_rdcache_gen && /* may have cached pages */
1485 (file_wanted == 0 || /* no open files */
1486 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1487 !tried_invalidate) {
1488 dout("check_caps trying to invalidate on %p\n", inode);
1489 if (try_nonblocking_invalidate(inode) < 0) {
1490 if (revoking & CEPH_CAP_FILE_CACHE) {
1491 dout("check_caps queuing invalidate\n");
1492 queue_invalidate = 1;
1493 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1494 } else {
1495 dout("check_caps failed to invalidate pages\n");
1496 /* we failed to invalidate pages. check these
1497 caps again later. */
1498 force_requeue = 1;
1499 __cap_set_timeouts(mdsc, ci);
1500 }
1501 }
1502 tried_invalidate = 1;
1503 goto retry_locked;
1504 }
1505
1506 num = 0;
1507 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1508 cap = rb_entry(p, struct ceph_cap, ci_node);
1509 num++;
1510
1511 /* avoid looping forever */
1512 if (mds >= cap->mds ||
1513 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1514 continue;
1515
1516 /* NOTE: no side-effects allowed, until we take s_mutex */
1517
1518 revoking = cap->implemented & ~cap->issued;
1519 if (revoking)
1520 dout(" mds%d revoking %s\n", cap->mds,
1521 ceph_cap_string(revoking));
1522
1523 if (cap == ci->i_auth_cap &&
1524 (cap->issued & CEPH_CAP_FILE_WR)) {
1525 /* request larger max_size from MDS? */
1526 if (ci->i_wanted_max_size > ci->i_max_size &&
1527 ci->i_wanted_max_size > ci->i_requested_max_size) {
1528 dout("requesting new max_size\n");
1529 goto ack;
1530 }
1531
1532 /* approaching file_max? */
1533 if ((inode->i_size << 1) >= ci->i_max_size &&
1534 (ci->i_reported_size << 1) < ci->i_max_size) {
1535 dout("i_size approaching max_size\n");
1536 goto ack;
1537 }
1538 }
1539 /* flush anything dirty? */
1540 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1541 ci->i_dirty_caps) {
1542 dout("flushing dirty caps\n");
1543 goto ack;
1544 }
1545
1546 /* completed revocation? going down and there are no caps? */
1547 if (revoking && (revoking & used) == 0) {
1548 dout("completed revocation of %s\n",
1549 ceph_cap_string(cap->implemented & ~cap->issued));
1550 goto ack;
1551 }
1552
1553 /* want more caps from mds? */
1554 if (want & ~(cap->mds_wanted | cap->issued))
1555 goto ack;
1556
1557 /* things we might delay */
1558 if ((cap->issued & ~retain) == 0 &&
1559 cap->mds_wanted == want)
1560 continue; /* nope, all good */
1561
1562 if (is_delayed)
1563 goto ack;
1564
1565 /* delay? */
1566 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1567 time_before(jiffies, ci->i_hold_caps_max)) {
1568 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1569 ceph_cap_string(cap->issued),
1570 ceph_cap_string(cap->issued & retain),
1571 ceph_cap_string(cap->mds_wanted),
1572 ceph_cap_string(want));
1573 delayed++;
1574 continue;
1575 }
1576
1577ack:
1578 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1579 dout(" skipping %p I_NOFLUSH set\n", inode);
1580 continue;
1581 }
1582
1583 if (session && session != cap->session) {
1584 dout("oops, wrong session %p mutex\n", session);
1585 mutex_unlock(&session->s_mutex);
1586 session = NULL;
1587 }
1588 if (!session) {
1589 session = cap->session;
1590 if (mutex_trylock(&session->s_mutex) == 0) {
1591 dout("inverting session/ino locks on %p\n",
1592 session);
1593 spin_unlock(&inode->i_lock);
1594 if (took_snap_rwsem) {
1595 up_read(&mdsc->snap_rwsem);
1596 took_snap_rwsem = 0;
1597 }
1598 mutex_lock(&session->s_mutex);
1599 goto retry;
1600 }
1601 }
1602 /* take snap_rwsem after session mutex */
1603 if (!took_snap_rwsem) {
1604 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1605 dout("inverting snap/in locks on %p\n",
1606 inode);
1607 spin_unlock(&inode->i_lock);
1608 down_read(&mdsc->snap_rwsem);
1609 took_snap_rwsem = 1;
1610 goto retry;
1611 }
1612 took_snap_rwsem = 1;
1613 }
1614
1615 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1616 flushing = __mark_caps_flushing(inode, session);
1617
1618 mds = cap->mds; /* remember mds, so we don't repeat */
1619 sent++;
1620
1621 /* __send_cap drops i_lock */
1622 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1623 retain, flushing, NULL);
1624 goto retry; /* retake i_lock and restart our cap scan. */
1625 }
1626
1627 /*
1628 * Reschedule delayed caps release if we delayed anything,
1629 * otherwise cancel.
1630 */
1631 if (delayed && is_delayed)
1632 force_requeue = 1; /* __send_cap delayed release; requeue */
1633 if (!delayed && !is_delayed)
1634 __cap_delay_cancel(mdsc, ci);
1635 else if (!is_delayed || force_requeue)
1636 __cap_delay_requeue(mdsc, ci);
1637
1638 spin_unlock(&inode->i_lock);
1639
1640 if (queue_invalidate)
1641 ceph_queue_invalidate(inode);
1642
1643 if (session)
1644 mutex_unlock(&session->s_mutex);
1645 if (took_snap_rwsem)
1646 up_read(&mdsc->snap_rwsem);
1647}
1648
1649/*
1650 * Try to flush dirty caps back to the auth mds.
1651 */
1652static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1653 unsigned *flush_tid)
1654{
1655 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1656 struct ceph_inode_info *ci = ceph_inode(inode);
1657 int unlock_session = session ? 0 : 1;
1658 int flushing = 0;
1659
1660retry:
1661 spin_lock(&inode->i_lock);
1662 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1663 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1664 goto out;
1665 }
1666 if (ci->i_dirty_caps && ci->i_auth_cap) {
1667 struct ceph_cap *cap = ci->i_auth_cap;
1668 int used = __ceph_caps_used(ci);
1669 int want = __ceph_caps_wanted(ci);
1670 int delayed;
1671
1672 if (!session) {
1673 spin_unlock(&inode->i_lock);
1674 session = cap->session;
1675 mutex_lock(&session->s_mutex);
1676 goto retry;
1677 }
1678 BUG_ON(session != cap->session);
1679 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1680 goto out;
1681
1682 flushing = __mark_caps_flushing(inode, session);
1683
1684 /* __send_cap drops i_lock */
1685 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1686 cap->issued | cap->implemented, flushing,
1687 flush_tid);
1688 if (!delayed)
1689 goto out_unlocked;
1690
1691 spin_lock(&inode->i_lock);
1692 __cap_delay_requeue(mdsc, ci);
1693 }
1694out:
1695 spin_unlock(&inode->i_lock);
1696out_unlocked:
1697 if (session && unlock_session)
1698 mutex_unlock(&session->s_mutex);
1699 return flushing;
1700}
1701
1702/*
1703 * Return true if we've flushed caps through the given flush_tid.
1704 */
1705static int caps_are_flushed(struct inode *inode, unsigned tid)
1706{
1707 struct ceph_inode_info *ci = ceph_inode(inode);
1708 int dirty, i, ret = 1;
1709
1710 spin_lock(&inode->i_lock);
1711 dirty = __ceph_caps_dirty(ci);
1712 for (i = 0; i < CEPH_CAP_BITS; i++)
1713 if ((ci->i_flushing_caps & (1 << i)) &&
1714 ci->i_cap_flush_tid[i] <= tid) {
1715 /* still flushing this bit */
1716 ret = 0;
1717 break;
1718 }
1719 spin_unlock(&inode->i_lock);
1720 return ret;
1721}
1722
1723/*
1724 * Wait on any unsafe replies for the given inode. First wait on the
1725 * newest request, and make that the upper bound. Then, if there are
1726 * more requests, keep waiting on the oldest as long as it is still older
1727 * than the original request.
1728 */
1729static void sync_write_wait(struct inode *inode)
1730{
1731 struct ceph_inode_info *ci = ceph_inode(inode);
1732 struct list_head *head = &ci->i_unsafe_writes;
1733 struct ceph_osd_request *req;
1734 u64 last_tid;
1735
1736 spin_lock(&ci->i_unsafe_lock);
1737 if (list_empty(head))
1738 goto out;
1739
1740 /* set upper bound as _last_ entry in chain */
1741 req = list_entry(head->prev, struct ceph_osd_request,
1742 r_unsafe_item);
1743 last_tid = req->r_tid;
1744
1745 do {
1746 ceph_osdc_get_request(req);
1747 spin_unlock(&ci->i_unsafe_lock);
1748 dout("sync_write_wait on tid %llu (until %llu)\n",
1749 req->r_tid, last_tid);
1750 wait_for_completion(&req->r_safe_completion);
1751 spin_lock(&ci->i_unsafe_lock);
1752 ceph_osdc_put_request(req);
1753
1754 /*
1755 * from here on look at first entry in chain, since we
1756 * only want to wait for anything older than last_tid
1757 */
1758 if (list_empty(head))
1759 break;
1760 req = list_entry(head->next, struct ceph_osd_request,
1761 r_unsafe_item);
1762 } while (req->r_tid < last_tid);
1763out:
1764 spin_unlock(&ci->i_unsafe_lock);
1765}
1766
1767int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1768{
1769 struct inode *inode = dentry->d_inode;
1770 struct ceph_inode_info *ci = ceph_inode(inode);
1771 unsigned flush_tid;
1772 int ret;
1773 int dirty;
1774
1775 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1776 sync_write_wait(inode);
1777
1778 ret = filemap_write_and_wait(inode->i_mapping);
1779 if (ret < 0)
1780 return ret;
1781
1782 dirty = try_flush_caps(inode, NULL, &flush_tid);
1783 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1784
1785 /*
1786 * only wait on non-file metadata writeback (the mds
1787 * can recover size and mtime, so we don't need to
1788 * wait for that)
1789 */
1790 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1791 dout("fsync waiting for flush_tid %u\n", flush_tid);
1792 ret = wait_event_interruptible(ci->i_cap_wq,
1793 caps_are_flushed(inode, flush_tid));
1794 }
1795
1796 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1797 return ret;
1798}
1799
1800/*
1801 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1802 * queue inode for flush but don't do so immediately, because we can
1803 * get by with fewer MDS messages if we wait for data writeback to
1804 * complete first.
1805 */
1806int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1807{
1808 struct ceph_inode_info *ci = ceph_inode(inode);
1809 unsigned flush_tid;
1810 int err = 0;
1811 int dirty;
1812 int wait = wbc->sync_mode == WB_SYNC_ALL;
1813
1814 dout("write_inode %p wait=%d\n", inode, wait);
1815 if (wait) {
1816 dirty = try_flush_caps(inode, NULL, &flush_tid);
1817 if (dirty)
1818 err = wait_event_interruptible(ci->i_cap_wq,
1819 caps_are_flushed(inode, flush_tid));
1820 } else {
1821 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1822
1823 spin_lock(&inode->i_lock);
1824 if (__ceph_caps_dirty(ci))
1825 __cap_delay_requeue_front(mdsc, ci);
1826 spin_unlock(&inode->i_lock);
1827 }
1828 return err;
1829}
1830
1831/*
1832 * After a recovering MDS goes active, we need to resend any caps
1833 * we were flushing.
1834 *
1835 * Caller holds session->s_mutex.
1836 */
1837static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1838 struct ceph_mds_session *session)
1839{
1840 struct ceph_cap_snap *capsnap;
1841
1842 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1843 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1844 flushing_item) {
1845 struct ceph_inode_info *ci = capsnap->ci;
1846 struct inode *inode = &ci->vfs_inode;
1847 struct ceph_cap *cap;
1848
1849 spin_lock(&inode->i_lock);
1850 cap = ci->i_auth_cap;
1851 if (cap && cap->session == session) {
1852 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1853 cap, capsnap);
1854 __ceph_flush_snaps(ci, &session);
1855 } else {
1856 pr_err("%p auth cap %p not mds%d ???\n", inode,
1857 cap, session->s_mds);
1858 spin_unlock(&inode->i_lock);
1859 }
1860 }
1861}
1862
1863void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1864 struct ceph_mds_session *session)
1865{
1866 struct ceph_inode_info *ci;
1867
1868 kick_flushing_capsnaps(mdsc, session);
1869
1870 dout("kick_flushing_caps mds%d\n", session->s_mds);
1871 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1872 struct inode *inode = &ci->vfs_inode;
1873 struct ceph_cap *cap;
1874 int delayed = 0;
1875
1876 spin_lock(&inode->i_lock);
1877 cap = ci->i_auth_cap;
1878 if (cap && cap->session == session) {
1879 dout("kick_flushing_caps %p cap %p %s\n", inode,
1880 cap, ceph_cap_string(ci->i_flushing_caps));
1881 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1882 __ceph_caps_used(ci),
1883 __ceph_caps_wanted(ci),
1884 cap->issued | cap->implemented,
1885 ci->i_flushing_caps, NULL);
1886 if (delayed) {
1887 spin_lock(&inode->i_lock);
1888 __cap_delay_requeue(mdsc, ci);
1889 spin_unlock(&inode->i_lock);
1890 }
1891 } else {
1892 pr_err("%p auth cap %p not mds%d ???\n", inode,
1893 cap, session->s_mds);
1894 spin_unlock(&inode->i_lock);
1895 }
1896 }
1897}
1898
1899
1900/*
1901 * Take references to capabilities we hold, so that we don't release
1902 * them to the MDS prematurely.
1903 *
1904 * Protected by i_lock.
1905 */
1906static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1907{
1908 if (got & CEPH_CAP_PIN)
1909 ci->i_pin_ref++;
1910 if (got & CEPH_CAP_FILE_RD)
1911 ci->i_rd_ref++;
1912 if (got & CEPH_CAP_FILE_CACHE)
1913 ci->i_rdcache_ref++;
1914 if (got & CEPH_CAP_FILE_WR)
1915 ci->i_wr_ref++;
1916 if (got & CEPH_CAP_FILE_BUFFER) {
1917 if (ci->i_wrbuffer_ref == 0)
1918 igrab(&ci->vfs_inode);
1919 ci->i_wrbuffer_ref++;
1920 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1921 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1922 }
1923}
1924
1925/*
1926 * Try to grab cap references. Specify those refs we @want, and the
1927 * minimal set we @need. Also include the larger offset we are writing
1928 * to (when applicable), and check against max_size here as well.
1929 * Note that caller is responsible for ensuring max_size increases are
1930 * requested from the MDS.
1931 */
1932static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1933 int *got, loff_t endoff, int *check_max, int *err)
1934{
1935 struct inode *inode = &ci->vfs_inode;
1936 int ret = 0;
1937 int have, implemented;
1938 int file_wanted;
1939
1940 dout("get_cap_refs %p need %s want %s\n", inode,
1941 ceph_cap_string(need), ceph_cap_string(want));
1942 spin_lock(&inode->i_lock);
1943
1944 /* make sure file is actually open */
1945 file_wanted = __ceph_caps_file_wanted(ci);
1946 if ((file_wanted & need) == 0) {
1947 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1948 ceph_cap_string(need), ceph_cap_string(file_wanted));
1949 *err = -EBADF;
1950 ret = 1;
1951 goto out;
1952 }
1953
1954 if (need & CEPH_CAP_FILE_WR) {
1955 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1956 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1957 inode, endoff, ci->i_max_size);
1958 if (endoff > ci->i_wanted_max_size) {
1959 *check_max = 1;
1960 ret = 1;
1961 }
1962 goto out;
1963 }
1964 /*
1965 * If a sync write is in progress, we must wait, so that we
1966 * can get a final snapshot value for size+mtime.
1967 */
1968 if (__ceph_have_pending_cap_snap(ci)) {
1969 dout("get_cap_refs %p cap_snap_pending\n", inode);
1970 goto out;
1971 }
1972 }
1973 have = __ceph_caps_issued(ci, &implemented);
1974
1975 /*
1976 * disallow writes while a truncate is pending
1977 */
1978 if (ci->i_truncate_pending)
1979 have &= ~CEPH_CAP_FILE_WR;
1980
1981 if ((have & need) == need) {
1982 /*
1983 * Look at (implemented & ~have & not) so that we keep waiting
1984 * on transition from wanted -> needed caps. This is needed
1985 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1986 * going before a prior buffered writeback happens.
1987 */
1988 int not = want & ~(have & need);
1989 int revoking = implemented & ~have;
1990 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1991 inode, ceph_cap_string(have), ceph_cap_string(not),
1992 ceph_cap_string(revoking));
1993 if ((revoking & not) == 0) {
1994 *got = need | (have & want);
1995 __take_cap_refs(ci, *got);
1996 ret = 1;
1997 }
1998 } else {
1999 dout("get_cap_refs %p have %s needed %s\n", inode,
2000 ceph_cap_string(have), ceph_cap_string(need));
2001 }
2002out:
2003 spin_unlock(&inode->i_lock);
2004 dout("get_cap_refs %p ret %d got %s\n", inode,
2005 ret, ceph_cap_string(*got));
2006 return ret;
2007}
2008
2009/*
2010 * Check the offset we are writing up to against our current
2011 * max_size. If necessary, tell the MDS we want to write to
2012 * a larger offset.
2013 */
2014static void check_max_size(struct inode *inode, loff_t endoff)
2015{
2016 struct ceph_inode_info *ci = ceph_inode(inode);
2017 int check = 0;
2018
2019 /* do we need to explicitly request a larger max_size? */
2020 spin_lock(&inode->i_lock);
2021 if ((endoff >= ci->i_max_size ||
2022 endoff > (inode->i_size << 1)) &&
2023 endoff > ci->i_wanted_max_size) {
2024 dout("write %p at large endoff %llu, req max_size\n",
2025 inode, endoff);
2026 ci->i_wanted_max_size = endoff;
2027 check = 1;
2028 }
2029 spin_unlock(&inode->i_lock);
2030 if (check)
2031 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2032}
2033
2034/*
2035 * Wait for caps, and take cap references. If we can't get a WR cap
2036 * due to a small max_size, make sure we check_max_size (and possibly
2037 * ask the mds) so we don't get hung up indefinitely.
2038 */
2039int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2040 loff_t endoff)
2041{
2042 int check_max, ret, err;
2043
2044retry:
2045 if (endoff > 0)
2046 check_max_size(&ci->vfs_inode, endoff);
2047 check_max = 0;
2048 err = 0;
2049 ret = wait_event_interruptible(ci->i_cap_wq,
2050 try_get_cap_refs(ci, need, want,
2051 got, endoff,
2052 &check_max, &err));
2053 if (err)
2054 ret = err;
2055 if (check_max)
2056 goto retry;
2057 return ret;
2058}
2059
2060/*
2061 * Take cap refs. Caller must already know we hold at least one ref
2062 * on the caps in question or we don't know this is safe.
2063 */
2064void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2065{
2066 spin_lock(&ci->vfs_inode.i_lock);
2067 __take_cap_refs(ci, caps);
2068 spin_unlock(&ci->vfs_inode.i_lock);
2069}
2070
2071/*
2072 * Release cap refs.
2073 *
2074 * If we released the last ref on any given cap, call ceph_check_caps
2075 * to release (or schedule a release).
2076 *
2077 * If we are releasing a WR cap (from a sync write), finalize any affected
2078 * cap_snap, and wake up any waiters.
2079 */
2080void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2081{
2082 struct inode *inode = &ci->vfs_inode;
2083 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2084 struct ceph_cap_snap *capsnap;
2085
2086 spin_lock(&inode->i_lock);
2087 if (had & CEPH_CAP_PIN)
2088 --ci->i_pin_ref;
2089 if (had & CEPH_CAP_FILE_RD)
2090 if (--ci->i_rd_ref == 0)
2091 last++;
2092 if (had & CEPH_CAP_FILE_CACHE)
2093 if (--ci->i_rdcache_ref == 0)
2094 last++;
2095 if (had & CEPH_CAP_FILE_BUFFER) {
2096 if (--ci->i_wrbuffer_ref == 0) {
2097 last++;
2098 put++;
2099 }
2100 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2101 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2102 }
2103 if (had & CEPH_CAP_FILE_WR)
2104 if (--ci->i_wr_ref == 0) {
2105 last++;
2106 if (!list_empty(&ci->i_cap_snaps)) {
2107 capsnap = list_first_entry(&ci->i_cap_snaps,
2108 struct ceph_cap_snap,
2109 ci_item);
2110 if (capsnap->writing) {
2111 capsnap->writing = 0;
2112 flushsnaps =
2113 __ceph_finish_cap_snap(ci,
2114 capsnap);
2115 wake = 1;
2116 }
2117 }
2118 }
2119 spin_unlock(&inode->i_lock);
2120
2121 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
2122 last ? "last" : "");
2123
2124 if (last && !flushsnaps)
2125 ceph_check_caps(ci, 0, NULL);
2126 else if (flushsnaps)
2127 ceph_flush_snaps(ci);
2128 if (wake)
2129 wake_up(&ci->i_cap_wq);
2130 if (put)
2131 iput(inode);
2132}
2133
2134/*
2135 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2136 * context. Adjust per-snap dirty page accounting as appropriate.
2137 * Once all dirty data for a cap_snap is flushed, flush snapped file
2138 * metadata back to the MDS. If we dropped the last ref, call
2139 * ceph_check_caps.
2140 */
2141void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2142 struct ceph_snap_context *snapc)
2143{
2144 struct inode *inode = &ci->vfs_inode;
2145 int last = 0;
2146 int last_snap = 0;
2147 int found = 0;
2148 struct ceph_cap_snap *capsnap = NULL;
2149
2150 spin_lock(&inode->i_lock);
2151 ci->i_wrbuffer_ref -= nr;
2152 last = !ci->i_wrbuffer_ref;
2153
2154 if (ci->i_head_snapc == snapc) {
2155 ci->i_wrbuffer_ref_head -= nr;
2156 if (!ci->i_wrbuffer_ref_head) {
2157 ceph_put_snap_context(ci->i_head_snapc);
2158 ci->i_head_snapc = NULL;
2159 }
2160 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2161 inode,
2162 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2163 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2164 last ? " LAST" : "");
2165 } else {
2166 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2167 if (capsnap->context == snapc) {
2168 found = 1;
2169 capsnap->dirty_pages -= nr;
2170 last_snap = !capsnap->dirty_pages;
2171 break;
2172 }
2173 }
2174 BUG_ON(!found);
2175 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2176 " snap %lld %d/%d -> %d/%d %s%s\n",
2177 inode, capsnap, capsnap->context->seq,
2178 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2179 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2180 last ? " (wrbuffer last)" : "",
2181 last_snap ? " (capsnap last)" : "");
2182 }
2183
2184 spin_unlock(&inode->i_lock);
2185
2186 if (last) {
2187 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2188 iput(inode);
2189 } else if (last_snap) {
2190 ceph_flush_snaps(ci);
2191 wake_up(&ci->i_cap_wq);
2192 }
2193}
2194
2195/*
2196 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2197 * actually be a revocation if it specifies a smaller cap set.)
2198 *
2199 * caller holds s_mutex and i_lock, we drop both.
2200 *
2201 * return value:
2202 * 0 - ok
2203 * 1 - check_caps on auth cap only (writeback)
2204 * 2 - check_caps (ack revoke)
2205 */
2206static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2207 struct ceph_mds_session *session,
2208 struct ceph_cap *cap,
2209 struct ceph_buffer *xattr_buf)
2210 __releases(inode->i_lock)
2211 __releases(session->s_mutex)
2212{
2213 struct ceph_inode_info *ci = ceph_inode(inode);
2214 int mds = session->s_mds;
2215 int seq = le32_to_cpu(grant->seq);
2216 int newcaps = le32_to_cpu(grant->caps);
2217 int issued, implemented, used, wanted, dirty;
2218 u64 size = le64_to_cpu(grant->size);
2219 u64 max_size = le64_to_cpu(grant->max_size);
2220 struct timespec mtime, atime, ctime;
2221 int check_caps = 0;
2222 int wake = 0;
2223 int writeback = 0;
2224 int revoked_rdcache = 0;
2225 int queue_invalidate = 0;
2226
2227 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2228 inode, cap, mds, seq, ceph_cap_string(newcaps));
2229 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2230 inode->i_size);
2231
2232 /*
2233 * If CACHE is being revoked, and we have no dirty buffers,
2234 * try to invalidate (once). (If there are dirty buffers, we
2235 * will invalidate _after_ writeback.)
2236 */
2237 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2238 !ci->i_wrbuffer_ref) {
2239 if (try_nonblocking_invalidate(inode) == 0) {
2240 revoked_rdcache = 1;
2241 } else {
2242 /* there were locked pages.. invalidate later
2243 in a separate thread. */
2244 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2245 queue_invalidate = 1;
2246 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2247 }
2248 }
2249 }
2250
2251 /* side effects now are allowed */
2252
2253 issued = __ceph_caps_issued(ci, &implemented);
2254 issued |= implemented | __ceph_caps_dirty(ci);
2255
2256 cap->cap_gen = session->s_cap_gen;
2257
2258 __check_cap_issue(ci, cap, newcaps);
2259
2260 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2261 inode->i_mode = le32_to_cpu(grant->mode);
2262 inode->i_uid = le32_to_cpu(grant->uid);
2263 inode->i_gid = le32_to_cpu(grant->gid);
2264 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2265 inode->i_uid, inode->i_gid);
2266 }
2267
2268 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2269 inode->i_nlink = le32_to_cpu(grant->nlink);
2270
2271 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2272 int len = le32_to_cpu(grant->xattr_len);
2273 u64 version = le64_to_cpu(grant->xattr_version);
2274
2275 if (version > ci->i_xattrs.version) {
2276 dout(" got new xattrs v%llu on %p len %d\n",
2277 version, inode, len);
2278 if (ci->i_xattrs.blob)
2279 ceph_buffer_put(ci->i_xattrs.blob);
2280 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2281 ci->i_xattrs.version = version;
2282 }
2283 }
2284
2285 /* size/ctime/mtime/atime? */
2286 ceph_fill_file_size(inode, issued,
2287 le32_to_cpu(grant->truncate_seq),
2288 le64_to_cpu(grant->truncate_size), size);
2289 ceph_decode_timespec(&mtime, &grant->mtime);
2290 ceph_decode_timespec(&atime, &grant->atime);
2291 ceph_decode_timespec(&ctime, &grant->ctime);
2292 ceph_fill_file_time(inode, issued,
2293 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2294 &atime);
2295
2296 /* max size increase? */
2297 if (max_size != ci->i_max_size) {
2298 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2299 ci->i_max_size = max_size;
2300 if (max_size >= ci->i_wanted_max_size) {
2301 ci->i_wanted_max_size = 0; /* reset */
2302 ci->i_requested_max_size = 0;
2303 }
2304 wake = 1;
2305 }
2306
2307 /* check cap bits */
2308 wanted = __ceph_caps_wanted(ci);
2309 used = __ceph_caps_used(ci);
2310 dirty = __ceph_caps_dirty(ci);
2311 dout(" my wanted = %s, used = %s, dirty %s\n",
2312 ceph_cap_string(wanted),
2313 ceph_cap_string(used),
2314 ceph_cap_string(dirty));
2315 if (wanted != le32_to_cpu(grant->wanted)) {
2316 dout("mds wanted %s -> %s\n",
2317 ceph_cap_string(le32_to_cpu(grant->wanted)),
2318 ceph_cap_string(wanted));
2319 grant->wanted = cpu_to_le32(wanted);
2320 }
2321
2322 cap->seq = seq;
2323
2324 /* file layout may have changed */
2325 ci->i_layout = grant->layout;
2326
2327 /* revocation, grant, or no-op? */
2328 if (cap->issued & ~newcaps) {
2329 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2330 ceph_cap_string(newcaps));
2331 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2332 writeback = 1; /* will delay ack */
2333 else if (dirty & ~newcaps)
2334 check_caps = 1; /* initiate writeback in check_caps */
2335 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2336 revoked_rdcache)
2337 check_caps = 2; /* send revoke ack in check_caps */
2338 cap->issued = newcaps;
2339 cap->implemented |= newcaps;
2340 } else if (cap->issued == newcaps) {
2341 dout("caps unchanged: %s -> %s\n",
2342 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2343 } else {
2344 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2345 ceph_cap_string(newcaps));
2346 cap->issued = newcaps;
2347 cap->implemented |= newcaps; /* add bits only, to
2348 * avoid stepping on a
2349 * pending revocation */
2350 wake = 1;
2351 }
2352 BUG_ON(cap->issued & ~cap->implemented);
2353
2354 spin_unlock(&inode->i_lock);
2355 if (writeback)
2356 /*
2357 * queue inode for writeback: we can't actually call
2358 * filemap_write_and_wait, etc. from message handler
2359 * context.
2360 */
2361 ceph_queue_writeback(inode);
2362 if (queue_invalidate)
2363 ceph_queue_invalidate(inode);
2364 if (wake)
2365 wake_up(&ci->i_cap_wq);
2366
2367 if (check_caps == 1)
2368 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2369 session);
2370 else if (check_caps == 2)
2371 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2372 else
2373 mutex_unlock(&session->s_mutex);
2374}
2375
2376/*
2377 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2378 * MDS has been safely committed.
2379 */
2380static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2381 struct ceph_mds_caps *m,
2382 struct ceph_mds_session *session,
2383 struct ceph_cap *cap)
2384 __releases(inode->i_lock)
2385{
2386 struct ceph_inode_info *ci = ceph_inode(inode);
2387 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2388 unsigned seq = le32_to_cpu(m->seq);
2389 int dirty = le32_to_cpu(m->dirty);
2390 int cleaned = 0;
2391 int drop = 0;
2392 int i;
2393
2394 for (i = 0; i < CEPH_CAP_BITS; i++)
2395 if ((dirty & (1 << i)) &&
2396 flush_tid == ci->i_cap_flush_tid[i])
2397 cleaned |= 1 << i;
2398
2399 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2400 " flushing %s -> %s\n",
2401 inode, session->s_mds, seq, ceph_cap_string(dirty),
2402 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2403 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2404
2405 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2406 goto out;
2407
2408 ci->i_flushing_caps &= ~cleaned;
2409
2410 spin_lock(&mdsc->cap_dirty_lock);
2411 if (ci->i_flushing_caps == 0) {
2412 list_del_init(&ci->i_flushing_item);
2413 if (!list_empty(&session->s_cap_flushing))
2414 dout(" mds%d still flushing cap on %p\n",
2415 session->s_mds,
2416 &list_entry(session->s_cap_flushing.next,
2417 struct ceph_inode_info,
2418 i_flushing_item)->vfs_inode);
2419 mdsc->num_cap_flushing--;
2420 wake_up(&mdsc->cap_flushing_wq);
2421 dout(" inode %p now !flushing\n", inode);
2422
2423 if (ci->i_dirty_caps == 0) {
2424 dout(" inode %p now clean\n", inode);
2425 BUG_ON(!list_empty(&ci->i_dirty_item));
2426 drop = 1;
2427 } else {
2428 BUG_ON(list_empty(&ci->i_dirty_item));
2429 }
2430 }
2431 spin_unlock(&mdsc->cap_dirty_lock);
2432 wake_up(&ci->i_cap_wq);
2433
2434out:
2435 spin_unlock(&inode->i_lock);
2436 if (drop)
2437 iput(inode);
2438}
2439
2440/*
2441 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2442 * throw away our cap_snap.
2443 *
2444 * Caller hold s_mutex.
2445 */
2446static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2447 struct ceph_mds_caps *m,
2448 struct ceph_mds_session *session)
2449{
2450 struct ceph_inode_info *ci = ceph_inode(inode);
2451 u64 follows = le64_to_cpu(m->snap_follows);
2452 struct ceph_cap_snap *capsnap;
2453 int drop = 0;
2454
2455 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2456 inode, ci, session->s_mds, follows);
2457
2458 spin_lock(&inode->i_lock);
2459 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2460 if (capsnap->follows == follows) {
2461 if (capsnap->flush_tid != flush_tid) {
2462 dout(" cap_snap %p follows %lld tid %lld !="
2463 " %lld\n", capsnap, follows,
2464 flush_tid, capsnap->flush_tid);
2465 break;
2466 }
2467 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2468 dout(" removing cap_snap %p follows %lld\n",
2469 capsnap, follows);
2470 ceph_put_snap_context(capsnap->context);
2471 list_del(&capsnap->ci_item);
2472 list_del(&capsnap->flushing_item);
2473 ceph_put_cap_snap(capsnap);
2474 drop = 1;
2475 break;
2476 } else {
2477 dout(" skipping cap_snap %p follows %lld\n",
2478 capsnap, capsnap->follows);
2479 }
2480 }
2481 spin_unlock(&inode->i_lock);
2482 if (drop)
2483 iput(inode);
2484}
2485
2486/*
2487 * Handle TRUNC from MDS, indicating file truncation.
2488 *
2489 * caller hold s_mutex.
2490 */
2491static void handle_cap_trunc(struct inode *inode,
2492 struct ceph_mds_caps *trunc,
2493 struct ceph_mds_session *session)
2494 __releases(inode->i_lock)
2495{
2496 struct ceph_inode_info *ci = ceph_inode(inode);
2497 int mds = session->s_mds;
2498 int seq = le32_to_cpu(trunc->seq);
2499 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2500 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2501 u64 size = le64_to_cpu(trunc->size);
2502 int implemented = 0;
2503 int dirty = __ceph_caps_dirty(ci);
2504 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2505 int queue_trunc = 0;
2506
2507 issued |= implemented | dirty;
2508
2509 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2510 inode, mds, seq, truncate_size, truncate_seq);
2511 queue_trunc = ceph_fill_file_size(inode, issued,
2512 truncate_seq, truncate_size, size);
2513 spin_unlock(&inode->i_lock);
2514
2515 if (queue_trunc)
2516 ceph_queue_vmtruncate(inode);
2517}
2518
2519/*
2520 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2521 * different one. If we are the most recent migration we've seen (as
2522 * indicated by mseq), make note of the migrating cap bits for the
2523 * duration (until we see the corresponding IMPORT).
2524 *
2525 * caller holds s_mutex
2526 */
2527static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2528 struct ceph_mds_session *session)
2529{
2530 struct ceph_inode_info *ci = ceph_inode(inode);
2531 int mds = session->s_mds;
2532 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2533 struct ceph_cap *cap = NULL, *t;
2534 struct rb_node *p;
2535 int remember = 1;
2536
2537 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2538 inode, ci, mds, mseq);
2539
2540 spin_lock(&inode->i_lock);
2541
2542 /* make sure we haven't seen a higher mseq */
2543 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2544 t = rb_entry(p, struct ceph_cap, ci_node);
2545 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2546 dout(" higher mseq on cap from mds%d\n",
2547 t->session->s_mds);
2548 remember = 0;
2549 }
2550 if (t->session->s_mds == mds)
2551 cap = t;
2552 }
2553
2554 if (cap) {
2555 if (remember) {
2556 /* make note */
2557 ci->i_cap_exporting_mds = mds;
2558 ci->i_cap_exporting_mseq = mseq;
2559 ci->i_cap_exporting_issued = cap->issued;
2560 }
2561 __ceph_remove_cap(cap);
2562 }
2563 /* else, we already released it */
2564
2565 spin_unlock(&inode->i_lock);
2566}
2567
2568/*
2569 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2570 * clean them up.
2571 *
2572 * caller holds s_mutex.
2573 */
2574static void handle_cap_import(struct ceph_mds_client *mdsc,
2575 struct inode *inode, struct ceph_mds_caps *im,
2576 struct ceph_mds_session *session,
2577 void *snaptrace, int snaptrace_len)
2578{
2579 struct ceph_inode_info *ci = ceph_inode(inode);
2580 int mds = session->s_mds;
2581 unsigned issued = le32_to_cpu(im->caps);
2582 unsigned wanted = le32_to_cpu(im->wanted);
2583 unsigned seq = le32_to_cpu(im->seq);
2584 unsigned mseq = le32_to_cpu(im->migrate_seq);
2585 u64 realmino = le64_to_cpu(im->realm);
2586 u64 cap_id = le64_to_cpu(im->cap_id);
2587
2588 if (ci->i_cap_exporting_mds >= 0 &&
2589 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2590 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2591 " - cleared exporting from mds%d\n",
2592 inode, ci, mds, mseq,
2593 ci->i_cap_exporting_mds);
2594 ci->i_cap_exporting_issued = 0;
2595 ci->i_cap_exporting_mseq = 0;
2596 ci->i_cap_exporting_mds = -1;
2597 } else {
2598 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2599 inode, ci, mds, mseq);
2600 }
2601
2602 down_write(&mdsc->snap_rwsem);
2603 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2604 false);
2605 downgrade_write(&mdsc->snap_rwsem);
2606 ceph_add_cap(inode, session, cap_id, -1,
2607 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2608 NULL /* no caps context */);
2609 try_flush_caps(inode, session, NULL);
2610 up_read(&mdsc->snap_rwsem);
2611}
2612
2613/*
2614 * Handle a caps message from the MDS.
2615 *
2616 * Identify the appropriate session, inode, and call the right handler
2617 * based on the cap op.
2618 */
2619void ceph_handle_caps(struct ceph_mds_session *session,
2620 struct ceph_msg *msg)
2621{
2622 struct ceph_mds_client *mdsc = session->s_mdsc;
2623 struct super_block *sb = mdsc->client->sb;
2624 struct inode *inode;
2625 struct ceph_cap *cap;
2626 struct ceph_mds_caps *h;
2627 int mds = session->s_mds;
2628 int op;
2629 u32 seq;
2630 struct ceph_vino vino;
2631 u64 cap_id;
2632 u64 size, max_size;
2633 u64 tid;
2634 void *snaptrace;
2635
2636 dout("handle_caps from mds%d\n", mds);
2637
2638 /* decode */
2639 tid = le64_to_cpu(msg->hdr.tid);
2640 if (msg->front.iov_len < sizeof(*h))
2641 goto bad;
2642 h = msg->front.iov_base;
2643 snaptrace = h + 1;
2644 op = le32_to_cpu(h->op);
2645 vino.ino = le64_to_cpu(h->ino);
2646 vino.snap = CEPH_NOSNAP;
2647 cap_id = le64_to_cpu(h->cap_id);
2648 seq = le32_to_cpu(h->seq);
2649 size = le64_to_cpu(h->size);
2650 max_size = le64_to_cpu(h->max_size);
2651
2652 mutex_lock(&session->s_mutex);
2653 session->s_seq++;
2654 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2655 (unsigned)seq);
2656
2657 /* lookup ino */
2658 inode = ceph_find_inode(sb, vino);
2659 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2660 vino.snap, inode);
2661 if (!inode) {
2662 dout(" i don't have ino %llx\n", vino.ino);
2663 goto done;
2664 }
2665
2666 /* these will work even if we don't have a cap yet */
2667 switch (op) {
2668 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2669 handle_cap_flushsnap_ack(inode, tid, h, session);
2670 goto done;
2671
2672 case CEPH_CAP_OP_EXPORT:
2673 handle_cap_export(inode, h, session);
2674 goto done;
2675
2676 case CEPH_CAP_OP_IMPORT:
2677 handle_cap_import(mdsc, inode, h, session,
2678 snaptrace, le32_to_cpu(h->snap_trace_len));
2679 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2680 session);
2681 goto done_unlocked;
2682 }
2683
2684 /* the rest require a cap */
2685 spin_lock(&inode->i_lock);
2686 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2687 if (!cap) {
2688 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2689 inode, ceph_ino(inode), ceph_snap(inode), mds);
2690 spin_unlock(&inode->i_lock);
2691 goto done;
2692 }
2693
2694 /* note that each of these drops i_lock for us */
2695 switch (op) {
2696 case CEPH_CAP_OP_REVOKE:
2697 case CEPH_CAP_OP_GRANT:
2698 handle_cap_grant(inode, h, session, cap, msg->middle);
2699 goto done_unlocked;
2700
2701 case CEPH_CAP_OP_FLUSH_ACK:
2702 handle_cap_flush_ack(inode, tid, h, session, cap);
2703 break;
2704
2705 case CEPH_CAP_OP_TRUNC:
2706 handle_cap_trunc(inode, h, session);
2707 break;
2708
2709 default:
2710 spin_unlock(&inode->i_lock);
2711 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2712 ceph_cap_op_name(op));
2713 }
2714
2715done:
2716 mutex_unlock(&session->s_mutex);
2717done_unlocked:
2718 if (inode)
2719 iput(inode);
2720 return;
2721
2722bad:
2723 pr_err("ceph_handle_caps: corrupt message\n");
2724 ceph_msg_dump(msg);
2725 return;
2726}
2727
2728/*
2729 * Delayed work handler to process end of delayed cap release LRU list.
2730 */
2731void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2732{
2733 struct ceph_inode_info *ci;
2734 int flags = CHECK_CAPS_NODELAY;
2735
2736 dout("check_delayed_caps\n");
2737 while (1) {
2738 spin_lock(&mdsc->cap_delay_lock);
2739 if (list_empty(&mdsc->cap_delay_list))
2740 break;
2741 ci = list_first_entry(&mdsc->cap_delay_list,
2742 struct ceph_inode_info,
2743 i_cap_delay_list);
2744 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2745 time_before(jiffies, ci->i_hold_caps_max))
2746 break;
2747 list_del_init(&ci->i_cap_delay_list);
2748 spin_unlock(&mdsc->cap_delay_lock);
2749 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2750 ceph_check_caps(ci, flags, NULL);
2751 }
2752 spin_unlock(&mdsc->cap_delay_lock);
2753}
2754
2755/*
2756 * Flush all dirty caps to the mds
2757 */
2758void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2759{
2760 struct ceph_inode_info *ci, *nci = NULL;
2761 struct inode *inode, *ninode = NULL;
2762 struct list_head *p, *n;
2763
2764 dout("flush_dirty_caps\n");
2765 spin_lock(&mdsc->cap_dirty_lock);
2766 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2767 if (nci) {
2768 ci = nci;
2769 inode = ninode;
2770 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2771 dout("flush_dirty_caps inode %p (was next inode)\n",
2772 inode);
2773 } else {
2774 ci = list_entry(p, struct ceph_inode_info,
2775 i_dirty_item);
2776 inode = igrab(&ci->vfs_inode);
2777 BUG_ON(!inode);
2778 dout("flush_dirty_caps inode %p\n", inode);
2779 }
2780 if (n != &mdsc->cap_dirty) {
2781 nci = list_entry(n, struct ceph_inode_info,
2782 i_dirty_item);
2783 ninode = igrab(&nci->vfs_inode);
2784 BUG_ON(!ninode);
2785 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2786 dout("flush_dirty_caps next inode %p, noflush\n",
2787 ninode);
2788 } else {
2789 nci = NULL;
2790 ninode = NULL;
2791 }
2792 spin_unlock(&mdsc->cap_dirty_lock);
2793 if (inode) {
2794 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2795 NULL);
2796 iput(inode);
2797 }
2798 spin_lock(&mdsc->cap_dirty_lock);
2799 }
2800 spin_unlock(&mdsc->cap_dirty_lock);
2801}
2802
2803/*
2804 * Drop open file reference. If we were the last open file,
2805 * we may need to release capabilities to the MDS (or schedule
2806 * their delayed release).
2807 */
2808void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2809{
2810 struct inode *inode = &ci->vfs_inode;
2811 int last = 0;
2812
2813 spin_lock(&inode->i_lock);
2814 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2815 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2816 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2817 if (--ci->i_nr_by_mode[fmode] == 0)
2818 last++;
2819 spin_unlock(&inode->i_lock);
2820
2821 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2822 ceph_check_caps(ci, 0, NULL);
2823}
2824
2825/*
2826 * Helpers for embedding cap and dentry lease releases into mds
2827 * requests.
2828 *
2829 * @force is used by dentry_release (below) to force inclusion of a
2830 * record for the directory inode, even when there aren't any caps to
2831 * drop.
2832 */
2833int ceph_encode_inode_release(void **p, struct inode *inode,
2834 int mds, int drop, int unless, int force)
2835{
2836 struct ceph_inode_info *ci = ceph_inode(inode);
2837 struct ceph_cap *cap;
2838 struct ceph_mds_request_release *rel = *p;
2839 int ret = 0;
2840 int used = 0;
2841
2842 spin_lock(&inode->i_lock);
2843 used = __ceph_caps_used(ci);
2844
2845 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2846 mds, ceph_cap_string(used), ceph_cap_string(drop),
2847 ceph_cap_string(unless));
2848
2849 /* only drop unused caps */
2850 drop &= ~used;
2851
2852 cap = __get_cap_for_mds(ci, mds);
2853 if (cap && __cap_is_valid(cap)) {
2854 if (force ||
2855 ((cap->issued & drop) &&
2856 (cap->issued & unless) == 0)) {
2857 if ((cap->issued & drop) &&
2858 (cap->issued & unless) == 0) {
2859 dout("encode_inode_release %p cap %p %s -> "
2860 "%s\n", inode, cap,
2861 ceph_cap_string(cap->issued),
2862 ceph_cap_string(cap->issued & ~drop));
2863 cap->issued &= ~drop;
2864 cap->implemented &= ~drop;
2865 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2866 int wanted = __ceph_caps_wanted(ci);
2867 dout(" wanted %s -> %s (act %s)\n",
2868 ceph_cap_string(cap->mds_wanted),
2869 ceph_cap_string(cap->mds_wanted &
2870 ~wanted),
2871 ceph_cap_string(wanted));
2872 cap->mds_wanted &= wanted;
2873 }
2874 } else {
2875 dout("encode_inode_release %p cap %p %s"
2876 " (force)\n", inode, cap,
2877 ceph_cap_string(cap->issued));
2878 }
2879
2880 rel->ino = cpu_to_le64(ceph_ino(inode));
2881 rel->cap_id = cpu_to_le64(cap->cap_id);
2882 rel->seq = cpu_to_le32(cap->seq);
2883 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2884 rel->mseq = cpu_to_le32(cap->mseq);
2885 rel->caps = cpu_to_le32(cap->issued);
2886 rel->wanted = cpu_to_le32(cap->mds_wanted);
2887 rel->dname_len = 0;
2888 rel->dname_seq = 0;
2889 *p += sizeof(*rel);
2890 ret = 1;
2891 } else {
2892 dout("encode_inode_release %p cap %p %s\n",
2893 inode, cap, ceph_cap_string(cap->issued));
2894 }
2895 }
2896 spin_unlock(&inode->i_lock);
2897 return ret;
2898}
2899
2900int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2901 int mds, int drop, int unless)
2902{
2903 struct inode *dir = dentry->d_parent->d_inode;
2904 struct ceph_mds_request_release *rel = *p;
2905 struct ceph_dentry_info *di = ceph_dentry(dentry);
2906 int force = 0;
2907 int ret;
2908
2909 /*
2910 * force an record for the directory caps if we have a dentry lease.
2911 * this is racy (can't take i_lock and d_lock together), but it
2912 * doesn't have to be perfect; the mds will revoke anything we don't
2913 * release.
2914 */
2915 spin_lock(&dentry->d_lock);
2916 if (di->lease_session && di->lease_session->s_mds == mds)
2917 force = 1;
2918 spin_unlock(&dentry->d_lock);
2919
2920 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2921
2922 spin_lock(&dentry->d_lock);
2923 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2924 dout("encode_dentry_release %p mds%d seq %d\n",
2925 dentry, mds, (int)di->lease_seq);
2926 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2927 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2928 *p += dentry->d_name.len;
2929 rel->dname_seq = cpu_to_le32(di->lease_seq);
2930 }
2931 spin_unlock(&dentry->d_lock);
2932 return ret;
2933}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78const u8 *aes_iv = "cephsageyudagreg";
79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
81 const void *src, size_t src_len)
82{
83 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
85 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
86 int ret;
87 void *iv;
88 int ivsize;
89 size_t zero_padding = (0x10 - (src_len & 0x0f));
90 char pad[16];
91
92 if (IS_ERR(tfm))
93 return PTR_ERR(tfm);
94
95 memset(pad, zero_padding, zero_padding);
96
97 *dst_len = src_len + zero_padding;
98
99 crypto_blkcipher_setkey((void *)tfm, key, key_len);
100 sg_init_table(sg_in, 2);
101 sg_set_buf(&sg_in[0], src, src_len);
102 sg_set_buf(&sg_in[1], pad, zero_padding);
103 sg_init_table(sg_out, 1);
104 sg_set_buf(sg_out, dst, *dst_len);
105 iv = crypto_blkcipher_crt(tfm)->iv;
106 ivsize = crypto_blkcipher_ivsize(tfm);
107
108 memcpy(iv, aes_iv, ivsize);
109 /*
110 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
111 key, key_len, 1);
112 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
113 src, src_len, 1);
114 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
115 pad, zero_padding, 1);
116 */
117 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
118 src_len + zero_padding);
119 crypto_free_blkcipher(tfm);
120 if (ret < 0)
121 pr_err("ceph_aes_crypt failed %d\n", ret);
122 /*
123 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
124 dst, *dst_len, 1);
125 */
126 return 0;
127}
128
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
130 const void *src1, size_t src1_len,
131 const void *src2, size_t src2_len)
132{
133 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
135 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
136 int ret;
137 void *iv;
138 int ivsize;
139 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
140 char pad[16];
141
142 if (IS_ERR(tfm))
143 return PTR_ERR(tfm);
144
145 memset(pad, zero_padding, zero_padding);
146
147 *dst_len = src1_len + src2_len + zero_padding;
148
149 crypto_blkcipher_setkey((void *)tfm, key, key_len);
150 sg_init_table(sg_in, 3);
151 sg_set_buf(&sg_in[0], src1, src1_len);
152 sg_set_buf(&sg_in[1], src2, src2_len);
153 sg_set_buf(&sg_in[2], pad, zero_padding);
154 sg_init_table(sg_out, 1);
155 sg_set_buf(sg_out, dst, *dst_len);
156 iv = crypto_blkcipher_crt(tfm)->iv;
157 ivsize = crypto_blkcipher_ivsize(tfm);
158
159 memcpy(iv, aes_iv, ivsize);
160 /*
161 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
162 key, key_len, 1);
163 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
164 src1, src1_len, 1);
165 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
166 src2, src2_len, 1);
167 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
168 pad, zero_padding, 1);
169 */
170 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
171 src1_len + src2_len + zero_padding);
172 crypto_free_blkcipher(tfm);
173 if (ret < 0)
174 pr_err("ceph_aes_crypt2 failed %d\n", ret);
175 /*
176 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
177 dst, *dst_len, 1);
178 */
179 return 0;
180}
181
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
183 const void *src, size_t src_len)
184{
185 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
187 struct blkcipher_desc desc = { .tfm = tfm };
188 char pad[16];
189 void *iv;
190 int ivsize;
191 int ret;
192 int last_byte;
193
194 if (IS_ERR(tfm))
195 return PTR_ERR(tfm);
196
197 crypto_blkcipher_setkey((void *)tfm, key, key_len);
198 sg_init_table(sg_in, 1);
199 sg_init_table(sg_out, 2);
200 sg_set_buf(sg_in, src, src_len);
201 sg_set_buf(&sg_out[0], dst, *dst_len);
202 sg_set_buf(&sg_out[1], pad, sizeof(pad));
203
204 iv = crypto_blkcipher_crt(tfm)->iv;
205 ivsize = crypto_blkcipher_ivsize(tfm);
206
207 memcpy(iv, aes_iv, ivsize);
208
209 /*
210 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
211 key, key_len, 1);
212 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
213 src, src_len, 1);
214 */
215
216 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
217 crypto_free_blkcipher(tfm);
218 if (ret < 0) {
219 pr_err("ceph_aes_decrypt failed %d\n", ret);
220 return ret;
221 }
222
223 if (src_len <= *dst_len)
224 last_byte = ((char *)dst)[src_len - 1];
225 else
226 last_byte = pad[src_len - *dst_len - 1];
227 if (last_byte <= 16 && src_len >= last_byte) {
228 *dst_len = src_len - last_byte;
229 } else {
230 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
231 last_byte, (int)src_len);
232 return -EPERM; /* bad padding */
233 }
234 /*
235 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
236 dst, *dst_len, 1);
237 */
238 return 0;
239}
240
241int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len)
245{
246 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
248 struct blkcipher_desc desc = { .tfm = tfm };
249 char pad[16];
250 void *iv;
251 int ivsize;
252 int ret;
253 int last_byte;
254
255 if (IS_ERR(tfm))
256 return PTR_ERR(tfm);
257
258 sg_init_table(sg_in, 1);
259 sg_set_buf(sg_in, src, src_len);
260 sg_init_table(sg_out, 3);
261 sg_set_buf(&sg_out[0], dst1, *dst1_len);
262 sg_set_buf(&sg_out[1], dst2, *dst2_len);
263 sg_set_buf(&sg_out[2], pad, sizeof(pad));
264
265 crypto_blkcipher_setkey((void *)tfm, key, key_len);
266 iv = crypto_blkcipher_crt(tfm)->iv;
267 ivsize = crypto_blkcipher_ivsize(tfm);
268
269 memcpy(iv, aes_iv, ivsize);
270
271 /*
272 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
273 key, key_len, 1);
274 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
275 src, src_len, 1);
276 */
277
278 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
279 crypto_free_blkcipher(tfm);
280 if (ret < 0) {
281 pr_err("ceph_aes_decrypt failed %d\n", ret);
282 return ret;
283 }
284
285 if (src_len <= *dst1_len)
286 last_byte = ((char *)dst1)[src_len - 1];
287 else if (src_len <= *dst1_len + *dst2_len)
288 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
289 else
290 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
291 if (last_byte <= 16 && src_len >= last_byte) {
292 src_len -= last_byte;
293 } else {
294 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
295 last_byte, (int)src_len);
296 return -EPERM; /* bad padding */
297 }
298
299 if (src_len < *dst1_len) {
300 *dst1_len = src_len;
301 *dst2_len = 0;
302 } else {
303 *dst2_len = src_len - *dst1_len;
304 }
305 /*
306 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
307 dst1, *dst1_len, 1);
308 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
309 dst2, *dst2_len, 1);
310 */
311
312 return 0;
313}
314
315
316int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
317 const void *src, size_t src_len)
318{
319 switch (secret->type) {
320 case CEPH_CRYPTO_NONE:
321 if (*dst_len < src_len)
322 return -ERANGE;
323 memcpy(dst, src, src_len);
324 *dst_len = src_len;
325 return 0;
326
327 case CEPH_CRYPTO_AES:
328 return ceph_aes_decrypt(secret->key, secret->len, dst,
329 dst_len, src, src_len);
330
331 default:
332 return -EINVAL;
333 }
334}
335
336int ceph_decrypt2(struct ceph_crypto_key *secret,
337 void *dst1, size_t *dst1_len,
338 void *dst2, size_t *dst2_len,
339 const void *src, size_t src_len)
340{
341 size_t t;
342
343 switch (secret->type) {
344 case CEPH_CRYPTO_NONE:
345 if (*dst1_len + *dst2_len < src_len)
346 return -ERANGE;
347 t = min(*dst1_len, src_len);
348 memcpy(dst1, src, t);
349 *dst1_len = t;
350 src += t;
351 src_len -= t;
352 if (src_len) {
353 t = min(*dst2_len, src_len);
354 memcpy(dst2, src, t);
355 *dst2_len = t;
356 }
357 return 0;
358
359 case CEPH_CRYPTO_AES:
360 return ceph_aes_decrypt2(secret->key, secret->len,
361 dst1, dst1_len, dst2, dst2_len,
362 src, src_len);
363
364 default:
365 return -EINVAL;
366 }
367}
368
369int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
370 const void *src, size_t src_len)
371{
372 switch (secret->type) {
373 case CEPH_CRYPTO_NONE:
374 if (*dst_len < src_len)
375 return -ERANGE;
376 memcpy(dst, src, src_len);
377 *dst_len = src_len;
378 return 0;
379
380 case CEPH_CRYPTO_AES:
381 return ceph_aes_encrypt(secret->key, secret->len, dst,
382 dst_len, src, src_len);
383
384 default:
385 return -EINVAL;
386 }
387}
388
389int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
390 const void *src1, size_t src1_len,
391 const void *src2, size_t src2_len)
392{
393 switch (secret->type) {
394 case CEPH_CRYPTO_NONE:
395 if (*dst_len < src1_len + src2_len)
396 return -ERANGE;
397 memcpy(dst, src1, src1_len);
398 memcpy(dst + src1_len, src2, src2_len);
399 *dst_len = src1_len + src2_len;
400 return 0;
401
402 case CEPH_CRYPTO_AES:
403 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
404 src1, src1_len, src2, src2_len);
405
406 default:
407 return -EINVAL;
408 }
409}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53
54static int mdsmap_show(struct seq_file *s, void *p)
55{
56 int i;
57 struct ceph_client *client = s->private;
58
59 if (client->mdsc.mdsmap == NULL)
60 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state;
71
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state));
74 }
75 return 0;
76}
77
78static int osdmap_show(struct seq_file *s, void *p)
79{
80 int i;
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
131 seq_printf(s, "%lld statfs\n", req->tid);
132 }
133
134 mutex_unlock(&monc->mutex);
135 return 0;
136}
137
138static int mdsc_show(struct seq_file *s, void *p)
139{
140 struct ceph_client *client = s->private;
141 struct ceph_mds_client *mdsc = &client->mdsc;
142 struct ceph_mds_request *req;
143 struct rb_node *rp;
144 int pathlen;
145 u64 pathbase;
146 char *path;
147
148 mutex_lock(&mdsc->mutex);
149 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
150 req = rb_entry(rp, struct ceph_mds_request, r_node);
151
152 if (req->r_request)
153 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
154 else
155 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
156
157 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
158
159 if (req->r_got_unsafe)
160 seq_printf(s, "\t(unsafe)");
161 else
162 seq_printf(s, "\t");
163
164 if (req->r_inode) {
165 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
166 } else if (req->r_dentry) {
167 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
168 &pathbase, 0);
169 spin_lock(&req->r_dentry->d_lock);
170 seq_printf(s, " #%llx/%.*s (%s)",
171 ceph_ino(req->r_dentry->d_parent->d_inode),
172 req->r_dentry->d_name.len,
173 req->r_dentry->d_name.name,
174 path ? path : "");
175 spin_unlock(&req->r_dentry->d_lock);
176 kfree(path);
177 } else if (req->r_path1) {
178 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
179 req->r_path1);
180 }
181
182 if (req->r_old_dentry) {
183 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
184 &pathbase, 0);
185 spin_lock(&req->r_old_dentry->d_lock);
186 seq_printf(s, " #%llx/%.*s (%s)",
187 ceph_ino(req->r_old_dentry->d_parent->d_inode),
188 req->r_old_dentry->d_name.len,
189 req->r_old_dentry->d_name.name,
190 path ? path : "");
191 spin_unlock(&req->r_old_dentry->d_lock);
192 kfree(path);
193 } else if (req->r_path2) {
194 if (req->r_ino2.ino)
195 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
196 req->r_path2);
197 else
198 seq_printf(s, " %s", req->r_path2);
199 }
200
201 seq_printf(s, "\n");
202 }
203 mutex_unlock(&mdsc->mutex);
204
205 return 0;
206}
207
208static int osdc_show(struct seq_file *s, void *pp)
209{
210 struct ceph_client *client = s->private;
211 struct ceph_osd_client *osdc = &client->osdc;
212 struct rb_node *p;
213
214 mutex_lock(&osdc->request_mutex);
215 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
216 struct ceph_osd_request *req;
217 struct ceph_osd_request_head *head;
218 struct ceph_osd_op *op;
219 int num_ops;
220 int opcode, olen;
221 int i;
222
223 req = rb_entry(p, struct ceph_osd_request, r_node);
224
225 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
226 req->r_osd ? req->r_osd->o_osd : -1,
227 le32_to_cpu(req->r_pgid.pool),
228 le16_to_cpu(req->r_pgid.ps));
229
230 head = req->r_request->front.iov_base;
231 op = (void *)(head + 1);
232
233 num_ops = le16_to_cpu(head->num_ops);
234 olen = le32_to_cpu(head->object_len);
235 seq_printf(s, "%.*s", olen,
236 (const char *)(head->ops + num_ops));
237
238 if (req->r_reassert_version.epoch)
239 seq_printf(s, "\t%u'%llu",
240 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
241 le64_to_cpu(req->r_reassert_version.version));
242 else
243 seq_printf(s, "\t");
244
245 for (i = 0; i < num_ops; i++) {
246 opcode = le16_to_cpu(op->op);
247 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
248 op++;
249 }
250
251 seq_printf(s, "\n");
252 }
253 mutex_unlock(&osdc->request_mutex);
254 return 0;
255}
256
257static int caps_show(struct seq_file *s, void *p)
258{
259 struct ceph_client *client = p;
260 int total, avail, used, reserved, min;
261
262 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
263 seq_printf(s, "total\t\t%d\n"
264 "avail\t\t%d\n"
265 "used\t\t%d\n"
266 "reserved\t%d\n"
267 "min\t%d\n",
268 total, avail, used, reserved, min);
269 return 0;
270}
271
272static int dentry_lru_show(struct seq_file *s, void *ptr)
273{
274 struct ceph_client *client = s->private;
275 struct ceph_mds_client *mdsc = &client->mdsc;
276 struct ceph_dentry_info *di;
277
278 spin_lock(&mdsc->dentry_lru_lock);
279 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
280 struct dentry *dentry = di->dentry;
281 seq_printf(s, "%p %p\t%.*s\n",
282 di, dentry, dentry->d_name.len, dentry->d_name.name);
283 }
284 spin_unlock(&mdsc->dentry_lru_lock);
285
286 return 0;
287}
288
289#define DEFINE_SHOW_FUNC(name) \
290static int name##_open(struct inode *inode, struct file *file) \
291{ \
292 struct seq_file *sf; \
293 int ret; \
294 \
295 ret = single_open(file, name, NULL); \
296 sf = file->private_data; \
297 sf->private = inode->i_private; \
298 return ret; \
299} \
300 \
301static const struct file_operations name##_fops = { \
302 .open = name##_open, \
303 .read = seq_read, \
304 .llseek = seq_lseek, \
305 .release = single_release, \
306};
307
308DEFINE_SHOW_FUNC(monmap_show)
309DEFINE_SHOW_FUNC(mdsmap_show)
310DEFINE_SHOW_FUNC(osdmap_show)
311DEFINE_SHOW_FUNC(monc_show)
312DEFINE_SHOW_FUNC(mdsc_show)
313DEFINE_SHOW_FUNC(osdc_show)
314DEFINE_SHOW_FUNC(dentry_lru_show)
315DEFINE_SHOW_FUNC(caps_show)
316
317static int congestion_kb_set(void *data, u64 val)
318{
319 struct ceph_client *client = (struct ceph_client *)data;
320
321 if (client)
322 client->mount_args->congestion_kb = (int)val;
323
324 return 0;
325}
326
327static int congestion_kb_get(void *data, u64 *val)
328{
329 struct ceph_client *client = (struct ceph_client *)data;
330
331 if (client)
332 *val = (u64)client->mount_args->congestion_kb;
333
334 return 0;
335}
336
337
338DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
339 congestion_kb_set, "%llu\n");
340
341int __init ceph_debugfs_init(void)
342{
343 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
344 if (!ceph_debugfs_dir)
345 return -ENOMEM;
346 return 0;
347}
348
349void ceph_debugfs_cleanup(void)
350{
351 debugfs_remove(ceph_debugfs_dir);
352}
353
354int ceph_debugfs_client_init(struct ceph_client *client)
355{
356 int ret = 0;
357 char name[80];
358
359 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
360 PR_FSID(&client->fsid), client->monc.auth->global_id);
361
362 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
363 if (!client->debugfs_dir)
364 goto out;
365
366 client->monc.debugfs_file = debugfs_create_file("monc",
367 0600,
368 client->debugfs_dir,
369 client,
370 &monc_show_fops);
371 if (!client->monc.debugfs_file)
372 goto out;
373
374 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
375 0600,
376 client->debugfs_dir,
377 client,
378 &mdsc_show_fops);
379 if (!client->mdsc.debugfs_file)
380 goto out;
381
382 client->osdc.debugfs_file = debugfs_create_file("osdc",
383 0600,
384 client->debugfs_dir,
385 client,
386 &osdc_show_fops);
387 if (!client->osdc.debugfs_file)
388 goto out;
389
390 client->debugfs_monmap = debugfs_create_file("monmap",
391 0600,
392 client->debugfs_dir,
393 client,
394 &monmap_show_fops);
395 if (!client->debugfs_monmap)
396 goto out;
397
398 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
399 0600,
400 client->debugfs_dir,
401 client,
402 &mdsmap_show_fops);
403 if (!client->debugfs_mdsmap)
404 goto out;
405
406 client->debugfs_osdmap = debugfs_create_file("osdmap",
407 0600,
408 client->debugfs_dir,
409 client,
410 &osdmap_show_fops);
411 if (!client->debugfs_osdmap)
412 goto out;
413
414 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
415 0600,
416 client->debugfs_dir,
417 client,
418 &dentry_lru_show_fops);
419 if (!client->debugfs_dentry_lru)
420 goto out;
421
422 client->debugfs_caps = debugfs_create_file("caps",
423 0400,
424 client->debugfs_dir,
425 client,
426 &caps_show_fops);
427 if (!client->debugfs_caps)
428 goto out;
429
430 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
431 0600,
432 client->debugfs_dir,
433 client,
434 &congestion_kb_fops);
435 if (!client->debugfs_congestion_kb)
436 goto out;
437
438 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
439 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
440 name);
441
442 return 0;
443
444out:
445 ceph_debugfs_client_cleanup(client);
446 return ret;
447}
448
449void ceph_debugfs_client_cleanup(struct ceph_client *client)
450{
451 debugfs_remove(client->debugfs_bdi);
452 debugfs_remove(client->debugfs_caps);
453 debugfs_remove(client->debugfs_dentry_lru);
454 debugfs_remove(client->debugfs_osdmap);
455 debugfs_remove(client->debugfs_mdsmap);
456 debugfs_remove(client->debugfs_monmap);
457 debugfs_remove(client->osdc.debugfs_file);
458 debugfs_remove(client->mdsc.debugfs_file);
459 debugfs_remove(client->monc.debugfs_file);
460 debugfs_remove(client->debugfs_congestion_kb);
461 debugfs_remove(client->debugfs_dir);
462}
463
464#else // CONFIG_DEBUG_FS
465
466int __init ceph_debugfs_init(void)
467{
468 return 0;
469}
470
471void ceph_debugfs_cleanup(void)
472{
473}
474
475int ceph_debugfs_client_init(struct ceph_client *client)
476{
477 return 0;
478}
479
480void ceph_debugfs_client_cleanup(struct ceph_client *client)
481{
482}
483
484#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..7261dc6c2ead
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1223 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/slab.h>
7#include <linux/sched.h>
8
9#include "super.h"
10
11/*
12 * Directory operations: readdir, lookup, create, link, unlink,
13 * rename, etc.
14 */
15
16/*
17 * Ceph MDS operations are specified in terms of a base ino and
18 * relative path. Thus, the client can specify an operation on a
19 * specific inode (e.g., a getattr due to fstat(2)), or as a path
20 * relative to, say, the root directory.
21 *
22 * Normally, we limit ourselves to strict inode ops (no path component)
23 * or dentry operations (a single path component relative to an ino). The
24 * exception to this is open_root_dentry(), which will open the mount
25 * point by name.
26 */
27
28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops;
31
32/*
33 * Initialize ceph dentry state.
34 */
35int ceph_init_dentry(struct dentry *dentry)
36{
37 struct ceph_dentry_info *di;
38
39 if (dentry->d_fsdata)
40 return 0;
41
42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
43 dentry->d_op = &ceph_dentry_ops;
44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
45 dentry->d_op = &ceph_snapdir_dentry_ops;
46 else
47 dentry->d_op = &ceph_snap_dentry_ops;
48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
50 if (!di)
51 return -ENOMEM; /* oh well */
52
53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */
55 goto out_unlock;
56 di->dentry = dentry;
57 di->lease_session = NULL;
58 dentry->d_fsdata = di;
59 dentry->d_time = jiffies;
60 ceph_dentry_lru_add(dentry);
61out_unlock:
62 spin_unlock(&dentry->d_lock);
63 return 0;
64}
65
66
67
68/*
69 * for readdir, we encode the directory frag and offset within that
70 * frag into f_pos.
71 */
72static unsigned fpos_frag(loff_t p)
73{
74 return p >> 32;
75}
76static unsigned fpos_off(loff_t p)
77{
78 return p & 0xffffffff;
79}
80
81/*
82 * When possible, we try to satisfy a readdir by peeking at the
83 * dcache. We make this work by carefully ordering dentries on
84 * d_u.d_child when we initially get results back from the MDS, and
85 * falling back to a "normal" sync readdir if any dentries in the dir
86 * are dropped.
87 *
88 * I_COMPLETE tells indicates we have all dentries in the dir. It is
89 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
90 * the MDS if/when the directory is modified).
91 */
92static int __dcache_readdir(struct file *filp,
93 void *dirent, filldir_t filldir)
94{
95 struct inode *inode = filp->f_dentry->d_inode;
96 struct ceph_file_info *fi = filp->private_data;
97 struct dentry *parent = filp->f_dentry;
98 struct inode *dir = parent->d_inode;
99 struct list_head *p;
100 struct dentry *dentry, *last;
101 struct ceph_dentry_info *di;
102 int err = 0;
103
104 /* claim ref on last dentry we returned */
105 last = fi->dentry;
106 fi->dentry = NULL;
107
108 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
109 last);
110
111 spin_lock(&dcache_lock);
112
113 /* start at beginning? */
114 if (filp->f_pos == 2 || (last &&
115 filp->f_pos < ceph_dentry(last)->offset)) {
116 if (list_empty(&parent->d_subdirs))
117 goto out_unlock;
118 p = parent->d_subdirs.prev;
119 dout(" initial p %p/%p\n", p->prev, p->next);
120 } else {
121 p = last->d_u.d_child.prev;
122 }
123
124more:
125 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry);
127 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
129 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) {
131 fi->at_end = 1;
132 goto out_unlock;
133 }
134 if (!d_unhashed(dentry) && dentry->d_inode &&
135 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
136 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
137 filp->f_pos <= di->offset)
138 break;
139 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
140 dentry->d_name.len, dentry->d_name.name, di->offset,
141 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
142 !dentry->d_inode ? " null" : "");
143 p = p->prev;
144 dentry = list_entry(p, struct dentry, d_u.d_child);
145 di = ceph_dentry(dentry);
146 }
147
148 atomic_inc(&dentry->d_count);
149 spin_unlock(&dcache_lock);
150 spin_unlock(&inode->i_lock);
151
152 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
153 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
154 filp->f_pos = di->offset;
155 err = filldir(dirent, dentry->d_name.name,
156 dentry->d_name.len, di->offset,
157 dentry->d_inode->i_ino,
158 dentry->d_inode->i_mode >> 12);
159
160 if (last) {
161 if (err < 0) {
162 /* remember our position */
163 fi->dentry = last;
164 fi->next_offset = di->offset;
165 } else {
166 dput(last);
167 }
168 last = NULL;
169 }
170
171 spin_lock(&inode->i_lock);
172 spin_lock(&dcache_lock);
173
174 if (err < 0)
175 goto out_unlock;
176
177 last = dentry;
178
179 p = p->prev;
180 filp->f_pos++;
181
182 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
183 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
184 goto more;
185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
186 err = -EAGAIN;
187
188out_unlock:
189 spin_unlock(&dcache_lock);
190
191 if (last) {
192 spin_unlock(&inode->i_lock);
193 dput(last);
194 spin_lock(&inode->i_lock);
195 }
196
197 return err;
198}
199
200/*
201 * make note of the last dentry we read, so we can
202 * continue at the same lexicographical point,
203 * regardless of what dir changes take place on the
204 * server.
205 */
206static int note_last_dentry(struct ceph_file_info *fi, const char *name,
207 int len)
208{
209 kfree(fi->last_name);
210 fi->last_name = kmalloc(len+1, GFP_NOFS);
211 if (!fi->last_name)
212 return -ENOMEM;
213 memcpy(fi->last_name, name, len);
214 fi->last_name[len] = 0;
215 dout("note_last_dentry '%s'\n", fi->last_name);
216 return 0;
217}
218
219static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
220{
221 struct ceph_file_info *fi = filp->private_data;
222 struct inode *inode = filp->f_dentry->d_inode;
223 struct ceph_inode_info *ci = ceph_inode(inode);
224 struct ceph_client *client = ceph_inode_to_client(inode);
225 struct ceph_mds_client *mdsc = &client->mdsc;
226 unsigned frag = fpos_frag(filp->f_pos);
227 int off = fpos_off(filp->f_pos);
228 int err;
229 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir;
232
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end)
235 return 0;
236
237 /* always start with . and .. */
238 if (filp->f_pos == 0) {
239 /* note dir version at start of readdir so we can tell
240 * if any dentries get dropped */
241 fi->dir_release_count = ci->i_release_count;
242
243 dout("readdir off 0 -> '.'\n");
244 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
245 inode->i_ino, inode->i_mode >> 12) < 0)
246 return 0;
247 filp->f_pos = 1;
248 off = 1;
249 }
250 if (filp->f_pos == 1) {
251 dout("readdir off 1 -> '..'\n");
252 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
253 filp->f_dentry->d_parent->d_inode->i_ino,
254 inode->i_mode >> 12) < 0)
255 return 0;
256 filp->f_pos = 2;
257 off = 2;
258 }
259
260 /* can we use the dcache? */
261 spin_lock(&inode->i_lock);
262 if ((filp->f_pos == 2 || fi->dentry) &&
263 !ceph_test_opt(client, NOASYNCREADDIR) &&
264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 err = __dcache_readdir(filp, dirent, filldir);
267 if (err != -EAGAIN) {
268 spin_unlock(&inode->i_lock);
269 return err;
270 }
271 }
272 spin_unlock(&inode->i_lock);
273 if (fi->dentry) {
274 err = note_last_dentry(fi, fi->dentry->d_name.name,
275 fi->dentry->d_name.len);
276 if (err)
277 return err;
278 dput(fi->dentry);
279 fi->dentry = NULL;
280 }
281
282 /* proceed with a normal readdir */
283
284more:
285 /* do we have the correct frag content buffered? */
286 if (fi->frag != frag || fi->last_readdir == NULL) {
287 struct ceph_mds_request *req;
288 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
289 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
290
291 /* discard old result, if any */
292 if (fi->last_readdir) {
293 ceph_mdsc_put_request(fi->last_readdir);
294 fi->last_readdir = NULL;
295 }
296
297 /* requery frag tree, as the frag topology may have changed */
298 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
299
300 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
301 ceph_vinop(inode), frag, fi->last_name);
302 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
303 if (IS_ERR(req))
304 return PTR_ERR(req);
305 req->r_inode = igrab(inode);
306 req->r_dentry = dget(filp->f_dentry);
307 /* hints to request -> mds selection code */
308 req->r_direct_mode = USE_AUTH_MDS;
309 req->r_direct_hash = ceph_frag_value(frag);
310 req->r_direct_is_hash = true;
311 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
312 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
315 req->r_num_caps = max_entries;
316 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) {
318 ceph_mdsc_put_request(req);
319 return err;
320 }
321 dout("readdir got and parsed readdir result=%d"
322 " on frag %x, end=%d, complete=%d\n", err, frag,
323 (int)req->r_reply_info.dir_end,
324 (int)req->r_reply_info.dir_complete);
325
326 if (!req->r_did_prepopulate) {
327 dout("readdir !did_prepopulate");
328 fi->dir_release_count--; /* preclude I_COMPLETE */
329 }
330
331 /* note next offset and last dentry name */
332 fi->offset = fi->next_offset;
333 fi->last_readdir = req;
334
335 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name);
337 fi->last_name = NULL;
338 fi->next_offset = 0;
339 } else {
340 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi,
342 rinfo->dir_dname[rinfo->dir_nr-1],
343 rinfo->dir_dname_len[rinfo->dir_nr-1]);
344 if (err)
345 return err;
346 fi->next_offset += rinfo->dir_nr;
347 }
348 }
349
350 rinfo = &fi->last_readdir->r_reply_info;
351 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
352 rinfo->dir_nr, off, fi->offset);
353 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
354 u64 pos = ceph_make_fpos(frag, off);
355 struct ceph_mds_reply_inode *in =
356 rinfo->dir_in[off - fi->offset].in;
357 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
358 off, off - fi->offset, rinfo->dir_nr, pos,
359 rinfo->dir_dname_len[off - fi->offset],
360 rinfo->dir_dname[off - fi->offset], in);
361 BUG_ON(!in);
362 ftype = le32_to_cpu(in->mode) >> 12;
363 if (filldir(dirent,
364 rinfo->dir_dname[off - fi->offset],
365 rinfo->dir_dname_len[off - fi->offset],
366 pos,
367 le64_to_cpu(in->ino),
368 ftype) < 0) {
369 dout("filldir stopping us...\n");
370 return 0;
371 }
372 off++;
373 filp->f_pos = pos + 1;
374 }
375
376 if (fi->last_name) {
377 ceph_mdsc_put_request(fi->last_readdir);
378 fi->last_readdir = NULL;
379 goto more;
380 }
381
382 /* more frags? */
383 if (!ceph_frag_is_rightmost(frag)) {
384 frag = ceph_frag_next(frag);
385 off = 0;
386 filp->f_pos = ceph_make_fpos(frag, off);
387 dout("readdir next frag is %x\n", frag);
388 goto more;
389 }
390 fi->at_end = 1;
391
392 /*
393 * if dir_release_count still matches the dir, no dentries
394 * were released during the whole readdir, and we should have
395 * the complete dir contents in our cache.
396 */
397 spin_lock(&inode->i_lock);
398 if (ci->i_release_count == fi->dir_release_count) {
399 dout(" marking %p complete\n", inode);
400 ci->i_ceph_flags |= CEPH_I_COMPLETE;
401 ci->i_max_offset = filp->f_pos;
402 }
403 spin_unlock(&inode->i_lock);
404
405 dout("readdir %p filp %p done.\n", inode, filp);
406 return 0;
407}
408
409static void reset_readdir(struct ceph_file_info *fi)
410{
411 if (fi->last_readdir) {
412 ceph_mdsc_put_request(fi->last_readdir);
413 fi->last_readdir = NULL;
414 }
415 kfree(fi->last_name);
416 fi->next_offset = 2; /* compensate for . and .. */
417 if (fi->dentry) {
418 dput(fi->dentry);
419 fi->dentry = NULL;
420 }
421 fi->at_end = 0;
422}
423
424static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
425{
426 struct ceph_file_info *fi = file->private_data;
427 struct inode *inode = file->f_mapping->host;
428 loff_t old_offset = offset;
429 loff_t retval;
430
431 mutex_lock(&inode->i_mutex);
432 switch (origin) {
433 case SEEK_END:
434 offset += inode->i_size + 2; /* FIXME */
435 break;
436 case SEEK_CUR:
437 offset += file->f_pos;
438 }
439 retval = -EINVAL;
440 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
441 if (offset != file->f_pos) {
442 file->f_pos = offset;
443 file->f_version = 0;
444 fi->at_end = 0;
445 }
446 retval = offset;
447
448 /*
449 * discard buffered readdir content on seekdir(0), or
450 * seek to new frag, or seek prior to current chunk.
451 */
452 if (offset == 0 ||
453 fpos_frag(offset) != fpos_frag(old_offset) ||
454 fpos_off(offset) < fi->offset) {
455 dout("dir_llseek dropping %p content\n", file);
456 reset_readdir(fi);
457 }
458
459 /* bump dir_release_count if we did a forward seek */
460 if (offset > old_offset)
461 fi->dir_release_count--;
462 }
463 mutex_unlock(&inode->i_mutex);
464 return retval;
465}
466
467/*
468 * Process result of a lookup/open request.
469 *
470 * Mainly, make sure we return the final req->r_dentry (if it already
471 * existed) in place of the original VFS-provided dentry when they
472 * differ.
473 *
474 * Gracefully handle the case where the MDS replies with -ENOENT and
475 * no trace (which it may do, at its discretion, e.g., if it doesn't
476 * care to issue a lease on the negative dentry).
477 */
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err)
480{
481 struct ceph_client *client = ceph_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode;
483
484 /* .snap dir? */
485 if (err == -ENOENT &&
486 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
487 strcmp(dentry->d_name.name,
488 client->mount_args->snapdir_name) == 0) {
489 struct inode *inode = ceph_get_snapdir(parent);
490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
492 d_add(dentry, inode);
493 err = 0;
494 }
495
496 if (err == -ENOENT) {
497 /* no trace? */
498 err = 0;
499 if (!req->r_reply_info.head->is_dentry) {
500 dout("ENOENT and no trace, dentry %p inode %p\n",
501 dentry, dentry->d_inode);
502 if (dentry->d_inode) {
503 d_drop(dentry);
504 err = -ENOENT;
505 } else {
506 d_add(dentry, NULL);
507 }
508 }
509 }
510 if (err)
511 dentry = ERR_PTR(err);
512 else if (dentry != req->r_dentry)
513 dentry = dget(req->r_dentry); /* we got spliced */
514 else
515 dentry = NULL;
516 return dentry;
517}
518
519static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
520{
521 return ceph_ino(inode) == CEPH_INO_ROOT &&
522 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
523}
524
525/*
526 * Look up a single dir entry. If there is a lookup intent, inform
527 * the MDS so that it gets our 'caps wanted' value in a single op.
528 */
529static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
530 struct nameidata *nd)
531{
532 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
533 struct ceph_mds_client *mdsc = &client->mdsc;
534 struct ceph_mds_request *req;
535 int op;
536 int err;
537
538 dout("lookup %p dentry %p '%.*s'\n",
539 dir, dentry, dentry->d_name.len, dentry->d_name.name);
540
541 if (dentry->d_name.len > NAME_MAX)
542 return ERR_PTR(-ENAMETOOLONG);
543
544 err = ceph_init_dentry(dentry);
545 if (err < 0)
546 return ERR_PTR(err);
547
548 /* open (but not create!) intent? */
549 if (nd &&
550 (nd->flags & LOOKUP_OPEN) &&
551 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
552 !(nd->intent.open.flags & O_CREAT)) {
553 int mode = nd->intent.open.create_mode & ~current->fs->umask;
554 return ceph_lookup_open(dir, dentry, nd, mode, 1);
555 }
556
557 /* can we conclude ENOENT locally? */
558 if (dentry->d_inode == NULL) {
559 struct ceph_inode_info *ci = ceph_inode(dir);
560 struct ceph_dentry_info *di = ceph_dentry(dentry);
561
562 spin_lock(&dir->i_lock);
563 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
564 if (strncmp(dentry->d_name.name,
565 client->mount_args->snapdir_name,
566 dentry->d_name.len) &&
567 !is_root_ceph_dentry(dir, dentry) &&
568 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
569 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
570 di->offset = ci->i_max_offset++;
571 spin_unlock(&dir->i_lock);
572 dout(" dir %p complete, -ENOENT\n", dir);
573 d_add(dentry, NULL);
574 di->lease_shared_gen = ci->i_shared_gen;
575 return NULL;
576 }
577 spin_unlock(&dir->i_lock);
578 }
579
580 op = ceph_snap(dir) == CEPH_SNAPDIR ?
581 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
582 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
583 if (IS_ERR(req))
584 return ERR_PTR(PTR_ERR(req));
585 req->r_dentry = dget(dentry);
586 req->r_num_caps = 2;
587 /* we only need inode linkage */
588 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
589 req->r_locked_dir = dir;
590 err = ceph_mdsc_do_request(mdsc, NULL, req);
591 dentry = ceph_finish_lookup(req, dentry, err);
592 ceph_mdsc_put_request(req); /* will dput(dentry) */
593 dout("lookup result=%p\n", dentry);
594 return dentry;
595}
596
597/*
598 * If we do a create but get no trace back from the MDS, follow up with
599 * a lookup (the VFS expects us to link up the provided dentry).
600 */
601int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
602{
603 struct dentry *result = ceph_lookup(dir, dentry, NULL);
604
605 if (result && !IS_ERR(result)) {
606 /*
607 * We created the item, then did a lookup, and found
608 * it was already linked to another inode we already
609 * had in our cache (and thus got spliced). Link our
610 * dentry to that inode, but don't hash it, just in
611 * case the VFS wants to dereference it.
612 */
613 BUG_ON(!result->d_inode);
614 d_instantiate(dentry, result->d_inode);
615 return 0;
616 }
617 return PTR_ERR(result);
618}
619
620static int ceph_mknod(struct inode *dir, struct dentry *dentry,
621 int mode, dev_t rdev)
622{
623 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
624 struct ceph_mds_client *mdsc = &client->mdsc;
625 struct ceph_mds_request *req;
626 int err;
627
628 if (ceph_snap(dir) != CEPH_NOSNAP)
629 return -EROFS;
630
631 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
632 dir, dentry, mode, rdev);
633 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
634 if (IS_ERR(req)) {
635 d_drop(dentry);
636 return PTR_ERR(req);
637 }
638 req->r_dentry = dget(dentry);
639 req->r_num_caps = 2;
640 req->r_locked_dir = dir;
641 req->r_args.mknod.mode = cpu_to_le32(mode);
642 req->r_args.mknod.rdev = cpu_to_le32(rdev);
643 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
644 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
645 err = ceph_mdsc_do_request(mdsc, dir, req);
646 if (!err && !req->r_reply_info.head->is_dentry)
647 err = ceph_handle_notrace_create(dir, dentry);
648 ceph_mdsc_put_request(req);
649 if (err)
650 d_drop(dentry);
651 return err;
652}
653
654static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
655 struct nameidata *nd)
656{
657 dout("create in dir %p dentry %p name '%.*s'\n",
658 dir, dentry, dentry->d_name.len, dentry->d_name.name);
659
660 if (ceph_snap(dir) != CEPH_NOSNAP)
661 return -EROFS;
662
663 if (nd) {
664 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
665 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
666 /* hrm, what should i do here if we get aliased? */
667 if (IS_ERR(dentry))
668 return PTR_ERR(dentry);
669 return 0;
670 }
671
672 /* fall back to mknod */
673 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
674}
675
676static int ceph_symlink(struct inode *dir, struct dentry *dentry,
677 const char *dest)
678{
679 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
680 struct ceph_mds_client *mdsc = &client->mdsc;
681 struct ceph_mds_request *req;
682 int err;
683
684 if (ceph_snap(dir) != CEPH_NOSNAP)
685 return -EROFS;
686
687 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
688 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
689 if (IS_ERR(req)) {
690 d_drop(dentry);
691 return PTR_ERR(req);
692 }
693 req->r_dentry = dget(dentry);
694 req->r_num_caps = 2;
695 req->r_path2 = kstrdup(dest, GFP_NOFS);
696 req->r_locked_dir = dir;
697 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
698 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
699 err = ceph_mdsc_do_request(mdsc, dir, req);
700 if (!err && !req->r_reply_info.head->is_dentry)
701 err = ceph_handle_notrace_create(dir, dentry);
702 ceph_mdsc_put_request(req);
703 if (err)
704 d_drop(dentry);
705 return err;
706}
707
708static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
709{
710 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
711 struct ceph_mds_client *mdsc = &client->mdsc;
712 struct ceph_mds_request *req;
713 int err = -EROFS;
714 int op;
715
716 if (ceph_snap(dir) == CEPH_SNAPDIR) {
717 /* mkdir .snap/foo is a MKSNAP */
718 op = CEPH_MDS_OP_MKSNAP;
719 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
720 dentry->d_name.len, dentry->d_name.name, dentry);
721 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
722 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
723 op = CEPH_MDS_OP_MKDIR;
724 } else {
725 goto out;
726 }
727 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
728 if (IS_ERR(req)) {
729 err = PTR_ERR(req);
730 goto out;
731 }
732
733 req->r_dentry = dget(dentry);
734 req->r_num_caps = 2;
735 req->r_locked_dir = dir;
736 req->r_args.mkdir.mode = cpu_to_le32(mode);
737 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
738 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
739 err = ceph_mdsc_do_request(mdsc, dir, req);
740 if (!err && !req->r_reply_info.head->is_dentry)
741 err = ceph_handle_notrace_create(dir, dentry);
742 ceph_mdsc_put_request(req);
743out:
744 if (err < 0)
745 d_drop(dentry);
746 return err;
747}
748
749static int ceph_link(struct dentry *old_dentry, struct inode *dir,
750 struct dentry *dentry)
751{
752 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
753 struct ceph_mds_client *mdsc = &client->mdsc;
754 struct ceph_mds_request *req;
755 int err;
756
757 if (ceph_snap(dir) != CEPH_NOSNAP)
758 return -EROFS;
759
760 dout("link in dir %p old_dentry %p dentry %p\n", dir,
761 old_dentry, dentry);
762 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
763 if (IS_ERR(req)) {
764 d_drop(dentry);
765 return PTR_ERR(req);
766 }
767 req->r_dentry = dget(dentry);
768 req->r_num_caps = 2;
769 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
770 req->r_locked_dir = dir;
771 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
772 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
773 err = ceph_mdsc_do_request(mdsc, dir, req);
774 if (err)
775 d_drop(dentry);
776 else if (!req->r_reply_info.head->is_dentry)
777 d_instantiate(dentry, igrab(old_dentry->d_inode));
778 ceph_mdsc_put_request(req);
779 return err;
780}
781
782/*
783 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
784 * looks like the link count will hit 0, drop any other caps (other
785 * than PIN) we don't specifically want (due to the file still being
786 * open).
787 */
788static int drop_caps_for_unlink(struct inode *inode)
789{
790 struct ceph_inode_info *ci = ceph_inode(inode);
791 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
792
793 spin_lock(&inode->i_lock);
794 if (inode->i_nlink == 1) {
795 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
796 ci->i_ceph_flags |= CEPH_I_NODELAY;
797 }
798 spin_unlock(&inode->i_lock);
799 return drop;
800}
801
802/*
803 * rmdir and unlink are differ only by the metadata op code
804 */
805static int ceph_unlink(struct inode *dir, struct dentry *dentry)
806{
807 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
808 struct ceph_mds_client *mdsc = &client->mdsc;
809 struct inode *inode = dentry->d_inode;
810 struct ceph_mds_request *req;
811 int err = -EROFS;
812 int op;
813
814 if (ceph_snap(dir) == CEPH_SNAPDIR) {
815 /* rmdir .snap/foo is RMSNAP */
816 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
817 dentry->d_name.name, dentry);
818 op = CEPH_MDS_OP_RMSNAP;
819 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
820 dout("unlink/rmdir dir %p dn %p inode %p\n",
821 dir, dentry, inode);
822 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
823 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
824 } else
825 goto out;
826 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
827 if (IS_ERR(req)) {
828 err = PTR_ERR(req);
829 goto out;
830 }
831 req->r_dentry = dget(dentry);
832 req->r_num_caps = 2;
833 req->r_locked_dir = dir;
834 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
835 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
836 req->r_inode_drop = drop_caps_for_unlink(inode);
837 err = ceph_mdsc_do_request(mdsc, dir, req);
838 if (!err && !req->r_reply_info.head->is_dentry)
839 d_delete(dentry);
840 ceph_mdsc_put_request(req);
841out:
842 return err;
843}
844
845static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
846 struct inode *new_dir, struct dentry *new_dentry)
847{
848 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
849 struct ceph_mds_client *mdsc = &client->mdsc;
850 struct ceph_mds_request *req;
851 int err;
852
853 if (ceph_snap(old_dir) != ceph_snap(new_dir))
854 return -EXDEV;
855 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
856 ceph_snap(new_dir) != CEPH_NOSNAP)
857 return -EROFS;
858 dout("rename dir %p dentry %p to dir %p dentry %p\n",
859 old_dir, old_dentry, new_dir, new_dentry);
860 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
861 if (IS_ERR(req))
862 return PTR_ERR(req);
863 req->r_dentry = dget(new_dentry);
864 req->r_num_caps = 2;
865 req->r_old_dentry = dget(old_dentry);
866 req->r_locked_dir = new_dir;
867 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
868 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
869 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
870 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
871 /* release LINK_RDCACHE on source inode (mds will lock it) */
872 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
873 if (new_dentry->d_inode)
874 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
875 err = ceph_mdsc_do_request(mdsc, old_dir, req);
876 if (!err && !req->r_reply_info.head->is_dentry) {
877 /*
878 * Normally d_move() is done by fill_trace (called by
879 * do_request, above). If there is no trace, we need
880 * to do it here.
881 */
882 d_move(old_dentry, new_dentry);
883 }
884 ceph_mdsc_put_request(req);
885 return err;
886}
887
888
889/*
890 * Check if dentry lease is valid. If not, delete the lease. Try to
891 * renew if the least is more than half up.
892 */
893static int dentry_lease_is_valid(struct dentry *dentry)
894{
895 struct ceph_dentry_info *di;
896 struct ceph_mds_session *s;
897 int valid = 0;
898 u32 gen;
899 unsigned long ttl;
900 struct ceph_mds_session *session = NULL;
901 struct inode *dir = NULL;
902 u32 seq = 0;
903
904 spin_lock(&dentry->d_lock);
905 di = ceph_dentry(dentry);
906 if (di && di->lease_session) {
907 s = di->lease_session;
908 spin_lock(&s->s_cap_lock);
909 gen = s->s_cap_gen;
910 ttl = s->s_cap_ttl;
911 spin_unlock(&s->s_cap_lock);
912
913 if (di->lease_gen == gen &&
914 time_before(jiffies, dentry->d_time) &&
915 time_before(jiffies, ttl)) {
916 valid = 1;
917 if (di->lease_renew_after &&
918 time_after(jiffies, di->lease_renew_after)) {
919 /* we should renew */
920 dir = dentry->d_parent->d_inode;
921 session = ceph_get_mds_session(s);
922 seq = di->lease_seq;
923 di->lease_renew_after = 0;
924 di->lease_renew_from = jiffies;
925 }
926 }
927 }
928 spin_unlock(&dentry->d_lock);
929
930 if (session) {
931 ceph_mdsc_lease_send_msg(session, dir, dentry,
932 CEPH_MDS_LEASE_RENEW, seq);
933 ceph_put_mds_session(session);
934 }
935 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
936 return valid;
937}
938
939/*
940 * Check if directory-wide content lease/cap is valid.
941 */
942static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
943{
944 struct ceph_inode_info *ci = ceph_inode(dir);
945 struct ceph_dentry_info *di = ceph_dentry(dentry);
946 int valid = 0;
947
948 spin_lock(&dir->i_lock);
949 if (ci->i_shared_gen == di->lease_shared_gen)
950 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
951 spin_unlock(&dir->i_lock);
952 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
953 dir, (unsigned)ci->i_shared_gen, dentry,
954 (unsigned)di->lease_shared_gen, valid);
955 return valid;
956}
957
958/*
959 * Check if cached dentry can be trusted.
960 */
961static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
962{
963 struct inode *dir = dentry->d_parent->d_inode;
964
965 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
966 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
967
968 /* always trust cached snapped dentries, snapdir dentry */
969 if (ceph_snap(dir) != CEPH_NOSNAP) {
970 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
971 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
972 goto out_touch;
973 }
974 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
975 goto out_touch;
976
977 if (dentry_lease_is_valid(dentry) ||
978 dir_lease_is_valid(dir, dentry))
979 goto out_touch;
980
981 dout("d_revalidate %p invalid\n", dentry);
982 d_drop(dentry);
983 return 0;
984out_touch:
985 ceph_dentry_lru_touch(dentry);
986 return 1;
987}
988
989/*
990 * When a dentry is released, clear the dir I_COMPLETE if it was part
991 * of the current dir gen.
992 */
993static void ceph_dentry_release(struct dentry *dentry)
994{
995 struct ceph_dentry_info *di = ceph_dentry(dentry);
996 struct inode *parent_inode = dentry->d_parent->d_inode;
997
998 if (parent_inode) {
999 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1000
1001 spin_lock(&parent_inode->i_lock);
1002 if (ci->i_shared_gen == di->lease_shared_gen) {
1003 dout(" clearing %p complete (d_release)\n",
1004 parent_inode);
1005 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1006 ci->i_release_count++;
1007 }
1008 spin_unlock(&parent_inode->i_lock);
1009 }
1010 if (di) {
1011 ceph_dentry_lru_del(dentry);
1012 if (di->lease_session)
1013 ceph_put_mds_session(di->lease_session);
1014 kmem_cache_free(ceph_dentry_cachep, di);
1015 dentry->d_fsdata = NULL;
1016 }
1017}
1018
1019static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1020 struct nameidata *nd)
1021{
1022 /*
1023 * Eventually, we'll want to revalidate snapped metadata
1024 * too... probably...
1025 */
1026 return 1;
1027}
1028
1029
1030
1031/*
1032 * read() on a dir. This weird interface hack only works if mounted
1033 * with '-o dirstat'.
1034 */
1035static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1036 loff_t *ppos)
1037{
1038 struct ceph_file_info *cf = file->private_data;
1039 struct inode *inode = file->f_dentry->d_inode;
1040 struct ceph_inode_info *ci = ceph_inode(inode);
1041 int left;
1042
1043 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1044 return -EISDIR;
1045
1046 if (!cf->dir_info) {
1047 cf->dir_info = kmalloc(1024, GFP_NOFS);
1048 if (!cf->dir_info)
1049 return -ENOMEM;
1050 cf->dir_info_len =
1051 sprintf(cf->dir_info,
1052 "entries: %20lld\n"
1053 " files: %20lld\n"
1054 " subdirs: %20lld\n"
1055 "rentries: %20lld\n"
1056 " rfiles: %20lld\n"
1057 " rsubdirs: %20lld\n"
1058 "rbytes: %20lld\n"
1059 "rctime: %10ld.%09ld\n",
1060 ci->i_files + ci->i_subdirs,
1061 ci->i_files,
1062 ci->i_subdirs,
1063 ci->i_rfiles + ci->i_rsubdirs,
1064 ci->i_rfiles,
1065 ci->i_rsubdirs,
1066 ci->i_rbytes,
1067 (long)ci->i_rctime.tv_sec,
1068 (long)ci->i_rctime.tv_nsec);
1069 }
1070
1071 if (*ppos >= cf->dir_info_len)
1072 return 0;
1073 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1074 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1075 if (left == size)
1076 return -EFAULT;
1077 *ppos += (size - left);
1078 return size - left;
1079}
1080
1081/*
1082 * an fsync() on a dir will wait for any uncommitted directory
1083 * operations to commit.
1084 */
1085static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1086 int datasync)
1087{
1088 struct inode *inode = dentry->d_inode;
1089 struct ceph_inode_info *ci = ceph_inode(inode);
1090 struct list_head *head = &ci->i_unsafe_dirops;
1091 struct ceph_mds_request *req;
1092 u64 last_tid;
1093 int ret = 0;
1094
1095 dout("dir_fsync %p\n", inode);
1096 spin_lock(&ci->i_unsafe_lock);
1097 if (list_empty(head))
1098 goto out;
1099
1100 req = list_entry(head->prev,
1101 struct ceph_mds_request, r_unsafe_dir_item);
1102 last_tid = req->r_tid;
1103
1104 do {
1105 ceph_mdsc_get_request(req);
1106 spin_unlock(&ci->i_unsafe_lock);
1107 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1108 inode, req->r_tid, last_tid);
1109 if (req->r_timeout) {
1110 ret = wait_for_completion_timeout(
1111 &req->r_safe_completion, req->r_timeout);
1112 if (ret > 0)
1113 ret = 0;
1114 else if (ret == 0)
1115 ret = -EIO; /* timed out */
1116 } else {
1117 wait_for_completion(&req->r_safe_completion);
1118 }
1119 spin_lock(&ci->i_unsafe_lock);
1120 ceph_mdsc_put_request(req);
1121
1122 if (ret || list_empty(head))
1123 break;
1124 req = list_entry(head->next,
1125 struct ceph_mds_request, r_unsafe_dir_item);
1126 } while (req->r_tid < last_tid);
1127out:
1128 spin_unlock(&ci->i_unsafe_lock);
1129 return ret;
1130}
1131
1132/*
1133 * We maintain a private dentry LRU.
1134 *
1135 * FIXME: this needs to be changed to a per-mds lru to be useful.
1136 */
1137void ceph_dentry_lru_add(struct dentry *dn)
1138{
1139 struct ceph_dentry_info *di = ceph_dentry(dn);
1140 struct ceph_mds_client *mdsc;
1141
1142 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1143 dn->d_name.len, dn->d_name.name);
1144 if (di) {
1145 mdsc = &ceph_client(dn->d_sb)->mdsc;
1146 spin_lock(&mdsc->dentry_lru_lock);
1147 list_add_tail(&di->lru, &mdsc->dentry_lru);
1148 mdsc->num_dentry++;
1149 spin_unlock(&mdsc->dentry_lru_lock);
1150 }
1151}
1152
1153void ceph_dentry_lru_touch(struct dentry *dn)
1154{
1155 struct ceph_dentry_info *di = ceph_dentry(dn);
1156 struct ceph_mds_client *mdsc;
1157
1158 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1159 dn->d_name.len, dn->d_name.name);
1160 if (di) {
1161 mdsc = &ceph_client(dn->d_sb)->mdsc;
1162 spin_lock(&mdsc->dentry_lru_lock);
1163 list_move_tail(&di->lru, &mdsc->dentry_lru);
1164 spin_unlock(&mdsc->dentry_lru_lock);
1165 }
1166}
1167
1168void ceph_dentry_lru_del(struct dentry *dn)
1169{
1170 struct ceph_dentry_info *di = ceph_dentry(dn);
1171 struct ceph_mds_client *mdsc;
1172
1173 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1174 dn->d_name.len, dn->d_name.name);
1175 if (di) {
1176 mdsc = &ceph_client(dn->d_sb)->mdsc;
1177 spin_lock(&mdsc->dentry_lru_lock);
1178 list_del_init(&di->lru);
1179 mdsc->num_dentry--;
1180 spin_unlock(&mdsc->dentry_lru_lock);
1181 }
1182}
1183
1184const struct file_operations ceph_dir_fops = {
1185 .read = ceph_read_dir,
1186 .readdir = ceph_readdir,
1187 .llseek = ceph_dir_llseek,
1188 .open = ceph_open,
1189 .release = ceph_release,
1190 .unlocked_ioctl = ceph_ioctl,
1191 .fsync = ceph_dir_fsync,
1192};
1193
1194const struct inode_operations ceph_dir_iops = {
1195 .lookup = ceph_lookup,
1196 .permission = ceph_permission,
1197 .getattr = ceph_getattr,
1198 .setattr = ceph_setattr,
1199 .setxattr = ceph_setxattr,
1200 .getxattr = ceph_getxattr,
1201 .listxattr = ceph_listxattr,
1202 .removexattr = ceph_removexattr,
1203 .mknod = ceph_mknod,
1204 .symlink = ceph_symlink,
1205 .mkdir = ceph_mkdir,
1206 .link = ceph_link,
1207 .unlink = ceph_unlink,
1208 .rmdir = ceph_unlink,
1209 .rename = ceph_rename,
1210 .create = ceph_create,
1211};
1212
1213struct dentry_operations ceph_dentry_ops = {
1214 .d_revalidate = ceph_d_revalidate,
1215 .d_release = ceph_dentry_release,
1216};
1217
1218struct dentry_operations ceph_snapdir_dentry_ops = {
1219 .d_revalidate = ceph_snapdir_d_revalidate,
1220};
1221
1222struct dentry_operations ceph_snap_dentry_ops = {
1223};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <linux/slab.h>
5#include <asm/unaligned.h>
6
7#include "super.h"
8
9/*
10 * NFS export support
11 *
12 * NFS re-export of a ceph mount is, at present, only semireliable.
13 * The basic issue is that the Ceph architectures doesn't lend itself
14 * well to generating filehandles that will remain valid forever.
15 *
16 * So, we do our best. If you're lucky, your inode will be in the
17 * client's cache. If it's not, and you have a connectable fh, then
18 * the MDS server may be able to find it for you. Otherwise, you get
19 * ESTALE.
20 *
21 * There are ways to this more reliable, but in the non-connectable fh
22 * case, we won't every work perfectly, and in the connectable case,
23 * some changes are needed on the MDS side to work better.
24 */
25
26/*
27 * Basic fh
28 */
29struct ceph_nfs_fh {
30 u64 ino;
31} __attribute__ ((packed));
32
33/*
34 * Larger 'connectable' fh that includes parent ino and name hash.
35 * Use this whenever possible, as it works more reliably.
36 */
37struct ceph_nfs_confh {
38 u64 ino, parent_ino;
39 u32 parent_name_hash;
40} __attribute__ ((packed));
41
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable)
44{
45 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode;
49 int type;
50
51 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL;
54
55 if (*max_len >= sizeof(*cfh)) {
56 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh);
61 type = 2;
62 } else if (*max_len > sizeof(*fh)) {
63 if (connectable)
64 return -ENOSPC;
65 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh);
68 type = 1;
69 } else {
70 return -ENOSPC;
71 }
72 return type;
73}
74
75/*
76 * convert regular fh to dentry
77 *
78 * FIXME: we should try harder by querying the mds for the ino.
79 */
80static struct dentry *__fh_to_dentry(struct super_block *sb,
81 struct ceph_nfs_fh *fh)
82{
83 struct inode *inode;
84 struct dentry *dentry;
85 struct ceph_vino vino;
86 int err;
87
88 dout("__fh_to_dentry %llx\n", fh->ino);
89 vino.ino = fh->ino;
90 vino.snap = CEPH_NOSNAP;
91 inode = ceph_find_inode(sb, vino);
92 if (!inode)
93 return ERR_PTR(-ESTALE);
94
95 dentry = d_obtain_alias(inode);
96 if (!dentry) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode);
99 iput(inode);
100 return ERR_PTR(-ENOMEM);
101 }
102 err = ceph_init_dentry(dentry);
103
104 if (err < 0) {
105 iput(inode);
106 return ERR_PTR(err);
107 }
108 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
109 return dentry;
110}
111
112/*
113 * convert connectable fh to dentry
114 */
115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh)
117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
119 struct inode *inode;
120 struct dentry *dentry;
121 struct ceph_vino vino;
122 int err;
123
124 dout("__cfh_to_dentry %llx (%llx/%x)\n",
125 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
126
127 vino.ino = cfh->ino;
128 vino.snap = CEPH_NOSNAP;
129 inode = ceph_find_inode(sb, vino);
130 if (!inode) {
131 struct ceph_mds_request *req;
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS);
135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req));
137
138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino;
140 req->r_ino2.snap = CEPH_NOSNAP;
141 req->r_path2 = kmalloc(16, GFP_NOFS);
142 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
143 req->r_num_caps = 1;
144 err = ceph_mdsc_do_request(mdsc, NULL, req);
145 ceph_mdsc_put_request(req);
146 inode = ceph_find_inode(sb, vino);
147 if (!inode)
148 return ERR_PTR(err ? err : -ESTALE);
149 }
150
151 dentry = d_obtain_alias(inode);
152 if (!dentry) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode);
155 iput(inode);
156 return ERR_PTR(-ENOMEM);
157 }
158 err = ceph_init_dentry(dentry);
159 if (err < 0) {
160 iput(inode);
161 return ERR_PTR(err);
162 }
163 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
164 return dentry;
165}
166
167static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
168 int fh_len, int fh_type)
169{
170 if (fh_type == 1)
171 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
172 else
173 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
174}
175
176/*
177 * get parent, if possible.
178 *
179 * FIXME: we could do better by querying the mds to discover the
180 * parent.
181 */
182static struct dentry *ceph_fh_to_parent(struct super_block *sb,
183 struct fid *fid,
184 int fh_len, int fh_type)
185{
186 struct ceph_nfs_confh *cfh = (void *)fid->raw;
187 struct ceph_vino vino;
188 struct inode *inode;
189 struct dentry *dentry;
190 int err;
191
192 if (fh_type == 1)
193 return ERR_PTR(-ESTALE);
194
195 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
196 cfh->parent_name_hash);
197
198 vino.ino = cfh->ino;
199 vino.snap = CEPH_NOSNAP;
200 inode = ceph_find_inode(sb, vino);
201 if (!inode)
202 return ERR_PTR(-ESTALE);
203
204 dentry = d_obtain_alias(inode);
205 if (!dentry) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode);
208 iput(inode);
209 return ERR_PTR(-ENOMEM);
210 }
211 err = ceph_init_dentry(dentry);
212 if (err < 0) {
213 iput(inode);
214 return ERR_PTR(err);
215 }
216 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
217 return dentry;
218}
219
220const struct export_operations ceph_export_ops = {
221 .encode_fh = ceph_encode_fh,
222 .fh_to_dentry = ceph_fh_to_dentry,
223 .fh_to_parent = ceph_fh_to_parent,
224};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..4add3d5da2c1
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,938 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/slab.h>
5#include <linux/file.h>
6#include <linux/namei.h>
7#include <linux/writeback.h>
8
9#include "super.h"
10#include "mds_client.h"
11
12/*
13 * Ceph file operations
14 *
15 * Implement basic open/close functionality, and implement
16 * read/write.
17 *
18 * We implement three modes of file I/O:
19 * - buffered uses the generic_file_aio_{read,write} helpers
20 *
21 * - synchronous is used when there is multi-client read/write
22 * sharing, avoids the page cache, and synchronously waits for an
23 * ack from the OSD.
24 *
25 * - direct io takes the variant of the sync path that references
26 * user pages directly.
27 *
28 * fsync() flushes and waits on dirty pages, but just queues metadata
29 * for writeback: since the MDS can recover size and mtime there is no
30 * need to wait for MDS acknowledgement.
31 */
32
33
34/*
35 * Prepare an open request. Preallocate ceph_cap to avoid an
36 * inopportune ENOMEM later.
37 */
38static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{
41 struct ceph_client *client = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc;
43 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
46
47 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
48 want_auth = USE_AUTH_MDS;
49
50 req = ceph_mdsc_create_request(mdsc, op, want_auth);
51 if (IS_ERR(req))
52 goto out;
53 req->r_fmode = ceph_flags_to_mode(flags);
54 req->r_args.open.flags = cpu_to_le32(flags);
55 req->r_args.open.mode = cpu_to_le32(create_mode);
56 req->r_args.open.preferred = cpu_to_le32(-1);
57out:
58 return req;
59}
60
61/*
62 * initialize private struct file data.
63 * if we fail, clean up by dropping fmode reference on the ceph_inode
64 */
65static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
66{
67 struct ceph_file_info *cf;
68 int ret = 0;
69
70 switch (inode->i_mode & S_IFMT) {
71 case S_IFREG:
72 case S_IFDIR:
73 dout("init_file %p %p 0%o (regular)\n", inode, file,
74 inode->i_mode);
75 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
76 if (cf == NULL) {
77 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
78 return -ENOMEM;
79 }
80 cf->fmode = fmode;
81 cf->next_offset = 2;
82 file->private_data = cf;
83 BUG_ON(inode->i_fop->release != ceph_release);
84 break;
85
86 case S_IFLNK:
87 dout("init_file %p %p 0%o (symlink)\n", inode, file,
88 inode->i_mode);
89 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
90 break;
91
92 default:
93 dout("init_file %p %p 0%o (special)\n", inode, file,
94 inode->i_mode);
95 /*
96 * we need to drop the open ref now, since we don't
97 * have .release set to ceph_release.
98 */
99 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
100 BUG_ON(inode->i_fop->release == ceph_release);
101
102 /* call the proper open fop */
103 ret = inode->i_fop->open(inode, file);
104 }
105 return ret;
106}
107
108/*
109 * If the filp already has private_data, that means the file was
110 * already opened by intent during lookup, and we do nothing.
111 *
112 * If we already have the requisite capabilities, we can satisfy
113 * the open request locally (no need to request new caps from the
114 * MDS). We do, however, need to inform the MDS (asynchronously)
115 * if our wanted caps set expands.
116 */
117int ceph_open(struct inode *inode, struct file *file)
118{
119 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc;
122 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
125 int err;
126 int flags, fmode, wanted;
127
128 if (cf) {
129 dout("open file %p is already opened\n", file);
130 return 0;
131 }
132
133 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
134 flags = file->f_flags & ~(O_CREAT|O_EXCL);
135 if (S_ISDIR(inode->i_mode))
136 flags = O_DIRECTORY; /* mds likes to know */
137
138 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
139 ceph_vinop(inode), file, flags, file->f_flags);
140 fmode = ceph_flags_to_mode(flags);
141 wanted = ceph_caps_for_mode(fmode);
142
143 /* snapped files are read-only */
144 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
145 return -EROFS;
146
147 /* trivially open snapdir */
148 if (ceph_snap(inode) == CEPH_SNAPDIR) {
149 spin_lock(&inode->i_lock);
150 __ceph_get_fmode(ci, fmode);
151 spin_unlock(&inode->i_lock);
152 return ceph_init_file(inode, file, fmode);
153 }
154
155 /*
156 * No need to block if we have any caps. Update wanted set
157 * asynchronously.
158 */
159 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL);
163
164 dout("open %p fmode %d want %s issued %s using existing\n",
165 inode, fmode, ceph_cap_string(wanted),
166 ceph_cap_string(issued));
167 __ceph_get_fmode(ci, fmode);
168 spin_unlock(&inode->i_lock);
169
170 /* adjust wanted? */
171 if ((issued & wanted) != wanted &&
172 (mds_wanted & wanted) != wanted &&
173 ceph_snap(inode) != CEPH_SNAPDIR)
174 ceph_check_caps(ci, 0, NULL);
175
176 return ceph_init_file(inode, file, fmode);
177 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
178 (ci->i_snap_caps & wanted) == wanted) {
179 __ceph_get_fmode(ci, fmode);
180 spin_unlock(&inode->i_lock);
181 return ceph_init_file(inode, file, fmode);
182 }
183 spin_unlock(&inode->i_lock);
184
185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186 req = prepare_open_request(inode->i_sb, flags, 0);
187 if (IS_ERR(req)) {
188 err = PTR_ERR(req);
189 goto out;
190 }
191 req->r_inode = igrab(inode);
192 req->r_num_caps = 1;
193 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
194 if (!err)
195 err = ceph_init_file(inode, file, req->r_fmode);
196 ceph_mdsc_put_request(req);
197 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
198out:
199 return err;
200}
201
202
203/*
204 * Do a lookup + open with a single request.
205 *
206 * If this succeeds, but some subsequent check in the vfs
207 * may_open() fails, the struct *file gets cleaned up (i.e.
208 * ceph_release gets called). So fear not!
209 */
210/*
211 * flags
212 * path_lookup_open -> LOOKUP_OPEN
213 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
214 */
215struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode,
217 int locked_dir)
218{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc;
221 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req;
224 int err;
225 int flags = nd->intent.open.flags - 1; /* silly vfs! */
226
227 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
228 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
229
230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req));
234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2;
236 if (flags & O_CREAT) {
237 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
238 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
239 }
240 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
241 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
242 dentry = ceph_finish_lookup(req, dentry, err);
243 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
244 err = ceph_handle_notrace_create(dir, dentry);
245 if (!err)
246 err = ceph_init_file(req->r_dentry->d_inode, file,
247 req->r_fmode);
248 ceph_mdsc_put_request(req);
249 dout("ceph_lookup_open result=%p\n", dentry);
250 return dentry;
251}
252
253int ceph_release(struct inode *inode, struct file *file)
254{
255 struct ceph_inode_info *ci = ceph_inode(inode);
256 struct ceph_file_info *cf = file->private_data;
257
258 dout("release inode %p file %p\n", inode, file);
259 ceph_put_fmode(ci, cf->fmode);
260 if (cf->last_readdir)
261 ceph_mdsc_put_request(cf->last_readdir);
262 kfree(cf->last_name);
263 kfree(cf->dir_info);
264 dput(cf->dentry);
265 kmem_cache_free(ceph_file_cachep, cf);
266
267 /* wake up anyone waiting for caps on this inode */
268 wake_up(&ci->i_cap_wq);
269 return 0;
270}
271
272/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **alloc_page_vector(int num_pages)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.)
432 *
433 * If we get a short result from the OSD, check against i_size; we need to
434 * only return a short read to the caller if we hit EOF.
435 */
436static int striped_read(struct inode *inode,
437 u64 off, u64 len,
438 struct page **pages, int num_pages,
439 int *checkeof)
440{
441 struct ceph_client *client = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left;
446 int read;
447 struct page **page_pos;
448 int ret;
449 bool hit_stripe, was_short;
450
451 /*
452 * we may need to do multiple reads. not atomic, unfortunately.
453 */
454 pos = off;
455 left = len;
456 page_pos = pages;
457 pages_left = num_pages;
458 read = 0;
459
460more:
461 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq,
465 ci->i_truncate_size,
466 page_pos, pages_left);
467 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT)
470 ret = 0;
471 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
472 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
473
474 if (ret > 0) {
475 int didpages =
476 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
477
478 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read,
481 pos - off - read, pages);
482 }
483 pos += ret;
484 read = pos - off;
485 left -= ret;
486 page_pos += didpages;
487 pages_left -= didpages;
488
489 /* hit stripe? */
490 if (left && hit_stripe)
491 goto more;
492 }
493
494 if (was_short) {
495 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) {
497 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read,
499 pages);
500 read = len;
501 goto out;
502 }
503
504 /* check i_size */
505 *checkeof = 1;
506 }
507
508out:
509 if (ret >= 0)
510 ret = read;
511 dout("striped_read returns %d\n", ret);
512 return ret;
513}
514
515/*
516 * Completely synchronous read and write methods. Direct from __user
517 * buffer to osd, or directly to user pages (if O_DIRECT).
518 *
519 * If the read spans object boundary, just do multiple reads.
520 */
521static ssize_t ceph_sync_read(struct file *file, char __user *data,
522 unsigned len, loff_t *poff, int *checkeof)
523{
524 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages;
526 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len);
528 int ret;
529
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532
533 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len);
535
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else {
543 pages = alloc_page_vector(num_pages);
544 }
545 if (IS_ERR(pages))
546 return PTR_ERR(pages);
547
548 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0)
550 goto done;
551
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0)
557 *poff = off + ret;
558
559done:
560 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages);
562 else
563 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret);
565 return ret;
566}
567
568/*
569 * Write commit callback, called if we requested both an ACK and
570 * ONDISK commit reply from the OSD.
571 */
572static void sync_write_commit(struct ceph_osd_request *req,
573 struct ceph_msg *msg)
574{
575 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
576
577 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
578 spin_lock(&ci->i_unsafe_lock);
579 list_del_init(&req->r_unsafe_item);
580 spin_unlock(&ci->i_unsafe_lock);
581 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
582}
583
584/*
585 * Synchronous write, straight from __user pointer or user pages (if
586 * O_DIRECT).
587 *
588 * If write spans object boundary, just do multiple writes. (For a
589 * correct atomic write, we should e.g. take write locks on all
590 * objects, rollback on failure, etc.)
591 */
592static ssize_t ceph_sync_write(struct file *file, const char __user *data,
593 size_t left, loff_t *offset)
594{
595 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req;
599 struct page **pages;
600 int num_pages;
601 long long unsigned pos;
602 u64 len;
603 int written = 0;
604 int flags;
605 int do_sync = 0;
606 int check_caps = 0;
607 int ret;
608 struct timespec mtime = CURRENT_TIME;
609
610 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
611 return -EROFS;
612
613 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
614 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
615
616 if (file->f_flags & O_APPEND)
617 pos = i_size_read(inode);
618 else
619 pos = *offset;
620
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0)
623 return ret;
624
625 ret = invalidate_inode_pages2_range(inode->i_mapping,
626 pos >> PAGE_CACHE_SHIFT,
627 (pos + left) >> PAGE_CACHE_SHIFT);
628 if (ret < 0)
629 dout("invalidate_inode_pages2_range returned %d\n", ret);
630
631 flags = CEPH_OSD_FLAG_ORDERSNAP |
632 CEPH_OSD_FLAG_ONDISK |
633 CEPH_OSD_FLAG_WRITE;
634 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
635 flags |= CEPH_OSD_FLAG_ACK;
636 else
637 do_sync = 1;
638
639 /*
640 * we may need to do multiple writes here if we span an object
641 * boundary. this isn't atomic, unfortunately. :(
642 */
643more:
644 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context,
649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2);
652 if (IS_ERR(req))
653 return PTR_ERR(req);
654
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages);
661 goto out;
662 }
663
664 /*
665 * throw out any page cache pages in this range. this
666 * may block.
667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
669 } else {
670 pages = alloc_page_vector(num_pages);
671 if (IS_ERR(pages)) {
672 ret = PTR_ERR(pages);
673 goto out;
674 }
675 ret = copy_user_to_page_vector(pages, data, pos, len);
676 if (ret < 0) {
677 ceph_release_page_vector(pages, num_pages);
678 goto out;
679 }
680
681 if ((file->f_flags & O_SYNC) == 0) {
682 /* get a second commit callback */
683 req->r_safe_callback = sync_write_commit;
684 req->r_own_pages = 1;
685 }
686 }
687 req->r_pages = pages;
688 req->r_num_pages = num_pages;
689 req->r_inode = inode;
690
691 ret = ceph_osdc_start_request(&client->osdc, req, false);
692 if (!ret) {
693 if (req->r_safe_callback) {
694 /*
695 * Add to inode unsafe list only after we
696 * start_request so that a tid has been assigned.
697 */
698 spin_lock(&ci->i_unsafe_lock);
699 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
700 spin_unlock(&ci->i_unsafe_lock);
701 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
702 }
703 ret = ceph_osdc_wait_request(&client->osdc, req);
704 }
705
706 if (file->f_flags & O_DIRECT)
707 put_page_vector(pages, num_pages);
708 else if (file->f_flags & O_SYNC)
709 ceph_release_page_vector(pages, num_pages);
710
711out:
712 ceph_osdc_put_request(req);
713 if (ret == 0) {
714 pos += len;
715 written += len;
716 left -= len;
717 if (left)
718 goto more;
719
720 ret = written;
721 *offset = pos;
722 if (pos > i_size_read(inode))
723 check_caps = ceph_inode_set_size(inode, pos);
724 if (check_caps)
725 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
726 NULL);
727 }
728 return ret;
729}
730
731/*
732 * Wrap generic_file_aio_read with checks for cap bits on the inode.
733 * Atomically grab references, so that those bits are not released
734 * back to the MDS mid-read.
735 *
736 * Hmm, the sync read case isn't actually async... should it be?
737 */
738static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
739 unsigned long nr_segs, loff_t pos)
740{
741 struct file *filp = iocb->ki_filp;
742 loff_t *ppos = &iocb->ki_pos;
743 size_t len = iov->iov_len;
744 struct inode *inode = filp->f_dentry->d_inode;
745 struct ceph_inode_info *ci = ceph_inode(inode);
746 void *base = iov->iov_base;
747 ssize_t ret;
748 int got = 0;
749 int checkeof = 0, read = 0;
750
751 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
752 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
753again:
754 __ceph_do_pending_vmtruncate(inode);
755 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
756 &got, -1);
757 if (ret < 0)
758 goto out;
759 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
760 inode, ceph_vinop(inode), pos, (unsigned)len,
761 ceph_cap_string(got));
762
763 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
764 (iocb->ki_filp->f_flags & O_DIRECT) ||
765 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
766 /* hmm, this isn't really async... */
767 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
768 else
769 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
770
771out:
772 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
773 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
774 ceph_put_cap_refs(ci, got);
775
776 if (checkeof && ret >= 0) {
777 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
778
779 /* hit EOF or hole? */
780 if (statret == 0 && *ppos < inode->i_size) {
781 dout("aio_read sync_read hit hole, reading more\n");
782 read += ret;
783 base += ret;
784 len -= ret;
785 checkeof = 0;
786 goto again;
787 }
788 }
789 if (ret >= 0)
790 ret += read;
791
792 return ret;
793}
794
795/*
796 * Take cap references to avoid releasing caps to MDS mid-write.
797 *
798 * If we are synchronous, and write with an old snap context, the OSD
799 * may return EOLDSNAPC. In that case, retry the write.. _after_
800 * dropping our cap refs and allowing the pending snap to logically
801 * complete _before_ this write occurs.
802 *
803 * If we are near ENOSPC, write synchronously.
804 */
805static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
806 unsigned long nr_segs, loff_t pos)
807{
808 struct file *file = iocb->ki_filp;
809 struct inode *inode = file->f_dentry->d_inode;
810 struct ceph_inode_info *ci = ceph_inode(inode);
811 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
812 loff_t endoff = pos + iov->iov_len;
813 int got = 0;
814 int ret, err;
815
816 if (ceph_snap(inode) != CEPH_NOSNAP)
817 return -EROFS;
818
819retry_snap:
820 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
821 return -ENOSPC;
822 __ceph_do_pending_vmtruncate(inode);
823 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
824 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
825 inode->i_size);
826 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
827 &got, endoff);
828 if (ret < 0)
829 goto out;
830
831 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
832 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
833 ceph_cap_string(got));
834
835 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
836 (iocb->ki_filp->f_flags & O_DIRECT) ||
837 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
838 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
839 &iocb->ki_pos);
840 } else {
841 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
842
843 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
844 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
845 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
846 err = vfs_fsync_range(file, file->f_path.dentry,
847 pos, pos + ret - 1, 1);
848 if (err < 0)
849 ret = err;
850 }
851 }
852 if (ret >= 0) {
853 spin_lock(&inode->i_lock);
854 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
855 spin_unlock(&inode->i_lock);
856 }
857
858out:
859 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
860 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
861 ceph_cap_string(got));
862 ceph_put_cap_refs(ci, got);
863
864 if (ret == -EOLDSNAPC) {
865 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
866 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
867 goto retry_snap;
868 }
869
870 return ret;
871}
872
873/*
874 * llseek. be sure to verify file size on SEEK_END.
875 */
876static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
877{
878 struct inode *inode = file->f_mapping->host;
879 int ret;
880
881 mutex_lock(&inode->i_mutex);
882 __ceph_do_pending_vmtruncate(inode);
883 switch (origin) {
884 case SEEK_END:
885 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
886 if (ret < 0) {
887 offset = ret;
888 goto out;
889 }
890 offset += inode->i_size;
891 break;
892 case SEEK_CUR:
893 /*
894 * Here we special-case the lseek(fd, 0, SEEK_CUR)
895 * position-querying operation. Avoid rewriting the "same"
896 * f_pos value back to the file because a concurrent read(),
897 * write() or lseek() might have altered it
898 */
899 if (offset == 0) {
900 offset = file->f_pos;
901 goto out;
902 }
903 offset += file->f_pos;
904 break;
905 }
906
907 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
908 offset = -EINVAL;
909 goto out;
910 }
911
912 /* Special lock needed here? */
913 if (offset != file->f_pos) {
914 file->f_pos = offset;
915 file->f_version = 0;
916 }
917
918out:
919 mutex_unlock(&inode->i_mutex);
920 return offset;
921}
922
923const struct file_operations ceph_file_fops = {
924 .open = ceph_open,
925 .release = ceph_release,
926 .llseek = ceph_llseek,
927 .read = do_sync_read,
928 .write = do_sync_write,
929 .aio_read = ceph_aio_read,
930 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap,
932 .fsync = ceph_fsync,
933 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl,
936 .compat_ioctl = ceph_ioctl,
937};
938
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..aca82d55cc53
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1766 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
397 kfree(ci->i_symlink);
398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
399 frag = rb_entry(n, struct ceph_inode_frag, node);
400 rb_erase(n, &ci->i_fragtree);
401 kfree(frag);
402 }
403
404 __ceph_destroy_xattrs(ci);
405 if (ci->i_xattrs.blob)
406 ceph_buffer_put(ci->i_xattrs.blob);
407 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409
410 kmem_cache_free(ceph_inode_cachep, ci);
411}
412
413
414/*
415 * Helpers to fill in size, ctime, mtime, and atime. We have to be
416 * careful because either the client or MDS may have more up to date
417 * info, depending on which capabilities are held, and whether
418 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
419 * and size are monotonically increasing, except when utimes() or
420 * truncate() increments the corresponding _seq values.)
421 */
422int ceph_fill_file_size(struct inode *inode, int issued,
423 u32 truncate_seq, u64 truncate_size, u64 size)
424{
425 struct ceph_inode_info *ci = ceph_inode(inode);
426 int queue_trunc = 0;
427
428 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
429 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
430 dout("size %lld -> %llu\n", inode->i_size, size);
431 inode->i_size = size;
432 inode->i_blocks = (size + (1<<9) - 1) >> 9;
433 ci->i_reported_size = size;
434 if (truncate_seq != ci->i_truncate_seq) {
435 dout("truncate_seq %u -> %u\n",
436 ci->i_truncate_seq, truncate_seq);
437 ci->i_truncate_seq = truncate_seq;
438 /*
439 * If we hold relevant caps, or in the case where we're
440 * not the only client referencing this file and we
441 * don't hold those caps, then we need to check whether
442 * the file is either opened or mmaped
443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) ||
447 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++;
450 queue_trunc = 1;
451 }
452 }
453 }
454 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
455 ci->i_truncate_size != truncate_size) {
456 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
457 truncate_size);
458 ci->i_truncate_size = truncate_size;
459 }
460 return queue_trunc;
461}
462
463void ceph_fill_file_time(struct inode *inode, int issued,
464 u64 time_warp_seq, struct timespec *ctime,
465 struct timespec *mtime, struct timespec *atime)
466{
467 struct ceph_inode_info *ci = ceph_inode(inode);
468 int warn = 0;
469
470 if (issued & (CEPH_CAP_FILE_EXCL|
471 CEPH_CAP_FILE_WR|
472 CEPH_CAP_FILE_BUFFER)) {
473 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
474 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
475 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
476 ctime->tv_sec, ctime->tv_nsec);
477 inode->i_ctime = *ctime;
478 }
479 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
480 /* the MDS did a utimes() */
481 dout("mtime %ld.%09ld -> %ld.%09ld "
482 "tw %d -> %d\n",
483 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
484 mtime->tv_sec, mtime->tv_nsec,
485 ci->i_time_warp_seq, (int)time_warp_seq);
486
487 inode->i_mtime = *mtime;
488 inode->i_atime = *atime;
489 ci->i_time_warp_seq = time_warp_seq;
490 } else if (time_warp_seq == ci->i_time_warp_seq) {
491 /* nobody did utimes(); take the max */
492 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
493 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
494 inode->i_mtime.tv_sec,
495 inode->i_mtime.tv_nsec,
496 mtime->tv_sec, mtime->tv_nsec);
497 inode->i_mtime = *mtime;
498 }
499 if (timespec_compare(atime, &inode->i_atime) > 0) {
500 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
501 inode->i_atime.tv_sec,
502 inode->i_atime.tv_nsec,
503 atime->tv_sec, atime->tv_nsec);
504 inode->i_atime = *atime;
505 }
506 } else if (issued & CEPH_CAP_FILE_EXCL) {
507 /* we did a utimes(); ignore mds values */
508 } else {
509 warn = 1;
510 }
511 } else {
512 /* we have no write caps; whatever the MDS says is true */
513 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
514 inode->i_ctime = *ctime;
515 inode->i_mtime = *mtime;
516 inode->i_atime = *atime;
517 ci->i_time_warp_seq = time_warp_seq;
518 } else {
519 warn = 1;
520 }
521 }
522 if (warn) /* time_warp_seq shouldn't go backwards */
523 dout("%p mds time_warp_seq %llu < %u\n",
524 inode, time_warp_seq, ci->i_time_warp_seq);
525}
526
527/*
528 * Populate an inode based on info from mds. May be called on new or
529 * existing inodes.
530 */
531static int fill_inode(struct inode *inode,
532 struct ceph_mds_reply_info_in *iinfo,
533 struct ceph_mds_reply_dirfrag *dirinfo,
534 struct ceph_mds_session *session,
535 unsigned long ttl_from, int cap_fmode,
536 struct ceph_cap_reservation *caps_reservation)
537{
538 struct ceph_mds_reply_inode *info = iinfo->in;
539 struct ceph_inode_info *ci = ceph_inode(inode);
540 int i;
541 int issued, implemented;
542 struct timespec mtime, atime, ctime;
543 u32 nsplits;
544 struct ceph_buffer *xattr_blob = NULL;
545 int err = 0;
546 int queue_trunc = 0;
547
548 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
549 inode, ceph_vinop(inode), le64_to_cpu(info->version),
550 ci->i_version);
551
552 /*
553 * prealloc xattr data, if it looks like we'll need it. only
554 * if len > 4 (meaning there are actually xattrs; the first 4
555 * bytes are the xattr count).
556 */
557 if (iinfo->xattr_len > 4) {
558 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
559 if (!xattr_blob)
560 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
561 iinfo->xattr_len);
562 }
563
564 spin_lock(&inode->i_lock);
565
566 /*
567 * provided version will be odd if inode value is projected,
568 * even if stable. skip the update if we have a newer info
569 * (e.g., due to inode info racing form multiple MDSs), or if
570 * we are getting projected (unstable) inode info.
571 */
572 if (le64_to_cpu(info->version) > 0 &&
573 (ci->i_version & ~1) > le64_to_cpu(info->version))
574 goto no_change;
575
576 issued = __ceph_caps_issued(ci, &implemented);
577 issued |= implemented | __ceph_caps_dirty(ci);
578
579 /* update inode */
580 ci->i_version = le64_to_cpu(info->version);
581 inode->i_version++;
582 inode->i_rdev = le32_to_cpu(info->rdev);
583
584 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
585 inode->i_mode = le32_to_cpu(info->mode);
586 inode->i_uid = le32_to_cpu(info->uid);
587 inode->i_gid = le32_to_cpu(info->gid);
588 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
589 inode->i_uid, inode->i_gid);
590 }
591
592 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
593 inode->i_nlink = le32_to_cpu(info->nlink);
594
595 /* be careful with mtime, atime, size */
596 ceph_decode_timespec(&atime, &info->atime);
597 ceph_decode_timespec(&mtime, &info->mtime);
598 ceph_decode_timespec(&ctime, &info->ctime);
599 queue_trunc = ceph_fill_file_size(inode, issued,
600 le32_to_cpu(info->truncate_seq),
601 le64_to_cpu(info->truncate_size),
602 le64_to_cpu(info->size));
603 ceph_fill_file_time(inode, issued,
604 le32_to_cpu(info->time_warp_seq),
605 &ctime, &mtime, &atime);
606
607 ci->i_max_size = le64_to_cpu(info->max_size);
608 ci->i_layout = info->layout;
609 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
610
611 /* xattrs */
612 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
613 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
614 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
615 if (ci->i_xattrs.blob)
616 ceph_buffer_put(ci->i_xattrs.blob);
617 ci->i_xattrs.blob = xattr_blob;
618 if (xattr_blob)
619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 }
623
624 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info;
627
628 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO:
630 case S_IFBLK:
631 case S_IFCHR:
632 case S_IFSOCK:
633 init_special_inode(inode, inode->i_mode, inode->i_rdev);
634 inode->i_op = &ceph_file_iops;
635 break;
636 case S_IFREG:
637 inode->i_op = &ceph_file_iops;
638 inode->i_fop = &ceph_file_fops;
639 break;
640 case S_IFLNK:
641 inode->i_op = &ceph_symlink_iops;
642 if (!ci->i_symlink) {
643 int symlen = iinfo->symlink_len;
644 char *sym;
645
646 BUG_ON(symlen != inode->i_size);
647 spin_unlock(&inode->i_lock);
648
649 err = -ENOMEM;
650 sym = kmalloc(symlen+1, GFP_NOFS);
651 if (!sym)
652 goto out;
653 memcpy(sym, iinfo->symlink, symlen);
654 sym[symlen] = 0;
655
656 spin_lock(&inode->i_lock);
657 if (!ci->i_symlink)
658 ci->i_symlink = sym;
659 else
660 kfree(sym); /* lost a race */
661 }
662 break;
663 case S_IFDIR:
664 inode->i_op = &ceph_dir_iops;
665 inode->i_fop = &ceph_dir_fops;
666
667 ci->i_files = le64_to_cpu(info->files);
668 ci->i_subdirs = le64_to_cpu(info->subdirs);
669 ci->i_rbytes = le64_to_cpu(info->rbytes);
670 ci->i_rfiles = le64_to_cpu(info->rfiles);
671 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
672 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
673
674 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
678 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2;
681 }
682
683 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes;
686 break;
687 default:
688 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
689 ceph_vinop(inode), inode->i_mode);
690 }
691
692no_change:
693 spin_unlock(&inode->i_lock);
694
695 /* queue truncate if we saw i_size decrease */
696 if (queue_trunc)
697 ceph_queue_vmtruncate(inode);
698
699 /* populate frag tree */
700 /* FIXME: move me up, if/when version reflects fragtree changes */
701 nsplits = le32_to_cpu(info->fragtree.nsplits);
702 mutex_lock(&ci->i_fragtree_mutex);
703 for (i = 0; i < nsplits; i++) {
704 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
705 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
706
707 if (IS_ERR(frag))
708 continue;
709 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
710 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
711 }
712 mutex_unlock(&ci->i_fragtree_mutex);
713
714 /* were we issued a capability? */
715 if (info->cap.caps) {
716 if (ceph_snap(inode) == CEPH_NOSNAP) {
717 ceph_add_cap(inode, session,
718 le64_to_cpu(info->cap.cap_id),
719 cap_fmode,
720 le32_to_cpu(info->cap.caps),
721 le32_to_cpu(info->cap.wanted),
722 le32_to_cpu(info->cap.seq),
723 le32_to_cpu(info->cap.mseq),
724 le64_to_cpu(info->cap.realm),
725 info->cap.flags,
726 caps_reservation);
727 } else {
728 spin_lock(&inode->i_lock);
729 dout(" %p got snap_caps %s\n", inode,
730 ceph_cap_string(le32_to_cpu(info->cap.caps)));
731 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
732 if (cap_fmode >= 0)
733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock);
735 }
736 }
737
738 /* update delegation info? */
739 if (dirinfo)
740 ceph_fill_dirfrag(inode, dirinfo);
741
742 err = 0;
743
744out:
745 if (xattr_blob)
746 ceph_buffer_put(xattr_blob);
747 return err;
748}
749
750/*
751 * caller should hold session s_mutex.
752 */
753static void update_dentry_lease(struct dentry *dentry,
754 struct ceph_mds_reply_lease *lease,
755 struct ceph_mds_session *session,
756 unsigned long from_time)
757{
758 struct ceph_dentry_info *di = ceph_dentry(dentry);
759 long unsigned duration = le32_to_cpu(lease->duration_ms);
760 long unsigned ttl = from_time + (duration * HZ) / 1000;
761 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
762 struct inode *dir;
763
764 /* only track leases on regular dentries */
765 if (dentry->d_op != &ceph_dentry_ops)
766 return;
767
768 spin_lock(&dentry->d_lock);
769 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
770 dentry, le16_to_cpu(lease->mask), duration, ttl);
771
772 /* make lease_rdcache_gen match directory */
773 dir = dentry->d_parent->d_inode;
774 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
775
776 if (lease->mask == 0)
777 goto out_unlock;
778
779 if (di->lease_gen == session->s_cap_gen &&
780 time_before(ttl, dentry->d_time))
781 goto out_unlock; /* we already have a newer lease. */
782
783 if (di->lease_session && di->lease_session != session)
784 goto out_unlock;
785
786 ceph_dentry_lru_touch(dentry);
787
788 if (!di->lease_session)
789 di->lease_session = ceph_get_mds_session(session);
790 di->lease_gen = session->s_cap_gen;
791 di->lease_seq = le32_to_cpu(lease->seq);
792 di->lease_renew_after = half_ttl;
793 di->lease_renew_from = 0;
794 dentry->d_time = ttl;
795out_unlock:
796 spin_unlock(&dentry->d_lock);
797 return;
798}
799
800/*
801 * splice a dentry to an inode.
802 * caller must hold directory i_mutex for this to be safe.
803 *
804 * we will only rehash the resulting dentry if @prehash is
805 * true; @prehash will be set to false (for the benefit of
806 * the caller) if we fail.
807 */
808static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
809 bool *prehash)
810{
811 struct dentry *realdn;
812
813 /* dn must be unhashed */
814 if (!d_unhashed(dn))
815 d_drop(dn);
816 realdn = d_materialise_unique(dn, in);
817 if (IS_ERR(realdn)) {
818 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
819 dn, in, ceph_vinop(in));
820 if (prehash)
821 *prehash = false; /* don't rehash on error */
822 dn = realdn; /* note realdn contains the error */
823 goto out;
824 } else if (realdn) {
825 dout("dn %p (%d) spliced with %p (%d) "
826 "inode %p ino %llx.%llx\n",
827 dn, atomic_read(&dn->d_count),
828 realdn, atomic_read(&realdn->d_count),
829 realdn->d_inode, ceph_vinop(realdn->d_inode));
830 dput(dn);
831 dn = realdn;
832 } else {
833 BUG_ON(!ceph_dentry(dn));
834
835 dout("dn %p attached to %p ino %llx.%llx\n",
836 dn, dn->d_inode, ceph_vinop(dn->d_inode));
837 }
838 if ((!prehash || *prehash) && d_unhashed(dn))
839 d_rehash(dn);
840out:
841 return dn;
842}
843
844/*
845 * Set dentry's directory position based on the current dir's max, and
846 * order it in d_subdirs, so that dcache_readdir behaves.
847 */
848static void ceph_set_dentry_offset(struct dentry *dn)
849{
850 struct dentry *dir = dn->d_parent;
851 struct inode *inode = dn->d_parent->d_inode;
852 struct ceph_dentry_info *di;
853
854 BUG_ON(!inode);
855
856 di = ceph_dentry(dn);
857
858 spin_lock(&inode->i_lock);
859 di->offset = ceph_inode(inode)->i_max_offset++;
860 spin_unlock(&inode->i_lock);
861
862 spin_lock(&dcache_lock);
863 spin_lock(&dn->d_lock);
864 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
865 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
866 dn->d_u.d_child.prev, dn->d_u.d_child.next);
867 spin_unlock(&dn->d_lock);
868 spin_unlock(&dcache_lock);
869}
870
871/*
872 * Incorporate results into the local cache. This is either just
873 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
874 * after a lookup).
875 *
876 * A reply may contain
877 * a directory inode along with a dentry.
878 * and/or a target inode
879 *
880 * Called with snap_rwsem (read).
881 */
882int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
883 struct ceph_mds_session *session)
884{
885 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
886 struct inode *in = NULL;
887 struct ceph_mds_reply_inode *ininfo;
888 struct ceph_vino vino;
889 int i = 0;
890 int err = 0;
891
892 dout("fill_trace %p is_dentry %d is_target %d\n", req,
893 rinfo->head->is_dentry, rinfo->head->is_target);
894
895#if 0
896 /*
897 * Debugging hook:
898 *
899 * If we resend completed ops to a recovering mds, we get no
900 * trace. Since that is very rare, pretend this is the case
901 * to ensure the 'no trace' handlers in the callers behave.
902 *
903 * Fill in inodes unconditionally to avoid breaking cap
904 * invariants.
905 */
906 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
907 pr_info("fill_trace faking empty trace on %lld %s\n",
908 req->r_tid, ceph_mds_op_name(rinfo->head->op));
909 if (rinfo->head->is_dentry) {
910 rinfo->head->is_dentry = 0;
911 err = fill_inode(req->r_locked_dir,
912 &rinfo->diri, rinfo->dirfrag,
913 session, req->r_request_started, -1);
914 }
915 if (rinfo->head->is_target) {
916 rinfo->head->is_target = 0;
917 ininfo = rinfo->targeti.in;
918 vino.ino = le64_to_cpu(ininfo->ino);
919 vino.snap = le64_to_cpu(ininfo->snapid);
920 in = ceph_get_inode(sb, vino);
921 err = fill_inode(in, &rinfo->targeti, NULL,
922 session, req->r_request_started,
923 req->r_fmode);
924 iput(in);
925 }
926 }
927#endif
928
929 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
930 dout("fill_trace reply is empty!\n");
931 if (rinfo->head->result == 0 && req->r_locked_dir) {
932 struct ceph_inode_info *ci =
933 ceph_inode(req->r_locked_dir);
934 dout(" clearing %p complete (empty trace)\n",
935 req->r_locked_dir);
936 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
937 ci->i_release_count++;
938 }
939 return 0;
940 }
941
942 if (rinfo->head->is_dentry) {
943 struct inode *dir = req->r_locked_dir;
944
945 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
946 session, req->r_request_started, -1,
947 &req->r_caps_reservation);
948 if (err < 0)
949 return err;
950 }
951
952 if (rinfo->head->is_dentry && !req->r_aborted) {
953 /*
954 * lookup link rename : null -> possibly existing inode
955 * mknod symlink mkdir : null -> new inode
956 * unlink : linked -> null
957 */
958 struct inode *dir = req->r_locked_dir;
959 struct dentry *dn = req->r_dentry;
960 bool have_dir_cap, have_lease;
961
962 BUG_ON(!dn);
963 BUG_ON(!dir);
964 BUG_ON(dn->d_parent->d_inode != dir);
965 BUG_ON(ceph_ino(dir) !=
966 le64_to_cpu(rinfo->diri.in->ino));
967 BUG_ON(ceph_snap(dir) !=
968 le64_to_cpu(rinfo->diri.in->snapid));
969
970 /* do we have a lease on the whole dir? */
971 have_dir_cap =
972 (le32_to_cpu(rinfo->diri.in->cap.caps) &
973 CEPH_CAP_FILE_SHARED);
974
975 /* do we have a dn lease? */
976 have_lease = have_dir_cap ||
977 (le16_to_cpu(rinfo->dlease->mask) &
978 CEPH_LOCK_DN);
979
980 if (!have_lease)
981 dout("fill_trace no dentry lease or dir cap\n");
982
983 /* rename? */
984 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
985 dout(" src %p '%.*s' dst %p '%.*s'\n",
986 req->r_old_dentry,
987 req->r_old_dentry->d_name.len,
988 req->r_old_dentry->d_name.name,
989 dn, dn->d_name.len, dn->d_name.name);
990 dout("fill_trace doing d_move %p -> %p\n",
991 req->r_old_dentry, dn);
992 d_move(req->r_old_dentry, dn);
993 dout(" src %p '%.*s' dst %p '%.*s'\n",
994 req->r_old_dentry,
995 req->r_old_dentry->d_name.len,
996 req->r_old_dentry->d_name.name,
997 dn, dn->d_name.len, dn->d_name.name);
998 /* ensure target dentry is invalidated, despite
999 rehashing bug in vfs_rename_dir */
1000 dn->d_time = jiffies;
1001 ceph_dentry(dn)->lease_shared_gen = 0;
1002 /* take overwritten dentry's readdir offset */
1003 ceph_dentry(req->r_old_dentry)->offset =
1004 ceph_dentry(dn)->offset;
1005 dn = req->r_old_dentry; /* use old_dentry */
1006 in = dn->d_inode;
1007 }
1008
1009 /* null dentry? */
1010 if (!rinfo->head->is_target) {
1011 dout("fill_trace null dentry\n");
1012 if (dn->d_inode) {
1013 dout("d_delete %p\n", dn);
1014 d_delete(dn);
1015 } else {
1016 dout("d_instantiate %p NULL\n", dn);
1017 d_instantiate(dn, NULL);
1018 if (have_lease && d_unhashed(dn))
1019 d_rehash(dn);
1020 update_dentry_lease(dn, rinfo->dlease,
1021 session,
1022 req->r_request_started);
1023 }
1024 goto done;
1025 }
1026
1027 /* attach proper inode */
1028 ininfo = rinfo->targeti.in;
1029 vino.ino = le64_to_cpu(ininfo->ino);
1030 vino.snap = le64_to_cpu(ininfo->snapid);
1031 if (!dn->d_inode) {
1032 in = ceph_get_inode(sb, vino);
1033 if (IS_ERR(in)) {
1034 pr_err("fill_trace bad get_inode "
1035 "%llx.%llx\n", vino.ino, vino.snap);
1036 err = PTR_ERR(in);
1037 d_delete(dn);
1038 goto done;
1039 }
1040 dn = splice_dentry(dn, in, &have_lease);
1041 if (IS_ERR(dn)) {
1042 err = PTR_ERR(dn);
1043 goto done;
1044 }
1045 req->r_dentry = dn; /* may have spliced */
1046 ceph_set_dentry_offset(dn);
1047 igrab(in);
1048 } else if (ceph_ino(in) == vino.ino &&
1049 ceph_snap(in) == vino.snap) {
1050 igrab(in);
1051 } else {
1052 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1053 dn, in, ceph_ino(in), ceph_snap(in),
1054 vino.ino, vino.snap);
1055 have_lease = false;
1056 in = NULL;
1057 }
1058
1059 if (have_lease)
1060 update_dentry_lease(dn, rinfo->dlease, session,
1061 req->r_request_started);
1062 dout(" final dn %p\n", dn);
1063 i++;
1064 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1065 req->r_op == CEPH_MDS_OP_MKSNAP) {
1066 struct dentry *dn = req->r_dentry;
1067
1068 /* fill out a snapdir LOOKUPSNAP dentry */
1069 BUG_ON(!dn);
1070 BUG_ON(!req->r_locked_dir);
1071 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1072 ininfo = rinfo->targeti.in;
1073 vino.ino = le64_to_cpu(ininfo->ino);
1074 vino.snap = le64_to_cpu(ininfo->snapid);
1075 in = ceph_get_inode(sb, vino);
1076 if (IS_ERR(in)) {
1077 pr_err("fill_inode get_inode badness %llx.%llx\n",
1078 vino.ino, vino.snap);
1079 err = PTR_ERR(in);
1080 d_delete(dn);
1081 goto done;
1082 }
1083 dout(" linking snapped dir %p to dn %p\n", in, dn);
1084 dn = splice_dentry(dn, in, NULL);
1085 if (IS_ERR(dn)) {
1086 err = PTR_ERR(dn);
1087 goto done;
1088 }
1089 ceph_set_dentry_offset(dn);
1090 req->r_dentry = dn; /* may have spliced */
1091 igrab(in);
1092 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1093 }
1094
1095 if (rinfo->head->is_target) {
1096 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1097 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1098
1099 if (in == NULL || ceph_ino(in) != vino.ino ||
1100 ceph_snap(in) != vino.snap) {
1101 in = ceph_get_inode(sb, vino);
1102 if (IS_ERR(in)) {
1103 err = PTR_ERR(in);
1104 goto done;
1105 }
1106 }
1107 req->r_target_inode = in;
1108
1109 err = fill_inode(in,
1110 &rinfo->targeti, NULL,
1111 session, req->r_request_started,
1112 (le32_to_cpu(rinfo->head->result) == 0) ?
1113 req->r_fmode : -1,
1114 &req->r_caps_reservation);
1115 if (err < 0) {
1116 pr_err("fill_inode badness %p %llx.%llx\n",
1117 in, ceph_vinop(in));
1118 goto done;
1119 }
1120 }
1121
1122done:
1123 dout("fill_trace done err=%d\n", err);
1124 return err;
1125}
1126
1127/*
1128 * Prepopulate our cache with readdir results, leases, etc.
1129 */
1130int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1131 struct ceph_mds_session *session)
1132{
1133 struct dentry *parent = req->r_dentry;
1134 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1135 struct qstr dname;
1136 struct dentry *dn;
1137 struct inode *in;
1138 int err = 0, i;
1139 struct inode *snapdir = NULL;
1140 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1141 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1142 struct ceph_dentry_info *di;
1143
1144 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1145 snapdir = ceph_get_snapdir(parent->d_inode);
1146 parent = d_find_alias(snapdir);
1147 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1148 rinfo->dir_nr, parent);
1149 } else {
1150 dout("readdir_prepopulate %d items under dn %p\n",
1151 rinfo->dir_nr, parent);
1152 if (rinfo->dir_dir)
1153 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1154 }
1155
1156 for (i = 0; i < rinfo->dir_nr; i++) {
1157 struct ceph_vino vino;
1158
1159 dname.name = rinfo->dir_dname[i];
1160 dname.len = rinfo->dir_dname_len[i];
1161 dname.hash = full_name_hash(dname.name, dname.len);
1162
1163 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1164 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1165
1166retry_lookup:
1167 dn = d_lookup(parent, &dname);
1168 dout("d_lookup on parent=%p name=%.*s got %p\n",
1169 parent, dname.len, dname.name, dn);
1170
1171 if (!dn) {
1172 dn = d_alloc(parent, &dname);
1173 dout("d_alloc %p '%.*s' = %p\n", parent,
1174 dname.len, dname.name, dn);
1175 if (dn == NULL) {
1176 dout("d_alloc badness\n");
1177 err = -ENOMEM;
1178 goto out;
1179 }
1180 err = ceph_init_dentry(dn);
1181 if (err < 0)
1182 goto out;
1183 } else if (dn->d_inode &&
1184 (ceph_ino(dn->d_inode) != vino.ino ||
1185 ceph_snap(dn->d_inode) != vino.snap)) {
1186 dout(" dn %p points to wrong inode %p\n",
1187 dn, dn->d_inode);
1188 d_delete(dn);
1189 dput(dn);
1190 goto retry_lookup;
1191 } else {
1192 /* reorder parent's d_subdirs */
1193 spin_lock(&dcache_lock);
1194 spin_lock(&dn->d_lock);
1195 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1196 spin_unlock(&dn->d_lock);
1197 spin_unlock(&dcache_lock);
1198 }
1199
1200 di = dn->d_fsdata;
1201 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1202
1203 /* inode */
1204 if (dn->d_inode) {
1205 in = dn->d_inode;
1206 } else {
1207 in = ceph_get_inode(parent->d_sb, vino);
1208 if (in == NULL) {
1209 dout("new_inode badness\n");
1210 d_delete(dn);
1211 dput(dn);
1212 err = -ENOMEM;
1213 goto out;
1214 }
1215 dn = splice_dentry(dn, in, NULL);
1216 }
1217
1218 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1219 req->r_request_started, -1,
1220 &req->r_caps_reservation) < 0) {
1221 pr_err("fill_inode badness on %p\n", in);
1222 dput(dn);
1223 continue;
1224 }
1225 update_dentry_lease(dn, rinfo->dir_dlease[i],
1226 req->r_session, req->r_request_started);
1227 dput(dn);
1228 }
1229 req->r_did_prepopulate = true;
1230
1231out:
1232 if (snapdir) {
1233 iput(snapdir);
1234 dput(parent);
1235 }
1236 dout("readdir_prepopulate done\n");
1237 return err;
1238}
1239
1240int ceph_inode_set_size(struct inode *inode, loff_t size)
1241{
1242 struct ceph_inode_info *ci = ceph_inode(inode);
1243 int ret = 0;
1244
1245 spin_lock(&inode->i_lock);
1246 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1247 inode->i_size = size;
1248 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1249
1250 /* tell the MDS if we are approaching max_size */
1251 if ((size << 1) >= ci->i_max_size &&
1252 (ci->i_reported_size << 1) < ci->i_max_size)
1253 ret = 1;
1254
1255 spin_unlock(&inode->i_lock);
1256 return ret;
1257}
1258
1259/*
1260 * Write back inode data in a worker thread. (This can't be done
1261 * in the message handler context.)
1262 */
1263void ceph_queue_writeback(struct inode *inode)
1264{
1265 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1266 &ceph_inode(inode)->i_wb_work)) {
1267 dout("ceph_queue_writeback %p\n", inode);
1268 igrab(inode);
1269 } else {
1270 dout("ceph_queue_writeback %p failed\n", inode);
1271 }
1272}
1273
1274static void ceph_writeback_work(struct work_struct *work)
1275{
1276 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1277 i_wb_work);
1278 struct inode *inode = &ci->vfs_inode;
1279
1280 dout("writeback %p\n", inode);
1281 filemap_fdatawrite(&inode->i_data);
1282 iput(inode);
1283}
1284
1285/*
1286 * queue an async invalidation
1287 */
1288void ceph_queue_invalidate(struct inode *inode)
1289{
1290 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1291 &ceph_inode(inode)->i_pg_inv_work)) {
1292 dout("ceph_queue_invalidate %p\n", inode);
1293 igrab(inode);
1294 } else {
1295 dout("ceph_queue_invalidate %p failed\n", inode);
1296 }
1297}
1298
1299/*
1300 * invalidate any pages that are not dirty or under writeback. this
1301 * includes pages that are clean and mapped.
1302 */
1303static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1304{
1305 struct pagevec pvec;
1306 pgoff_t next = 0;
1307 int i;
1308
1309 pagevec_init(&pvec, 0);
1310 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1311 for (i = 0; i < pagevec_count(&pvec); i++) {
1312 struct page *page = pvec.pages[i];
1313 pgoff_t index;
1314 int skip_page =
1315 (PageDirty(page) || PageWriteback(page));
1316
1317 if (!skip_page)
1318 skip_page = !trylock_page(page);
1319
1320 /*
1321 * We really shouldn't be looking at the ->index of an
1322 * unlocked page. But we're not allowed to lock these
1323 * pages. So we rely upon nobody altering the ->index
1324 * of this (pinned-by-us) page.
1325 */
1326 index = page->index;
1327 if (index > next)
1328 next = index;
1329 next++;
1330
1331 if (skip_page)
1332 continue;
1333
1334 generic_error_remove_page(mapping, page);
1335 unlock_page(page);
1336 }
1337 pagevec_release(&pvec);
1338 cond_resched();
1339 }
1340}
1341
1342/*
1343 * Invalidate inode pages in a worker thread. (This can't be done
1344 * in the message handler context.)
1345 */
1346static void ceph_invalidate_work(struct work_struct *work)
1347{
1348 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1349 i_pg_inv_work);
1350 struct inode *inode = &ci->vfs_inode;
1351 u32 orig_gen;
1352 int check = 0;
1353
1354 spin_lock(&inode->i_lock);
1355 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1356 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1357 if (ci->i_rdcache_gen == 0 ||
1358 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1359 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1360 /* nevermind! */
1361 ci->i_rdcache_revoking = 0;
1362 spin_unlock(&inode->i_lock);
1363 goto out;
1364 }
1365 orig_gen = ci->i_rdcache_gen;
1366 spin_unlock(&inode->i_lock);
1367
1368 ceph_invalidate_nondirty_pages(inode->i_mapping);
1369
1370 spin_lock(&inode->i_lock);
1371 if (orig_gen == ci->i_rdcache_gen) {
1372 dout("invalidate_pages %p gen %d successful\n", inode,
1373 ci->i_rdcache_gen);
1374 ci->i_rdcache_gen = 0;
1375 ci->i_rdcache_revoking = 0;
1376 check = 1;
1377 } else {
1378 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1379 inode, orig_gen, ci->i_rdcache_gen);
1380 }
1381 spin_unlock(&inode->i_lock);
1382
1383 if (check)
1384 ceph_check_caps(ci, 0, NULL);
1385out:
1386 iput(inode);
1387}
1388
1389
1390/*
1391 * called by trunc_wq; take i_mutex ourselves
1392 *
1393 * We also truncate in a separate thread as well.
1394 */
1395static void ceph_vmtruncate_work(struct work_struct *work)
1396{
1397 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1398 i_vmtruncate_work);
1399 struct inode *inode = &ci->vfs_inode;
1400
1401 dout("vmtruncate_work %p\n", inode);
1402 mutex_lock(&inode->i_mutex);
1403 __ceph_do_pending_vmtruncate(inode);
1404 mutex_unlock(&inode->i_mutex);
1405 iput(inode);
1406}
1407
1408/*
1409 * Queue an async vmtruncate. If we fail to queue work, we will handle
1410 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1411 */
1412void ceph_queue_vmtruncate(struct inode *inode)
1413{
1414 struct ceph_inode_info *ci = ceph_inode(inode);
1415
1416 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1417 &ci->i_vmtruncate_work)) {
1418 dout("ceph_queue_vmtruncate %p\n", inode);
1419 igrab(inode);
1420 } else {
1421 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1422 inode, ci->i_truncate_pending);
1423 }
1424}
1425
1426/*
1427 * called with i_mutex held.
1428 *
1429 * Make sure any pending truncation is applied before doing anything
1430 * that may depend on it.
1431 */
1432void __ceph_do_pending_vmtruncate(struct inode *inode)
1433{
1434 struct ceph_inode_info *ci = ceph_inode(inode);
1435 u64 to;
1436 int wrbuffer_refs, wake = 0;
1437
1438retry:
1439 spin_lock(&inode->i_lock);
1440 if (ci->i_truncate_pending == 0) {
1441 dout("__do_pending_vmtruncate %p none pending\n", inode);
1442 spin_unlock(&inode->i_lock);
1443 return;
1444 }
1445
1446 /*
1447 * make sure any dirty snapped pages are flushed before we
1448 * possibly truncate them.. so write AND block!
1449 */
1450 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1451 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1452 inode);
1453 spin_unlock(&inode->i_lock);
1454 filemap_write_and_wait_range(&inode->i_data, 0,
1455 inode->i_sb->s_maxbytes);
1456 goto retry;
1457 }
1458
1459 to = ci->i_truncate_size;
1460 wrbuffer_refs = ci->i_wrbuffer_ref;
1461 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1462 ci->i_truncate_pending, to);
1463 spin_unlock(&inode->i_lock);
1464
1465 truncate_inode_pages(inode->i_mapping, to);
1466
1467 spin_lock(&inode->i_lock);
1468 ci->i_truncate_pending--;
1469 if (ci->i_truncate_pending == 0)
1470 wake = 1;
1471 spin_unlock(&inode->i_lock);
1472
1473 if (wrbuffer_refs == 0)
1474 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1475 if (wake)
1476 wake_up(&ci->i_cap_wq);
1477}
1478
1479
1480/*
1481 * symlinks
1482 */
1483static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1484{
1485 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1486 nd_set_link(nd, ci->i_symlink);
1487 return NULL;
1488}
1489
1490static const struct inode_operations ceph_symlink_iops = {
1491 .readlink = generic_readlink,
1492 .follow_link = ceph_sym_follow_link,
1493};
1494
1495/*
1496 * setattr
1497 */
1498int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1499{
1500 struct inode *inode = dentry->d_inode;
1501 struct ceph_inode_info *ci = ceph_inode(inode);
1502 struct inode *parent_inode = dentry->d_parent->d_inode;
1503 const unsigned int ia_valid = attr->ia_valid;
1504 struct ceph_mds_request *req;
1505 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1506 int issued;
1507 int release = 0, dirtied = 0;
1508 int mask = 0;
1509 int err = 0;
1510
1511 if (ceph_snap(inode) != CEPH_NOSNAP)
1512 return -EROFS;
1513
1514 __ceph_do_pending_vmtruncate(inode);
1515
1516 err = inode_change_ok(inode, attr);
1517 if (err != 0)
1518 return err;
1519
1520 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1521 USE_AUTH_MDS);
1522 if (IS_ERR(req))
1523 return PTR_ERR(req);
1524
1525 spin_lock(&inode->i_lock);
1526 issued = __ceph_caps_issued(ci, NULL);
1527 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1528
1529 if (ia_valid & ATTR_UID) {
1530 dout("setattr %p uid %d -> %d\n", inode,
1531 inode->i_uid, attr->ia_uid);
1532 if (issued & CEPH_CAP_AUTH_EXCL) {
1533 inode->i_uid = attr->ia_uid;
1534 dirtied |= CEPH_CAP_AUTH_EXCL;
1535 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1536 attr->ia_uid != inode->i_uid) {
1537 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1538 mask |= CEPH_SETATTR_UID;
1539 release |= CEPH_CAP_AUTH_SHARED;
1540 }
1541 }
1542 if (ia_valid & ATTR_GID) {
1543 dout("setattr %p gid %d -> %d\n", inode,
1544 inode->i_gid, attr->ia_gid);
1545 if (issued & CEPH_CAP_AUTH_EXCL) {
1546 inode->i_gid = attr->ia_gid;
1547 dirtied |= CEPH_CAP_AUTH_EXCL;
1548 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1549 attr->ia_gid != inode->i_gid) {
1550 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1551 mask |= CEPH_SETATTR_GID;
1552 release |= CEPH_CAP_AUTH_SHARED;
1553 }
1554 }
1555 if (ia_valid & ATTR_MODE) {
1556 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1557 attr->ia_mode);
1558 if (issued & CEPH_CAP_AUTH_EXCL) {
1559 inode->i_mode = attr->ia_mode;
1560 dirtied |= CEPH_CAP_AUTH_EXCL;
1561 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1562 attr->ia_mode != inode->i_mode) {
1563 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1564 mask |= CEPH_SETATTR_MODE;
1565 release |= CEPH_CAP_AUTH_SHARED;
1566 }
1567 }
1568
1569 if (ia_valid & ATTR_ATIME) {
1570 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1571 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1572 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1573 if (issued & CEPH_CAP_FILE_EXCL) {
1574 ci->i_time_warp_seq++;
1575 inode->i_atime = attr->ia_atime;
1576 dirtied |= CEPH_CAP_FILE_EXCL;
1577 } else if ((issued & CEPH_CAP_FILE_WR) &&
1578 timespec_compare(&inode->i_atime,
1579 &attr->ia_atime) < 0) {
1580 inode->i_atime = attr->ia_atime;
1581 dirtied |= CEPH_CAP_FILE_WR;
1582 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1583 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1584 ceph_encode_timespec(&req->r_args.setattr.atime,
1585 &attr->ia_atime);
1586 mask |= CEPH_SETATTR_ATIME;
1587 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1588 CEPH_CAP_FILE_WR;
1589 }
1590 }
1591 if (ia_valid & ATTR_MTIME) {
1592 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1593 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1594 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1595 if (issued & CEPH_CAP_FILE_EXCL) {
1596 ci->i_time_warp_seq++;
1597 inode->i_mtime = attr->ia_mtime;
1598 dirtied |= CEPH_CAP_FILE_EXCL;
1599 } else if ((issued & CEPH_CAP_FILE_WR) &&
1600 timespec_compare(&inode->i_mtime,
1601 &attr->ia_mtime) < 0) {
1602 inode->i_mtime = attr->ia_mtime;
1603 dirtied |= CEPH_CAP_FILE_WR;
1604 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1605 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1606 ceph_encode_timespec(&req->r_args.setattr.mtime,
1607 &attr->ia_mtime);
1608 mask |= CEPH_SETATTR_MTIME;
1609 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1610 CEPH_CAP_FILE_WR;
1611 }
1612 }
1613 if (ia_valid & ATTR_SIZE) {
1614 dout("setattr %p size %lld -> %lld\n", inode,
1615 inode->i_size, attr->ia_size);
1616 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1617 err = -EINVAL;
1618 goto out;
1619 }
1620 if ((issued & CEPH_CAP_FILE_EXCL) &&
1621 attr->ia_size > inode->i_size) {
1622 inode->i_size = attr->ia_size;
1623 inode->i_blocks =
1624 (attr->ia_size + (1 << 9) - 1) >> 9;
1625 inode->i_ctime = attr->ia_ctime;
1626 ci->i_reported_size = attr->ia_size;
1627 dirtied |= CEPH_CAP_FILE_EXCL;
1628 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1629 attr->ia_size != inode->i_size) {
1630 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1631 req->r_args.setattr.old_size =
1632 cpu_to_le64(inode->i_size);
1633 mask |= CEPH_SETATTR_SIZE;
1634 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1635 CEPH_CAP_FILE_WR;
1636 }
1637 }
1638
1639 /* these do nothing */
1640 if (ia_valid & ATTR_CTIME) {
1641 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1642 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1643 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1644 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1645 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1646 only ? "ctime only" : "ignored");
1647 inode->i_ctime = attr->ia_ctime;
1648 if (only) {
1649 /*
1650 * if kernel wants to dirty ctime but nothing else,
1651 * we need to choose a cap to dirty under, or do
1652 * a almost-no-op setattr
1653 */
1654 if (issued & CEPH_CAP_AUTH_EXCL)
1655 dirtied |= CEPH_CAP_AUTH_EXCL;
1656 else if (issued & CEPH_CAP_FILE_EXCL)
1657 dirtied |= CEPH_CAP_FILE_EXCL;
1658 else if (issued & CEPH_CAP_XATTR_EXCL)
1659 dirtied |= CEPH_CAP_XATTR_EXCL;
1660 else
1661 mask |= CEPH_SETATTR_CTIME;
1662 }
1663 }
1664 if (ia_valid & ATTR_FILE)
1665 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1666
1667 if (dirtied) {
1668 __ceph_mark_dirty_caps(ci, dirtied);
1669 inode->i_ctime = CURRENT_TIME;
1670 }
1671
1672 release &= issued;
1673 spin_unlock(&inode->i_lock);
1674
1675 if (mask) {
1676 req->r_inode = igrab(inode);
1677 req->r_inode_drop = release;
1678 req->r_args.setattr.mask = cpu_to_le32(mask);
1679 req->r_num_caps = 1;
1680 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1681 }
1682 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1683 ceph_cap_string(dirtied), mask);
1684
1685 ceph_mdsc_put_request(req);
1686 __ceph_do_pending_vmtruncate(inode);
1687 return err;
1688out:
1689 spin_unlock(&inode->i_lock);
1690 ceph_mdsc_put_request(req);
1691 return err;
1692}
1693
1694/*
1695 * Verify that we have a lease on the given mask. If not,
1696 * do a getattr against an mds.
1697 */
1698int ceph_do_getattr(struct inode *inode, int mask)
1699{
1700 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1701 struct ceph_mds_client *mdsc = &client->mdsc;
1702 struct ceph_mds_request *req;
1703 int err;
1704
1705 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1706 dout("do_getattr inode %p SNAPDIR\n", inode);
1707 return 0;
1708 }
1709
1710 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1711 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1712 return 0;
1713
1714 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1715 if (IS_ERR(req))
1716 return PTR_ERR(req);
1717 req->r_inode = igrab(inode);
1718 req->r_num_caps = 1;
1719 req->r_args.getattr.mask = cpu_to_le32(mask);
1720 err = ceph_mdsc_do_request(mdsc, NULL, req);
1721 ceph_mdsc_put_request(req);
1722 dout("do_getattr result=%d\n", err);
1723 return err;
1724}
1725
1726
1727/*
1728 * Check inode permissions. We verify we have a valid value for
1729 * the AUTH cap, then call the generic handler.
1730 */
1731int ceph_permission(struct inode *inode, int mask)
1732{
1733 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1734
1735 if (!err)
1736 err = generic_permission(inode, mask, NULL);
1737 return err;
1738}
1739
1740/*
1741 * Get all attributes. Hopefully somedata we'll have a statlite()
1742 * and can limit the fields we require to be accurate.
1743 */
1744int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1745 struct kstat *stat)
1746{
1747 struct inode *inode = dentry->d_inode;
1748 struct ceph_inode_info *ci = ceph_inode(inode);
1749 int err;
1750
1751 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1752 if (!err) {
1753 generic_fillattr(inode, stat);
1754 stat->ino = inode->i_ino;
1755 if (ceph_snap(inode) != CEPH_NOSNAP)
1756 stat->dev = ceph_snap(inode);
1757 else
1758 stat->dev = 0;
1759 if (S_ISDIR(inode->i_mode)) {
1760 stat->size = ci->i_rbytes;
1761 stat->blocks = 0;
1762 stat->blksize = 65536;
1763 }
1764 }
1765 return err;
1766}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..60a9a4ae47be
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3043 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/slab.h>
5#include <linux/sched.h>
6
7#include "mds_client.h"
8#include "mon_client.h"
9#include "super.h"
10#include "messenger.h"
11#include "decode.h"
12#include "auth.h"
13#include "pagelist.h"
14
15/*
16 * A cluster of MDS (metadata server) daemons is responsible for
17 * managing the file system namespace (the directory hierarchy and
18 * inodes) and for coordinating shared access to storage. Metadata is
19 * partitioning hierarchically across a number of servers, and that
20 * partition varies over time as the cluster adjusts the distribution
21 * in order to balance load.
22 *
23 * The MDS client is primarily responsible to managing synchronous
24 * metadata requests for operations like open, unlink, and so forth.
25 * If there is a MDS failure, we find out about it when we (possibly
26 * request and) receive a new MDS map, and can resubmit affected
27 * requests.
28 *
29 * For the most part, though, we take advantage of a lossless
30 * communications channel to the MDS, and do not need to worry about
31 * timing out or resubmitting requests.
32 *
33 * We maintain a stateful "session" with each MDS we interact with.
34 * Within each session, we sent periodic heartbeat messages to ensure
35 * any capabilities or leases we have been issues remain valid. If
36 * the session times out and goes stale, our leases and capabilities
37 * are no longer valid.
38 */
39
40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head);
42
43const static struct ceph_connection_operations mds_con_ops;
44
45
46/*
47 * mds reply parsing
48 */
49
50/*
51 * parse individual inode info
52 */
53static int parse_reply_info_in(void **p, void *end,
54 struct ceph_mds_reply_info_in *info)
55{
56 int err = -EIO;
57
58 info->in = *p;
59 *p += sizeof(struct ceph_mds_reply_inode) +
60 sizeof(*info->in->fragtree.splits) *
61 le32_to_cpu(info->in->fragtree.nsplits);
62
63 ceph_decode_32_safe(p, end, info->symlink_len, bad);
64 ceph_decode_need(p, end, info->symlink_len, bad);
65 info->symlink = *p;
66 *p += info->symlink_len;
67
68 ceph_decode_32_safe(p, end, info->xattr_len, bad);
69 ceph_decode_need(p, end, info->xattr_len, bad);
70 info->xattr_data = *p;
71 *p += info->xattr_len;
72 return 0;
73bad:
74 return err;
75}
76
77/*
78 * parse a normal reply, which may contain a (dir+)dentry and/or a
79 * target inode.
80 */
81static int parse_reply_info_trace(void **p, void *end,
82 struct ceph_mds_reply_info_parsed *info)
83{
84 int err;
85
86 if (info->head->is_dentry) {
87 err = parse_reply_info_in(p, end, &info->diri);
88 if (err < 0)
89 goto out_bad;
90
91 if (unlikely(*p + sizeof(*info->dirfrag) > end))
92 goto bad;
93 info->dirfrag = *p;
94 *p += sizeof(*info->dirfrag) +
95 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
96 if (unlikely(*p > end))
97 goto bad;
98
99 ceph_decode_32_safe(p, end, info->dname_len, bad);
100 ceph_decode_need(p, end, info->dname_len, bad);
101 info->dname = *p;
102 *p += info->dname_len;
103 info->dlease = *p;
104 *p += sizeof(*info->dlease);
105 }
106
107 if (info->head->is_target) {
108 err = parse_reply_info_in(p, end, &info->targeti);
109 if (err < 0)
110 goto out_bad;
111 }
112
113 if (unlikely(*p != end))
114 goto bad;
115 return 0;
116
117bad:
118 err = -EIO;
119out_bad:
120 pr_err("problem parsing mds trace %d\n", err);
121 return err;
122}
123
124/*
125 * parse readdir results
126 */
127static int parse_reply_info_dir(void **p, void *end,
128 struct ceph_mds_reply_info_parsed *info)
129{
130 u32 num, i = 0;
131 int err;
132
133 info->dir_dir = *p;
134 if (*p + sizeof(*info->dir_dir) > end)
135 goto bad;
136 *p += sizeof(*info->dir_dir) +
137 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
138 if (*p > end)
139 goto bad;
140
141 ceph_decode_need(p, end, sizeof(num) + 2, bad);
142 num = ceph_decode_32(p);
143 info->dir_end = ceph_decode_8(p);
144 info->dir_complete = ceph_decode_8(p);
145 if (num == 0)
146 goto done;
147
148 /* alloc large array */
149 info->dir_nr = num;
150 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
151 sizeof(*info->dir_dname) +
152 sizeof(*info->dir_dname_len) +
153 sizeof(*info->dir_dlease),
154 GFP_NOFS);
155 if (info->dir_in == NULL) {
156 err = -ENOMEM;
157 goto out_bad;
158 }
159 info->dir_dname = (void *)(info->dir_in + num);
160 info->dir_dname_len = (void *)(info->dir_dname + num);
161 info->dir_dlease = (void *)(info->dir_dname_len + num);
162
163 while (num) {
164 /* dentry */
165 ceph_decode_need(p, end, sizeof(u32)*2, bad);
166 info->dir_dname_len[i] = ceph_decode_32(p);
167 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
168 info->dir_dname[i] = *p;
169 *p += info->dir_dname_len[i];
170 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
171 info->dir_dname[i]);
172 info->dir_dlease[i] = *p;
173 *p += sizeof(struct ceph_mds_reply_lease);
174
175 /* inode */
176 err = parse_reply_info_in(p, end, &info->dir_in[i]);
177 if (err < 0)
178 goto out_bad;
179 i++;
180 num--;
181 }
182
183done:
184 if (*p != end)
185 goto bad;
186 return 0;
187
188bad:
189 err = -EIO;
190out_bad:
191 pr_err("problem parsing dir contents %d\n", err);
192 return err;
193}
194
195/*
196 * parse entire mds reply
197 */
198static int parse_reply_info(struct ceph_msg *msg,
199 struct ceph_mds_reply_info_parsed *info)
200{
201 void *p, *end;
202 u32 len;
203 int err;
204
205 info->head = msg->front.iov_base;
206 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
207 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
208
209 /* trace */
210 ceph_decode_32_safe(&p, end, len, bad);
211 if (len > 0) {
212 err = parse_reply_info_trace(&p, p+len, info);
213 if (err < 0)
214 goto out_bad;
215 }
216
217 /* dir content */
218 ceph_decode_32_safe(&p, end, len, bad);
219 if (len > 0) {
220 err = parse_reply_info_dir(&p, p+len, info);
221 if (err < 0)
222 goto out_bad;
223 }
224
225 /* snap blob */
226 ceph_decode_32_safe(&p, end, len, bad);
227 info->snapblob_len = len;
228 info->snapblob = p;
229 p += len;
230
231 if (p != end)
232 goto bad;
233 return 0;
234
235bad:
236 err = -EIO;
237out_bad:
238 pr_err("mds parse_reply err %d\n", err);
239 return err;
240}
241
242static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
243{
244 kfree(info->dir_in);
245}
246
247
248/*
249 * sessions
250 */
251static const char *session_state_name(int s)
252{
253 switch (s) {
254 case CEPH_MDS_SESSION_NEW: return "new";
255 case CEPH_MDS_SESSION_OPENING: return "opening";
256 case CEPH_MDS_SESSION_OPEN: return "open";
257 case CEPH_MDS_SESSION_HUNG: return "hung";
258 case CEPH_MDS_SESSION_CLOSING: return "closing";
259 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
260 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
261 default: return "???";
262 }
263}
264
265static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
266{
267 if (atomic_inc_not_zero(&s->s_ref)) {
268 dout("mdsc get_session %p %d -> %d\n", s,
269 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
270 return s;
271 } else {
272 dout("mdsc get_session %p 0 -- FAIL", s);
273 return NULL;
274 }
275}
276
277void ceph_put_mds_session(struct ceph_mds_session *s)
278{
279 dout("mdsc put_session %p %d -> %d\n", s,
280 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
281 if (atomic_dec_and_test(&s->s_ref)) {
282 if (s->s_authorizer)
283 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
284 s->s_mdsc->client->monc.auth, s->s_authorizer);
285 kfree(s);
286 }
287}
288
289/*
290 * called under mdsc->mutex
291 */
292struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
293 int mds)
294{
295 struct ceph_mds_session *session;
296
297 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
298 return NULL;
299 session = mdsc->sessions[mds];
300 dout("lookup_mds_session %p %d\n", session,
301 atomic_read(&session->s_ref));
302 get_session(session);
303 return session;
304}
305
306static bool __have_session(struct ceph_mds_client *mdsc, int mds)
307{
308 if (mds >= mdsc->max_sessions)
309 return false;
310 return mdsc->sessions[mds];
311}
312
313static int __verify_registered_session(struct ceph_mds_client *mdsc,
314 struct ceph_mds_session *s)
315{
316 if (s->s_mds >= mdsc->max_sessions ||
317 mdsc->sessions[s->s_mds] != s)
318 return -ENOENT;
319 return 0;
320}
321
322/*
323 * create+register a new session for given mds.
324 * called under mdsc->mutex.
325 */
326static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
327 int mds)
328{
329 struct ceph_mds_session *s;
330
331 s = kzalloc(sizeof(*s), GFP_NOFS);
332 if (!s)
333 return ERR_PTR(-ENOMEM);
334 s->s_mdsc = mdsc;
335 s->s_mds = mds;
336 s->s_state = CEPH_MDS_SESSION_NEW;
337 s->s_ttl = 0;
338 s->s_seq = 0;
339 mutex_init(&s->s_mutex);
340
341 ceph_con_init(mdsc->client->msgr, &s->s_con);
342 s->s_con.private = s;
343 s->s_con.ops = &mds_con_ops;
344 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
345 s->s_con.peer_name.num = cpu_to_le64(mds);
346
347 spin_lock_init(&s->s_cap_lock);
348 s->s_cap_gen = 0;
349 s->s_cap_ttl = 0;
350 s->s_renew_requested = 0;
351 s->s_renew_seq = 0;
352 INIT_LIST_HEAD(&s->s_caps);
353 s->s_nr_caps = 0;
354 s->s_trim_caps = 0;
355 atomic_set(&s->s_ref, 1);
356 INIT_LIST_HEAD(&s->s_waiting);
357 INIT_LIST_HEAD(&s->s_unsafe);
358 s->s_num_cap_releases = 0;
359 s->s_cap_iterator = NULL;
360 INIT_LIST_HEAD(&s->s_cap_releases);
361 INIT_LIST_HEAD(&s->s_cap_releases_done);
362 INIT_LIST_HEAD(&s->s_cap_flushing);
363 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
364
365 dout("register_session mds%d\n", mds);
366 if (mds >= mdsc->max_sessions) {
367 int newmax = 1 << get_count_order(mds+1);
368 struct ceph_mds_session **sa;
369
370 dout("register_session realloc to %d\n", newmax);
371 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
372 if (sa == NULL)
373 goto fail_realloc;
374 if (mdsc->sessions) {
375 memcpy(sa, mdsc->sessions,
376 mdsc->max_sessions * sizeof(void *));
377 kfree(mdsc->sessions);
378 }
379 mdsc->sessions = sa;
380 mdsc->max_sessions = newmax;
381 }
382 mdsc->sessions[mds] = s;
383 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
384
385 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
386
387 return s;
388
389fail_realloc:
390 kfree(s);
391 return ERR_PTR(-ENOMEM);
392}
393
394/*
395 * called under mdsc->mutex
396 */
397static void __unregister_session(struct ceph_mds_client *mdsc,
398 struct ceph_mds_session *s)
399{
400 dout("__unregister_session mds%d %p\n", s->s_mds, s);
401 BUG_ON(mdsc->sessions[s->s_mds] != s);
402 mdsc->sessions[s->s_mds] = NULL;
403 ceph_con_close(&s->s_con);
404 ceph_put_mds_session(s);
405}
406
407/*
408 * drop session refs in request.
409 *
410 * should be last request ref, or hold mdsc->mutex
411 */
412static void put_request_session(struct ceph_mds_request *req)
413{
414 if (req->r_session) {
415 ceph_put_mds_session(req->r_session);
416 req->r_session = NULL;
417 }
418}
419
420void ceph_mdsc_release_request(struct kref *kref)
421{
422 struct ceph_mds_request *req = container_of(kref,
423 struct ceph_mds_request,
424 r_kref);
425 if (req->r_request)
426 ceph_msg_put(req->r_request);
427 if (req->r_reply) {
428 ceph_msg_put(req->r_reply);
429 destroy_reply_info(&req->r_reply_info);
430 }
431 if (req->r_inode) {
432 ceph_put_cap_refs(ceph_inode(req->r_inode),
433 CEPH_CAP_PIN);
434 iput(req->r_inode);
435 }
436 if (req->r_locked_dir)
437 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
438 CEPH_CAP_PIN);
439 if (req->r_target_inode)
440 iput(req->r_target_inode);
441 if (req->r_dentry)
442 dput(req->r_dentry);
443 if (req->r_old_dentry) {
444 ceph_put_cap_refs(
445 ceph_inode(req->r_old_dentry->d_parent->d_inode),
446 CEPH_CAP_PIN);
447 dput(req->r_old_dentry);
448 }
449 kfree(req->r_path1);
450 kfree(req->r_path2);
451 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation);
453 kfree(req);
454}
455
456/*
457 * lookup session, bump ref if found.
458 *
459 * called under mdsc->mutex.
460 */
461static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
462 u64 tid)
463{
464 struct ceph_mds_request *req;
465 struct rb_node *n = mdsc->request_tree.rb_node;
466
467 while (n) {
468 req = rb_entry(n, struct ceph_mds_request, r_node);
469 if (tid < req->r_tid)
470 n = n->rb_left;
471 else if (tid > req->r_tid)
472 n = n->rb_right;
473 else {
474 ceph_mdsc_get_request(req);
475 return req;
476 }
477 }
478 return NULL;
479}
480
481static void __insert_request(struct ceph_mds_client *mdsc,
482 struct ceph_mds_request *new)
483{
484 struct rb_node **p = &mdsc->request_tree.rb_node;
485 struct rb_node *parent = NULL;
486 struct ceph_mds_request *req = NULL;
487
488 while (*p) {
489 parent = *p;
490 req = rb_entry(parent, struct ceph_mds_request, r_node);
491 if (new->r_tid < req->r_tid)
492 p = &(*p)->rb_left;
493 else if (new->r_tid > req->r_tid)
494 p = &(*p)->rb_right;
495 else
496 BUG();
497 }
498
499 rb_link_node(&new->r_node, parent, p);
500 rb_insert_color(&new->r_node, &mdsc->request_tree);
501}
502
503/*
504 * Register an in-flight request, and assign a tid. Link to directory
505 * are modifying (if any).
506 *
507 * Called under mdsc->mutex.
508 */
509static void __register_request(struct ceph_mds_client *mdsc,
510 struct ceph_mds_request *req,
511 struct inode *dir)
512{
513 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req);
519
520 if (dir) {
521 struct ceph_inode_info *ci = ceph_inode(dir);
522
523 spin_lock(&ci->i_unsafe_lock);
524 req->r_unsafe_dir = dir;
525 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
526 spin_unlock(&ci->i_unsafe_lock);
527 }
528}
529
530static void __unregister_request(struct ceph_mds_client *mdsc,
531 struct ceph_mds_request *req)
532{
533 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
534 rb_erase(&req->r_node, &mdsc->request_tree);
535 RB_CLEAR_NODE(&req->r_node);
536
537 if (req->r_unsafe_dir) {
538 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
539
540 spin_lock(&ci->i_unsafe_lock);
541 list_del_init(&req->r_unsafe_dir_item);
542 spin_unlock(&ci->i_unsafe_lock);
543 }
544
545 ceph_mdsc_put_request(req);
546}
547
548/*
549 * Choose mds to send request to next. If there is a hint set in the
550 * request (e.g., due to a prior forward hint from the mds), use that.
551 * Otherwise, consult frag tree and/or caps to identify the
552 * appropriate mds. If all else fails, choose randomly.
553 *
554 * Called under mdsc->mutex.
555 */
556static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req)
558{
559 struct inode *inode;
560 struct ceph_inode_info *ci;
561 struct ceph_cap *cap;
562 int mode = req->r_direct_mode;
563 int mds = -1;
564 u32 hash = req->r_direct_hash;
565 bool is_hash = req->r_direct_is_hash;
566
567 /*
568 * is there a specific mds we should try? ignore hint if we have
569 * no session and the mds is not up (active or recovering).
570 */
571 if (req->r_resend_mds >= 0 &&
572 (__have_session(mdsc, req->r_resend_mds) ||
573 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
574 dout("choose_mds using resend_mds mds%d\n",
575 req->r_resend_mds);
576 return req->r_resend_mds;
577 }
578
579 if (mode == USE_RANDOM_MDS)
580 goto random;
581
582 inode = NULL;
583 if (req->r_inode) {
584 inode = req->r_inode;
585 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) {
587 inode = req->r_dentry->d_inode;
588 } else {
589 inode = req->r_dentry->d_parent->d_inode;
590 hash = req->r_dentry->d_name.hash;
591 is_hash = true;
592 }
593 }
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode);
596 if (!inode)
597 goto random;
598 ci = ceph_inode(inode);
599
600 if (is_hash && S_ISDIR(inode->i_mode)) {
601 struct ceph_inode_frag frag;
602 int found;
603
604 ceph_choose_frag(ci, hash, &frag, &found);
605 if (found) {
606 if (mode == USE_ANY_MDS && frag.ndist > 0) {
607 u8 r;
608
609 /* choose a random replica */
610 get_random_bytes(&r, 1);
611 r %= frag.ndist;
612 mds = frag.dist[r];
613 dout("choose_mds %p %llx.%llx "
614 "frag %u mds%d (%d/%d)\n",
615 inode, ceph_vinop(inode),
616 frag.frag, frag.mds,
617 (int)r, frag.ndist);
618 return mds;
619 }
620
621 /* since this file/dir wasn't known to be
622 * replicated, then we want to look for the
623 * authoritative mds. */
624 mode = USE_AUTH_MDS;
625 if (frag.mds >= 0) {
626 /* choose auth mds */
627 mds = frag.mds;
628 dout("choose_mds %p %llx.%llx "
629 "frag %u mds%d (auth)\n",
630 inode, ceph_vinop(inode), frag.frag, mds);
631 return mds;
632 }
633 }
634 }
635
636 spin_lock(&inode->i_lock);
637 cap = NULL;
638 if (mode == USE_AUTH_MDS)
639 cap = ci->i_auth_cap;
640 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
641 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
642 if (!cap) {
643 spin_unlock(&inode->i_lock);
644 goto random;
645 }
646 mds = cap->session->s_mds;
647 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
648 inode, ceph_vinop(inode), mds,
649 cap == ci->i_auth_cap ? "auth " : "", cap);
650 spin_unlock(&inode->i_lock);
651 return mds;
652
653random:
654 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
655 dout("choose_mds chose random mds%d\n", mds);
656 return mds;
657}
658
659
660/*
661 * session messages
662 */
663static struct ceph_msg *create_session_msg(u32 op, u64 seq)
664{
665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h;
667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
669 if (IS_ERR(msg)) {
670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg));
672 }
673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op);
675 h->seq = cpu_to_le64(seq);
676 return msg;
677}
678
679/*
680 * send session open request.
681 *
682 * called under mdsc->mutex
683 */
684static int __open_session(struct ceph_mds_client *mdsc,
685 struct ceph_mds_session *session)
686{
687 struct ceph_msg *msg;
688 int mstate;
689 int mds = session->s_mds;
690 int err = 0;
691
692 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
694 dout("open_session to mds%d (%s)\n", mds,
695 ceph_mds_state_name(mstate));
696 session->s_state = CEPH_MDS_SESSION_OPENING;
697 session->s_renew_requested = jiffies;
698
699 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) {
702 err = PTR_ERR(msg);
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0;
709}
710
711/*
712 * session caps
713 */
714
715/*
716 * Free preallocated cap messages assigned to this session
717 */
718static void cleanup_cap_releases(struct ceph_mds_session *session)
719{
720 struct ceph_msg *msg;
721
722 spin_lock(&session->s_cap_lock);
723 while (!list_empty(&session->s_cap_releases)) {
724 msg = list_first_entry(&session->s_cap_releases,
725 struct ceph_msg, list_head);
726 list_del_init(&msg->list_head);
727 ceph_msg_put(msg);
728 }
729 while (!list_empty(&session->s_cap_releases_done)) {
730 msg = list_first_entry(&session->s_cap_releases_done,
731 struct ceph_msg, list_head);
732 list_del_init(&msg->list_head);
733 ceph_msg_put(msg);
734 }
735 spin_unlock(&session->s_cap_lock);
736}
737
738/*
739 * Helper to safely iterate over all caps associated with a session.
740 *
741 * caller must hold session s_mutex
742 */
743static int iterate_session_caps(struct ceph_mds_session *session,
744 int (*cb)(struct inode *, struct ceph_cap *,
745 void *), void *arg)
746{
747 struct list_head *p;
748 struct ceph_cap *cap;
749 struct inode *inode, *last_inode = NULL;
750 struct ceph_cap *old_cap = NULL;
751 int ret;
752
753 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
754 spin_lock(&session->s_cap_lock);
755 p = session->s_caps.next;
756 while (p != &session->s_caps) {
757 cap = list_entry(p, struct ceph_cap, session_caps);
758 inode = igrab(&cap->ci->vfs_inode);
759 if (!inode) {
760 p = p->next;
761 continue;
762 }
763 session->s_cap_iterator = cap;
764 spin_unlock(&session->s_cap_lock);
765
766 if (last_inode) {
767 iput(last_inode);
768 last_inode = NULL;
769 }
770 if (old_cap) {
771 ceph_put_cap(old_cap);
772 old_cap = NULL;
773 }
774
775 ret = cb(inode, cap, arg);
776 last_inode = inode;
777
778 spin_lock(&session->s_cap_lock);
779 p = p->next;
780 if (cap->ci == NULL) {
781 dout("iterate_session_caps finishing cap %p removal\n",
782 cap);
783 BUG_ON(cap->session != session);
784 list_del_init(&cap->session_caps);
785 session->s_nr_caps--;
786 cap->session = NULL;
787 old_cap = cap; /* put_cap it w/o locks held */
788 }
789 if (ret < 0)
790 goto out;
791 }
792 ret = 0;
793out:
794 session->s_cap_iterator = NULL;
795 spin_unlock(&session->s_cap_lock);
796
797 if (last_inode)
798 iput(last_inode);
799 if (old_cap)
800 ceph_put_cap(old_cap);
801
802 return ret;
803}
804
805static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
806 void *arg)
807{
808 struct ceph_inode_info *ci = ceph_inode(inode);
809 dout("removing cap %p, ci is %p, inode is %p\n",
810 cap, ci, &ci->vfs_inode);
811 ceph_remove_cap(cap);
812 return 0;
813}
814
815/*
816 * caller must hold session s_mutex
817 */
818static void remove_session_caps(struct ceph_mds_session *session)
819{
820 dout("remove_session_caps on %p\n", session);
821 iterate_session_caps(session, remove_session_caps_cb, NULL);
822 BUG_ON(session->s_nr_caps > 0);
823 cleanup_cap_releases(session);
824}
825
826/*
827 * wake up any threads waiting on this session's caps. if the cap is
828 * old (didn't get renewed on the client reconnect), remove it now.
829 *
830 * caller must hold s_mutex.
831 */
832static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
833 void *arg)
834{
835 struct ceph_inode_info *ci = ceph_inode(inode);
836
837 wake_up(&ci->i_cap_wq);
838 if (arg) {
839 spin_lock(&inode->i_lock);
840 ci->i_wanted_max_size = 0;
841 ci->i_requested_max_size = 0;
842 spin_unlock(&inode->i_lock);
843 }
844 return 0;
845}
846
847static void wake_up_session_caps(struct ceph_mds_session *session,
848 int reconnect)
849{
850 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
851 iterate_session_caps(session, wake_up_session_cb,
852 (void *)(unsigned long)reconnect);
853}
854
855/*
856 * Send periodic message to MDS renewing all currently held caps. The
857 * ack will reset the expiration for all caps from this session.
858 *
859 * caller holds s_mutex
860 */
861static int send_renew_caps(struct ceph_mds_client *mdsc,
862 struct ceph_mds_session *session)
863{
864 struct ceph_msg *msg;
865 int state;
866
867 if (time_after_eq(jiffies, session->s_cap_ttl) &&
868 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
869 pr_info("mds%d caps stale\n", session->s_mds);
870 session->s_renew_requested = jiffies;
871
872 /* do not try to renew caps until a recovering mds has reconnected
873 * with its clients. */
874 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
875 if (state < CEPH_MDS_STATE_RECONNECT) {
876 dout("send_renew_caps ignoring mds%d (%s)\n",
877 session->s_mds, ceph_mds_state_name(state));
878 return 0;
879 }
880
881 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
882 ceph_mds_state_name(state));
883 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
884 ++session->s_renew_seq);
885 if (IS_ERR(msg))
886 return PTR_ERR(msg);
887 ceph_con_send(&session->s_con, msg);
888 return 0;
889}
890
891/*
892 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
893 *
894 * Called under session->s_mutex
895 */
896static void renewed_caps(struct ceph_mds_client *mdsc,
897 struct ceph_mds_session *session, int is_renew)
898{
899 int was_stale;
900 int wake = 0;
901
902 spin_lock(&session->s_cap_lock);
903 was_stale = is_renew && (session->s_cap_ttl == 0 ||
904 time_after_eq(jiffies, session->s_cap_ttl));
905
906 session->s_cap_ttl = session->s_renew_requested +
907 mdsc->mdsmap->m_session_timeout*HZ;
908
909 if (was_stale) {
910 if (time_before(jiffies, session->s_cap_ttl)) {
911 pr_info("mds%d caps renewed\n", session->s_mds);
912 wake = 1;
913 } else {
914 pr_info("mds%d caps still stale\n", session->s_mds);
915 }
916 }
917 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
918 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
919 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
920 spin_unlock(&session->s_cap_lock);
921
922 if (wake)
923 wake_up_session_caps(session, 0);
924}
925
926/*
927 * send a session close request
928 */
929static int request_close_session(struct ceph_mds_client *mdsc,
930 struct ceph_mds_session *session)
931{
932 struct ceph_msg *msg;
933 int err = 0;
934
935 dout("request_close_session mds%d state %s seq %lld\n",
936 session->s_mds, session_state_name(session->s_state),
937 session->s_seq);
938 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
939 if (IS_ERR(msg))
940 err = PTR_ERR(msg);
941 else
942 ceph_con_send(&session->s_con, msg);
943 return err;
944}
945
946/*
947 * Called with s_mutex held.
948 */
949static int __close_session(struct ceph_mds_client *mdsc,
950 struct ceph_mds_session *session)
951{
952 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
953 return 0;
954 session->s_state = CEPH_MDS_SESSION_CLOSING;
955 return request_close_session(mdsc, session);
956}
957
958/*
959 * Trim old(er) caps.
960 *
961 * Because we can't cache an inode without one or more caps, we do
962 * this indirectly: if a cap is unused, we prune its aliases, at which
963 * point the inode will hopefully get dropped to.
964 *
965 * Yes, this is a bit sloppy. Our only real goal here is to respond to
966 * memory pressure from the MDS, though, so it needn't be perfect.
967 */
968static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
969{
970 struct ceph_mds_session *session = arg;
971 struct ceph_inode_info *ci = ceph_inode(inode);
972 int used, oissued, mine;
973
974 if (session->s_trim_caps <= 0)
975 return -1;
976
977 spin_lock(&inode->i_lock);
978 mine = cap->issued | cap->implemented;
979 used = __ceph_caps_used(ci);
980 oissued = __ceph_caps_issued_other(ci, cap);
981
982 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
983 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
984 ceph_cap_string(used));
985 if (ci->i_dirty_caps)
986 goto out; /* dirty caps */
987 if ((used & ~oissued) & mine)
988 goto out; /* we need these caps */
989
990 session->s_trim_caps--;
991 if (oissued) {
992 /* we aren't the only cap.. just remove us */
993 __ceph_remove_cap(cap);
994 } else {
995 /* try to drop referring dentries */
996 spin_unlock(&inode->i_lock);
997 d_prune_aliases(inode);
998 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
999 inode, cap, atomic_read(&inode->i_count));
1000 return 0;
1001 }
1002
1003out:
1004 spin_unlock(&inode->i_lock);
1005 return 0;
1006}
1007
1008/*
1009 * Trim session cap count down to some max number.
1010 */
1011static int trim_caps(struct ceph_mds_client *mdsc,
1012 struct ceph_mds_session *session,
1013 int max_caps)
1014{
1015 int trim_caps = session->s_nr_caps - max_caps;
1016
1017 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1019 if (trim_caps > 0) {
1020 session->s_trim_caps = trim_caps;
1021 iterate_session_caps(session, trim_caps_cb, session);
1022 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1023 session->s_mds, session->s_nr_caps, max_caps,
1024 trim_caps - session->s_trim_caps);
1025 session->s_trim_caps = 0;
1026 }
1027 return 0;
1028}
1029
1030/*
1031 * Allocate cap_release messages. If there is a partially full message
1032 * in the queue, try to allocate enough to cover it's remainder, so that
1033 * we can send it immediately.
1034 *
1035 * Called under s_mutex.
1036 */
1037static int add_cap_releases(struct ceph_mds_client *mdsc,
1038 struct ceph_mds_session *session,
1039 int extra)
1040{
1041 struct ceph_msg *msg;
1042 struct ceph_mds_cap_release *head;
1043 int err = -ENOMEM;
1044
1045 if (extra < 0)
1046 extra = mdsc->client->mount_args->cap_release_safety;
1047
1048 spin_lock(&session->s_cap_lock);
1049
1050 if (!list_empty(&session->s_cap_releases)) {
1051 msg = list_first_entry(&session->s_cap_releases,
1052 struct ceph_msg,
1053 list_head);
1054 head = msg->front.iov_base;
1055 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1056 }
1057
1058 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1059 spin_unlock(&session->s_cap_lock);
1060 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1061 0, 0, NULL);
1062 if (!msg)
1063 goto out_unlocked;
1064 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1065 (int)msg->front.iov_len);
1066 head = msg->front.iov_base;
1067 head->num = cpu_to_le32(0);
1068 msg->front.iov_len = sizeof(*head);
1069 spin_lock(&session->s_cap_lock);
1070 list_add(&msg->list_head, &session->s_cap_releases);
1071 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1072 }
1073
1074 if (!list_empty(&session->s_cap_releases)) {
1075 msg = list_first_entry(&session->s_cap_releases,
1076 struct ceph_msg,
1077 list_head);
1078 head = msg->front.iov_base;
1079 if (head->num) {
1080 dout(" queueing non-full %p (%d)\n", msg,
1081 le32_to_cpu(head->num));
1082 list_move_tail(&msg->list_head,
1083 &session->s_cap_releases_done);
1084 session->s_num_cap_releases -=
1085 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1086 }
1087 }
1088 err = 0;
1089 spin_unlock(&session->s_cap_lock);
1090out_unlocked:
1091 return err;
1092}
1093
1094/*
1095 * flush all dirty inode data to disk.
1096 *
1097 * returns true if we've flushed through want_flush_seq
1098 */
1099static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1100{
1101 int mds, ret = 1;
1102
1103 dout("check_cap_flush want %lld\n", want_flush_seq);
1104 mutex_lock(&mdsc->mutex);
1105 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1106 struct ceph_mds_session *session = mdsc->sessions[mds];
1107
1108 if (!session)
1109 continue;
1110 get_session(session);
1111 mutex_unlock(&mdsc->mutex);
1112
1113 mutex_lock(&session->s_mutex);
1114 if (!list_empty(&session->s_cap_flushing)) {
1115 struct ceph_inode_info *ci =
1116 list_entry(session->s_cap_flushing.next,
1117 struct ceph_inode_info,
1118 i_flushing_item);
1119 struct inode *inode = &ci->vfs_inode;
1120
1121 spin_lock(&inode->i_lock);
1122 if (ci->i_cap_flush_seq <= want_flush_seq) {
1123 dout("check_cap_flush still flushing %p "
1124 "seq %lld <= %lld to mds%d\n", inode,
1125 ci->i_cap_flush_seq, want_flush_seq,
1126 session->s_mds);
1127 ret = 0;
1128 }
1129 spin_unlock(&inode->i_lock);
1130 }
1131 mutex_unlock(&session->s_mutex);
1132 ceph_put_mds_session(session);
1133
1134 if (!ret)
1135 return ret;
1136 mutex_lock(&mdsc->mutex);
1137 }
1138
1139 mutex_unlock(&mdsc->mutex);
1140 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1141 return ret;
1142}
1143
1144/*
1145 * called under s_mutex
1146 */
1147static void send_cap_releases(struct ceph_mds_client *mdsc,
1148 struct ceph_mds_session *session)
1149{
1150 struct ceph_msg *msg;
1151
1152 dout("send_cap_releases mds%d\n", session->s_mds);
1153 while (1) {
1154 spin_lock(&session->s_cap_lock);
1155 if (list_empty(&session->s_cap_releases_done))
1156 break;
1157 msg = list_first_entry(&session->s_cap_releases_done,
1158 struct ceph_msg, list_head);
1159 list_del_init(&msg->list_head);
1160 spin_unlock(&session->s_cap_lock);
1161 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1162 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1163 ceph_con_send(&session->s_con, msg);
1164 }
1165 spin_unlock(&session->s_cap_lock);
1166}
1167
1168/*
1169 * requests
1170 */
1171
1172/*
1173 * Create an mds request.
1174 */
1175struct ceph_mds_request *
1176ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1177{
1178 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1179
1180 if (!req)
1181 return ERR_PTR(-ENOMEM);
1182
1183 req->r_started = jiffies;
1184 req->r_resend_mds = -1;
1185 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1186 req->r_fmode = -1;
1187 kref_init(&req->r_kref);
1188 INIT_LIST_HEAD(&req->r_wait);
1189 init_completion(&req->r_completion);
1190 init_completion(&req->r_safe_completion);
1191 INIT_LIST_HEAD(&req->r_unsafe_item);
1192
1193 req->r_op = op;
1194 req->r_direct_mode = mode;
1195 return req;
1196}
1197
1198/*
1199 * return oldest (lowest) request, tid in request tree, 0 if none.
1200 *
1201 * called under mdsc->mutex.
1202 */
1203static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1204{
1205 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1206 return NULL;
1207 return rb_entry(rb_first(&mdsc->request_tree),
1208 struct ceph_mds_request, r_node);
1209}
1210
1211static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1212{
1213 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1214
1215 if (req)
1216 return req->r_tid;
1217 return 0;
1218}
1219
1220/*
1221 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1222 * on build_path_from_dentry in fs/cifs/dir.c.
1223 *
1224 * If @stop_on_nosnap, generate path relative to the first non-snapped
1225 * inode.
1226 *
1227 * Encode hidden .snap dirs as a double /, i.e.
1228 * foo/.snap/bar -> foo//bar
1229 */
1230char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1231 int stop_on_nosnap)
1232{
1233 struct dentry *temp;
1234 char *path;
1235 int len, pos;
1236
1237 if (dentry == NULL)
1238 return ERR_PTR(-EINVAL);
1239
1240retry:
1241 len = 0;
1242 for (temp = dentry; !IS_ROOT(temp);) {
1243 struct inode *inode = temp->d_inode;
1244 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1245 len++; /* slash only */
1246 else if (stop_on_nosnap && inode &&
1247 ceph_snap(inode) == CEPH_NOSNAP)
1248 break;
1249 else
1250 len += 1 + temp->d_name.len;
1251 temp = temp->d_parent;
1252 if (temp == NULL) {
1253 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1254 return ERR_PTR(-EINVAL);
1255 }
1256 }
1257 if (len)
1258 len--; /* no leading '/' */
1259
1260 path = kmalloc(len+1, GFP_NOFS);
1261 if (path == NULL)
1262 return ERR_PTR(-ENOMEM);
1263 pos = len;
1264 path[pos] = 0; /* trailing null */
1265 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1266 struct inode *inode = temp->d_inode;
1267
1268 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1269 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1270 pos, temp);
1271 } else if (stop_on_nosnap && inode &&
1272 ceph_snap(inode) == CEPH_NOSNAP) {
1273 break;
1274 } else {
1275 pos -= temp->d_name.len;
1276 if (pos < 0)
1277 break;
1278 strncpy(path + pos, temp->d_name.name,
1279 temp->d_name.len);
1280 dout("build_path_dentry path+%d: %p '%.*s'\n",
1281 pos, temp, temp->d_name.len, path + pos);
1282 }
1283 if (pos)
1284 path[--pos] = '/';
1285 temp = temp->d_parent;
1286 if (temp == NULL) {
1287 pr_err("build_path_dentry corrupt dentry\n");
1288 kfree(path);
1289 return ERR_PTR(-EINVAL);
1290 }
1291 }
1292 if (pos != 0) {
1293 pr_err("build_path_dentry did not end path lookup where "
1294 "expected, namelen is %d, pos is %d\n", len, pos);
1295 /* presumably this is only possible if racing with a
1296 rename of one of the parent directories (we can not
1297 lock the dentries above us to prevent this, but
1298 retrying should be harmless) */
1299 kfree(path);
1300 goto retry;
1301 }
1302
1303 *base = ceph_ino(temp->d_inode);
1304 *plen = len;
1305 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1306 dentry, atomic_read(&dentry->d_count), *base, len, path);
1307 return path;
1308}
1309
1310static int build_dentry_path(struct dentry *dentry,
1311 const char **ppath, int *ppathlen, u64 *pino,
1312 int *pfreepath)
1313{
1314 char *path;
1315
1316 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1317 *pino = ceph_ino(dentry->d_parent->d_inode);
1318 *ppath = dentry->d_name.name;
1319 *ppathlen = dentry->d_name.len;
1320 return 0;
1321 }
1322 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1323 if (IS_ERR(path))
1324 return PTR_ERR(path);
1325 *ppath = path;
1326 *pfreepath = 1;
1327 return 0;
1328}
1329
1330static int build_inode_path(struct inode *inode,
1331 const char **ppath, int *ppathlen, u64 *pino,
1332 int *pfreepath)
1333{
1334 struct dentry *dentry;
1335 char *path;
1336
1337 if (ceph_snap(inode) == CEPH_NOSNAP) {
1338 *pino = ceph_ino(inode);
1339 *ppathlen = 0;
1340 return 0;
1341 }
1342 dentry = d_find_alias(inode);
1343 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1344 dput(dentry);
1345 if (IS_ERR(path))
1346 return PTR_ERR(path);
1347 *ppath = path;
1348 *pfreepath = 1;
1349 return 0;
1350}
1351
1352/*
1353 * request arguments may be specified via an inode *, a dentry *, or
1354 * an explicit ino+path.
1355 */
1356static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1357 const char *rpath, u64 rino,
1358 const char **ppath, int *pathlen,
1359 u64 *ino, int *freepath)
1360{
1361 int r = 0;
1362
1363 if (rinode) {
1364 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1365 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1366 ceph_snap(rinode));
1367 } else if (rdentry) {
1368 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1369 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1370 *ppath);
1371 } else if (rpath) {
1372 *ino = rino;
1373 *ppath = rpath;
1374 *pathlen = strlen(rpath);
1375 dout(" path %.*s\n", *pathlen, rpath);
1376 }
1377
1378 return r;
1379}
1380
1381/*
1382 * called under mdsc->mutex
1383 */
1384static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1385 struct ceph_mds_request *req,
1386 int mds)
1387{
1388 struct ceph_msg *msg;
1389 struct ceph_mds_request_head *head;
1390 const char *path1 = NULL;
1391 const char *path2 = NULL;
1392 u64 ino1 = 0, ino2 = 0;
1393 int pathlen1 = 0, pathlen2 = 0;
1394 int freepath1 = 0, freepath2 = 0;
1395 int len;
1396 u16 releases;
1397 void *p, *end;
1398 int ret;
1399
1400 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1401 req->r_path1, req->r_ino1.ino,
1402 &path1, &pathlen1, &ino1, &freepath1);
1403 if (ret < 0) {
1404 msg = ERR_PTR(ret);
1405 goto out;
1406 }
1407
1408 ret = set_request_path_attr(NULL, req->r_old_dentry,
1409 req->r_path2, req->r_ino2.ino,
1410 &path2, &pathlen2, &ino2, &freepath2);
1411 if (ret < 0) {
1412 msg = ERR_PTR(ret);
1413 goto out_free1;
1414 }
1415
1416 len = sizeof(*head) +
1417 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1418
1419 /* calculate (max) length for cap releases */
1420 len += sizeof(struct ceph_mds_request_release) *
1421 (!!req->r_inode_drop + !!req->r_dentry_drop +
1422 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1423 if (req->r_dentry_drop)
1424 len += req->r_dentry->d_name.len;
1425 if (req->r_old_dentry_drop)
1426 len += req->r_old_dentry->d_name.len;
1427
1428 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1429 if (IS_ERR(msg))
1430 goto out_free2;
1431
1432 msg->hdr.tid = cpu_to_le64(req->r_tid);
1433
1434 head = msg->front.iov_base;
1435 p = msg->front.iov_base + sizeof(*head);
1436 end = msg->front.iov_base + msg->front.iov_len;
1437
1438 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1439 head->op = cpu_to_le32(req->r_op);
1440 head->caller_uid = cpu_to_le32(current_fsuid());
1441 head->caller_gid = cpu_to_le32(current_fsgid());
1442 head->args = req->r_args;
1443
1444 ceph_encode_filepath(&p, end, ino1, path1);
1445 ceph_encode_filepath(&p, end, ino2, path2);
1446
1447 /* cap releases */
1448 releases = 0;
1449 if (req->r_inode_drop)
1450 releases += ceph_encode_inode_release(&p,
1451 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1452 mds, req->r_inode_drop, req->r_inode_unless, 0);
1453 if (req->r_dentry_drop)
1454 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1455 mds, req->r_dentry_drop, req->r_dentry_unless);
1456 if (req->r_old_dentry_drop)
1457 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1458 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1459 if (req->r_old_inode_drop)
1460 releases += ceph_encode_inode_release(&p,
1461 req->r_old_dentry->d_inode,
1462 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1463 head->num_releases = cpu_to_le16(releases);
1464
1465 BUG_ON(p > end);
1466 msg->front.iov_len = p - msg->front.iov_base;
1467 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1468
1469 msg->pages = req->r_pages;
1470 msg->nr_pages = req->r_num_pages;
1471 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1472 msg->hdr.data_off = cpu_to_le16(0);
1473
1474out_free2:
1475 if (freepath2)
1476 kfree((char *)path2);
1477out_free1:
1478 if (freepath1)
1479 kfree((char *)path1);
1480out:
1481 return msg;
1482}
1483
1484/*
1485 * called under mdsc->mutex if error, under no mutex if
1486 * success.
1487 */
1488static void complete_request(struct ceph_mds_client *mdsc,
1489 struct ceph_mds_request *req)
1490{
1491 if (req->r_callback)
1492 req->r_callback(mdsc, req);
1493 else
1494 complete(&req->r_completion);
1495}
1496
1497/*
1498 * called under mdsc->mutex
1499 */
1500static int __prepare_send_request(struct ceph_mds_client *mdsc,
1501 struct ceph_mds_request *req,
1502 int mds)
1503{
1504 struct ceph_mds_request_head *rhead;
1505 struct ceph_msg *msg;
1506 int flags = 0;
1507
1508 req->r_mds = mds;
1509 req->r_attempts++;
1510 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1511 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1512
1513 if (req->r_request) {
1514 ceph_msg_put(req->r_request);
1515 req->r_request = NULL;
1516 }
1517 msg = create_request_message(mdsc, req, mds);
1518 if (IS_ERR(msg)) {
1519 req->r_reply = ERR_PTR(PTR_ERR(msg));
1520 complete_request(mdsc, req);
1521 return -PTR_ERR(msg);
1522 }
1523 req->r_request = msg;
1524
1525 rhead = msg->front.iov_base;
1526 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1527 if (req->r_got_unsafe)
1528 flags |= CEPH_MDS_FLAG_REPLAY;
1529 if (req->r_locked_dir)
1530 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1531 rhead->flags = cpu_to_le32(flags);
1532 rhead->num_fwd = req->r_num_fwd;
1533 rhead->num_retry = req->r_attempts - 1;
1534
1535 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1536
1537 if (req->r_target_inode && req->r_got_unsafe)
1538 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1539 else
1540 rhead->ino = 0;
1541 return 0;
1542}
1543
1544/*
1545 * send request, or put it on the appropriate wait list.
1546 */
1547static int __do_request(struct ceph_mds_client *mdsc,
1548 struct ceph_mds_request *req)
1549{
1550 struct ceph_mds_session *session = NULL;
1551 int mds = -1;
1552 int err = -EAGAIN;
1553
1554 if (req->r_reply)
1555 goto out;
1556
1557 if (req->r_timeout &&
1558 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1559 dout("do_request timed out\n");
1560 err = -EIO;
1561 goto finish;
1562 }
1563
1564 mds = __choose_mds(mdsc, req);
1565 if (mds < 0 ||
1566 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1567 dout("do_request no mds or not active, waiting for map\n");
1568 list_add(&req->r_wait, &mdsc->waiting_for_map);
1569 goto out;
1570 }
1571
1572 /* get, open session */
1573 session = __ceph_lookup_mds_session(mdsc, mds);
1574 if (!session) {
1575 session = register_session(mdsc, mds);
1576 if (IS_ERR(session)) {
1577 err = PTR_ERR(session);
1578 goto finish;
1579 }
1580 }
1581 dout("do_request mds%d session %p state %s\n", mds, session,
1582 session_state_name(session->s_state));
1583 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1584 session->s_state != CEPH_MDS_SESSION_HUNG) {
1585 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1586 session->s_state == CEPH_MDS_SESSION_CLOSING)
1587 __open_session(mdsc, session);
1588 list_add(&req->r_wait, &session->s_waiting);
1589 goto out_session;
1590 }
1591
1592 /* send request */
1593 req->r_session = get_session(session);
1594 req->r_resend_mds = -1; /* forget any previous mds hint */
1595
1596 if (req->r_request_started == 0) /* note request start time */
1597 req->r_request_started = jiffies;
1598
1599 err = __prepare_send_request(mdsc, req, mds);
1600 if (!err) {
1601 ceph_msg_get(req->r_request);
1602 ceph_con_send(&session->s_con, req->r_request);
1603 }
1604
1605out_session:
1606 ceph_put_mds_session(session);
1607out:
1608 return err;
1609
1610finish:
1611 req->r_reply = ERR_PTR(err);
1612 complete_request(mdsc, req);
1613 goto out;
1614}
1615
1616/*
1617 * called under mdsc->mutex
1618 */
1619static void __wake_requests(struct ceph_mds_client *mdsc,
1620 struct list_head *head)
1621{
1622 struct ceph_mds_request *req, *nreq;
1623
1624 list_for_each_entry_safe(req, nreq, head, r_wait) {
1625 list_del_init(&req->r_wait);
1626 __do_request(mdsc, req);
1627 }
1628}
1629
1630/*
1631 * Wake up threads with requests pending for @mds, so that they can
1632 * resubmit their requests to a possibly different mds. If @all is set,
1633 * wake up if their requests has been forwarded to @mds, too.
1634 */
1635static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1636{
1637 struct ceph_mds_request *req;
1638 struct rb_node *p;
1639
1640 dout("kick_requests mds%d\n", mds);
1641 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1642 req = rb_entry(p, struct ceph_mds_request, r_node);
1643 if (req->r_got_unsafe)
1644 continue;
1645 if (req->r_session &&
1646 req->r_session->s_mds == mds) {
1647 dout(" kicking tid %llu\n", req->r_tid);
1648 put_request_session(req);
1649 __do_request(mdsc, req);
1650 }
1651 }
1652}
1653
1654void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1655 struct ceph_mds_request *req)
1656{
1657 dout("submit_request on %p\n", req);
1658 mutex_lock(&mdsc->mutex);
1659 __register_request(mdsc, req, NULL);
1660 __do_request(mdsc, req);
1661 mutex_unlock(&mdsc->mutex);
1662}
1663
1664/*
1665 * Synchrously perform an mds request. Take care of all of the
1666 * session setup, forwarding, retry details.
1667 */
1668int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1669 struct inode *dir,
1670 struct ceph_mds_request *req)
1671{
1672 int err;
1673
1674 dout("do_request on %p\n", req);
1675
1676 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1677 if (req->r_inode)
1678 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1679 if (req->r_locked_dir)
1680 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1681 if (req->r_old_dentry)
1682 ceph_get_cap_refs(
1683 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1684 CEPH_CAP_PIN);
1685
1686 /* issue */
1687 mutex_lock(&mdsc->mutex);
1688 __register_request(mdsc, req, dir);
1689 __do_request(mdsc, req);
1690
1691 /* wait */
1692 if (!req->r_reply) {
1693 mutex_unlock(&mdsc->mutex);
1694 if (req->r_timeout) {
1695 err = (long)wait_for_completion_interruptible_timeout(
1696 &req->r_completion, req->r_timeout);
1697 if (err == 0)
1698 req->r_reply = ERR_PTR(-EIO);
1699 else if (err < 0)
1700 req->r_reply = ERR_PTR(err);
1701 } else {
1702 err = wait_for_completion_interruptible(
1703 &req->r_completion);
1704 if (err)
1705 req->r_reply = ERR_PTR(err);
1706 }
1707 mutex_lock(&mdsc->mutex);
1708 }
1709
1710 if (IS_ERR(req->r_reply)) {
1711 err = PTR_ERR(req->r_reply);
1712 req->r_reply = NULL;
1713
1714 if (err == -ERESTARTSYS) {
1715 /* aborted */
1716 req->r_aborted = true;
1717
1718 if (req->r_locked_dir &&
1719 (req->r_op & CEPH_MDS_OP_WRITE)) {
1720 struct ceph_inode_info *ci =
1721 ceph_inode(req->r_locked_dir);
1722
1723 dout("aborted, clearing I_COMPLETE on %p\n",
1724 req->r_locked_dir);
1725 spin_lock(&req->r_locked_dir->i_lock);
1726 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1727 ci->i_release_count++;
1728 spin_unlock(&req->r_locked_dir->i_lock);
1729 }
1730 } else {
1731 /* clean up this request */
1732 __unregister_request(mdsc, req);
1733 if (!list_empty(&req->r_unsafe_item))
1734 list_del_init(&req->r_unsafe_item);
1735 complete(&req->r_safe_completion);
1736 }
1737 } else if (req->r_err) {
1738 err = req->r_err;
1739 } else {
1740 err = le32_to_cpu(req->r_reply_info.head->result);
1741 }
1742 mutex_unlock(&mdsc->mutex);
1743
1744 dout("do_request %p done, result %d\n", req, err);
1745 return err;
1746}
1747
1748/*
1749 * Handle mds reply.
1750 *
1751 * We take the session mutex and parse and process the reply immediately.
1752 * This preserves the logical ordering of replies, capabilities, etc., sent
1753 * by the MDS as they are applied to our local cache.
1754 */
1755static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1756{
1757 struct ceph_mds_client *mdsc = session->s_mdsc;
1758 struct ceph_mds_request *req;
1759 struct ceph_mds_reply_head *head = msg->front.iov_base;
1760 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1761 u64 tid;
1762 int err, result;
1763 int mds = session->s_mds;
1764
1765 if (msg->front.iov_len < sizeof(*head)) {
1766 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1767 ceph_msg_dump(msg);
1768 return;
1769 }
1770
1771 /* get request, session */
1772 tid = le64_to_cpu(msg->hdr.tid);
1773 mutex_lock(&mdsc->mutex);
1774 req = __lookup_request(mdsc, tid);
1775 if (!req) {
1776 dout("handle_reply on unknown tid %llu\n", tid);
1777 mutex_unlock(&mdsc->mutex);
1778 return;
1779 }
1780 dout("handle_reply %p\n", req);
1781
1782 /* correct session? */
1783 if (req->r_session != session) {
1784 pr_err("mdsc_handle_reply got %llu on session mds%d"
1785 " not mds%d\n", tid, session->s_mds,
1786 req->r_session ? req->r_session->s_mds : -1);
1787 mutex_unlock(&mdsc->mutex);
1788 goto out;
1789 }
1790
1791 /* dup? */
1792 if ((req->r_got_unsafe && !head->safe) ||
1793 (req->r_got_safe && head->safe)) {
1794 pr_warning("got a dup %s reply on %llu from mds%d\n",
1795 head->safe ? "safe" : "unsafe", tid, mds);
1796 mutex_unlock(&mdsc->mutex);
1797 goto out;
1798 }
1799
1800 result = le32_to_cpu(head->result);
1801
1802 /*
1803 * Tolerate 2 consecutive ESTALEs from the same mds.
1804 * FIXME: we should be looking at the cap migrate_seq.
1805 */
1806 if (result == -ESTALE) {
1807 req->r_direct_mode = USE_AUTH_MDS;
1808 req->r_num_stale++;
1809 if (req->r_num_stale <= 2) {
1810 __do_request(mdsc, req);
1811 mutex_unlock(&mdsc->mutex);
1812 goto out;
1813 }
1814 } else {
1815 req->r_num_stale = 0;
1816 }
1817
1818 if (head->safe) {
1819 req->r_got_safe = true;
1820 __unregister_request(mdsc, req);
1821 complete(&req->r_safe_completion);
1822
1823 if (req->r_got_unsafe) {
1824 /*
1825 * We already handled the unsafe response, now do the
1826 * cleanup. No need to examine the response; the MDS
1827 * doesn't include any result info in the safe
1828 * response. And even if it did, there is nothing
1829 * useful we could do with a revised return value.
1830 */
1831 dout("got safe reply %llu, mds%d\n", tid, mds);
1832 list_del_init(&req->r_unsafe_item);
1833
1834 /* last unsafe request during umount? */
1835 if (mdsc->stopping && !__get_oldest_req(mdsc))
1836 complete(&mdsc->safe_umount_waiters);
1837 mutex_unlock(&mdsc->mutex);
1838 goto out;
1839 }
1840 }
1841
1842 BUG_ON(req->r_reply);
1843
1844 if (!head->safe) {
1845 req->r_got_unsafe = true;
1846 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1847 }
1848
1849 dout("handle_reply tid %lld result %d\n", tid, result);
1850 rinfo = &req->r_reply_info;
1851 err = parse_reply_info(msg, rinfo);
1852 mutex_unlock(&mdsc->mutex);
1853
1854 mutex_lock(&session->s_mutex);
1855 if (err < 0) {
1856 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1857 ceph_msg_dump(msg);
1858 goto out_err;
1859 }
1860
1861 /* snap trace */
1862 if (rinfo->snapblob_len) {
1863 down_write(&mdsc->snap_rwsem);
1864 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1865 rinfo->snapblob + rinfo->snapblob_len,
1866 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1867 downgrade_write(&mdsc->snap_rwsem);
1868 } else {
1869 down_read(&mdsc->snap_rwsem);
1870 }
1871
1872 /* insert trace into our cache */
1873 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1874 if (err == 0) {
1875 if (result == 0 && rinfo->dir_nr)
1876 ceph_readdir_prepopulate(req, req->r_session);
1877 ceph_unreserve_caps(&req->r_caps_reservation);
1878 }
1879
1880 up_read(&mdsc->snap_rwsem);
1881out_err:
1882 if (err) {
1883 req->r_err = err;
1884 } else {
1885 req->r_reply = msg;
1886 ceph_msg_get(msg);
1887 }
1888
1889 add_cap_releases(mdsc, req->r_session, -1);
1890 mutex_unlock(&session->s_mutex);
1891
1892 /* kick calling process */
1893 complete_request(mdsc, req);
1894out:
1895 ceph_mdsc_put_request(req);
1896 return;
1897}
1898
1899
1900
1901/*
1902 * handle mds notification that our request has been forwarded.
1903 */
1904static void handle_forward(struct ceph_mds_client *mdsc,
1905 struct ceph_mds_session *session,
1906 struct ceph_msg *msg)
1907{
1908 struct ceph_mds_request *req;
1909 u64 tid = le64_to_cpu(msg->hdr.tid);
1910 u32 next_mds;
1911 u32 fwd_seq;
1912 int err = -EINVAL;
1913 void *p = msg->front.iov_base;
1914 void *end = p + msg->front.iov_len;
1915
1916 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1917 next_mds = ceph_decode_32(&p);
1918 fwd_seq = ceph_decode_32(&p);
1919
1920 mutex_lock(&mdsc->mutex);
1921 req = __lookup_request(mdsc, tid);
1922 if (!req) {
1923 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1924 goto out; /* dup reply? */
1925 }
1926
1927 if (fwd_seq <= req->r_num_fwd) {
1928 dout("forward %llu to mds%d - old seq %d <= %d\n",
1929 tid, next_mds, req->r_num_fwd, fwd_seq);
1930 } else {
1931 /* resend. forward race not possible; mds would drop */
1932 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1933 req->r_num_fwd = fwd_seq;
1934 req->r_resend_mds = next_mds;
1935 put_request_session(req);
1936 __do_request(mdsc, req);
1937 }
1938 ceph_mdsc_put_request(req);
1939out:
1940 mutex_unlock(&mdsc->mutex);
1941 return;
1942
1943bad:
1944 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1945}
1946
1947/*
1948 * handle a mds session control message
1949 */
1950static void handle_session(struct ceph_mds_session *session,
1951 struct ceph_msg *msg)
1952{
1953 struct ceph_mds_client *mdsc = session->s_mdsc;
1954 u32 op;
1955 u64 seq;
1956 int mds = session->s_mds;
1957 struct ceph_mds_session_head *h = msg->front.iov_base;
1958 int wake = 0;
1959
1960 /* decode */
1961 if (msg->front.iov_len != sizeof(*h))
1962 goto bad;
1963 op = le32_to_cpu(h->op);
1964 seq = le64_to_cpu(h->seq);
1965
1966 mutex_lock(&mdsc->mutex);
1967 if (op == CEPH_SESSION_CLOSE)
1968 __unregister_session(mdsc, session);
1969 /* FIXME: this ttl calculation is generous */
1970 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1971 mutex_unlock(&mdsc->mutex);
1972
1973 mutex_lock(&session->s_mutex);
1974
1975 dout("handle_session mds%d %s %p state %s seq %llu\n",
1976 mds, ceph_session_op_name(op), session,
1977 session_state_name(session->s_state), seq);
1978
1979 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1980 session->s_state = CEPH_MDS_SESSION_OPEN;
1981 pr_info("mds%d came back\n", session->s_mds);
1982 }
1983
1984 switch (op) {
1985 case CEPH_SESSION_OPEN:
1986 session->s_state = CEPH_MDS_SESSION_OPEN;
1987 renewed_caps(mdsc, session, 0);
1988 wake = 1;
1989 if (mdsc->stopping)
1990 __close_session(mdsc, session);
1991 break;
1992
1993 case CEPH_SESSION_RENEWCAPS:
1994 if (session->s_renew_seq == seq)
1995 renewed_caps(mdsc, session, 1);
1996 break;
1997
1998 case CEPH_SESSION_CLOSE:
1999 remove_session_caps(session);
2000 wake = 1; /* for good measure */
2001 complete(&mdsc->session_close_waiters);
2002 kick_requests(mdsc, mds, 0); /* cur only */
2003 break;
2004
2005 case CEPH_SESSION_STALE:
2006 pr_info("mds%d caps went stale, renewing\n",
2007 session->s_mds);
2008 spin_lock(&session->s_cap_lock);
2009 session->s_cap_gen++;
2010 session->s_cap_ttl = 0;
2011 spin_unlock(&session->s_cap_lock);
2012 send_renew_caps(mdsc, session);
2013 break;
2014
2015 case CEPH_SESSION_RECALL_STATE:
2016 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2017 break;
2018
2019 default:
2020 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2021 WARN_ON(1);
2022 }
2023
2024 mutex_unlock(&session->s_mutex);
2025 if (wake) {
2026 mutex_lock(&mdsc->mutex);
2027 __wake_requests(mdsc, &session->s_waiting);
2028 mutex_unlock(&mdsc->mutex);
2029 }
2030 return;
2031
2032bad:
2033 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2034 (int)msg->front.iov_len);
2035 ceph_msg_dump(msg);
2036 return;
2037}
2038
2039
2040/*
2041 * called under session->mutex.
2042 */
2043static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2044 struct ceph_mds_session *session)
2045{
2046 struct ceph_mds_request *req, *nreq;
2047 int err;
2048
2049 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2050
2051 mutex_lock(&mdsc->mutex);
2052 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2053 err = __prepare_send_request(mdsc, req, session->s_mds);
2054 if (!err) {
2055 ceph_msg_get(req->r_request);
2056 ceph_con_send(&session->s_con, req->r_request);
2057 }
2058 }
2059 mutex_unlock(&mdsc->mutex);
2060}
2061
2062/*
2063 * Encode information about a cap for a reconnect with the MDS.
2064 */
2065static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2066 void *arg)
2067{
2068 struct ceph_mds_cap_reconnect rec;
2069 struct ceph_inode_info *ci;
2070 struct ceph_pagelist *pagelist = arg;
2071 char *path;
2072 int pathlen, err;
2073 u64 pathbase;
2074 struct dentry *dentry;
2075
2076 ci = cap->ci;
2077
2078 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2079 inode, ceph_vinop(inode), cap, cap->cap_id,
2080 ceph_cap_string(cap->issued));
2081 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2082 if (err)
2083 return err;
2084
2085 dentry = d_find_alias(inode);
2086 if (dentry) {
2087 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2088 if (IS_ERR(path)) {
2089 err = PTR_ERR(path);
2090 BUG_ON(err);
2091 }
2092 } else {
2093 path = NULL;
2094 pathlen = 0;
2095 }
2096 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2097 if (err)
2098 goto out;
2099
2100 spin_lock(&inode->i_lock);
2101 cap->seq = 0; /* reset cap seq */
2102 cap->issue_seq = 0; /* and issue_seq */
2103 rec.cap_id = cpu_to_le64(cap->cap_id);
2104 rec.pathbase = cpu_to_le64(pathbase);
2105 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2106 rec.issued = cpu_to_le32(cap->issued);
2107 rec.size = cpu_to_le64(inode->i_size);
2108 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2109 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2110 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2111 spin_unlock(&inode->i_lock);
2112
2113 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2114
2115out:
2116 kfree(path);
2117 dput(dentry);
2118 return err;
2119}
2120
2121
2122/*
2123 * If an MDS fails and recovers, clients need to reconnect in order to
2124 * reestablish shared state. This includes all caps issued through
2125 * this session _and_ the snap_realm hierarchy. Because it's not
2126 * clear which snap realms the mds cares about, we send everything we
2127 * know about.. that ensures we'll then get any new info the
2128 * recovering MDS might have.
2129 *
2130 * This is a relatively heavyweight operation, but it's rare.
2131 *
2132 * called with mdsc->mutex held.
2133 */
2134static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2135{
2136 struct ceph_mds_session *session = NULL;
2137 struct ceph_msg *reply;
2138 struct rb_node *p;
2139 int err;
2140 struct ceph_pagelist *pagelist;
2141
2142 pr_info("reconnect to recovering mds%d\n", mds);
2143
2144 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2145 if (!pagelist)
2146 goto fail_nopagelist;
2147 ceph_pagelist_init(pagelist);
2148
2149 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2150 if (IS_ERR(reply)) {
2151 err = PTR_ERR(reply);
2152 goto fail_nomsg;
2153 }
2154
2155 /* find session */
2156 session = __ceph_lookup_mds_session(mdsc, mds);
2157 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2158
2159 if (session) {
2160 mutex_lock(&session->s_mutex);
2161
2162 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2163 session->s_seq = 0;
2164
2165 ceph_con_open(&session->s_con,
2166 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2167
2168 /* replay unsafe requests */
2169 replay_unsafe_requests(mdsc, session);
2170 } else {
2171 dout("no session for mds%d, will send short reconnect\n",
2172 mds);
2173 }
2174
2175 down_read(&mdsc->snap_rwsem);
2176
2177 if (!session)
2178 goto send;
2179 dout("session %p state %s\n", session,
2180 session_state_name(session->s_state));
2181
2182 /* traverse this session's caps */
2183 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2184 if (err)
2185 goto fail;
2186 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2187 if (err < 0)
2188 goto out;
2189
2190 /*
2191 * snaprealms. we provide mds with the ino, seq (version), and
2192 * parent for all of our realms. If the mds has any newer info,
2193 * it will tell us.
2194 */
2195 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2196 struct ceph_snap_realm *realm =
2197 rb_entry(p, struct ceph_snap_realm, node);
2198 struct ceph_mds_snaprealm_reconnect sr_rec;
2199
2200 dout(" adding snap realm %llx seq %lld parent %llx\n",
2201 realm->ino, realm->seq, realm->parent_ino);
2202 sr_rec.ino = cpu_to_le64(realm->ino);
2203 sr_rec.seq = cpu_to_le64(realm->seq);
2204 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2205 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2206 if (err)
2207 goto fail;
2208 }
2209
2210send:
2211 reply->pagelist = pagelist;
2212 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2213 reply->nr_pages = calc_pages_for(0, pagelist->length);
2214 ceph_con_send(&session->s_con, reply);
2215
2216 if (session) {
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 __wake_requests(mdsc, &session->s_waiting);
2219 }
2220
2221out:
2222 up_read(&mdsc->snap_rwsem);
2223 if (session) {
2224 mutex_unlock(&session->s_mutex);
2225 ceph_put_mds_session(session);
2226 }
2227 mutex_lock(&mdsc->mutex);
2228 return;
2229
2230fail:
2231 ceph_msg_put(reply);
2232fail_nomsg:
2233 ceph_pagelist_release(pagelist);
2234 kfree(pagelist);
2235fail_nopagelist:
2236 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2237 goto out;
2238}
2239
2240
2241/*
2242 * compare old and new mdsmaps, kicking requests
2243 * and closing out old connections as necessary
2244 *
2245 * called under mdsc->mutex.
2246 */
2247static void check_new_map(struct ceph_mds_client *mdsc,
2248 struct ceph_mdsmap *newmap,
2249 struct ceph_mdsmap *oldmap)
2250{
2251 int i;
2252 int oldstate, newstate;
2253 struct ceph_mds_session *s;
2254
2255 dout("check_new_map new %u old %u\n",
2256 newmap->m_epoch, oldmap->m_epoch);
2257
2258 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2259 if (mdsc->sessions[i] == NULL)
2260 continue;
2261 s = mdsc->sessions[i];
2262 oldstate = ceph_mdsmap_get_state(oldmap, i);
2263 newstate = ceph_mdsmap_get_state(newmap, i);
2264
2265 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2266 i, ceph_mds_state_name(oldstate),
2267 ceph_mds_state_name(newstate),
2268 session_state_name(s->s_state));
2269
2270 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2271 ceph_mdsmap_get_addr(newmap, i),
2272 sizeof(struct ceph_entity_addr))) {
2273 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2274 /* the session never opened, just close it
2275 * out now */
2276 __wake_requests(mdsc, &s->s_waiting);
2277 __unregister_session(mdsc, s);
2278 } else {
2279 /* just close it */
2280 mutex_unlock(&mdsc->mutex);
2281 mutex_lock(&s->s_mutex);
2282 mutex_lock(&mdsc->mutex);
2283 ceph_con_close(&s->s_con);
2284 mutex_unlock(&s->s_mutex);
2285 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2286 }
2287
2288 /* kick any requests waiting on the recovering mds */
2289 kick_requests(mdsc, i, 1);
2290 } else if (oldstate == newstate) {
2291 continue; /* nothing new with this mds */
2292 }
2293
2294 /*
2295 * send reconnect?
2296 */
2297 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2298 newstate >= CEPH_MDS_STATE_RECONNECT)
2299 send_mds_reconnect(mdsc, i);
2300
2301 /*
2302 * kick requests on any mds that has gone active.
2303 *
2304 * kick requests on cur or forwarder: we may have sent
2305 * the request to mds1, mds1 told us it forwarded it
2306 * to mds2, but then we learn mds1 failed and can't be
2307 * sure it successfully forwarded our request before
2308 * it died.
2309 */
2310 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2311 newstate >= CEPH_MDS_STATE_ACTIVE) {
2312 pr_info("mds%d reconnect completed\n", s->s_mds);
2313 kick_requests(mdsc, i, 1);
2314 ceph_kick_flushing_caps(mdsc, s);
2315 wake_up_session_caps(s, 1);
2316 }
2317 }
2318}
2319
2320
2321
2322/*
2323 * leases
2324 */
2325
2326/*
2327 * caller must hold session s_mutex, dentry->d_lock
2328 */
2329void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2330{
2331 struct ceph_dentry_info *di = ceph_dentry(dentry);
2332
2333 ceph_put_mds_session(di->lease_session);
2334 di->lease_session = NULL;
2335}
2336
2337static void handle_lease(struct ceph_mds_client *mdsc,
2338 struct ceph_mds_session *session,
2339 struct ceph_msg *msg)
2340{
2341 struct super_block *sb = mdsc->client->sb;
2342 struct inode *inode;
2343 struct ceph_inode_info *ci;
2344 struct dentry *parent, *dentry;
2345 struct ceph_dentry_info *di;
2346 int mds = session->s_mds;
2347 struct ceph_mds_lease *h = msg->front.iov_base;
2348 struct ceph_vino vino;
2349 int mask;
2350 struct qstr dname;
2351 int release = 0;
2352
2353 dout("handle_lease from mds%d\n", mds);
2354
2355 /* decode */
2356 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2357 goto bad;
2358 vino.ino = le64_to_cpu(h->ino);
2359 vino.snap = CEPH_NOSNAP;
2360 mask = le16_to_cpu(h->mask);
2361 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2362 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2363 if (dname.len != get_unaligned_le32(h+1))
2364 goto bad;
2365
2366 mutex_lock(&session->s_mutex);
2367 session->s_seq++;
2368
2369 /* lookup inode */
2370 inode = ceph_find_inode(sb, vino);
2371 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2372 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2373 if (inode == NULL) {
2374 dout("handle_lease no inode %llx\n", vino.ino);
2375 goto release;
2376 }
2377 ci = ceph_inode(inode);
2378
2379 /* dentry */
2380 parent = d_find_alias(inode);
2381 if (!parent) {
2382 dout("no parent dentry on inode %p\n", inode);
2383 WARN_ON(1);
2384 goto release; /* hrm... */
2385 }
2386 dname.hash = full_name_hash(dname.name, dname.len);
2387 dentry = d_lookup(parent, &dname);
2388 dput(parent);
2389 if (!dentry)
2390 goto release;
2391
2392 spin_lock(&dentry->d_lock);
2393 di = ceph_dentry(dentry);
2394 switch (h->action) {
2395 case CEPH_MDS_LEASE_REVOKE:
2396 if (di && di->lease_session == session) {
2397 h->seq = cpu_to_le32(di->lease_seq);
2398 __ceph_mdsc_drop_dentry_lease(dentry);
2399 }
2400 release = 1;
2401 break;
2402
2403 case CEPH_MDS_LEASE_RENEW:
2404 if (di && di->lease_session == session &&
2405 di->lease_gen == session->s_cap_gen &&
2406 di->lease_renew_from &&
2407 di->lease_renew_after == 0) {
2408 unsigned long duration =
2409 le32_to_cpu(h->duration_ms) * HZ / 1000;
2410
2411 di->lease_seq = le32_to_cpu(h->seq);
2412 dentry->d_time = di->lease_renew_from + duration;
2413 di->lease_renew_after = di->lease_renew_from +
2414 (duration >> 1);
2415 di->lease_renew_from = 0;
2416 }
2417 break;
2418 }
2419 spin_unlock(&dentry->d_lock);
2420 dput(dentry);
2421
2422 if (!release)
2423 goto out;
2424
2425release:
2426 /* let's just reuse the same message */
2427 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2428 ceph_msg_get(msg);
2429 ceph_con_send(&session->s_con, msg);
2430
2431out:
2432 iput(inode);
2433 mutex_unlock(&session->s_mutex);
2434 return;
2435
2436bad:
2437 pr_err("corrupt lease message\n");
2438 ceph_msg_dump(msg);
2439}
2440
2441void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2442 struct inode *inode,
2443 struct dentry *dentry, char action,
2444 u32 seq)
2445{
2446 struct ceph_msg *msg;
2447 struct ceph_mds_lease *lease;
2448 int len = sizeof(*lease) + sizeof(u32);
2449 int dnamelen = 0;
2450
2451 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2452 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2453 dnamelen = dentry->d_name.len;
2454 len += dnamelen;
2455
2456 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2457 if (IS_ERR(msg))
2458 return;
2459 lease = msg->front.iov_base;
2460 lease->action = action;
2461 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2462 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2463 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2464 lease->seq = cpu_to_le32(seq);
2465 put_unaligned_le32(dnamelen, lease + 1);
2466 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2467
2468 /*
2469 * if this is a preemptive lease RELEASE, no need to
2470 * flush request stream, since the actual request will
2471 * soon follow.
2472 */
2473 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2474
2475 ceph_con_send(&session->s_con, msg);
2476}
2477
2478/*
2479 * Preemptively release a lease we expect to invalidate anyway.
2480 * Pass @inode always, @dentry is optional.
2481 */
2482void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2483 struct dentry *dentry, int mask)
2484{
2485 struct ceph_dentry_info *di;
2486 struct ceph_mds_session *session;
2487 u32 seq;
2488
2489 BUG_ON(inode == NULL);
2490 BUG_ON(dentry == NULL);
2491 BUG_ON(mask != CEPH_LOCK_DN);
2492
2493 /* is dentry lease valid? */
2494 spin_lock(&dentry->d_lock);
2495 di = ceph_dentry(dentry);
2496 if (!di || !di->lease_session ||
2497 di->lease_session->s_mds < 0 ||
2498 di->lease_gen != di->lease_session->s_cap_gen ||
2499 !time_before(jiffies, dentry->d_time)) {
2500 dout("lease_release inode %p dentry %p -- "
2501 "no lease on %d\n",
2502 inode, dentry, mask);
2503 spin_unlock(&dentry->d_lock);
2504 return;
2505 }
2506
2507 /* we do have a lease on this dentry; note mds and seq */
2508 session = ceph_get_mds_session(di->lease_session);
2509 seq = di->lease_seq;
2510 __ceph_mdsc_drop_dentry_lease(dentry);
2511 spin_unlock(&dentry->d_lock);
2512
2513 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2514 inode, dentry, mask, session->s_mds);
2515 ceph_mdsc_lease_send_msg(session, inode, dentry,
2516 CEPH_MDS_LEASE_RELEASE, seq);
2517 ceph_put_mds_session(session);
2518}
2519
2520/*
2521 * drop all leases (and dentry refs) in preparation for umount
2522 */
2523static void drop_leases(struct ceph_mds_client *mdsc)
2524{
2525 int i;
2526
2527 dout("drop_leases\n");
2528 mutex_lock(&mdsc->mutex);
2529 for (i = 0; i < mdsc->max_sessions; i++) {
2530 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2531 if (!s)
2532 continue;
2533 mutex_unlock(&mdsc->mutex);
2534 mutex_lock(&s->s_mutex);
2535 mutex_unlock(&s->s_mutex);
2536 ceph_put_mds_session(s);
2537 mutex_lock(&mdsc->mutex);
2538 }
2539 mutex_unlock(&mdsc->mutex);
2540}
2541
2542
2543
2544/*
2545 * delayed work -- periodically trim expired leases, renew caps with mds
2546 */
2547static void schedule_delayed(struct ceph_mds_client *mdsc)
2548{
2549 int delay = 5;
2550 unsigned hz = round_jiffies_relative(HZ * delay);
2551 schedule_delayed_work(&mdsc->delayed_work, hz);
2552}
2553
2554static void delayed_work(struct work_struct *work)
2555{
2556 int i;
2557 struct ceph_mds_client *mdsc =
2558 container_of(work, struct ceph_mds_client, delayed_work.work);
2559 int renew_interval;
2560 int renew_caps;
2561
2562 dout("mdsc delayed_work\n");
2563 ceph_check_delayed_caps(mdsc);
2564
2565 mutex_lock(&mdsc->mutex);
2566 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2567 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2568 mdsc->last_renew_caps);
2569 if (renew_caps)
2570 mdsc->last_renew_caps = jiffies;
2571
2572 for (i = 0; i < mdsc->max_sessions; i++) {
2573 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2574 if (s == NULL)
2575 continue;
2576 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2577 dout("resending session close request for mds%d\n",
2578 s->s_mds);
2579 request_close_session(mdsc, s);
2580 ceph_put_mds_session(s);
2581 continue;
2582 }
2583 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2584 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2585 s->s_state = CEPH_MDS_SESSION_HUNG;
2586 pr_info("mds%d hung\n", s->s_mds);
2587 }
2588 }
2589 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2590 /* this mds is failed or recovering, just wait */
2591 ceph_put_mds_session(s);
2592 continue;
2593 }
2594 mutex_unlock(&mdsc->mutex);
2595
2596 mutex_lock(&s->s_mutex);
2597 if (renew_caps)
2598 send_renew_caps(mdsc, s);
2599 else
2600 ceph_con_keepalive(&s->s_con);
2601 add_cap_releases(mdsc, s, -1);
2602 send_cap_releases(mdsc, s);
2603 mutex_unlock(&s->s_mutex);
2604 ceph_put_mds_session(s);
2605
2606 mutex_lock(&mdsc->mutex);
2607 }
2608 mutex_unlock(&mdsc->mutex);
2609
2610 schedule_delayed(mdsc);
2611}
2612
2613
2614int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2615{
2616 mdsc->client = client;
2617 mutex_init(&mdsc->mutex);
2618 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2619 init_completion(&mdsc->safe_umount_waiters);
2620 init_completion(&mdsc->session_close_waiters);
2621 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2622 mdsc->sessions = NULL;
2623 mdsc->max_sessions = 0;
2624 mdsc->stopping = 0;
2625 init_rwsem(&mdsc->snap_rwsem);
2626 mdsc->snap_realms = RB_ROOT;
2627 INIT_LIST_HEAD(&mdsc->snap_empty);
2628 spin_lock_init(&mdsc->snap_empty_lock);
2629 mdsc->last_tid = 0;
2630 mdsc->request_tree = RB_ROOT;
2631 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2632 mdsc->last_renew_caps = jiffies;
2633 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2634 spin_lock_init(&mdsc->cap_delay_lock);
2635 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2636 spin_lock_init(&mdsc->snap_flush_lock);
2637 mdsc->cap_flush_seq = 0;
2638 INIT_LIST_HEAD(&mdsc->cap_dirty);
2639 mdsc->num_cap_flushing = 0;
2640 spin_lock_init(&mdsc->cap_dirty_lock);
2641 init_waitqueue_head(&mdsc->cap_flushing_wq);
2642 spin_lock_init(&mdsc->dentry_lru_lock);
2643 INIT_LIST_HEAD(&mdsc->dentry_lru);
2644 return 0;
2645}
2646
2647/*
2648 * Wait for safe replies on open mds requests. If we time out, drop
2649 * all requests from the tree to avoid dangling dentry refs.
2650 */
2651static void wait_requests(struct ceph_mds_client *mdsc)
2652{
2653 struct ceph_mds_request *req;
2654 struct ceph_client *client = mdsc->client;
2655
2656 mutex_lock(&mdsc->mutex);
2657 if (__get_oldest_req(mdsc)) {
2658 mutex_unlock(&mdsc->mutex);
2659
2660 dout("wait_requests waiting for requests\n");
2661 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2662 client->mount_args->mount_timeout * HZ);
2663
2664 /* tear down remaining requests */
2665 mutex_lock(&mdsc->mutex);
2666 while ((req = __get_oldest_req(mdsc))) {
2667 dout("wait_requests timed out on tid %llu\n",
2668 req->r_tid);
2669 __unregister_request(mdsc, req);
2670 }
2671 }
2672 mutex_unlock(&mdsc->mutex);
2673 dout("wait_requests done\n");
2674}
2675
2676/*
2677 * called before mount is ro, and before dentries are torn down.
2678 * (hmm, does this still race with new lookups?)
2679 */
2680void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2681{
2682 dout("pre_umount\n");
2683 mdsc->stopping = 1;
2684
2685 drop_leases(mdsc);
2686 ceph_flush_dirty_caps(mdsc);
2687 wait_requests(mdsc);
2688}
2689
2690/*
2691 * wait for all write mds requests to flush.
2692 */
2693static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2694{
2695 struct ceph_mds_request *req = NULL, *nextreq;
2696 struct rb_node *n;
2697
2698 mutex_lock(&mdsc->mutex);
2699 dout("wait_unsafe_requests want %lld\n", want_tid);
2700restart:
2701 req = __get_oldest_req(mdsc);
2702 while (req && req->r_tid <= want_tid) {
2703 /* find next request */
2704 n = rb_next(&req->r_node);
2705 if (n)
2706 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2707 else
2708 nextreq = NULL;
2709 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2710 /* write op */
2711 ceph_mdsc_get_request(req);
2712 if (nextreq)
2713 ceph_mdsc_get_request(nextreq);
2714 mutex_unlock(&mdsc->mutex);
2715 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2716 req->r_tid, want_tid);
2717 wait_for_completion(&req->r_safe_completion);
2718 mutex_lock(&mdsc->mutex);
2719 ceph_mdsc_put_request(req);
2720 if (!nextreq)
2721 break; /* next dne before, so we're done! */
2722 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2723 /* next request was removed from tree */
2724 ceph_mdsc_put_request(nextreq);
2725 goto restart;
2726 }
2727 ceph_mdsc_put_request(nextreq); /* won't go away */
2728 }
2729 req = nextreq;
2730 }
2731 mutex_unlock(&mdsc->mutex);
2732 dout("wait_unsafe_requests done\n");
2733}
2734
2735void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2736{
2737 u64 want_tid, want_flush;
2738
2739 dout("sync\n");
2740 mutex_lock(&mdsc->mutex);
2741 want_tid = mdsc->last_tid;
2742 want_flush = mdsc->cap_flush_seq;
2743 mutex_unlock(&mdsc->mutex);
2744 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2745
2746 ceph_flush_dirty_caps(mdsc);
2747
2748 wait_unsafe_requests(mdsc, want_tid);
2749 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2750}
2751
2752
2753/*
2754 * called after sb is ro.
2755 */
2756void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2757{
2758 struct ceph_mds_session *session;
2759 int i;
2760 int n;
2761 struct ceph_client *client = mdsc->client;
2762 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2763
2764 dout("close_sessions\n");
2765
2766 mutex_lock(&mdsc->mutex);
2767
2768 /* close sessions */
2769 started = jiffies;
2770 while (time_before(jiffies, started + timeout)) {
2771 dout("closing sessions\n");
2772 n = 0;
2773 for (i = 0; i < mdsc->max_sessions; i++) {
2774 session = __ceph_lookup_mds_session(mdsc, i);
2775 if (!session)
2776 continue;
2777 mutex_unlock(&mdsc->mutex);
2778 mutex_lock(&session->s_mutex);
2779 __close_session(mdsc, session);
2780 mutex_unlock(&session->s_mutex);
2781 ceph_put_mds_session(session);
2782 mutex_lock(&mdsc->mutex);
2783 n++;
2784 }
2785 if (n == 0)
2786 break;
2787
2788 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2789 break;
2790
2791 dout("waiting for sessions to close\n");
2792 mutex_unlock(&mdsc->mutex);
2793 wait_for_completion_timeout(&mdsc->session_close_waiters,
2794 timeout);
2795 mutex_lock(&mdsc->mutex);
2796 }
2797
2798 /* tear down remaining sessions */
2799 for (i = 0; i < mdsc->max_sessions; i++) {
2800 if (mdsc->sessions[i]) {
2801 session = get_session(mdsc->sessions[i]);
2802 __unregister_session(mdsc, session);
2803 mutex_unlock(&mdsc->mutex);
2804 mutex_lock(&session->s_mutex);
2805 remove_session_caps(session);
2806 mutex_unlock(&session->s_mutex);
2807 ceph_put_mds_session(session);
2808 mutex_lock(&mdsc->mutex);
2809 }
2810 }
2811
2812 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2813
2814 mutex_unlock(&mdsc->mutex);
2815
2816 ceph_cleanup_empty_realms(mdsc);
2817
2818 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2819
2820 dout("stopped\n");
2821}
2822
2823void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2824{
2825 dout("stop\n");
2826 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2827 if (mdsc->mdsmap)
2828 ceph_mdsmap_destroy(mdsc->mdsmap);
2829 kfree(mdsc->sessions);
2830}
2831
2832
2833/*
2834 * handle mds map update.
2835 */
2836void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2837{
2838 u32 epoch;
2839 u32 maplen;
2840 void *p = msg->front.iov_base;
2841 void *end = p + msg->front.iov_len;
2842 struct ceph_mdsmap *newmap, *oldmap;
2843 struct ceph_fsid fsid;
2844 int err = -EINVAL;
2845
2846 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2847 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2848 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2849 return;
2850 epoch = ceph_decode_32(&p);
2851 maplen = ceph_decode_32(&p);
2852 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2853
2854 /* do we need it? */
2855 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2856 mutex_lock(&mdsc->mutex);
2857 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2858 dout("handle_map epoch %u <= our %u\n",
2859 epoch, mdsc->mdsmap->m_epoch);
2860 mutex_unlock(&mdsc->mutex);
2861 return;
2862 }
2863
2864 newmap = ceph_mdsmap_decode(&p, end);
2865 if (IS_ERR(newmap)) {
2866 err = PTR_ERR(newmap);
2867 goto bad_unlock;
2868 }
2869
2870 /* swap into place */
2871 if (mdsc->mdsmap) {
2872 oldmap = mdsc->mdsmap;
2873 mdsc->mdsmap = newmap;
2874 check_new_map(mdsc, newmap, oldmap);
2875 ceph_mdsmap_destroy(oldmap);
2876 } else {
2877 mdsc->mdsmap = newmap; /* first mds map */
2878 }
2879 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2880
2881 __wake_requests(mdsc, &mdsc->waiting_for_map);
2882
2883 mutex_unlock(&mdsc->mutex);
2884 schedule_delayed(mdsc);
2885 return;
2886
2887bad_unlock:
2888 mutex_unlock(&mdsc->mutex);
2889bad:
2890 pr_err("error decoding mdsmap %d\n", err);
2891 return;
2892}
2893
2894static struct ceph_connection *con_get(struct ceph_connection *con)
2895{
2896 struct ceph_mds_session *s = con->private;
2897
2898 if (get_session(s)) {
2899 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2900 return con;
2901 }
2902 dout("mdsc con_get %p FAIL\n", s);
2903 return NULL;
2904}
2905
2906static void con_put(struct ceph_connection *con)
2907{
2908 struct ceph_mds_session *s = con->private;
2909
2910 ceph_put_mds_session(s);
2911 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2912}
2913
2914/*
2915 * if the client is unresponsive for long enough, the mds will kill
2916 * the session entirely.
2917 */
2918static void peer_reset(struct ceph_connection *con)
2919{
2920 struct ceph_mds_session *s = con->private;
2921
2922 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2923 s->s_mds);
2924}
2925
2926static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2927{
2928 struct ceph_mds_session *s = con->private;
2929 struct ceph_mds_client *mdsc = s->s_mdsc;
2930 int type = le16_to_cpu(msg->hdr.type);
2931
2932 mutex_lock(&mdsc->mutex);
2933 if (__verify_registered_session(mdsc, s) < 0) {
2934 mutex_unlock(&mdsc->mutex);
2935 goto out;
2936 }
2937 mutex_unlock(&mdsc->mutex);
2938
2939 switch (type) {
2940 case CEPH_MSG_MDS_MAP:
2941 ceph_mdsc_handle_map(mdsc, msg);
2942 break;
2943 case CEPH_MSG_CLIENT_SESSION:
2944 handle_session(s, msg);
2945 break;
2946 case CEPH_MSG_CLIENT_REPLY:
2947 handle_reply(s, msg);
2948 break;
2949 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2950 handle_forward(mdsc, s, msg);
2951 break;
2952 case CEPH_MSG_CLIENT_CAPS:
2953 ceph_handle_caps(s, msg);
2954 break;
2955 case CEPH_MSG_CLIENT_SNAP:
2956 ceph_handle_snap(mdsc, s, msg);
2957 break;
2958 case CEPH_MSG_CLIENT_LEASE:
2959 handle_lease(mdsc, s, msg);
2960 break;
2961
2962 default:
2963 pr_err("received unknown message type %d %s\n", type,
2964 ceph_msg_type_name(type));
2965 }
2966out:
2967 ceph_msg_put(msg);
2968}
2969
2970/*
2971 * authentication
2972 */
2973static int get_authorizer(struct ceph_connection *con,
2974 void **buf, int *len, int *proto,
2975 void **reply_buf, int *reply_len, int force_new)
2976{
2977 struct ceph_mds_session *s = con->private;
2978 struct ceph_mds_client *mdsc = s->s_mdsc;
2979 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2980 int ret = 0;
2981
2982 if (force_new && s->s_authorizer) {
2983 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2984 s->s_authorizer = NULL;
2985 }
2986 if (s->s_authorizer == NULL) {
2987 if (ac->ops->create_authorizer) {
2988 ret = ac->ops->create_authorizer(
2989 ac, CEPH_ENTITY_TYPE_MDS,
2990 &s->s_authorizer,
2991 &s->s_authorizer_buf,
2992 &s->s_authorizer_buf_len,
2993 &s->s_authorizer_reply_buf,
2994 &s->s_authorizer_reply_buf_len);
2995 if (ret)
2996 return ret;
2997 }
2998 }
2999
3000 *proto = ac->protocol;
3001 *buf = s->s_authorizer_buf;
3002 *len = s->s_authorizer_buf_len;
3003 *reply_buf = s->s_authorizer_reply_buf;
3004 *reply_len = s->s_authorizer_reply_buf_len;
3005 return 0;
3006}
3007
3008
3009static int verify_authorizer_reply(struct ceph_connection *con, int len)
3010{
3011 struct ceph_mds_session *s = con->private;
3012 struct ceph_mds_client *mdsc = s->s_mdsc;
3013 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3014
3015 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3016}
3017
3018static int invalidate_authorizer(struct ceph_connection *con)
3019{
3020 struct ceph_mds_session *s = con->private;
3021 struct ceph_mds_client *mdsc = s->s_mdsc;
3022 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3023
3024 if (ac->ops->invalidate_authorizer)
3025 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3026
3027 return ceph_monc_validate_auth(&mdsc->client->monc);
3028}
3029
3030const static struct ceph_connection_operations mds_con_ops = {
3031 .get = con_get,
3032 .put = con_put,
3033 .dispatch = dispatch,
3034 .get_authorizer = get_authorizer,
3035 .verify_authorizer_reply = verify_authorizer_reply,
3036 .invalidate_authorizer = invalidate_authorizer,
3037 .peer_reset = peer_reset,
3038};
3039
3040
3041
3042
3043/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..8f1715ffbe4b
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33
34static void queue_con(struct ceph_connection *con);
35static void con_work(struct work_struct *);
36static void ceph_fault(struct ceph_connection *con);
37
38const char *ceph_name_type_str(int t)
39{
40 switch (t) {
41 case CEPH_ENTITY_TYPE_MON: return "mon";
42 case CEPH_ENTITY_TYPE_MDS: return "mds";
43 case CEPH_ENTITY_TYPE_OSD: return "osd";
44 case CEPH_ENTITY_TYPE_CLIENT: return "client";
45 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
46 default: return "???";
47 }
48}
49
50/*
51 * nicely render a sockaddr as a string.
52 */
53#define MAX_ADDR_STR 20
54static char addr_str[MAX_ADDR_STR][40];
55static DEFINE_SPINLOCK(addr_str_lock);
56static int last_addr_str;
57
58const char *pr_addr(const struct sockaddr_storage *ss)
59{
60 int i;
61 char *s;
62 struct sockaddr_in *in4 = (void *)ss;
63 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
64 struct sockaddr_in6 *in6 = (void *)ss;
65
66 spin_lock(&addr_str_lock);
67 i = last_addr_str++;
68 if (last_addr_str == MAX_ADDR_STR)
69 last_addr_str = 0;
70 spin_unlock(&addr_str_lock);
71 s = addr_str[i];
72
73 switch (ss->ss_family) {
74 case AF_INET:
75 sprintf(s, "%u.%u.%u.%u:%u",
76 (unsigned int)quad[0],
77 (unsigned int)quad[1],
78 (unsigned int)quad[2],
79 (unsigned int)quad[3],
80 (unsigned int)ntohs(in4->sin_port));
81 break;
82
83 case AF_INET6:
84 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
85 in6->sin6_addr.s6_addr16[0],
86 in6->sin6_addr.s6_addr16[1],
87 in6->sin6_addr.s6_addr16[2],
88 in6->sin6_addr.s6_addr16[3],
89 in6->sin6_addr.s6_addr16[4],
90 in6->sin6_addr.s6_addr16[5],
91 in6->sin6_addr.s6_addr16[6],
92 in6->sin6_addr.s6_addr16[7],
93 (unsigned int)ntohs(in6->sin6_port));
94 break;
95
96 default:
97 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
98 }
99
100 return s;
101}
102
103static void encode_my_addr(struct ceph_messenger *msgr)
104{
105 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
106 ceph_encode_addr(&msgr->my_enc_addr);
107}
108
109/*
110 * work queue for all reading and writing to/from the socket.
111 */
112struct workqueue_struct *ceph_msgr_wq;
113
114int __init ceph_msgr_init(void)
115{
116 ceph_msgr_wq = create_workqueue("ceph-msgr");
117 if (IS_ERR(ceph_msgr_wq)) {
118 int ret = PTR_ERR(ceph_msgr_wq);
119 pr_err("msgr_init failed to create workqueue: %d\n", ret);
120 ceph_msgr_wq = NULL;
121 return ret;
122 }
123 return 0;
124}
125
126void ceph_msgr_exit(void)
127{
128 destroy_workqueue(ceph_msgr_wq);
129}
130
131/*
132 * socket callback functions
133 */
134
135/* data available on socket, or listen socket received a connect */
136static void ceph_data_ready(struct sock *sk, int count_unused)
137{
138 struct ceph_connection *con =
139 (struct ceph_connection *)sk->sk_user_data;
140 if (sk->sk_state != TCP_CLOSE_WAIT) {
141 dout("ceph_data_ready on %p state = %lu, queueing work\n",
142 con, con->state);
143 queue_con(con);
144 }
145}
146
147/* socket has buffer space for writing */
148static void ceph_write_space(struct sock *sk)
149{
150 struct ceph_connection *con =
151 (struct ceph_connection *)sk->sk_user_data;
152
153 /* only queue to workqueue if there is data we want to write. */
154 if (test_bit(WRITE_PENDING, &con->state)) {
155 dout("ceph_write_space %p queueing write work\n", con);
156 queue_con(con);
157 } else {
158 dout("ceph_write_space %p nothing to write\n", con);
159 }
160
161 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
162 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
163}
164
165/* socket's state has changed */
166static void ceph_state_change(struct sock *sk)
167{
168 struct ceph_connection *con =
169 (struct ceph_connection *)sk->sk_user_data;
170
171 dout("ceph_state_change %p state = %lu sk_state = %u\n",
172 con, con->state, sk->sk_state);
173
174 if (test_bit(CLOSED, &con->state))
175 return;
176
177 switch (sk->sk_state) {
178 case TCP_CLOSE:
179 dout("ceph_state_change TCP_CLOSE\n");
180 case TCP_CLOSE_WAIT:
181 dout("ceph_state_change TCP_CLOSE_WAIT\n");
182 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
183 if (test_bit(CONNECTING, &con->state))
184 con->error_msg = "connection failed";
185 else
186 con->error_msg = "socket closed";
187 queue_con(con);
188 }
189 break;
190 case TCP_ESTABLISHED:
191 dout("ceph_state_change TCP_ESTABLISHED\n");
192 queue_con(con);
193 break;
194 }
195}
196
197/*
198 * set up socket callbacks
199 */
200static void set_sock_callbacks(struct socket *sock,
201 struct ceph_connection *con)
202{
203 struct sock *sk = sock->sk;
204 sk->sk_user_data = (void *)con;
205 sk->sk_data_ready = ceph_data_ready;
206 sk->sk_write_space = ceph_write_space;
207 sk->sk_state_change = ceph_state_change;
208}
209
210
211/*
212 * socket helpers
213 */
214
215/*
216 * initiate connection to a remote socket.
217 */
218static struct socket *ceph_tcp_connect(struct ceph_connection *con)
219{
220 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
221 struct socket *sock;
222 int ret;
223
224 BUG_ON(con->sock);
225 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
226 if (ret)
227 return ERR_PTR(ret);
228 con->sock = sock;
229 sock->sk->sk_allocation = GFP_NOFS;
230
231 set_sock_callbacks(sock, con);
232
233 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
234
235 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
236 if (ret == -EINPROGRESS) {
237 dout("connect %s EINPROGRESS sk_state = %u\n",
238 pr_addr(&con->peer_addr.in_addr),
239 sock->sk->sk_state);
240 ret = 0;
241 }
242 if (ret < 0) {
243 pr_err("connect %s error %d\n",
244 pr_addr(&con->peer_addr.in_addr), ret);
245 sock_release(sock);
246 con->sock = NULL;
247 con->error_msg = "connect error";
248 }
249
250 if (ret < 0)
251 return ERR_PTR(ret);
252 return sock;
253}
254
255static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
256{
257 struct kvec iov = {buf, len};
258 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
259
260 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
261}
262
263/*
264 * write something. @more is true if caller will be sending more data
265 * shortly.
266 */
267static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
268 size_t kvlen, size_t len, int more)
269{
270 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
271
272 if (more)
273 msg.msg_flags |= MSG_MORE;
274 else
275 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
276
277 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
278}
279
280
281/*
282 * Shutdown/close the socket for the given connection.
283 */
284static int con_close_socket(struct ceph_connection *con)
285{
286 int rc;
287
288 dout("con_close_socket on %p sock %p\n", con, con->sock);
289 if (!con->sock)
290 return 0;
291 set_bit(SOCK_CLOSED, &con->state);
292 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
293 sock_release(con->sock);
294 con->sock = NULL;
295 clear_bit(SOCK_CLOSED, &con->state);
296 return rc;
297}
298
299/*
300 * Reset a connection. Discard all incoming and outgoing messages
301 * and clear *_seq state.
302 */
303static void ceph_msg_remove(struct ceph_msg *msg)
304{
305 list_del_init(&msg->list_head);
306 ceph_msg_put(msg);
307}
308static void ceph_msg_remove_list(struct list_head *head)
309{
310 while (!list_empty(head)) {
311 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
312 list_head);
313 ceph_msg_remove(msg);
314 }
315}
316
317static void reset_connection(struct ceph_connection *con)
318{
319 /* reset connection, out_queue, msg_ and connect_seq */
320 /* discard existing out_queue and msg_seq */
321 ceph_msg_remove_list(&con->out_queue);
322 ceph_msg_remove_list(&con->out_sent);
323
324 if (con->in_msg) {
325 ceph_msg_put(con->in_msg);
326 con->in_msg = NULL;
327 }
328
329 con->connect_seq = 0;
330 con->out_seq = 0;
331 if (con->out_msg) {
332 ceph_msg_put(con->out_msg);
333 con->out_msg = NULL;
334 }
335 con->in_seq = 0;
336}
337
338/*
339 * mark a peer down. drop any open connections.
340 */
341void ceph_con_close(struct ceph_connection *con)
342{
343 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
344 set_bit(CLOSED, &con->state); /* in case there's queued work */
345 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
346 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
347 clear_bit(KEEPALIVE_PENDING, &con->state);
348 clear_bit(WRITE_PENDING, &con->state);
349 mutex_lock(&con->mutex);
350 reset_connection(con);
351 cancel_delayed_work(&con->work);
352 mutex_unlock(&con->mutex);
353 queue_con(con);
354}
355
356/*
357 * Reopen a closed connection, with a new peer address.
358 */
359void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
360{
361 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
362 set_bit(OPENING, &con->state);
363 clear_bit(CLOSED, &con->state);
364 memcpy(&con->peer_addr, addr, sizeof(*addr));
365 con->delay = 0; /* reset backoff memory */
366 queue_con(con);
367}
368
369/*
370 * return true if this connection ever successfully opened
371 */
372bool ceph_con_opened(struct ceph_connection *con)
373{
374 return con->connect_seq > 0;
375}
376
377/*
378 * generic get/put
379 */
380struct ceph_connection *ceph_con_get(struct ceph_connection *con)
381{
382 dout("con_get %p nref = %d -> %d\n", con,
383 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
384 if (atomic_inc_not_zero(&con->nref))
385 return con;
386 return NULL;
387}
388
389void ceph_con_put(struct ceph_connection *con)
390{
391 dout("con_put %p nref = %d -> %d\n", con,
392 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
393 BUG_ON(atomic_read(&con->nref) == 0);
394 if (atomic_dec_and_test(&con->nref)) {
395 BUG_ON(con->sock);
396 kfree(con);
397 }
398}
399
400/*
401 * initialize a new connection.
402 */
403void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
404{
405 dout("con_init %p\n", con);
406 memset(con, 0, sizeof(*con));
407 atomic_set(&con->nref, 1);
408 con->msgr = msgr;
409 mutex_init(&con->mutex);
410 INIT_LIST_HEAD(&con->out_queue);
411 INIT_LIST_HEAD(&con->out_sent);
412 INIT_DELAYED_WORK(&con->work, con_work);
413}
414
415
416/*
417 * We maintain a global counter to order connection attempts. Get
418 * a unique seq greater than @gt.
419 */
420static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
421{
422 u32 ret;
423
424 spin_lock(&msgr->global_seq_lock);
425 if (msgr->global_seq < gt)
426 msgr->global_seq = gt;
427 ret = ++msgr->global_seq;
428 spin_unlock(&msgr->global_seq_lock);
429 return ret;
430}
431
432
433/*
434 * Prepare footer for currently outgoing message, and finish things
435 * off. Assumes out_kvec* are already valid.. we just add on to the end.
436 */
437static void prepare_write_message_footer(struct ceph_connection *con, int v)
438{
439 struct ceph_msg *m = con->out_msg;
440
441 dout("prepare_write_message_footer %p\n", con);
442 con->out_kvec_is_msg = true;
443 con->out_kvec[v].iov_base = &m->footer;
444 con->out_kvec[v].iov_len = sizeof(m->footer);
445 con->out_kvec_bytes += sizeof(m->footer);
446 con->out_kvec_left++;
447 con->out_more = m->more_to_follow;
448 con->out_msg_done = true;
449}
450
451/*
452 * Prepare headers for the next outgoing message.
453 */
454static void prepare_write_message(struct ceph_connection *con)
455{
456 struct ceph_msg *m;
457 int v = 0;
458
459 con->out_kvec_bytes = 0;
460 con->out_kvec_is_msg = true;
461 con->out_msg_done = false;
462
463 /* Sneak an ack in there first? If we can get it into the same
464 * TCP packet that's a good thing. */
465 if (con->in_seq > con->in_seq_acked) {
466 con->in_seq_acked = con->in_seq;
467 con->out_kvec[v].iov_base = &tag_ack;
468 con->out_kvec[v++].iov_len = 1;
469 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
470 con->out_kvec[v].iov_base = &con->out_temp_ack;
471 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
472 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
473 }
474
475 m = list_first_entry(&con->out_queue,
476 struct ceph_msg, list_head);
477 con->out_msg = m;
478 if (test_bit(LOSSYTX, &con->state)) {
479 list_del_init(&m->list_head);
480 } else {
481 /* put message on sent list */
482 ceph_msg_get(m);
483 list_move_tail(&m->list_head, &con->out_sent);
484 }
485
486 m->hdr.seq = cpu_to_le64(++con->out_seq);
487
488 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
489 m, con->out_seq, le16_to_cpu(m->hdr.type),
490 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
491 le32_to_cpu(m->hdr.data_len),
492 m->nr_pages);
493 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
494
495 /* tag + hdr + front + middle */
496 con->out_kvec[v].iov_base = &tag_msg;
497 con->out_kvec[v++].iov_len = 1;
498 con->out_kvec[v].iov_base = &m->hdr;
499 con->out_kvec[v++].iov_len = sizeof(m->hdr);
500 con->out_kvec[v++] = m->front;
501 if (m->middle)
502 con->out_kvec[v++] = m->middle->vec;
503 con->out_kvec_left = v;
504 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
505 (m->middle ? m->middle->vec.iov_len : 0);
506 con->out_kvec_cur = con->out_kvec;
507
508 /* fill in crc (except data pages), footer */
509 con->out_msg->hdr.crc =
510 cpu_to_le32(crc32c(0, (void *)&m->hdr,
511 sizeof(m->hdr) - sizeof(m->hdr.crc)));
512 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
513 con->out_msg->footer.front_crc =
514 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
515 if (m->middle)
516 con->out_msg->footer.middle_crc =
517 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
518 m->middle->vec.iov_len));
519 else
520 con->out_msg->footer.middle_crc = 0;
521 con->out_msg->footer.data_crc = 0;
522 dout("prepare_write_message front_crc %u data_crc %u\n",
523 le32_to_cpu(con->out_msg->footer.front_crc),
524 le32_to_cpu(con->out_msg->footer.middle_crc));
525
526 /* is there a data payload? */
527 if (le32_to_cpu(m->hdr.data_len) > 0) {
528 /* initialize page iterator */
529 con->out_msg_pos.page = 0;
530 con->out_msg_pos.page_pos =
531 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
532 con->out_msg_pos.data_pos = 0;
533 con->out_msg_pos.did_page_crc = 0;
534 con->out_more = 1; /* data + footer will follow */
535 } else {
536 /* no, queue up footer too and be done */
537 prepare_write_message_footer(con, v);
538 }
539
540 set_bit(WRITE_PENDING, &con->state);
541}
542
543/*
544 * Prepare an ack.
545 */
546static void prepare_write_ack(struct ceph_connection *con)
547{
548 dout("prepare_write_ack %p %llu -> %llu\n", con,
549 con->in_seq_acked, con->in_seq);
550 con->in_seq_acked = con->in_seq;
551
552 con->out_kvec[0].iov_base = &tag_ack;
553 con->out_kvec[0].iov_len = 1;
554 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
555 con->out_kvec[1].iov_base = &con->out_temp_ack;
556 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
557 con->out_kvec_left = 2;
558 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
559 con->out_kvec_cur = con->out_kvec;
560 con->out_more = 1; /* more will follow.. eventually.. */
561 set_bit(WRITE_PENDING, &con->state);
562}
563
564/*
565 * Prepare to write keepalive byte.
566 */
567static void prepare_write_keepalive(struct ceph_connection *con)
568{
569 dout("prepare_write_keepalive %p\n", con);
570 con->out_kvec[0].iov_base = &tag_keepalive;
571 con->out_kvec[0].iov_len = 1;
572 con->out_kvec_left = 1;
573 con->out_kvec_bytes = 1;
574 con->out_kvec_cur = con->out_kvec;
575 set_bit(WRITE_PENDING, &con->state);
576}
577
578/*
579 * Connection negotiation.
580 */
581
582static void prepare_connect_authorizer(struct ceph_connection *con)
583{
584 void *auth_buf;
585 int auth_len = 0;
586 int auth_protocol = 0;
587
588 mutex_unlock(&con->mutex);
589 if (con->ops->get_authorizer)
590 con->ops->get_authorizer(con, &auth_buf, &auth_len,
591 &auth_protocol, &con->auth_reply_buf,
592 &con->auth_reply_buf_len,
593 con->auth_retry);
594 mutex_lock(&con->mutex);
595
596 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
597 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
598
599 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
600 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
601 con->out_kvec_left++;
602 con->out_kvec_bytes += auth_len;
603}
604
605/*
606 * We connected to a peer and are saying hello.
607 */
608static void prepare_write_banner(struct ceph_messenger *msgr,
609 struct ceph_connection *con)
610{
611 int len = strlen(CEPH_BANNER);
612
613 con->out_kvec[0].iov_base = CEPH_BANNER;
614 con->out_kvec[0].iov_len = len;
615 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
616 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
617 con->out_kvec_left = 2;
618 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
619 con->out_kvec_cur = con->out_kvec;
620 con->out_more = 0;
621 set_bit(WRITE_PENDING, &con->state);
622}
623
624static void prepare_write_connect(struct ceph_messenger *msgr,
625 struct ceph_connection *con,
626 int after_banner)
627{
628 unsigned global_seq = get_global_seq(con->msgr, 0);
629 int proto;
630
631 switch (con->peer_name.type) {
632 case CEPH_ENTITY_TYPE_MON:
633 proto = CEPH_MONC_PROTOCOL;
634 break;
635 case CEPH_ENTITY_TYPE_OSD:
636 proto = CEPH_OSDC_PROTOCOL;
637 break;
638 case CEPH_ENTITY_TYPE_MDS:
639 proto = CEPH_MDSC_PROTOCOL;
640 break;
641 default:
642 BUG();
643 }
644
645 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
646 con->connect_seq, global_seq, proto);
647
648 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
649 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
650 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
651 con->out_connect.global_seq = cpu_to_le32(global_seq);
652 con->out_connect.protocol_version = cpu_to_le32(proto);
653 con->out_connect.flags = 0;
654
655 if (!after_banner) {
656 con->out_kvec_left = 0;
657 con->out_kvec_bytes = 0;
658 }
659 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
660 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
661 con->out_kvec_left++;
662 con->out_kvec_bytes += sizeof(con->out_connect);
663 con->out_kvec_cur = con->out_kvec;
664 con->out_more = 0;
665 set_bit(WRITE_PENDING, &con->state);
666
667 prepare_connect_authorizer(con);
668}
669
670
671/*
672 * write as much of pending kvecs to the socket as we can.
673 * 1 -> done
674 * 0 -> socket full, but more to do
675 * <0 -> error
676 */
677static int write_partial_kvec(struct ceph_connection *con)
678{
679 int ret;
680
681 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
682 while (con->out_kvec_bytes > 0) {
683 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
684 con->out_kvec_left, con->out_kvec_bytes,
685 con->out_more);
686 if (ret <= 0)
687 goto out;
688 con->out_kvec_bytes -= ret;
689 if (con->out_kvec_bytes == 0)
690 break; /* done */
691 while (ret > 0) {
692 if (ret >= con->out_kvec_cur->iov_len) {
693 ret -= con->out_kvec_cur->iov_len;
694 con->out_kvec_cur++;
695 con->out_kvec_left--;
696 } else {
697 con->out_kvec_cur->iov_len -= ret;
698 con->out_kvec_cur->iov_base += ret;
699 ret = 0;
700 break;
701 }
702 }
703 }
704 con->out_kvec_left = 0;
705 con->out_kvec_is_msg = false;
706 ret = 1;
707out:
708 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
709 con->out_kvec_bytes, con->out_kvec_left, ret);
710 return ret; /* done! */
711}
712
713/*
714 * Write as much message data payload as we can. If we finish, queue
715 * up the footer.
716 * 1 -> done, footer is now queued in out_kvec[].
717 * 0 -> socket full, but more to do
718 * <0 -> error
719 */
720static int write_partial_msg_pages(struct ceph_connection *con)
721{
722 struct ceph_msg *msg = con->out_msg;
723 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
724 size_t len;
725 int crc = con->msgr->nocrc;
726 int ret;
727
728 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
729 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
730 con->out_msg_pos.page_pos);
731
732 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
733 struct page *page = NULL;
734 void *kaddr = NULL;
735
736 /*
737 * if we are calculating the data crc (the default), we need
738 * to map the page. if our pages[] has been revoked, use the
739 * zero page.
740 */
741 if (msg->pages) {
742 page = msg->pages[con->out_msg_pos.page];
743 if (crc)
744 kaddr = kmap(page);
745 } else if (msg->pagelist) {
746 page = list_first_entry(&msg->pagelist->head,
747 struct page, lru);
748 if (crc)
749 kaddr = kmap(page);
750 } else {
751 page = con->msgr->zero_page;
752 if (crc)
753 kaddr = page_address(con->msgr->zero_page);
754 }
755 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
756 (int)(data_len - con->out_msg_pos.data_pos));
757 if (crc && !con->out_msg_pos.did_page_crc) {
758 void *base = kaddr + con->out_msg_pos.page_pos;
759 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
760
761 BUG_ON(kaddr == NULL);
762 con->out_msg->footer.data_crc =
763 cpu_to_le32(crc32c(tmpcrc, base, len));
764 con->out_msg_pos.did_page_crc = 1;
765 }
766
767 ret = kernel_sendpage(con->sock, page,
768 con->out_msg_pos.page_pos, len,
769 MSG_DONTWAIT | MSG_NOSIGNAL |
770 MSG_MORE);
771
772 if (crc && (msg->pages || msg->pagelist))
773 kunmap(page);
774
775 if (ret <= 0)
776 goto out;
777
778 con->out_msg_pos.data_pos += ret;
779 con->out_msg_pos.page_pos += ret;
780 if (ret == len) {
781 con->out_msg_pos.page_pos = 0;
782 con->out_msg_pos.page++;
783 con->out_msg_pos.did_page_crc = 0;
784 if (msg->pagelist)
785 list_move_tail(&page->lru,
786 &msg->pagelist->head);
787 }
788 }
789
790 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
791
792 /* prepare and queue up footer, too */
793 if (!crc)
794 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
795 con->out_kvec_bytes = 0;
796 con->out_kvec_left = 0;
797 con->out_kvec_cur = con->out_kvec;
798 prepare_write_message_footer(con, 0);
799 ret = 1;
800out:
801 return ret;
802}
803
804/*
805 * write some zeros
806 */
807static int write_partial_skip(struct ceph_connection *con)
808{
809 int ret;
810
811 while (con->out_skip > 0) {
812 struct kvec iov = {
813 .iov_base = page_address(con->msgr->zero_page),
814 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
815 };
816
817 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
818 if (ret <= 0)
819 goto out;
820 con->out_skip -= ret;
821 }
822 ret = 1;
823out:
824 return ret;
825}
826
827/*
828 * Prepare to read connection handshake, or an ack.
829 */
830static void prepare_read_banner(struct ceph_connection *con)
831{
832 dout("prepare_read_banner %p\n", con);
833 con->in_base_pos = 0;
834}
835
836static void prepare_read_connect(struct ceph_connection *con)
837{
838 dout("prepare_read_connect %p\n", con);
839 con->in_base_pos = 0;
840}
841
842static void prepare_read_ack(struct ceph_connection *con)
843{
844 dout("prepare_read_ack %p\n", con);
845 con->in_base_pos = 0;
846}
847
848static void prepare_read_tag(struct ceph_connection *con)
849{
850 dout("prepare_read_tag %p\n", con);
851 con->in_base_pos = 0;
852 con->in_tag = CEPH_MSGR_TAG_READY;
853}
854
855/*
856 * Prepare to read a message.
857 */
858static int prepare_read_message(struct ceph_connection *con)
859{
860 dout("prepare_read_message %p\n", con);
861 BUG_ON(con->in_msg != NULL);
862 con->in_base_pos = 0;
863 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
864 return 0;
865}
866
867
868static int read_partial(struct ceph_connection *con,
869 int *to, int size, void *object)
870{
871 *to += size;
872 while (con->in_base_pos < *to) {
873 int left = *to - con->in_base_pos;
874 int have = size - left;
875 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
876 if (ret <= 0)
877 return ret;
878 con->in_base_pos += ret;
879 }
880 return 1;
881}
882
883
884/*
885 * Read all or part of the connect-side handshake on a new connection
886 */
887static int read_partial_banner(struct ceph_connection *con)
888{
889 int ret, to = 0;
890
891 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
892
893 /* peer's banner */
894 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
895 if (ret <= 0)
896 goto out;
897 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
898 &con->actual_peer_addr);
899 if (ret <= 0)
900 goto out;
901 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
902 &con->peer_addr_for_me);
903 if (ret <= 0)
904 goto out;
905out:
906 return ret;
907}
908
909static int read_partial_connect(struct ceph_connection *con)
910{
911 int ret, to = 0;
912
913 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
914
915 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
916 if (ret <= 0)
917 goto out;
918 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
919 con->auth_reply_buf);
920 if (ret <= 0)
921 goto out;
922
923 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
924 con, (int)con->in_reply.tag,
925 le32_to_cpu(con->in_reply.connect_seq),
926 le32_to_cpu(con->in_reply.global_seq));
927out:
928 return ret;
929
930}
931
932/*
933 * Verify the hello banner looks okay.
934 */
935static int verify_hello(struct ceph_connection *con)
936{
937 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
938 pr_err("connect to %s got bad banner\n",
939 pr_addr(&con->peer_addr.in_addr));
940 con->error_msg = "protocol error, bad banner";
941 return -1;
942 }
943 return 0;
944}
945
946static bool addr_is_blank(struct sockaddr_storage *ss)
947{
948 switch (ss->ss_family) {
949 case AF_INET:
950 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
951 case AF_INET6:
952 return
953 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
954 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
955 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
956 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
957 }
958 return false;
959}
960
961static int addr_port(struct sockaddr_storage *ss)
962{
963 switch (ss->ss_family) {
964 case AF_INET:
965 return ntohs(((struct sockaddr_in *)ss)->sin_port);
966 case AF_INET6:
967 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
968 }
969 return 0;
970}
971
972static void addr_set_port(struct sockaddr_storage *ss, int p)
973{
974 switch (ss->ss_family) {
975 case AF_INET:
976 ((struct sockaddr_in *)ss)->sin_port = htons(p);
977 case AF_INET6:
978 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
979 }
980}
981
982/*
983 * Parse an ip[:port] list into an addr array. Use the default
984 * monitor port if a port isn't specified.
985 */
986int ceph_parse_ips(const char *c, const char *end,
987 struct ceph_entity_addr *addr,
988 int max_count, int *count)
989{
990 int i;
991 const char *p = c;
992
993 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
994 for (i = 0; i < max_count; i++) {
995 const char *ipend;
996 struct sockaddr_storage *ss = &addr[i].in_addr;
997 struct sockaddr_in *in4 = (void *)ss;
998 struct sockaddr_in6 *in6 = (void *)ss;
999 int port;
1000
1001 memset(ss, 0, sizeof(*ss));
1002 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1003 ',', &ipend)) {
1004 ss->ss_family = AF_INET;
1005 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1006 ',', &ipend)) {
1007 ss->ss_family = AF_INET6;
1008 } else {
1009 goto bad;
1010 }
1011 p = ipend;
1012
1013 /* port? */
1014 if (p < end && *p == ':') {
1015 port = 0;
1016 p++;
1017 while (p < end && *p >= '0' && *p <= '9') {
1018 port = (port * 10) + (*p - '0');
1019 p++;
1020 }
1021 if (port > 65535 || port == 0)
1022 goto bad;
1023 } else {
1024 port = CEPH_MON_PORT;
1025 }
1026
1027 addr_set_port(ss, port);
1028
1029 dout("parse_ips got %s\n", pr_addr(ss));
1030
1031 if (p == end)
1032 break;
1033 if (*p != ',')
1034 goto bad;
1035 p++;
1036 }
1037
1038 if (p != end)
1039 goto bad;
1040
1041 if (count)
1042 *count = i + 1;
1043 return 0;
1044
1045bad:
1046 pr_err("parse_ips bad ip '%s'\n", c);
1047 return -EINVAL;
1048}
1049
1050static int process_banner(struct ceph_connection *con)
1051{
1052 dout("process_banner on %p\n", con);
1053
1054 if (verify_hello(con) < 0)
1055 return -1;
1056
1057 ceph_decode_addr(&con->actual_peer_addr);
1058 ceph_decode_addr(&con->peer_addr_for_me);
1059
1060 /*
1061 * Make sure the other end is who we wanted. note that the other
1062 * end may not yet know their ip address, so if it's 0.0.0.0, give
1063 * them the benefit of the doubt.
1064 */
1065 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1066 sizeof(con->peer_addr)) != 0 &&
1067 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1068 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1069 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1070 pr_addr(&con->peer_addr.in_addr),
1071 le64_to_cpu(con->peer_addr.nonce),
1072 pr_addr(&con->actual_peer_addr.in_addr),
1073 le64_to_cpu(con->actual_peer_addr.nonce));
1074 con->error_msg = "wrong peer at address";
1075 return -1;
1076 }
1077
1078 /*
1079 * did we learn our address?
1080 */
1081 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1082 int port = addr_port(&con->msgr->inst.addr.in_addr);
1083
1084 memcpy(&con->msgr->inst.addr.in_addr,
1085 &con->peer_addr_for_me.in_addr,
1086 sizeof(con->peer_addr_for_me.in_addr));
1087 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1088 encode_my_addr(con->msgr);
1089 dout("process_banner learned my addr is %s\n",
1090 pr_addr(&con->msgr->inst.addr.in_addr));
1091 }
1092
1093 set_bit(NEGOTIATING, &con->state);
1094 prepare_read_connect(con);
1095 return 0;
1096}
1097
1098static void fail_protocol(struct ceph_connection *con)
1099{
1100 reset_connection(con);
1101 set_bit(CLOSED, &con->state); /* in case there's queued work */
1102
1103 mutex_unlock(&con->mutex);
1104 if (con->ops->bad_proto)
1105 con->ops->bad_proto(con);
1106 mutex_lock(&con->mutex);
1107}
1108
1109static int process_connect(struct ceph_connection *con)
1110{
1111 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1112 u64 req_feat = CEPH_FEATURE_REQUIRED;
1113 u64 server_feat = le64_to_cpu(con->in_reply.features);
1114
1115 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1116
1117 switch (con->in_reply.tag) {
1118 case CEPH_MSGR_TAG_FEATURES:
1119 pr_err("%s%lld %s feature set mismatch,"
1120 " my %llx < server's %llx, missing %llx\n",
1121 ENTITY_NAME(con->peer_name),
1122 pr_addr(&con->peer_addr.in_addr),
1123 sup_feat, server_feat, server_feat & ~sup_feat);
1124 con->error_msg = "missing required protocol features";
1125 fail_protocol(con);
1126 return -1;
1127
1128 case CEPH_MSGR_TAG_BADPROTOVER:
1129 pr_err("%s%lld %s protocol version mismatch,"
1130 " my %d != server's %d\n",
1131 ENTITY_NAME(con->peer_name),
1132 pr_addr(&con->peer_addr.in_addr),
1133 le32_to_cpu(con->out_connect.protocol_version),
1134 le32_to_cpu(con->in_reply.protocol_version));
1135 con->error_msg = "protocol version mismatch";
1136 fail_protocol(con);
1137 return -1;
1138
1139 case CEPH_MSGR_TAG_BADAUTHORIZER:
1140 con->auth_retry++;
1141 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1142 con->auth_retry);
1143 if (con->auth_retry == 2) {
1144 con->error_msg = "connect authorization failure";
1145 reset_connection(con);
1146 set_bit(CLOSED, &con->state);
1147 return -1;
1148 }
1149 con->auth_retry = 1;
1150 prepare_write_connect(con->msgr, con, 0);
1151 prepare_read_connect(con);
1152 break;
1153
1154 case CEPH_MSGR_TAG_RESETSESSION:
1155 /*
1156 * If we connected with a large connect_seq but the peer
1157 * has no record of a session with us (no connection, or
1158 * connect_seq == 0), they will send RESETSESION to indicate
1159 * that they must have reset their session, and may have
1160 * dropped messages.
1161 */
1162 dout("process_connect got RESET peer seq %u\n",
1163 le32_to_cpu(con->in_connect.connect_seq));
1164 pr_err("%s%lld %s connection reset\n",
1165 ENTITY_NAME(con->peer_name),
1166 pr_addr(&con->peer_addr.in_addr));
1167 reset_connection(con);
1168 prepare_write_connect(con->msgr, con, 0);
1169 prepare_read_connect(con);
1170
1171 /* Tell ceph about it. */
1172 mutex_unlock(&con->mutex);
1173 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1174 if (con->ops->peer_reset)
1175 con->ops->peer_reset(con);
1176 mutex_lock(&con->mutex);
1177 break;
1178
1179 case CEPH_MSGR_TAG_RETRY_SESSION:
1180 /*
1181 * If we sent a smaller connect_seq than the peer has, try
1182 * again with a larger value.
1183 */
1184 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1185 le32_to_cpu(con->out_connect.connect_seq),
1186 le32_to_cpu(con->in_connect.connect_seq));
1187 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1188 prepare_write_connect(con->msgr, con, 0);
1189 prepare_read_connect(con);
1190 break;
1191
1192 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1193 /*
1194 * If we sent a smaller global_seq than the peer has, try
1195 * again with a larger value.
1196 */
1197 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1198 con->peer_global_seq,
1199 le32_to_cpu(con->in_connect.global_seq));
1200 get_global_seq(con->msgr,
1201 le32_to_cpu(con->in_connect.global_seq));
1202 prepare_write_connect(con->msgr, con, 0);
1203 prepare_read_connect(con);
1204 break;
1205
1206 case CEPH_MSGR_TAG_READY:
1207 if (req_feat & ~server_feat) {
1208 pr_err("%s%lld %s protocol feature mismatch,"
1209 " my required %llx > server's %llx, need %llx\n",
1210 ENTITY_NAME(con->peer_name),
1211 pr_addr(&con->peer_addr.in_addr),
1212 req_feat, server_feat, req_feat & ~server_feat);
1213 con->error_msg = "missing required protocol features";
1214 fail_protocol(con);
1215 return -1;
1216 }
1217 clear_bit(CONNECTING, &con->state);
1218 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1219 con->connect_seq++;
1220 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1221 con->peer_global_seq,
1222 le32_to_cpu(con->in_reply.connect_seq),
1223 con->connect_seq);
1224 WARN_ON(con->connect_seq !=
1225 le32_to_cpu(con->in_reply.connect_seq));
1226
1227 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1228 set_bit(LOSSYTX, &con->state);
1229
1230 prepare_read_tag(con);
1231 break;
1232
1233 case CEPH_MSGR_TAG_WAIT:
1234 /*
1235 * If there is a connection race (we are opening
1236 * connections to each other), one of us may just have
1237 * to WAIT. This shouldn't happen if we are the
1238 * client.
1239 */
1240 pr_err("process_connect peer connecting WAIT\n");
1241
1242 default:
1243 pr_err("connect protocol error, will retry\n");
1244 con->error_msg = "protocol error, garbage tag during connect";
1245 return -1;
1246 }
1247 return 0;
1248}
1249
1250
1251/*
1252 * read (part of) an ack
1253 */
1254static int read_partial_ack(struct ceph_connection *con)
1255{
1256 int to = 0;
1257
1258 return read_partial(con, &to, sizeof(con->in_temp_ack),
1259 &con->in_temp_ack);
1260}
1261
1262
1263/*
1264 * We can finally discard anything that's been acked.
1265 */
1266static void process_ack(struct ceph_connection *con)
1267{
1268 struct ceph_msg *m;
1269 u64 ack = le64_to_cpu(con->in_temp_ack);
1270 u64 seq;
1271
1272 while (!list_empty(&con->out_sent)) {
1273 m = list_first_entry(&con->out_sent, struct ceph_msg,
1274 list_head);
1275 seq = le64_to_cpu(m->hdr.seq);
1276 if (seq > ack)
1277 break;
1278 dout("got ack for seq %llu type %d at %p\n", seq,
1279 le16_to_cpu(m->hdr.type), m);
1280 ceph_msg_remove(m);
1281 }
1282 prepare_read_tag(con);
1283}
1284
1285
1286
1287
1288static int read_partial_message_section(struct ceph_connection *con,
1289 struct kvec *section, unsigned int sec_len,
1290 u32 *crc)
1291{
1292 int left;
1293 int ret;
1294
1295 BUG_ON(!section);
1296
1297 while (section->iov_len < sec_len) {
1298 BUG_ON(section->iov_base == NULL);
1299 left = sec_len - section->iov_len;
1300 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1301 section->iov_len, left);
1302 if (ret <= 0)
1303 return ret;
1304 section->iov_len += ret;
1305 if (section->iov_len == sec_len)
1306 *crc = crc32c(0, section->iov_base,
1307 section->iov_len);
1308 }
1309
1310 return 1;
1311}
1312
1313static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1314 struct ceph_msg_header *hdr,
1315 int *skip);
1316/*
1317 * read (part of) a message.
1318 */
1319static int read_partial_message(struct ceph_connection *con)
1320{
1321 struct ceph_msg *m = con->in_msg;
1322 void *p;
1323 int ret;
1324 int to, left;
1325 unsigned front_len, middle_len, data_len, data_off;
1326 int datacrc = con->msgr->nocrc;
1327 int skip;
1328
1329 dout("read_partial_message con %p msg %p\n", con, m);
1330
1331 /* header */
1332 while (con->in_base_pos < sizeof(con->in_hdr)) {
1333 left = sizeof(con->in_hdr) - con->in_base_pos;
1334 ret = ceph_tcp_recvmsg(con->sock,
1335 (char *)&con->in_hdr + con->in_base_pos,
1336 left);
1337 if (ret <= 0)
1338 return ret;
1339 con->in_base_pos += ret;
1340 if (con->in_base_pos == sizeof(con->in_hdr)) {
1341 u32 crc = crc32c(0, (void *)&con->in_hdr,
1342 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1343 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1344 pr_err("read_partial_message bad hdr "
1345 " crc %u != expected %u\n",
1346 crc, con->in_hdr.crc);
1347 return -EBADMSG;
1348 }
1349 }
1350 }
1351 front_len = le32_to_cpu(con->in_hdr.front_len);
1352 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1353 return -EIO;
1354 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1355 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1356 return -EIO;
1357 data_len = le32_to_cpu(con->in_hdr.data_len);
1358 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1359 return -EIO;
1360 data_off = le16_to_cpu(con->in_hdr.data_off);
1361
1362 /* allocate message? */
1363 if (!con->in_msg) {
1364 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1365 con->in_hdr.front_len, con->in_hdr.data_len);
1366 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1367 if (skip) {
1368 /* skip this message */
1369 dout("alloc_msg returned NULL, skipping message\n");
1370 con->in_base_pos = -front_len - middle_len - data_len -
1371 sizeof(m->footer);
1372 con->in_tag = CEPH_MSGR_TAG_READY;
1373 return 0;
1374 }
1375 if (IS_ERR(con->in_msg)) {
1376 ret = PTR_ERR(con->in_msg);
1377 con->in_msg = NULL;
1378 con->error_msg =
1379 "error allocating memory for incoming message";
1380 return ret;
1381 }
1382 m = con->in_msg;
1383 m->front.iov_len = 0; /* haven't read it yet */
1384 if (m->middle)
1385 m->middle->vec.iov_len = 0;
1386
1387 con->in_msg_pos.page = 0;
1388 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1389 con->in_msg_pos.data_pos = 0;
1390 }
1391
1392 /* front */
1393 ret = read_partial_message_section(con, &m->front, front_len,
1394 &con->in_front_crc);
1395 if (ret <= 0)
1396 return ret;
1397
1398 /* middle */
1399 if (m->middle) {
1400 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1401 &con->in_middle_crc);
1402 if (ret <= 0)
1403 return ret;
1404 }
1405
1406 /* (page) data */
1407 while (con->in_msg_pos.data_pos < data_len) {
1408 left = min((int)(data_len - con->in_msg_pos.data_pos),
1409 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1410 BUG_ON(m->pages == NULL);
1411 p = kmap(m->pages[con->in_msg_pos.page]);
1412 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1413 left);
1414 if (ret > 0 && datacrc)
1415 con->in_data_crc =
1416 crc32c(con->in_data_crc,
1417 p + con->in_msg_pos.page_pos, ret);
1418 kunmap(m->pages[con->in_msg_pos.page]);
1419 if (ret <= 0)
1420 return ret;
1421 con->in_msg_pos.data_pos += ret;
1422 con->in_msg_pos.page_pos += ret;
1423 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1424 con->in_msg_pos.page_pos = 0;
1425 con->in_msg_pos.page++;
1426 }
1427 }
1428
1429 /* footer */
1430 to = sizeof(m->hdr) + sizeof(m->footer);
1431 while (con->in_base_pos < to) {
1432 left = to - con->in_base_pos;
1433 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1434 (con->in_base_pos - sizeof(m->hdr)),
1435 left);
1436 if (ret <= 0)
1437 return ret;
1438 con->in_base_pos += ret;
1439 }
1440 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1441 m, front_len, m->footer.front_crc, middle_len,
1442 m->footer.middle_crc, data_len, m->footer.data_crc);
1443
1444 /* crc ok? */
1445 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1446 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1447 m, con->in_front_crc, m->footer.front_crc);
1448 return -EBADMSG;
1449 }
1450 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1451 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1452 m, con->in_middle_crc, m->footer.middle_crc);
1453 return -EBADMSG;
1454 }
1455 if (datacrc &&
1456 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1457 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1458 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1459 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1460 return -EBADMSG;
1461 }
1462
1463 return 1; /* done! */
1464}
1465
1466/*
1467 * Process message. This happens in the worker thread. The callback should
1468 * be careful not to do anything that waits on other incoming messages or it
1469 * may deadlock.
1470 */
1471static void process_message(struct ceph_connection *con)
1472{
1473 struct ceph_msg *msg;
1474
1475 msg = con->in_msg;
1476 con->in_msg = NULL;
1477
1478 /* if first message, set peer_name */
1479 if (con->peer_name.type == 0)
1480 con->peer_name = msg->hdr.src.name;
1481
1482 con->in_seq++;
1483 mutex_unlock(&con->mutex);
1484
1485 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1486 msg, le64_to_cpu(msg->hdr.seq),
1487 ENTITY_NAME(msg->hdr.src.name),
1488 le16_to_cpu(msg->hdr.type),
1489 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1490 le32_to_cpu(msg->hdr.front_len),
1491 le32_to_cpu(msg->hdr.data_len),
1492 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1493 con->ops->dispatch(con, msg);
1494
1495 mutex_lock(&con->mutex);
1496 prepare_read_tag(con);
1497}
1498
1499
1500/*
1501 * Write something to the socket. Called in a worker thread when the
1502 * socket appears to be writeable and we have something ready to send.
1503 */
1504static int try_write(struct ceph_connection *con)
1505{
1506 struct ceph_messenger *msgr = con->msgr;
1507 int ret = 1;
1508
1509 dout("try_write start %p state %lu nref %d\n", con, con->state,
1510 atomic_read(&con->nref));
1511
1512 mutex_lock(&con->mutex);
1513more:
1514 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1515
1516 /* open the socket first? */
1517 if (con->sock == NULL) {
1518 /*
1519 * if we were STANDBY and are reconnecting _this_
1520 * connection, bump connect_seq now. Always bump
1521 * global_seq.
1522 */
1523 if (test_and_clear_bit(STANDBY, &con->state))
1524 con->connect_seq++;
1525
1526 prepare_write_banner(msgr, con);
1527 prepare_write_connect(msgr, con, 1);
1528 prepare_read_banner(con);
1529 set_bit(CONNECTING, &con->state);
1530 clear_bit(NEGOTIATING, &con->state);
1531
1532 BUG_ON(con->in_msg);
1533 con->in_tag = CEPH_MSGR_TAG_READY;
1534 dout("try_write initiating connect on %p new state %lu\n",
1535 con, con->state);
1536 con->sock = ceph_tcp_connect(con);
1537 if (IS_ERR(con->sock)) {
1538 con->sock = NULL;
1539 con->error_msg = "connect error";
1540 ret = -1;
1541 goto out;
1542 }
1543 }
1544
1545more_kvec:
1546 /* kvec data queued? */
1547 if (con->out_skip) {
1548 ret = write_partial_skip(con);
1549 if (ret <= 0)
1550 goto done;
1551 if (ret < 0) {
1552 dout("try_write write_partial_skip err %d\n", ret);
1553 goto done;
1554 }
1555 }
1556 if (con->out_kvec_left) {
1557 ret = write_partial_kvec(con);
1558 if (ret <= 0)
1559 goto done;
1560 }
1561
1562 /* msg pages? */
1563 if (con->out_msg) {
1564 if (con->out_msg_done) {
1565 ceph_msg_put(con->out_msg);
1566 con->out_msg = NULL; /* we're done with this one */
1567 goto do_next;
1568 }
1569
1570 ret = write_partial_msg_pages(con);
1571 if (ret == 1)
1572 goto more_kvec; /* we need to send the footer, too! */
1573 if (ret == 0)
1574 goto done;
1575 if (ret < 0) {
1576 dout("try_write write_partial_msg_pages err %d\n",
1577 ret);
1578 goto done;
1579 }
1580 }
1581
1582do_next:
1583 if (!test_bit(CONNECTING, &con->state)) {
1584 /* is anything else pending? */
1585 if (!list_empty(&con->out_queue)) {
1586 prepare_write_message(con);
1587 goto more;
1588 }
1589 if (con->in_seq > con->in_seq_acked) {
1590 prepare_write_ack(con);
1591 goto more;
1592 }
1593 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1594 prepare_write_keepalive(con);
1595 goto more;
1596 }
1597 }
1598
1599 /* Nothing to do! */
1600 clear_bit(WRITE_PENDING, &con->state);
1601 dout("try_write nothing else to write.\n");
1602done:
1603 ret = 0;
1604out:
1605 mutex_unlock(&con->mutex);
1606 dout("try_write done on %p\n", con);
1607 return ret;
1608}
1609
1610
1611
1612/*
1613 * Read what we can from the socket.
1614 */
1615static int try_read(struct ceph_connection *con)
1616{
1617 struct ceph_messenger *msgr;
1618 int ret = -1;
1619
1620 if (!con->sock)
1621 return 0;
1622
1623 if (test_bit(STANDBY, &con->state))
1624 return 0;
1625
1626 dout("try_read start on %p\n", con);
1627 msgr = con->msgr;
1628
1629 mutex_lock(&con->mutex);
1630
1631more:
1632 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1633 con->in_base_pos);
1634 if (test_bit(CONNECTING, &con->state)) {
1635 if (!test_bit(NEGOTIATING, &con->state)) {
1636 dout("try_read connecting\n");
1637 ret = read_partial_banner(con);
1638 if (ret <= 0)
1639 goto done;
1640 if (process_banner(con) < 0) {
1641 ret = -1;
1642 goto out;
1643 }
1644 }
1645 ret = read_partial_connect(con);
1646 if (ret <= 0)
1647 goto done;
1648 if (process_connect(con) < 0) {
1649 ret = -1;
1650 goto out;
1651 }
1652 goto more;
1653 }
1654
1655 if (con->in_base_pos < 0) {
1656 /*
1657 * skipping + discarding content.
1658 *
1659 * FIXME: there must be a better way to do this!
1660 */
1661 static char buf[1024];
1662 int skip = min(1024, -con->in_base_pos);
1663 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1664 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1665 if (ret <= 0)
1666 goto done;
1667 con->in_base_pos += ret;
1668 if (con->in_base_pos)
1669 goto more;
1670 }
1671 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1672 /*
1673 * what's next?
1674 */
1675 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1676 if (ret <= 0)
1677 goto done;
1678 dout("try_read got tag %d\n", (int)con->in_tag);
1679 switch (con->in_tag) {
1680 case CEPH_MSGR_TAG_MSG:
1681 prepare_read_message(con);
1682 break;
1683 case CEPH_MSGR_TAG_ACK:
1684 prepare_read_ack(con);
1685 break;
1686 case CEPH_MSGR_TAG_CLOSE:
1687 set_bit(CLOSED, &con->state); /* fixme */
1688 goto done;
1689 default:
1690 goto bad_tag;
1691 }
1692 }
1693 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1694 ret = read_partial_message(con);
1695 if (ret <= 0) {
1696 switch (ret) {
1697 case -EBADMSG:
1698 con->error_msg = "bad crc";
1699 ret = -EIO;
1700 goto out;
1701 case -EIO:
1702 con->error_msg = "io error";
1703 goto out;
1704 default:
1705 goto done;
1706 }
1707 }
1708 if (con->in_tag == CEPH_MSGR_TAG_READY)
1709 goto more;
1710 process_message(con);
1711 goto more;
1712 }
1713 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1714 ret = read_partial_ack(con);
1715 if (ret <= 0)
1716 goto done;
1717 process_ack(con);
1718 goto more;
1719 }
1720
1721done:
1722 ret = 0;
1723out:
1724 mutex_unlock(&con->mutex);
1725 dout("try_read done on %p\n", con);
1726 return ret;
1727
1728bad_tag:
1729 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1730 con->error_msg = "protocol error, garbage tag";
1731 ret = -1;
1732 goto out;
1733}
1734
1735
1736/*
1737 * Atomically queue work on a connection. Bump @con reference to
1738 * avoid races with connection teardown.
1739 *
1740 * There is some trickery going on with QUEUED and BUSY because we
1741 * only want a _single_ thread operating on each connection at any
1742 * point in time, but we want to use all available CPUs.
1743 *
1744 * The worker thread only proceeds if it can atomically set BUSY. It
1745 * clears QUEUED and does it's thing. When it thinks it's done, it
1746 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1747 * (tries again to set BUSY).
1748 *
1749 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1750 * try to queue work. If that fails (work is already queued, or BUSY)
1751 * we give up (work also already being done or is queued) but leave QUEUED
1752 * set so that the worker thread will loop if necessary.
1753 */
1754static void queue_con(struct ceph_connection *con)
1755{
1756 if (test_bit(DEAD, &con->state)) {
1757 dout("queue_con %p ignoring: DEAD\n",
1758 con);
1759 return;
1760 }
1761
1762 if (!con->ops->get(con)) {
1763 dout("queue_con %p ref count 0\n", con);
1764 return;
1765 }
1766
1767 set_bit(QUEUED, &con->state);
1768 if (test_bit(BUSY, &con->state)) {
1769 dout("queue_con %p - already BUSY\n", con);
1770 con->ops->put(con);
1771 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1772 dout("queue_con %p - already queued\n", con);
1773 con->ops->put(con);
1774 } else {
1775 dout("queue_con %p\n", con);
1776 }
1777}
1778
1779/*
1780 * Do some work on a connection. Drop a connection ref when we're done.
1781 */
1782static void con_work(struct work_struct *work)
1783{
1784 struct ceph_connection *con = container_of(work, struct ceph_connection,
1785 work.work);
1786 int backoff = 0;
1787
1788more:
1789 if (test_and_set_bit(BUSY, &con->state) != 0) {
1790 dout("con_work %p BUSY already set\n", con);
1791 goto out;
1792 }
1793 dout("con_work %p start, clearing QUEUED\n", con);
1794 clear_bit(QUEUED, &con->state);
1795
1796 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1797 dout("con_work CLOSED\n");
1798 con_close_socket(con);
1799 goto done;
1800 }
1801 if (test_and_clear_bit(OPENING, &con->state)) {
1802 /* reopen w/ new peer */
1803 dout("con_work OPENING\n");
1804 con_close_socket(con);
1805 }
1806
1807 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1808 try_read(con) < 0 ||
1809 try_write(con) < 0) {
1810 backoff = 1;
1811 ceph_fault(con); /* error/fault path */
1812 }
1813
1814done:
1815 clear_bit(BUSY, &con->state);
1816 dout("con->state=%lu\n", con->state);
1817 if (test_bit(QUEUED, &con->state)) {
1818 if (!backoff || test_bit(OPENING, &con->state)) {
1819 dout("con_work %p QUEUED reset, looping\n", con);
1820 goto more;
1821 }
1822 dout("con_work %p QUEUED reset, but just faulted\n", con);
1823 clear_bit(QUEUED, &con->state);
1824 }
1825 dout("con_work %p done\n", con);
1826
1827out:
1828 con->ops->put(con);
1829}
1830
1831
1832/*
1833 * Generic error/fault handler. A retry mechanism is used with
1834 * exponential backoff
1835 */
1836static void ceph_fault(struct ceph_connection *con)
1837{
1838 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1839 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1840 dout("fault %p state %lu to peer %s\n",
1841 con, con->state, pr_addr(&con->peer_addr.in_addr));
1842
1843 if (test_bit(LOSSYTX, &con->state)) {
1844 dout("fault on LOSSYTX channel\n");
1845 goto out;
1846 }
1847
1848 mutex_lock(&con->mutex);
1849 if (test_bit(CLOSED, &con->state))
1850 goto out_unlock;
1851
1852 con_close_socket(con);
1853
1854 if (con->in_msg) {
1855 ceph_msg_put(con->in_msg);
1856 con->in_msg = NULL;
1857 }
1858
1859 /* Requeue anything that hasn't been acked */
1860 list_splice_init(&con->out_sent, &con->out_queue);
1861
1862 /* If there are no messages in the queue, place the connection
1863 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1864 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1865 dout("fault setting STANDBY\n");
1866 set_bit(STANDBY, &con->state);
1867 } else {
1868 /* retry after a delay. */
1869 if (con->delay == 0)
1870 con->delay = BASE_DELAY_INTERVAL;
1871 else if (con->delay < MAX_DELAY_INTERVAL)
1872 con->delay *= 2;
1873 dout("fault queueing %p delay %lu\n", con, con->delay);
1874 con->ops->get(con);
1875 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1876 round_jiffies_relative(con->delay)) == 0)
1877 con->ops->put(con);
1878 }
1879
1880out_unlock:
1881 mutex_unlock(&con->mutex);
1882out:
1883 /*
1884 * in case we faulted due to authentication, invalidate our
1885 * current tickets so that we can get new ones.
1886 */
1887 if (con->auth_retry && con->ops->invalidate_authorizer) {
1888 dout("calling invalidate_authorizer()\n");
1889 con->ops->invalidate_authorizer(con);
1890 }
1891
1892 if (con->ops->fault)
1893 con->ops->fault(con);
1894}
1895
1896
1897
1898/*
1899 * create a new messenger instance
1900 */
1901struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1902{
1903 struct ceph_messenger *msgr;
1904
1905 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1906 if (msgr == NULL)
1907 return ERR_PTR(-ENOMEM);
1908
1909 spin_lock_init(&msgr->global_seq_lock);
1910
1911 /* the zero page is needed if a request is "canceled" while the message
1912 * is being written over the socket */
1913 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1914 if (!msgr->zero_page) {
1915 kfree(msgr);
1916 return ERR_PTR(-ENOMEM);
1917 }
1918 kmap(msgr->zero_page);
1919
1920 if (myaddr)
1921 msgr->inst.addr = *myaddr;
1922
1923 /* select a random nonce */
1924 msgr->inst.addr.type = 0;
1925 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1926 encode_my_addr(msgr);
1927
1928 dout("messenger_create %p\n", msgr);
1929 return msgr;
1930}
1931
1932void ceph_messenger_destroy(struct ceph_messenger *msgr)
1933{
1934 dout("destroy %p\n", msgr);
1935 kunmap(msgr->zero_page);
1936 __free_page(msgr->zero_page);
1937 kfree(msgr);
1938 dout("destroyed messenger %p\n", msgr);
1939}
1940
1941/*
1942 * Queue up an outgoing message on the given connection.
1943 */
1944void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1945{
1946 if (test_bit(CLOSED, &con->state)) {
1947 dout("con_send %p closed, dropping %p\n", con, msg);
1948 ceph_msg_put(msg);
1949 return;
1950 }
1951
1952 /* set src+dst */
1953 msg->hdr.src.name = con->msgr->inst.name;
1954 msg->hdr.src.addr = con->msgr->my_enc_addr;
1955 msg->hdr.orig_src = msg->hdr.src;
1956
1957 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1958
1959 /* queue */
1960 mutex_lock(&con->mutex);
1961 BUG_ON(!list_empty(&msg->list_head));
1962 list_add_tail(&msg->list_head, &con->out_queue);
1963 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1964 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1965 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1966 le32_to_cpu(msg->hdr.front_len),
1967 le32_to_cpu(msg->hdr.middle_len),
1968 le32_to_cpu(msg->hdr.data_len));
1969 mutex_unlock(&con->mutex);
1970
1971 /* if there wasn't anything waiting to send before, queue
1972 * new work */
1973 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1974 queue_con(con);
1975}
1976
1977/*
1978 * Revoke a message that was previously queued for send
1979 */
1980void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1981{
1982 mutex_lock(&con->mutex);
1983 if (!list_empty(&msg->list_head)) {
1984 dout("con_revoke %p msg %p\n", con, msg);
1985 list_del_init(&msg->list_head);
1986 ceph_msg_put(msg);
1987 msg->hdr.seq = 0;
1988 if (con->out_msg == msg) {
1989 ceph_msg_put(con->out_msg);
1990 con->out_msg = NULL;
1991 }
1992 if (con->out_kvec_is_msg) {
1993 con->out_skip = con->out_kvec_bytes;
1994 con->out_kvec_is_msg = false;
1995 }
1996 } else {
1997 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
1998 }
1999 mutex_unlock(&con->mutex);
2000}
2001
2002/*
2003 * Revoke a message that we may be reading data into
2004 */
2005void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2006{
2007 mutex_lock(&con->mutex);
2008 if (con->in_msg && con->in_msg == msg) {
2009 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2010 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2011 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2012
2013 /* skip rest of message */
2014 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2015 con->in_base_pos = con->in_base_pos -
2016 sizeof(struct ceph_msg_header) -
2017 front_len -
2018 middle_len -
2019 data_len -
2020 sizeof(struct ceph_msg_footer);
2021 ceph_msg_put(con->in_msg);
2022 con->in_msg = NULL;
2023 con->in_tag = CEPH_MSGR_TAG_READY;
2024 } else {
2025 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2026 con, con->in_msg, msg);
2027 }
2028 mutex_unlock(&con->mutex);
2029}
2030
2031/*
2032 * Queue a keepalive byte to ensure the tcp connection is alive.
2033 */
2034void ceph_con_keepalive(struct ceph_connection *con)
2035{
2036 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2037 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2038 queue_con(con);
2039}
2040
2041
2042/*
2043 * construct a new message with given type, size
2044 * the new msg has a ref count of 1.
2045 */
2046struct ceph_msg *ceph_msg_new(int type, int front_len,
2047 int page_len, int page_off, struct page **pages)
2048{
2049 struct ceph_msg *m;
2050
2051 m = kmalloc(sizeof(*m), GFP_NOFS);
2052 if (m == NULL)
2053 goto out;
2054 kref_init(&m->kref);
2055 INIT_LIST_HEAD(&m->list_head);
2056
2057 m->hdr.type = cpu_to_le16(type);
2058 m->hdr.front_len = cpu_to_le32(front_len);
2059 m->hdr.middle_len = 0;
2060 m->hdr.data_len = cpu_to_le32(page_len);
2061 m->hdr.data_off = cpu_to_le16(page_off);
2062 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2063 m->footer.front_crc = 0;
2064 m->footer.middle_crc = 0;
2065 m->footer.data_crc = 0;
2066 m->front_max = front_len;
2067 m->front_is_vmalloc = false;
2068 m->more_to_follow = false;
2069 m->pool = NULL;
2070
2071 /* front */
2072 if (front_len) {
2073 if (front_len > PAGE_CACHE_SIZE) {
2074 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2075 PAGE_KERNEL);
2076 m->front_is_vmalloc = true;
2077 } else {
2078 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2079 }
2080 if (m->front.iov_base == NULL) {
2081 pr_err("msg_new can't allocate %d bytes\n",
2082 front_len);
2083 goto out2;
2084 }
2085 } else {
2086 m->front.iov_base = NULL;
2087 }
2088 m->front.iov_len = front_len;
2089
2090 /* middle */
2091 m->middle = NULL;
2092
2093 /* data */
2094 m->nr_pages = calc_pages_for(page_off, page_len);
2095 m->pages = pages;
2096 m->pagelist = NULL;
2097
2098 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2099 m->nr_pages);
2100 return m;
2101
2102out2:
2103 ceph_msg_put(m);
2104out:
2105 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2106 return ERR_PTR(-ENOMEM);
2107}
2108
2109/*
2110 * Allocate "middle" portion of a message, if it is needed and wasn't
2111 * allocated by alloc_msg. This allows us to read a small fixed-size
2112 * per-type header in the front and then gracefully fail (i.e.,
2113 * propagate the error to the caller based on info in the front) when
2114 * the middle is too large.
2115 */
2116static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2117{
2118 int type = le16_to_cpu(msg->hdr.type);
2119 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2120
2121 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2122 ceph_msg_type_name(type), middle_len);
2123 BUG_ON(!middle_len);
2124 BUG_ON(msg->middle);
2125
2126 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2127 if (!msg->middle)
2128 return -ENOMEM;
2129 return 0;
2130}
2131
2132/*
2133 * Generic message allocator, for incoming messages.
2134 */
2135static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2136 struct ceph_msg_header *hdr,
2137 int *skip)
2138{
2139 int type = le16_to_cpu(hdr->type);
2140 int front_len = le32_to_cpu(hdr->front_len);
2141 int middle_len = le32_to_cpu(hdr->middle_len);
2142 struct ceph_msg *msg = NULL;
2143 int ret;
2144
2145 if (con->ops->alloc_msg) {
2146 mutex_unlock(&con->mutex);
2147 msg = con->ops->alloc_msg(con, hdr, skip);
2148 mutex_lock(&con->mutex);
2149 if (IS_ERR(msg))
2150 return msg;
2151
2152 if (*skip)
2153 return NULL;
2154 }
2155 if (!msg) {
2156 *skip = 0;
2157 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2158 if (!msg) {
2159 pr_err("unable to allocate msg type %d len %d\n",
2160 type, front_len);
2161 return ERR_PTR(-ENOMEM);
2162 }
2163 }
2164 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2165
2166 if (middle_len) {
2167 ret = ceph_alloc_middle(con, msg);
2168
2169 if (ret < 0) {
2170 ceph_msg_put(msg);
2171 return msg;
2172 }
2173 }
2174
2175 return msg;
2176}
2177
2178
2179/*
2180 * Free a generically kmalloc'd message.
2181 */
2182void ceph_msg_kfree(struct ceph_msg *m)
2183{
2184 dout("msg_kfree %p\n", m);
2185 if (m->front_is_vmalloc)
2186 vfree(m->front.iov_base);
2187 else
2188 kfree(m->front.iov_base);
2189 kfree(m);
2190}
2191
2192/*
2193 * Drop a msg ref. Destroy as needed.
2194 */
2195void ceph_msg_last_put(struct kref *kref)
2196{
2197 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2198
2199 dout("ceph_msg_put last one on %p\n", m);
2200 WARN_ON(!list_empty(&m->list_head));
2201
2202 /* drop middle, data, if any */
2203 if (m->middle) {
2204 ceph_buffer_put(m->middle);
2205 m->middle = NULL;
2206 }
2207 m->nr_pages = 0;
2208 m->pages = NULL;
2209
2210 if (m->pagelist) {
2211 ceph_pagelist_release(m->pagelist);
2212 kfree(m->pagelist);
2213 m->pagelist = NULL;
2214 }
2215
2216 if (m->pool)
2217 ceph_msgpool_put(m->pool, m);
2218 else
2219 ceph_msg_kfree(m);
2220}
2221
2222void ceph_msg_dump(struct ceph_msg *msg)
2223{
2224 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2225 msg->front_max, msg->nr_pages);
2226 print_hex_dump(KERN_DEBUG, "header: ",
2227 DUMP_PREFIX_OFFSET, 16, 1,
2228 &msg->hdr, sizeof(msg->hdr), true);
2229 print_hex_dump(KERN_DEBUG, " front: ",
2230 DUMP_PREFIX_OFFSET, 16, 1,
2231 msg->front.iov_base, msg->front.iov_len, true);
2232 if (msg->middle)
2233 print_hex_dump(KERN_DEBUG, "middle: ",
2234 DUMP_PREFIX_OFFSET, 16, 1,
2235 msg->middle->vec.iov_base,
2236 msg->middle->vec.iov_len, true);
2237 print_hex_dump(KERN_DEBUG, "footer: ",
2238 DUMP_PREFIX_OFFSET, 16, 1,
2239 &msg->footer, sizeof(msg->footer), true);
2240}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a343dae73cdc
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len,
237 int page_len, int page_off,
238 struct page **pages);
239extern void ceph_msg_kfree(struct ceph_msg *m);
240
241
242static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
243{
244 kref_get(&msg->kref);
245 return msg;
246}
247extern void ceph_msg_last_put(struct kref *kref);
248static inline void ceph_msg_put(struct ceph_msg *msg)
249{
250 kref_put(&msg->kref, ceph_msg_last_put);
251}
252
253extern void ceph_msg_dump(struct ceph_msg *msg);
254
255#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31const static struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth);
109}
110
111/*
112 * Close monitor session, if any.
113 */
114static void __close_session(struct ceph_mon_client *monc)
115{
116 if (monc->con) {
117 dout("__close_session closing mon%d\n", monc->cur_mon);
118 ceph_con_revoke(monc->con, monc->m_auth);
119 ceph_con_close(monc->con);
120 monc->cur_mon = -1;
121 monc->pending_auth = 0;
122 ceph_auth_reset(monc->auth);
123 }
124}
125
126/*
127 * Open a session with a (new) monitor.
128 */
129static int __open_session(struct ceph_mon_client *monc)
130{
131 char r;
132 int ret;
133
134 if (monc->cur_mon < 0) {
135 get_random_bytes(&r, 1);
136 monc->cur_mon = r % monc->monmap->num_mon;
137 dout("open_session num=%d r=%d -> mon%d\n",
138 monc->monmap->num_mon, r, monc->cur_mon);
139 monc->sub_sent = 0;
140 monc->sub_renew_after = jiffies; /* i.e., expired */
141 monc->want_next_osdmap = !!monc->want_next_osdmap;
142
143 dout("open_session mon%d opening\n", monc->cur_mon);
144 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
145 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
146 ceph_con_open(monc->con,
147 &monc->monmap->mon_inst[monc->cur_mon].addr);
148
149 /* initiatiate authentication handshake */
150 ret = ceph_auth_build_hello(monc->auth,
151 monc->m_auth->front.iov_base,
152 monc->m_auth->front_max);
153 __send_prepared_auth_request(monc, ret);
154 } else {
155 dout("open_session mon%d already open\n", monc->cur_mon);
156 }
157 return 0;
158}
159
160static bool __sub_expired(struct ceph_mon_client *monc)
161{
162 return time_after_eq(jiffies, monc->sub_renew_after);
163}
164
165/*
166 * Reschedule delayed work timer.
167 */
168static void __schedule_delayed(struct ceph_mon_client *monc)
169{
170 unsigned delay;
171
172 if (monc->cur_mon < 0 || __sub_expired(monc))
173 delay = 10 * HZ;
174 else
175 delay = 20 * HZ;
176 dout("__schedule_delayed after %u\n", delay);
177 schedule_delayed_work(&monc->delayed_work, delay);
178}
179
180/*
181 * Send subscribe request for mdsmap and/or osdmap.
182 */
183static void __send_subscribe(struct ceph_mon_client *monc)
184{
185 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
186 (unsigned)monc->sub_sent, __sub_expired(monc),
187 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg;
191 struct ceph_mon_subscribe_item *i;
192 void *p, *end;
193
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base;
199 end = p + msg->front.iov_len;
200
201 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap);
203 if (monc->want_next_osdmap) {
204 dout("__send_subscribe to 'osdmap' %u\n",
205 (unsigned)monc->have_osdmap);
206 ceph_encode_32(&p, 3);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 } else {
214 ceph_encode_32(&p, 2);
215 }
216 ceph_encode_string(&p, end, "mdsmap", 6);
217 i = p;
218 i->have = cpu_to_le64(monc->have_mdsmap);
219 i->onetime = 0;
220 p += sizeof(*i);
221 ceph_encode_string(&p, end, "monmap", 6);
222 i = p;
223 i->have = 0;
224 i->onetime = 0;
225 p += sizeof(*i);
226
227 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg);
230
231 monc->sub_sent = jiffies | 1; /* never 0 */
232 }
233}
234
235static void handle_subscribe_ack(struct ceph_mon_client *monc,
236 struct ceph_msg *msg)
237{
238 unsigned seconds;
239 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
240
241 if (msg->front.iov_len < sizeof(*h))
242 goto bad;
243 seconds = le32_to_cpu(h->duration);
244
245 mutex_lock(&monc->mutex);
246 if (monc->hunting) {
247 pr_info("mon%d %s session established\n",
248 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
249 monc->hunting = false;
250 }
251 dout("handle_subscribe_ack after %d seconds\n", seconds);
252 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
253 monc->sub_sent = 0;
254 mutex_unlock(&monc->mutex);
255 return;
256bad:
257 pr_err("got corrupt subscribe-ack msg\n");
258 ceph_msg_dump(msg);
259}
260
261/*
262 * Keep track of which maps we have
263 */
264int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
265{
266 mutex_lock(&monc->mutex);
267 monc->have_mdsmap = got;
268 mutex_unlock(&monc->mutex);
269 return 0;
270}
271
272int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
273{
274 mutex_lock(&monc->mutex);
275 monc->have_osdmap = got;
276 monc->want_next_osdmap = 0;
277 mutex_unlock(&monc->mutex);
278 return 0;
279}
280
281/*
282 * Register interest in the next osdmap
283 */
284void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
285{
286 dout("request_next_osdmap have %u\n", monc->have_osdmap);
287 mutex_lock(&monc->mutex);
288 if (!monc->want_next_osdmap)
289 monc->want_next_osdmap = 1;
290 if (monc->want_next_osdmap < 2)
291 __send_subscribe(monc);
292 mutex_unlock(&monc->mutex);
293}
294
295/*
296 *
297 */
298int ceph_monc_open_session(struct ceph_mon_client *monc)
299{
300 if (!monc->con) {
301 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
302 if (!monc->con)
303 return -ENOMEM;
304 ceph_con_init(monc->client->msgr, monc->con);
305 monc->con->private = monc;
306 monc->con->ops = &mon_con_ops;
307 }
308
309 mutex_lock(&monc->mutex);
310 __open_session(monc);
311 __schedule_delayed(monc);
312 mutex_unlock(&monc->mutex);
313 return 0;
314}
315
316/*
317 * The monitor responds with mount ack indicate mount success. The
318 * included client ticket allows the client to talk to MDSs and OSDs.
319 */
320static void ceph_monc_handle_map(struct ceph_mon_client *monc,
321 struct ceph_msg *msg)
322{
323 struct ceph_client *client = monc->client;
324 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
325 void *p, *end;
326
327 mutex_lock(&monc->mutex);
328
329 dout("handle_monmap\n");
330 p = msg->front.iov_base;
331 end = p + msg->front.iov_len;
332
333 monmap = ceph_monmap_decode(p, end);
334 if (IS_ERR(monmap)) {
335 pr_err("problem decoding monmap, %d\n",
336 (int)PTR_ERR(monmap));
337 goto out;
338 }
339
340 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
341 kfree(monmap);
342 goto out;
343 }
344
345 client->monc.monmap = monmap;
346 kfree(old);
347
348out:
349 mutex_unlock(&monc->mutex);
350 wake_up(&client->auth_wq);
351}
352
353/*
354 * statfs
355 */
356static struct ceph_mon_statfs_request *__lookup_statfs(
357 struct ceph_mon_client *monc, u64 tid)
358{
359 struct ceph_mon_statfs_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node;
361
362 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node);
364 if (tid < req->tid)
365 n = n->rb_left;
366 else if (tid > req->tid)
367 n = n->rb_right;
368 else
369 return req;
370 }
371 return NULL;
372}
373
374static void __insert_statfs(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new)
376{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node;
378 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL;
380
381 while (*p) {
382 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
384 if (new->tid < req->tid)
385 p = &(*p)->rb_left;
386 else if (new->tid > req->tid)
387 p = &(*p)->rb_right;
388 else
389 BUG();
390 }
391
392 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree);
394}
395
396static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg)
398{
399 struct ceph_mon_statfs_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid;
402
403 if (msg->front.iov_len != sizeof(*reply))
404 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407
408 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid);
410 if (req) {
411 *req->buf = reply->st;
412 req->result = 0;
413 }
414 mutex_unlock(&monc->mutex);
415 if (req)
416 complete(&req->completion);
417 return;
418
419bad:
420 pr_err("corrupt statfs reply, no tid\n");
421 ceph_msg_dump(msg);
422}
423
424/*
425 * (re)send a statfs request
426 */
427static int send_statfs(struct ceph_mon_client *monc,
428 struct ceph_mon_statfs_request *req)
429{
430 struct ceph_msg *msg;
431 struct ceph_mon_statfs *h;
432
433 dout("send_statfs tid %llu\n", req->tid);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463
464 /* register request */
465 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid;
467 req.last_attempt = jiffies;
468 req.delay = BASE_DELAY_INTERVAL;
469 __insert_statfs(monc, &req);
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex);
472
473 /* send request and wait */
474 err = send_statfs(monc, &req);
475 if (!err)
476 err = wait_for_completion_interruptible(&req.completion);
477
478 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree);
480 monc->num_statfs_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex);
483
484 if (!err)
485 err = req.result;
486 return err;
487}
488
489/*
490 * Resend pending statfs requests.
491 */
492static void __resend_statfs(struct ceph_mon_client *monc)
493{
494 struct ceph_mon_statfs_request *req;
495 struct rb_node *p;
496
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node);
499 send_statfs(monc, req);
500 }
501}
502
503/*
504 * Delayed work. If we haven't mounted yet, retry. Otherwise,
505 * renew/retry subscription as needed (in case it is timing out, or we
506 * got an ENOMEM). And keep the monitor connection alive.
507 */
508static void delayed_work(struct work_struct *work)
509{
510 struct ceph_mon_client *monc =
511 container_of(work, struct ceph_mon_client, delayed_work.work);
512
513 dout("monc delayed_work\n");
514 mutex_lock(&monc->mutex);
515 if (monc->hunting) {
516 __close_session(monc);
517 __open_session(monc); /* continue hunting */
518 } else {
519 ceph_con_keepalive(monc->con);
520
521 __validate_auth(monc);
522
523 if (monc->auth->ops->is_authenticated(monc->auth))
524 __send_subscribe(monc);
525 }
526 __schedule_delayed(monc);
527 mutex_unlock(&monc->mutex);
528}
529
530/*
531 * On startup, we build a temporary monmap populated with the IPs
532 * provided by mount(2).
533 */
534static int build_initial_monmap(struct ceph_mon_client *monc)
535{
536 struct ceph_mount_args *args = monc->client->mount_args;
537 struct ceph_entity_addr *mon_addr = args->mon_addr;
538 int num_mon = args->num_mon;
539 int i;
540
541 /* build initial monmap */
542 monc->monmap = kzalloc(sizeof(*monc->monmap) +
543 num_mon*sizeof(monc->monmap->mon_inst[0]),
544 GFP_KERNEL);
545 if (!monc->monmap)
546 return -ENOMEM;
547 for (i = 0; i < num_mon; i++) {
548 monc->monmap->mon_inst[i].addr = mon_addr[i];
549 monc->monmap->mon_inst[i].addr.nonce = 0;
550 monc->monmap->mon_inst[i].name.type =
551 CEPH_ENTITY_TYPE_MON;
552 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
553 }
554 monc->monmap->num_mon = num_mon;
555 monc->have_fsid = false;
556
557 /* release addr memory */
558 kfree(args->mon_addr);
559 args->mon_addr = NULL;
560 args->num_mon = 0;
561 return 0;
562}
563
564int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
565{
566 int err = 0;
567
568 dout("init\n");
569 memset(monc, 0, sizeof(*monc));
570 monc->client = cl;
571 monc->monmap = NULL;
572 mutex_init(&monc->mutex);
573
574 err = build_initial_monmap(monc);
575 if (err)
576 goto out;
577
578 monc->con = NULL;
579
580 /* authentication */
581 monc->auth = ceph_auth_init(cl->mount_args->name,
582 cl->mount_args->secret);
583 if (IS_ERR(monc->auth))
584 return PTR_ERR(monc->auth);
585 monc->auth->want_keys =
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588
589 /* msg pools */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
591 sizeof(struct ceph_mon_subscribe_ack), 1, false);
592 if (err < 0)
593 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
595 sizeof(struct ceph_mon_statfs_reply), 0, false);
596 if (err < 0)
597 goto out_pool1;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
599 if (err < 0)
600 goto out_pool2;
601
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
603 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) {
605 err = PTR_ERR(monc->m_auth);
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609
610 monc->cur_mon = -1;
611 monc->hunting = true;
612 monc->sub_renew_after = jiffies;
613 monc->sub_sent = 0;
614
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0;
618 monc->last_tid = 0;
619
620 monc->have_mdsmap = 0;
621 monc->have_osdmap = 0;
622 monc->want_next_osdmap = 1;
623 return 0;
624
625out_pool3:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
627out_pool2:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
629out_pool1:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
631out_monmap:
632 kfree(monc->monmap);
633out:
634 return err;
635}
636
637void ceph_monc_stop(struct ceph_mon_client *monc)
638{
639 dout("stop\n");
640 cancel_delayed_work_sync(&monc->delayed_work);
641
642 mutex_lock(&monc->mutex);
643 __close_session(monc);
644 if (monc->con) {
645 monc->con->private = NULL;
646 monc->con->ops->put(monc->con);
647 monc->con = NULL;
648 }
649 mutex_unlock(&monc->mutex);
650
651 ceph_auth_destroy(monc->auth);
652
653 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
657
658 kfree(monc->monmap);
659}
660
661static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg)
663{
664 int ret;
665
666 mutex_lock(&monc->mutex);
667 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len,
670 monc->m_auth->front.iov_base,
671 monc->m_auth->front_max);
672 if (ret < 0) {
673 monc->client->auth_err = ret;
674 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n");
679
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id;
682
683 __send_subscribe(monc);
684 __resend_statfs(monc);
685 }
686 mutex_unlock(&monc->mutex);
687}
688
689static int __validate_auth(struct ceph_mon_client *monc)
690{
691 int ret;
692
693 if (monc->pending_auth)
694 return 0;
695
696 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
697 monc->m_auth->front_max);
698 if (ret <= 0)
699 return ret; /* either an error, or no need to authenticate */
700 __send_prepared_auth_request(monc, ret);
701 return 0;
702}
703
704int ceph_monc_validate_auth(struct ceph_mon_client *monc)
705{
706 int ret;
707
708 mutex_lock(&monc->mutex);
709 ret = __validate_auth(monc);
710 mutex_unlock(&monc->mutex);
711 return ret;
712}
713
714/*
715 * handle incoming message
716 */
717static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
718{
719 struct ceph_mon_client *monc = con->private;
720 int type = le16_to_cpu(msg->hdr.type);
721
722 if (!monc)
723 return;
724
725 switch (type) {
726 case CEPH_MSG_AUTH_REPLY:
727 handle_auth_reply(monc, msg);
728 break;
729
730 case CEPH_MSG_MON_SUBSCRIBE_ACK:
731 handle_subscribe_ack(monc, msg);
732 break;
733
734 case CEPH_MSG_STATFS_REPLY:
735 handle_statfs_reply(monc, msg);
736 break;
737
738 case CEPH_MSG_MON_MAP:
739 ceph_monc_handle_map(monc, msg);
740 break;
741
742 case CEPH_MSG_MDS_MAP:
743 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
744 break;
745
746 case CEPH_MSG_OSD_MAP:
747 ceph_osdc_handle_map(&monc->client->osdc, msg);
748 break;
749
750 default:
751 pr_err("received unknown message type %d %s\n", type,
752 ceph_msg_type_name(type));
753 }
754 ceph_msg_put(msg);
755}
756
757/*
758 * Allocate memory for incoming message
759 */
760static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
761 struct ceph_msg_header *hdr,
762 int *skip)
763{
764 struct ceph_mon_client *monc = con->private;
765 int type = le16_to_cpu(hdr->type);
766 int front_len = le32_to_cpu(hdr->front_len);
767 struct ceph_msg *m = NULL;
768
769 *skip = 0;
770
771 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
774 break;
775 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
777 break;
778 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
780 break;
781 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL);
785 break;
786 }
787
788 if (!m) {
789 pr_info("alloc_msg unknown type %d\n", type);
790 *skip = 1;
791 }
792 return m;
793}
794
795/*
796 * If the monitor connection resets, pick a new monitor and resubmit
797 * any pending requests.
798 */
799static void mon_fault(struct ceph_connection *con)
800{
801 struct ceph_mon_client *monc = con->private;
802
803 if (!monc)
804 return;
805
806 dout("mon_fault\n");
807 mutex_lock(&monc->mutex);
808 if (!con->private)
809 goto out;
810
811 if (monc->con && !monc->hunting)
812 pr_info("mon%d %s session lost, "
813 "hunting for new mon\n", monc->cur_mon,
814 pr_addr(&monc->con->peer_addr.in_addr));
815
816 __close_session(monc);
817 if (!monc->hunting) {
818 /* start hunting */
819 monc->hunting = true;
820 __open_session(monc);
821 } else {
822 /* already hunting, let's wait a bit */
823 __schedule_delayed(monc);
824 }
825out:
826 mutex_unlock(&monc->mutex);
827}
828
829const static struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get,
831 .put = ceph_con_put,
832 .dispatch = dispatch,
833 .fault = mon_fault,
834 .alloc_msg = mon_alloc_msg,
835};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..c7b4dedaace6
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 struct ceph_osd_request *req;
417 int ret = 0;
418
419 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
420 if (list_empty(&osd->o_requests)) {
421 __remove_osd(osdc, osd);
422 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
423 &osd->o_con.peer_addr,
424 sizeof(osd->o_con.peer_addr)) == 0 &&
425 !ceph_con_opened(&osd->o_con)) {
426 dout(" osd addr hasn't changed and connection never opened,"
427 " letting msgr retry");
428 /* touch each r_stamp for handle_timeout()'s benfit */
429 list_for_each_entry(req, &osd->o_requests, r_osd_item)
430 req->r_stamp = jiffies;
431 ret = -EAGAIN;
432 } else {
433 ceph_con_close(&osd->o_con);
434 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
435 osd->o_incarnation++;
436 }
437 return ret;
438}
439
440static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
441{
442 struct rb_node **p = &osdc->osds.rb_node;
443 struct rb_node *parent = NULL;
444 struct ceph_osd *osd = NULL;
445
446 while (*p) {
447 parent = *p;
448 osd = rb_entry(parent, struct ceph_osd, o_node);
449 if (new->o_osd < osd->o_osd)
450 p = &(*p)->rb_left;
451 else if (new->o_osd > osd->o_osd)
452 p = &(*p)->rb_right;
453 else
454 BUG();
455 }
456
457 rb_link_node(&new->o_node, parent, p);
458 rb_insert_color(&new->o_node, &osdc->osds);
459}
460
461static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
462{
463 struct ceph_osd *osd;
464 struct rb_node *n = osdc->osds.rb_node;
465
466 while (n) {
467 osd = rb_entry(n, struct ceph_osd, o_node);
468 if (o < osd->o_osd)
469 n = n->rb_left;
470 else if (o > osd->o_osd)
471 n = n->rb_right;
472 else
473 return osd;
474 }
475 return NULL;
476}
477
478static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
479{
480 schedule_delayed_work(&osdc->timeout_work,
481 osdc->client->mount_args->osd_keepalive_timeout * HZ);
482}
483
484static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
485{
486 cancel_delayed_work(&osdc->timeout_work);
487}
488
489/*
490 * Register request, assign tid. If this is the first request, set up
491 * the timeout event.
492 */
493static void register_request(struct ceph_osd_client *osdc,
494 struct ceph_osd_request *req)
495{
496 mutex_lock(&osdc->request_mutex);
497 req->r_tid = ++osdc->last_tid;
498 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
499 INIT_LIST_HEAD(&req->r_req_lru_item);
500
501 dout("register_request %p tid %lld\n", req, req->r_tid);
502 __insert_request(osdc, req);
503 ceph_osdc_get_request(req);
504 osdc->num_requests++;
505
506 if (osdc->num_requests == 1) {
507 dout(" first request, scheduling timeout\n");
508 __schedule_osd_timeout(osdc);
509 }
510 mutex_unlock(&osdc->request_mutex);
511}
512
513/*
514 * called under osdc->request_mutex
515 */
516static void __unregister_request(struct ceph_osd_client *osdc,
517 struct ceph_osd_request *req)
518{
519 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
520 rb_erase(&req->r_node, &osdc->requests);
521 osdc->num_requests--;
522
523 if (req->r_osd) {
524 /* make sure the original request isn't in flight. */
525 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
526
527 list_del_init(&req->r_osd_item);
528 if (list_empty(&req->r_osd->o_requests))
529 __move_osd_to_lru(osdc, req->r_osd);
530 req->r_osd = NULL;
531 }
532
533 ceph_osdc_put_request(req);
534
535 list_del_init(&req->r_req_lru_item);
536 if (osdc->num_requests == 0) {
537 dout(" no requests, canceling timeout\n");
538 __cancel_osd_timeout(osdc);
539 }
540}
541
542/*
543 * Cancel a previously queued request message
544 */
545static void __cancel_request(struct ceph_osd_request *req)
546{
547 if (req->r_sent) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0;
550 }
551 list_del_init(&req->r_req_lru_item);
552}
553
554/*
555 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
556 * (as needed), and set the request r_osd appropriately. If there is
557 * no up osd, set r_osd to NULL.
558 *
559 * Return 0 if unchanged, 1 if changed, or negative on error.
560 *
561 * Caller should hold map_sem for read and request_mutex.
562 */
563static int __map_osds(struct ceph_osd_client *osdc,
564 struct ceph_osd_request *req)
565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid;
568 int o = -1;
569 int err;
570
571 dout("map_osds %p tid %lld\n", req, req->r_tid);
572 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
573 &req->r_file_layout, osdc->osdmap);
574 if (err)
575 return err;
576 pgid = reqhead->layout.ol_pgid;
577 req->r_pgid = pgid;
578
579 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
580
581 if ((req->r_osd && req->r_osd->o_osd == o &&
582 req->r_sent >= req->r_osd->o_incarnation) ||
583 (req->r_osd == NULL && o == -1))
584 return 0; /* no change */
585
586 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
587 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
588 req->r_osd ? req->r_osd->o_osd : -1);
589
590 if (req->r_osd) {
591 __cancel_request(req);
592 list_del_init(&req->r_osd_item);
593 req->r_osd = NULL;
594 }
595
596 req->r_osd = __lookup_osd(osdc, o);
597 if (!req->r_osd && o >= 0) {
598 err = -ENOMEM;
599 req->r_osd = create_osd(osdc);
600 if (!req->r_osd)
601 goto out;
602
603 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
604 req->r_osd->o_osd = o;
605 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
606 __insert_osd(osdc, req->r_osd);
607
608 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
609 }
610
611 if (req->r_osd) {
612 __remove_osd_from_lru(req->r_osd);
613 list_add(&req->r_osd_item, &req->r_osd->o_requests);
614 }
615 err = 1; /* osd changed */
616
617out:
618 return err;
619}
620
621/*
622 * caller should hold map_sem (for read) and request_mutex
623 */
624static int __send_request(struct ceph_osd_client *osdc,
625 struct ceph_osd_request *req)
626{
627 struct ceph_osd_request_head *reqhead;
628 int err;
629
630 err = __map_osds(osdc, req);
631 if (err < 0)
632 return err;
633 if (req->r_osd == NULL) {
634 dout("send_request %p no up osds in pg\n", req);
635 ceph_monc_request_next_osdmap(&osdc->client->monc);
636 return 0;
637 }
638
639 dout("send_request %p tid %llu to osd%d flags %d\n",
640 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
641
642 reqhead = req->r_request->front.iov_base;
643 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
644 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
645 reqhead->reassert_version = req->r_reassert_version;
646
647 req->r_stamp = jiffies;
648 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
649
650 ceph_msg_get(req->r_request); /* send consumes a ref */
651 ceph_con_send(&req->r_osd->o_con, req->r_request);
652 req->r_sent = req->r_osd->o_incarnation;
653 return 0;
654}
655
656/*
657 * Timeout callback, called every N seconds when 1 or more osd
658 * requests has been active for more than N seconds. When this
659 * happens, we ping all OSDs with requests who have timed out to
660 * ensure any communications channel reset is detected. Reset the
661 * request timeouts another N seconds in the future as we go.
662 * Reschedule the timeout event another N seconds in future (unless
663 * there are no open requests).
664 */
665static void handle_timeout(struct work_struct *work)
666{
667 struct ceph_osd_client *osdc =
668 container_of(work, struct ceph_osd_client, timeout_work.work);
669 struct ceph_osd_request *req, *last_req = NULL;
670 struct ceph_osd *osd;
671 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
672 unsigned long keepalive =
673 osdc->client->mount_args->osd_keepalive_timeout * HZ;
674 unsigned long last_stamp = 0;
675 struct rb_node *p;
676 struct list_head slow_osds;
677
678 dout("timeout\n");
679 down_read(&osdc->map_sem);
680
681 ceph_monc_request_next_osdmap(&osdc->client->monc);
682
683 mutex_lock(&osdc->request_mutex);
684 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
685 req = rb_entry(p, struct ceph_osd_request, r_node);
686
687 if (req->r_resend) {
688 int err;
689
690 dout("osdc resending prev failed %lld\n", req->r_tid);
691 err = __send_request(osdc, req);
692 if (err)
693 dout("osdc failed again on %lld\n", req->r_tid);
694 else
695 req->r_resend = false;
696 continue;
697 }
698 }
699
700 /*
701 * reset osds that appear to be _really_ unresponsive. this
702 * is a failsafe measure.. we really shouldn't be getting to
703 * this point if the system is working properly. the monitors
704 * should mark the osd as failed and we should find out about
705 * it from an updated osd map.
706 */
707 while (!list_empty(&osdc->req_lru)) {
708 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
709 r_req_lru_item);
710
711 if (time_before(jiffies, req->r_stamp + timeout))
712 break;
713
714 BUG_ON(req == last_req && req->r_stamp == last_stamp);
715 last_req = req;
716 last_stamp = req->r_stamp;
717
718 osd = req->r_osd;
719 BUG_ON(!osd);
720 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
721 req->r_tid, osd->o_osd);
722 __kick_requests(osdc, osd);
723 }
724
725 /*
726 * ping osds that are a bit slow. this ensures that if there
727 * is a break in the TCP connection we will notice, and reopen
728 * a connection with that osd (from the fault callback).
729 */
730 INIT_LIST_HEAD(&slow_osds);
731 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
732 if (time_before(jiffies, req->r_stamp + keepalive))
733 break;
734
735 osd = req->r_osd;
736 BUG_ON(!osd);
737 dout(" tid %llu is slow, will send keepalive on osd%d\n",
738 req->r_tid, osd->o_osd);
739 list_move_tail(&osd->o_keepalive_item, &slow_osds);
740 }
741 while (!list_empty(&slow_osds)) {
742 osd = list_entry(slow_osds.next, struct ceph_osd,
743 o_keepalive_item);
744 list_del_init(&osd->o_keepalive_item);
745 ceph_con_keepalive(&osd->o_con);
746 }
747
748 __schedule_osd_timeout(osdc);
749 mutex_unlock(&osdc->request_mutex);
750
751 up_read(&osdc->map_sem);
752}
753
754static void handle_osds_timeout(struct work_struct *work)
755{
756 struct ceph_osd_client *osdc =
757 container_of(work, struct ceph_osd_client,
758 osds_timeout_work.work);
759 unsigned long delay =
760 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
761
762 dout("osds timeout\n");
763 down_read(&osdc->map_sem);
764 remove_old_osds(osdc, 0);
765 up_read(&osdc->map_sem);
766
767 schedule_delayed_work(&osdc->osds_timeout_work,
768 round_jiffies_relative(delay));
769}
770
771/*
772 * handle osd op reply. either call the callback if it is specified,
773 * or do the completion to wake up the waiting thread.
774 */
775static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
776 struct ceph_connection *con)
777{
778 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
779 struct ceph_osd_request *req;
780 u64 tid;
781 int numops, object_len, flags;
782
783 tid = le64_to_cpu(msg->hdr.tid);
784 if (msg->front.iov_len < sizeof(*rhead))
785 goto bad;
786 numops = le32_to_cpu(rhead->num_ops);
787 object_len = le32_to_cpu(rhead->object_len);
788 if (msg->front.iov_len != sizeof(*rhead) + object_len +
789 numops * sizeof(struct ceph_osd_op))
790 goto bad;
791 dout("handle_reply %p tid %llu\n", msg, tid);
792
793 /* lookup */
794 mutex_lock(&osdc->request_mutex);
795 req = __lookup_request(osdc, tid);
796 if (req == NULL) {
797 dout("handle_reply tid %llu dne\n", tid);
798 mutex_unlock(&osdc->request_mutex);
799 return;
800 }
801 ceph_osdc_get_request(req);
802 flags = le32_to_cpu(rhead->flags);
803
804 /*
805 * if this connection filled our message, drop our reference now, to
806 * avoid a (safe but slower) revoke later.
807 */
808 if (req->r_con_filling_msg == con && req->r_reply == msg) {
809 dout(" dropping con_filling_msg ref %p\n", con);
810 req->r_con_filling_msg = NULL;
811 ceph_con_put(con);
812 }
813
814 if (!req->r_got_reply) {
815 unsigned bytes;
816
817 req->r_result = le32_to_cpu(rhead->result);
818 bytes = le32_to_cpu(msg->hdr.data_len);
819 dout("handle_reply result %d bytes %d\n", req->r_result,
820 bytes);
821 if (req->r_result == 0)
822 req->r_result = bytes;
823
824 /* in case this is a write and we need to replay, */
825 req->r_reassert_version = rhead->reassert_version;
826
827 req->r_got_reply = 1;
828 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
829 dout("handle_reply tid %llu dup ack\n", tid);
830 mutex_unlock(&osdc->request_mutex);
831 goto done;
832 }
833
834 dout("handle_reply tid %llu flags %d\n", tid, flags);
835
836 /* either this is a read, or we got the safe response */
837 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
838 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
839 __unregister_request(osdc, req);
840
841 mutex_unlock(&osdc->request_mutex);
842
843 if (req->r_callback)
844 req->r_callback(req, msg);
845 else
846 complete(&req->r_completion);
847
848 if (flags & CEPH_OSD_FLAG_ONDISK) {
849 if (req->r_safe_callback)
850 req->r_safe_callback(req, msg);
851 complete(&req->r_safe_completion); /* fsync waiter */
852 }
853
854done:
855 ceph_osdc_put_request(req);
856 return;
857
858bad:
859 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
860 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
861 (int)sizeof(*rhead));
862 ceph_msg_dump(msg);
863}
864
865
866static int __kick_requests(struct ceph_osd_client *osdc,
867 struct ceph_osd *kickosd)
868{
869 struct ceph_osd_request *req;
870 struct rb_node *p, *n;
871 int needmap = 0;
872 int err;
873
874 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
875 if (kickosd) {
876 err = __reset_osd(osdc, kickosd);
877 if (err == -EAGAIN)
878 return 1;
879 } else {
880 for (p = rb_first(&osdc->osds); p; p = n) {
881 struct ceph_osd *osd =
882 rb_entry(p, struct ceph_osd, o_node);
883
884 n = rb_next(p);
885 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
886 memcmp(&osd->o_con.peer_addr,
887 ceph_osd_addr(osdc->osdmap,
888 osd->o_osd),
889 sizeof(struct ceph_entity_addr)) != 0)
890 __reset_osd(osdc, osd);
891 }
892 }
893
894 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
895 req = rb_entry(p, struct ceph_osd_request, r_node);
896
897 if (req->r_resend) {
898 dout(" r_resend set on tid %llu\n", req->r_tid);
899 __cancel_request(req);
900 goto kick;
901 }
902 if (req->r_osd && kickosd == req->r_osd) {
903 __cancel_request(req);
904 goto kick;
905 }
906
907 err = __map_osds(osdc, req);
908 if (err == 0)
909 continue; /* no change */
910 if (err < 0) {
911 /*
912 * FIXME: really, we should set the request
913 * error and fail if this isn't a 'nofail'
914 * request, but that's a fair bit more
915 * complicated to do. So retry!
916 */
917 dout(" setting r_resend on %llu\n", req->r_tid);
918 req->r_resend = true;
919 continue;
920 }
921 if (req->r_osd == NULL) {
922 dout("tid %llu maps to no valid osd\n", req->r_tid);
923 needmap++; /* request a newer map */
924 continue;
925 }
926
927kick:
928 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
929 req->r_osd ? req->r_osd->o_osd : -1);
930 req->r_flags |= CEPH_OSD_FLAG_RETRY;
931 err = __send_request(osdc, req);
932 if (err) {
933 dout(" setting r_resend on %llu\n", req->r_tid);
934 req->r_resend = true;
935 }
936 }
937
938 return needmap;
939}
940
941/*
942 * Resubmit osd requests whose osd or osd address has changed. Request
943 * a new osd map if osds are down, or we are otherwise unable to determine
944 * how to direct a request.
945 *
946 * Close connections to down osds.
947 *
948 * If @who is specified, resubmit requests for that specific osd.
949 *
950 * Caller should hold map_sem for read and request_mutex.
951 */
952static void kick_requests(struct ceph_osd_client *osdc,
953 struct ceph_osd *kickosd)
954{
955 int needmap;
956
957 mutex_lock(&osdc->request_mutex);
958 needmap = __kick_requests(osdc, kickosd);
959 mutex_unlock(&osdc->request_mutex);
960
961 if (needmap) {
962 dout("%d requests for down osds, need new map\n", needmap);
963 ceph_monc_request_next_osdmap(&osdc->client->monc);
964 }
965
966}
967/*
968 * Process updated osd map.
969 *
970 * The message contains any number of incremental and full maps, normally
971 * indicating some sort of topology change in the cluster. Kick requests
972 * off to different OSDs as needed.
973 */
974void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
975{
976 void *p, *end, *next;
977 u32 nr_maps, maplen;
978 u32 epoch;
979 struct ceph_osdmap *newmap = NULL, *oldmap;
980 int err;
981 struct ceph_fsid fsid;
982
983 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
984 p = msg->front.iov_base;
985 end = p + msg->front.iov_len;
986
987 /* verify fsid */
988 ceph_decode_need(&p, end, sizeof(fsid), bad);
989 ceph_decode_copy(&p, &fsid, sizeof(fsid));
990 if (ceph_check_fsid(osdc->client, &fsid) < 0)
991 return;
992
993 down_write(&osdc->map_sem);
994
995 /* incremental maps */
996 ceph_decode_32_safe(&p, end, nr_maps, bad);
997 dout(" %d inc maps\n", nr_maps);
998 while (nr_maps > 0) {
999 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1000 epoch = ceph_decode_32(&p);
1001 maplen = ceph_decode_32(&p);
1002 ceph_decode_need(&p, end, maplen, bad);
1003 next = p + maplen;
1004 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1005 dout("applying incremental map %u len %d\n",
1006 epoch, maplen);
1007 newmap = osdmap_apply_incremental(&p, next,
1008 osdc->osdmap,
1009 osdc->client->msgr);
1010 if (IS_ERR(newmap)) {
1011 err = PTR_ERR(newmap);
1012 goto bad;
1013 }
1014 BUG_ON(!newmap);
1015 if (newmap != osdc->osdmap) {
1016 ceph_osdmap_destroy(osdc->osdmap);
1017 osdc->osdmap = newmap;
1018 }
1019 } else {
1020 dout("ignoring incremental map %u len %d\n",
1021 epoch, maplen);
1022 }
1023 p = next;
1024 nr_maps--;
1025 }
1026 if (newmap)
1027 goto done;
1028
1029 /* full maps */
1030 ceph_decode_32_safe(&p, end, nr_maps, bad);
1031 dout(" %d full maps\n", nr_maps);
1032 while (nr_maps) {
1033 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1034 epoch = ceph_decode_32(&p);
1035 maplen = ceph_decode_32(&p);
1036 ceph_decode_need(&p, end, maplen, bad);
1037 if (nr_maps > 1) {
1038 dout("skipping non-latest full map %u len %d\n",
1039 epoch, maplen);
1040 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1041 dout("skipping full map %u len %d, "
1042 "older than our %u\n", epoch, maplen,
1043 osdc->osdmap->epoch);
1044 } else {
1045 dout("taking full map %u len %d\n", epoch, maplen);
1046 newmap = osdmap_decode(&p, p+maplen);
1047 if (IS_ERR(newmap)) {
1048 err = PTR_ERR(newmap);
1049 goto bad;
1050 }
1051 BUG_ON(!newmap);
1052 oldmap = osdc->osdmap;
1053 osdc->osdmap = newmap;
1054 if (oldmap)
1055 ceph_osdmap_destroy(oldmap);
1056 }
1057 p += maplen;
1058 nr_maps--;
1059 }
1060
1061done:
1062 downgrade_write(&osdc->map_sem);
1063 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1064 if (newmap)
1065 kick_requests(osdc, NULL);
1066 up_read(&osdc->map_sem);
1067 return;
1068
1069bad:
1070 pr_err("osdc handle_map corrupt msg\n");
1071 ceph_msg_dump(msg);
1072 up_write(&osdc->map_sem);
1073 return;
1074}
1075
1076
1077/*
1078 * A read request prepares specific pages that data is to be read into.
1079 * When a message is being read off the wire, we call prepare_pages to
1080 * find those pages.
1081 * 0 = success, -1 failure.
1082 */
1083static int __prepare_pages(struct ceph_connection *con,
1084 struct ceph_msg_header *hdr,
1085 struct ceph_osd_request *req,
1086 u64 tid,
1087 struct ceph_msg *m)
1088{
1089 struct ceph_osd *osd = con->private;
1090 struct ceph_osd_client *osdc;
1091 int ret = -1;
1092 int data_len = le32_to_cpu(hdr->data_len);
1093 unsigned data_off = le16_to_cpu(hdr->data_off);
1094
1095 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1096
1097 if (!osd)
1098 return -1;
1099
1100 osdc = osd->o_osdc;
1101
1102 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1103 tid, req->r_num_pages, want);
1104 if (unlikely(req->r_num_pages < want))
1105 goto out;
1106 m->pages = req->r_pages;
1107 m->nr_pages = req->r_num_pages;
1108 ret = 0; /* success */
1109out:
1110 BUG_ON(ret < 0 || m->nr_pages < want);
1111
1112 return ret;
1113}
1114
1115/*
1116 * Register request, send initial attempt.
1117 */
1118int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1119 struct ceph_osd_request *req,
1120 bool nofail)
1121{
1122 int rc = 0;
1123
1124 req->r_request->pages = req->r_pages;
1125 req->r_request->nr_pages = req->r_num_pages;
1126
1127 register_request(osdc, req);
1128
1129 down_read(&osdc->map_sem);
1130 mutex_lock(&osdc->request_mutex);
1131 /*
1132 * a racing kick_requests() may have sent the message for us
1133 * while we dropped request_mutex above, so only send now if
1134 * the request still han't been touched yet.
1135 */
1136 if (req->r_sent == 0) {
1137 rc = __send_request(osdc, req);
1138 if (rc) {
1139 if (nofail) {
1140 dout("osdc_start_request failed send, "
1141 " marking %lld\n", req->r_tid);
1142 req->r_resend = true;
1143 rc = 0;
1144 } else {
1145 __unregister_request(osdc, req);
1146 }
1147 }
1148 }
1149 mutex_unlock(&osdc->request_mutex);
1150 up_read(&osdc->map_sem);
1151 return rc;
1152}
1153
1154/*
1155 * wait for a request to complete
1156 */
1157int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1158 struct ceph_osd_request *req)
1159{
1160 int rc;
1161
1162 rc = wait_for_completion_interruptible(&req->r_completion);
1163 if (rc < 0) {
1164 mutex_lock(&osdc->request_mutex);
1165 __cancel_request(req);
1166 __unregister_request(osdc, req);
1167 mutex_unlock(&osdc->request_mutex);
1168 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1169 return rc;
1170 }
1171
1172 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1173 return req->r_result;
1174}
1175
1176/*
1177 * sync - wait for all in-flight requests to flush. avoid starvation.
1178 */
1179void ceph_osdc_sync(struct ceph_osd_client *osdc)
1180{
1181 struct ceph_osd_request *req;
1182 u64 last_tid, next_tid = 0;
1183
1184 mutex_lock(&osdc->request_mutex);
1185 last_tid = osdc->last_tid;
1186 while (1) {
1187 req = __lookup_request_ge(osdc, next_tid);
1188 if (!req)
1189 break;
1190 if (req->r_tid > last_tid)
1191 break;
1192
1193 next_tid = req->r_tid + 1;
1194 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1195 continue;
1196
1197 ceph_osdc_get_request(req);
1198 mutex_unlock(&osdc->request_mutex);
1199 dout("sync waiting on tid %llu (last is %llu)\n",
1200 req->r_tid, last_tid);
1201 wait_for_completion(&req->r_safe_completion);
1202 mutex_lock(&osdc->request_mutex);
1203 ceph_osdc_put_request(req);
1204 }
1205 mutex_unlock(&osdc->request_mutex);
1206 dout("sync done (thru tid %llu)\n", last_tid);
1207}
1208
1209/*
1210 * init, shutdown
1211 */
1212int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1213{
1214 int err;
1215
1216 dout("init\n");
1217 osdc->client = client;
1218 osdc->osdmap = NULL;
1219 init_rwsem(&osdc->map_sem);
1220 init_completion(&osdc->map_waiters);
1221 osdc->last_requested_map = 0;
1222 mutex_init(&osdc->request_mutex);
1223 osdc->last_tid = 0;
1224 osdc->osds = RB_ROOT;
1225 INIT_LIST_HEAD(&osdc->osd_lru);
1226 osdc->requests = RB_ROOT;
1227 INIT_LIST_HEAD(&osdc->req_lru);
1228 osdc->num_requests = 0;
1229 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1230 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1231
1232 schedule_delayed_work(&osdc->osds_timeout_work,
1233 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1234
1235 err = -ENOMEM;
1236 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1237 sizeof(struct ceph_osd_request));
1238 if (!osdc->req_mempool)
1239 goto out;
1240
1241 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1242 if (err < 0)
1243 goto out_mempool;
1244 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1245 OSD_OPREPLY_FRONT_LEN, 10, true);
1246 if (err < 0)
1247 goto out_msgpool;
1248 return 0;
1249
1250out_msgpool:
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252out_mempool:
1253 mempool_destroy(osdc->req_mempool);
1254out:
1255 return err;
1256}
1257
1258void ceph_osdc_stop(struct ceph_osd_client *osdc)
1259{
1260 cancel_delayed_work_sync(&osdc->timeout_work);
1261 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1262 if (osdc->osdmap) {
1263 ceph_osdmap_destroy(osdc->osdmap);
1264 osdc->osdmap = NULL;
1265 }
1266 remove_old_osds(osdc, 1);
1267 mempool_destroy(osdc->req_mempool);
1268 ceph_msgpool_destroy(&osdc->msgpool_op);
1269 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1270}
1271
1272/*
1273 * Read some contiguous pages. If we cross a stripe boundary, shorten
1274 * *plen. Return number of bytes read, or error.
1275 */
1276int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1277 struct ceph_vino vino, struct ceph_file_layout *layout,
1278 u64 off, u64 *plen,
1279 u32 truncate_seq, u64 truncate_size,
1280 struct page **pages, int num_pages)
1281{
1282 struct ceph_osd_request *req;
1283 int rc = 0;
1284
1285 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1286 vino.snap, off, *plen);
1287 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1288 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1289 NULL, 0, truncate_seq, truncate_size, NULL,
1290 false, 1);
1291 if (IS_ERR(req))
1292 return PTR_ERR(req);
1293
1294 /* it may be a short read due to an object boundary */
1295 req->r_pages = pages;
1296 num_pages = calc_pages_for(off, *plen);
1297 req->r_num_pages = num_pages;
1298
1299 dout("readpages final extent is %llu~%llu (%d pages)\n",
1300 off, *plen, req->r_num_pages);
1301
1302 rc = ceph_osdc_start_request(osdc, req, false);
1303 if (!rc)
1304 rc = ceph_osdc_wait_request(osdc, req);
1305
1306 ceph_osdc_put_request(req);
1307 dout("readpages result %d\n", rc);
1308 return rc;
1309}
1310
1311/*
1312 * do a synchronous write on N pages
1313 */
1314int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1315 struct ceph_file_layout *layout,
1316 struct ceph_snap_context *snapc,
1317 u64 off, u64 len,
1318 u32 truncate_seq, u64 truncate_size,
1319 struct timespec *mtime,
1320 struct page **pages, int num_pages,
1321 int flags, int do_sync, bool nofail)
1322{
1323 struct ceph_osd_request *req;
1324 int rc = 0;
1325
1326 BUG_ON(vino.snap != CEPH_NOSNAP);
1327 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1328 CEPH_OSD_OP_WRITE,
1329 flags | CEPH_OSD_FLAG_ONDISK |
1330 CEPH_OSD_FLAG_WRITE,
1331 snapc, do_sync,
1332 truncate_seq, truncate_size, mtime,
1333 nofail, 1);
1334 if (IS_ERR(req))
1335 return PTR_ERR(req);
1336
1337 /* it may be a short write due to an object boundary */
1338 req->r_pages = pages;
1339 req->r_num_pages = calc_pages_for(off, len);
1340 dout("writepages %llu~%llu (%d pages)\n", off, len,
1341 req->r_num_pages);
1342
1343 rc = ceph_osdc_start_request(osdc, req, nofail);
1344 if (!rc)
1345 rc = ceph_osdc_wait_request(osdc, req);
1346
1347 ceph_osdc_put_request(req);
1348 if (rc == 0)
1349 rc = len;
1350 dout("writepages result %d\n", rc);
1351 return rc;
1352}
1353
1354/*
1355 * handle incoming message
1356 */
1357static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1358{
1359 struct ceph_osd *osd = con->private;
1360 struct ceph_osd_client *osdc;
1361 int type = le16_to_cpu(msg->hdr.type);
1362
1363 if (!osd)
1364 return;
1365 osdc = osd->o_osdc;
1366
1367 switch (type) {
1368 case CEPH_MSG_OSD_MAP:
1369 ceph_osdc_handle_map(osdc, msg);
1370 break;
1371 case CEPH_MSG_OSD_OPREPLY:
1372 handle_reply(osdc, msg, con);
1373 break;
1374
1375 default:
1376 pr_err("received unknown message type %d %s\n", type,
1377 ceph_msg_type_name(type));
1378 }
1379 ceph_msg_put(msg);
1380}
1381
1382/*
1383 * lookup and return message for incoming reply
1384 */
1385static struct ceph_msg *get_reply(struct ceph_connection *con,
1386 struct ceph_msg_header *hdr,
1387 int *skip)
1388{
1389 struct ceph_osd *osd = con->private;
1390 struct ceph_osd_client *osdc = osd->o_osdc;
1391 struct ceph_msg *m;
1392 struct ceph_osd_request *req;
1393 int front = le32_to_cpu(hdr->front_len);
1394 int data_len = le32_to_cpu(hdr->data_len);
1395 u64 tid;
1396 int err;
1397
1398 tid = le64_to_cpu(hdr->tid);
1399 mutex_lock(&osdc->request_mutex);
1400 req = __lookup_request(osdc, tid);
1401 if (!req) {
1402 *skip = 1;
1403 m = NULL;
1404 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1405 osd->o_osd);
1406 goto out;
1407 }
1408
1409 if (req->r_con_filling_msg) {
1410 dout("get_reply revoking msg %p from old con %p\n",
1411 req->r_reply, req->r_con_filling_msg);
1412 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1413 ceph_con_put(req->r_con_filling_msg);
1414 }
1415
1416 if (front > req->r_reply->front.iov_len) {
1417 pr_warning("get_reply front %d > preallocated %d\n",
1418 front, (int)req->r_reply->front.iov_len);
1419 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1420 if (IS_ERR(m))
1421 goto out;
1422 ceph_msg_put(req->r_reply);
1423 req->r_reply = m;
1424 }
1425 m = ceph_msg_get(req->r_reply);
1426
1427 if (data_len > 0) {
1428 err = __prepare_pages(con, hdr, req, tid, m);
1429 if (err < 0) {
1430 *skip = 1;
1431 ceph_msg_put(m);
1432 m = ERR_PTR(err);
1433 }
1434 }
1435 *skip = 0;
1436 req->r_con_filling_msg = ceph_con_get(con);
1437 dout("get_reply tid %lld %p\n", tid, m);
1438
1439out:
1440 mutex_unlock(&osdc->request_mutex);
1441 return m;
1442
1443}
1444
1445static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1446 struct ceph_msg_header *hdr,
1447 int *skip)
1448{
1449 struct ceph_osd *osd = con->private;
1450 int type = le16_to_cpu(hdr->type);
1451 int front = le32_to_cpu(hdr->front_len);
1452
1453 switch (type) {
1454 case CEPH_MSG_OSD_MAP:
1455 return ceph_msg_new(type, front, 0, 0, NULL);
1456 case CEPH_MSG_OSD_OPREPLY:
1457 return get_reply(con, hdr, skip);
1458 default:
1459 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1460 osd->o_osd);
1461 *skip = 1;
1462 return NULL;
1463 }
1464}
1465
1466/*
1467 * Wrappers to refcount containing ceph_osd struct
1468 */
1469static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1470{
1471 struct ceph_osd *osd = con->private;
1472 if (get_osd(osd))
1473 return con;
1474 return NULL;
1475}
1476
1477static void put_osd_con(struct ceph_connection *con)
1478{
1479 struct ceph_osd *osd = con->private;
1480 put_osd(osd);
1481}
1482
1483/*
1484 * authentication
1485 */
1486static int get_authorizer(struct ceph_connection *con,
1487 void **buf, int *len, int *proto,
1488 void **reply_buf, int *reply_len, int force_new)
1489{
1490 struct ceph_osd *o = con->private;
1491 struct ceph_osd_client *osdc = o->o_osdc;
1492 struct ceph_auth_client *ac = osdc->client->monc.auth;
1493 int ret = 0;
1494
1495 if (force_new && o->o_authorizer) {
1496 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1497 o->o_authorizer = NULL;
1498 }
1499 if (o->o_authorizer == NULL) {
1500 ret = ac->ops->create_authorizer(
1501 ac, CEPH_ENTITY_TYPE_OSD,
1502 &o->o_authorizer,
1503 &o->o_authorizer_buf,
1504 &o->o_authorizer_buf_len,
1505 &o->o_authorizer_reply_buf,
1506 &o->o_authorizer_reply_buf_len);
1507 if (ret)
1508 return ret;
1509 }
1510
1511 *proto = ac->protocol;
1512 *buf = o->o_authorizer_buf;
1513 *len = o->o_authorizer_buf_len;
1514 *reply_buf = o->o_authorizer_reply_buf;
1515 *reply_len = o->o_authorizer_reply_buf_len;
1516 return 0;
1517}
1518
1519
1520static int verify_authorizer_reply(struct ceph_connection *con, int len)
1521{
1522 struct ceph_osd *o = con->private;
1523 struct ceph_osd_client *osdc = o->o_osdc;
1524 struct ceph_auth_client *ac = osdc->client->monc.auth;
1525
1526 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1527}
1528
1529static int invalidate_authorizer(struct ceph_connection *con)
1530{
1531 struct ceph_osd *o = con->private;
1532 struct ceph_osd_client *osdc = o->o_osdc;
1533 struct ceph_auth_client *ac = osdc->client->monc.auth;
1534
1535 if (ac->ops->invalidate_authorizer)
1536 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1537
1538 return ceph_monc_validate_auth(&osdc->client->monc);
1539}
1540
1541const static struct ceph_connection_operations osd_con_ops = {
1542 .get = get_osd_con,
1543 .put = put_osd_con,
1544 .dispatch = dispatch,
1545 .get_authorizer = get_authorizer,
1546 .verify_authorizer_reply = verify_authorizer_reply,
1547 .invalidate_authorizer = invalidate_authorizer,
1548 .alloc_msg = alloc_msg,
1549 .fault = osd_reset,
1550};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..b0759911e7c3
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_stamp; /* send OR check time */
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..21c6623c4b07
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1024 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317
318/*
319 * osd map
320 */
321void ceph_osdmap_destroy(struct ceph_osdmap *map)
322{
323 dout("osdmap_destroy %p\n", map);
324 if (map->crush)
325 crush_destroy(map->crush);
326 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
327 struct ceph_pg_mapping *pg =
328 rb_entry(rb_first(&map->pg_temp),
329 struct ceph_pg_mapping, node);
330 rb_erase(&pg->node, &map->pg_temp);
331 kfree(pg);
332 }
333 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
334 struct ceph_pg_pool_info *pi =
335 rb_entry(rb_first(&map->pg_pools),
336 struct ceph_pg_pool_info, node);
337 rb_erase(&pi->node, &map->pg_pools);
338 kfree(pi);
339 }
340 kfree(map->osd_state);
341 kfree(map->osd_weight);
342 kfree(map->osd_addr);
343 kfree(map);
344}
345
346/*
347 * adjust max osd value. reallocate arrays.
348 */
349static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
350{
351 u8 *state;
352 struct ceph_entity_addr *addr;
353 u32 *weight;
354
355 state = kcalloc(max, sizeof(*state), GFP_NOFS);
356 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
357 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
358 if (state == NULL || addr == NULL || weight == NULL) {
359 kfree(state);
360 kfree(addr);
361 kfree(weight);
362 return -ENOMEM;
363 }
364
365 /* copy old? */
366 if (map->osd_state) {
367 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
368 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
369 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
370 kfree(map->osd_state);
371 kfree(map->osd_addr);
372 kfree(map->osd_weight);
373 }
374
375 map->osd_state = state;
376 map->osd_weight = weight;
377 map->osd_addr = addr;
378 map->max_osd = max;
379 return 0;
380}
381
382/*
383 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
384 * to a set of osds)
385 */
386static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
387{
388 u64 a = *(u64 *)&l;
389 u64 b = *(u64 *)&r;
390
391 if (a < b)
392 return -1;
393 if (a > b)
394 return 1;
395 return 0;
396}
397
398static int __insert_pg_mapping(struct ceph_pg_mapping *new,
399 struct rb_root *root)
400{
401 struct rb_node **p = &root->rb_node;
402 struct rb_node *parent = NULL;
403 struct ceph_pg_mapping *pg = NULL;
404 int c;
405
406 while (*p) {
407 parent = *p;
408 pg = rb_entry(parent, struct ceph_pg_mapping, node);
409 c = pgid_cmp(new->pgid, pg->pgid);
410 if (c < 0)
411 p = &(*p)->rb_left;
412 else if (c > 0)
413 p = &(*p)->rb_right;
414 else
415 return -EEXIST;
416 }
417
418 rb_link_node(&new->node, parent, p);
419 rb_insert_color(&new->node, root);
420 return 0;
421}
422
423static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
424 struct ceph_pg pgid)
425{
426 struct rb_node *n = root->rb_node;
427 struct ceph_pg_mapping *pg;
428 int c;
429
430 while (n) {
431 pg = rb_entry(n, struct ceph_pg_mapping, node);
432 c = pgid_cmp(pgid, pg->pgid);
433 if (c < 0)
434 n = n->rb_left;
435 else if (c > 0)
436 n = n->rb_right;
437 else
438 return pg;
439 }
440 return NULL;
441}
442
443/*
444 * rbtree of pg pool info
445 */
446static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
447{
448 struct rb_node **p = &root->rb_node;
449 struct rb_node *parent = NULL;
450 struct ceph_pg_pool_info *pi = NULL;
451
452 while (*p) {
453 parent = *p;
454 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
455 if (new->id < pi->id)
456 p = &(*p)->rb_left;
457 else if (new->id > pi->id)
458 p = &(*p)->rb_right;
459 else
460 return -EEXIST;
461 }
462
463 rb_link_node(&new->node, parent, p);
464 rb_insert_color(&new->node, root);
465 return 0;
466}
467
468static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
469{
470 struct ceph_pg_pool_info *pi;
471 struct rb_node *n = root->rb_node;
472
473 while (n) {
474 pi = rb_entry(n, struct ceph_pg_pool_info, node);
475 if (id < pi->id)
476 n = n->rb_left;
477 else if (id > pi->id)
478 n = n->rb_right;
479 else
480 return pi;
481 }
482 return NULL;
483}
484
485void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
486{
487 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
488 calc_pg_masks(pi);
489 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
490 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
491}
492
493/*
494 * decode a full map.
495 */
496struct ceph_osdmap *osdmap_decode(void **p, void *end)
497{
498 struct ceph_osdmap *map;
499 u16 version;
500 u32 len, max, i;
501 u8 ev;
502 int err = -EINVAL;
503 void *start = *p;
504 struct ceph_pg_pool_info *pi;
505
506 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
507
508 map = kzalloc(sizeof(*map), GFP_NOFS);
509 if (map == NULL)
510 return ERR_PTR(-ENOMEM);
511 map->pg_temp = RB_ROOT;
512
513 ceph_decode_16_safe(p, end, version, bad);
514 if (version > CEPH_OSDMAP_VERSION) {
515 pr_warning("got unknown v %d > %d of osdmap\n", version,
516 CEPH_OSDMAP_VERSION);
517 goto bad;
518 }
519
520 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
521 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
522 map->epoch = ceph_decode_32(p);
523 ceph_decode_copy(p, &map->created, sizeof(map->created));
524 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
525
526 ceph_decode_32_safe(p, end, max, bad);
527 while (max--) {
528 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
529 pi = kmalloc(sizeof(*pi), GFP_NOFS);
530 if (!pi)
531 goto bad;
532 pi->id = ceph_decode_32(p);
533 ev = ceph_decode_8(p); /* encoding version */
534 if (ev > CEPH_PG_POOL_VERSION) {
535 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
536 ev, CEPH_PG_POOL_VERSION);
537 goto bad;
538 }
539 __decode_pool(p, pi);
540 __insert_pg_pool(&map->pg_pools, pi);
541 }
542 ceph_decode_32_safe(p, end, map->pool_max, bad);
543
544 ceph_decode_32_safe(p, end, map->flags, bad);
545
546 max = ceph_decode_32(p);
547
548 /* (re)alloc osd arrays */
549 err = osdmap_set_max_osd(map, max);
550 if (err < 0)
551 goto bad;
552 dout("osdmap_decode max_osd = %d\n", map->max_osd);
553
554 /* osds */
555 err = -EINVAL;
556 ceph_decode_need(p, end, 3*sizeof(u32) +
557 map->max_osd*(1 + sizeof(*map->osd_weight) +
558 sizeof(*map->osd_addr)), bad);
559 *p += 4; /* skip length field (should match max) */
560 ceph_decode_copy(p, map->osd_state, map->max_osd);
561
562 *p += 4; /* skip length field (should match max) */
563 for (i = 0; i < map->max_osd; i++)
564 map->osd_weight[i] = ceph_decode_32(p);
565
566 *p += 4; /* skip length field (should match max) */
567 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
568 for (i = 0; i < map->max_osd; i++)
569 ceph_decode_addr(&map->osd_addr[i]);
570
571 /* pg_temp */
572 ceph_decode_32_safe(p, end, len, bad);
573 for (i = 0; i < len; i++) {
574 int n, j;
575 struct ceph_pg pgid;
576 struct ceph_pg_mapping *pg;
577
578 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
579 ceph_decode_copy(p, &pgid, sizeof(pgid));
580 n = ceph_decode_32(p);
581 ceph_decode_need(p, end, n * sizeof(u32), bad);
582 err = -ENOMEM;
583 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
584 if (!pg)
585 goto bad;
586 pg->pgid = pgid;
587 pg->len = n;
588 for (j = 0; j < n; j++)
589 pg->osds[j] = ceph_decode_32(p);
590
591 err = __insert_pg_mapping(pg, &map->pg_temp);
592 if (err)
593 goto bad;
594 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
595 }
596
597 /* crush */
598 ceph_decode_32_safe(p, end, len, bad);
599 dout("osdmap_decode crush len %d from off 0x%x\n", len,
600 (int)(*p - start));
601 ceph_decode_need(p, end, len, bad);
602 map->crush = crush_decode(*p, end);
603 *p += len;
604 if (IS_ERR(map->crush)) {
605 err = PTR_ERR(map->crush);
606 map->crush = NULL;
607 goto bad;
608 }
609
610 /* ignore the rest of the map */
611 *p = end;
612
613 dout("osdmap_decode done %p %p\n", *p, end);
614 return map;
615
616bad:
617 dout("osdmap_decode fail\n");
618 ceph_osdmap_destroy(map);
619 return ERR_PTR(err);
620}
621
622/*
623 * decode and apply an incremental map update.
624 */
625struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
626 struct ceph_osdmap *map,
627 struct ceph_messenger *msgr)
628{
629 struct crush_map *newcrush = NULL;
630 struct ceph_fsid fsid;
631 u32 epoch = 0;
632 struct ceph_timespec modified;
633 u32 len, pool;
634 __s32 new_pool_max, new_flags, max;
635 void *start = *p;
636 int err = -EINVAL;
637 u16 version;
638 struct rb_node *rbp;
639
640 ceph_decode_16_safe(p, end, version, bad);
641 if (version > CEPH_OSDMAP_INC_VERSION) {
642 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
643 CEPH_OSDMAP_INC_VERSION);
644 goto bad;
645 }
646
647 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
648 bad);
649 ceph_decode_copy(p, &fsid, sizeof(fsid));
650 epoch = ceph_decode_32(p);
651 BUG_ON(epoch != map->epoch+1);
652 ceph_decode_copy(p, &modified, sizeof(modified));
653 new_pool_max = ceph_decode_32(p);
654 new_flags = ceph_decode_32(p);
655
656 /* full map? */
657 ceph_decode_32_safe(p, end, len, bad);
658 if (len > 0) {
659 dout("apply_incremental full map len %d, %p to %p\n",
660 len, *p, end);
661 return osdmap_decode(p, min(*p+len, end));
662 }
663
664 /* new crush? */
665 ceph_decode_32_safe(p, end, len, bad);
666 if (len > 0) {
667 dout("apply_incremental new crush map len %d, %p to %p\n",
668 len, *p, end);
669 newcrush = crush_decode(*p, min(*p+len, end));
670 if (IS_ERR(newcrush))
671 return ERR_PTR(PTR_ERR(newcrush));
672 }
673
674 /* new flags? */
675 if (new_flags >= 0)
676 map->flags = new_flags;
677 if (new_pool_max >= 0)
678 map->pool_max = new_pool_max;
679
680 ceph_decode_need(p, end, 5*sizeof(u32), bad);
681
682 /* new max? */
683 max = ceph_decode_32(p);
684 if (max >= 0) {
685 err = osdmap_set_max_osd(map, max);
686 if (err < 0)
687 goto bad;
688 }
689
690 map->epoch++;
691 map->modified = map->modified;
692 if (newcrush) {
693 if (map->crush)
694 crush_destroy(map->crush);
695 map->crush = newcrush;
696 newcrush = NULL;
697 }
698
699 /* new_pool */
700 ceph_decode_32_safe(p, end, len, bad);
701 while (len--) {
702 __u8 ev;
703 struct ceph_pg_pool_info *pi;
704
705 ceph_decode_32_safe(p, end, pool, bad);
706 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
707 ev = ceph_decode_8(p); /* encoding version */
708 if (ev > CEPH_PG_POOL_VERSION) {
709 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
710 ev, CEPH_PG_POOL_VERSION);
711 goto bad;
712 }
713 pi = __lookup_pg_pool(&map->pg_pools, pool);
714 if (!pi) {
715 pi = kmalloc(sizeof(*pi), GFP_NOFS);
716 if (!pi) {
717 err = -ENOMEM;
718 goto bad;
719 }
720 pi->id = pool;
721 __insert_pg_pool(&map->pg_pools, pi);
722 }
723 __decode_pool(p, pi);
724 }
725
726 /* old_pool */
727 ceph_decode_32_safe(p, end, len, bad);
728 while (len--) {
729 struct ceph_pg_pool_info *pi;
730
731 ceph_decode_32_safe(p, end, pool, bad);
732 pi = __lookup_pg_pool(&map->pg_pools, pool);
733 if (pi) {
734 rb_erase(&pi->node, &map->pg_pools);
735 kfree(pi);
736 }
737 }
738
739 /* new_up */
740 err = -EINVAL;
741 ceph_decode_32_safe(p, end, len, bad);
742 while (len--) {
743 u32 osd;
744 struct ceph_entity_addr addr;
745 ceph_decode_32_safe(p, end, osd, bad);
746 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
747 ceph_decode_addr(&addr);
748 pr_info("osd%d up\n", osd);
749 BUG_ON(osd >= map->max_osd);
750 map->osd_state[osd] |= CEPH_OSD_UP;
751 map->osd_addr[osd] = addr;
752 }
753
754 /* new_down */
755 ceph_decode_32_safe(p, end, len, bad);
756 while (len--) {
757 u32 osd;
758 ceph_decode_32_safe(p, end, osd, bad);
759 (*p)++; /* clean flag */
760 pr_info("osd%d down\n", osd);
761 if (osd < map->max_osd)
762 map->osd_state[osd] &= ~CEPH_OSD_UP;
763 }
764
765 /* new_weight */
766 ceph_decode_32_safe(p, end, len, bad);
767 while (len--) {
768 u32 osd, off;
769 ceph_decode_need(p, end, sizeof(u32)*2, bad);
770 osd = ceph_decode_32(p);
771 off = ceph_decode_32(p);
772 pr_info("osd%d weight 0x%x %s\n", osd, off,
773 off == CEPH_OSD_IN ? "(in)" :
774 (off == CEPH_OSD_OUT ? "(out)" : ""));
775 if (osd < map->max_osd)
776 map->osd_weight[osd] = off;
777 }
778
779 /* new_pg_temp */
780 rbp = rb_first(&map->pg_temp);
781 ceph_decode_32_safe(p, end, len, bad);
782 while (len--) {
783 struct ceph_pg_mapping *pg;
784 int j;
785 struct ceph_pg pgid;
786 u32 pglen;
787 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
788 ceph_decode_copy(p, &pgid, sizeof(pgid));
789 pglen = ceph_decode_32(p);
790
791 /* remove any? */
792 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
793 node)->pgid, pgid) <= 0) {
794 struct rb_node *cur = rbp;
795 rbp = rb_next(rbp);
796 dout(" removed pg_temp %llx\n",
797 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
798 node)->pgid);
799 rb_erase(cur, &map->pg_temp);
800 }
801
802 if (pglen) {
803 /* insert */
804 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
805 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
806 if (!pg) {
807 err = -ENOMEM;
808 goto bad;
809 }
810 pg->pgid = pgid;
811 pg->len = pglen;
812 for (j = 0; j < pglen; j++)
813 pg->osds[j] = ceph_decode_32(p);
814 err = __insert_pg_mapping(pg, &map->pg_temp);
815 if (err)
816 goto bad;
817 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
818 pglen);
819 }
820 }
821 while (rbp) {
822 struct rb_node *cur = rbp;
823 rbp = rb_next(rbp);
824 dout(" removed pg_temp %llx\n",
825 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
826 node)->pgid);
827 rb_erase(cur, &map->pg_temp);
828 }
829
830 /* ignore the rest */
831 *p = end;
832 return map;
833
834bad:
835 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
836 epoch, (int)(*p - start), *p, start, end);
837 print_hex_dump(KERN_DEBUG, "osdmap: ",
838 DUMP_PREFIX_OFFSET, 16, 1,
839 start, end - start, true);
840 if (newcrush)
841 crush_destroy(newcrush);
842 return ERR_PTR(err);
843}
844
845
846
847
848/*
849 * calculate file layout from given offset, length.
850 * fill in correct oid, logical length, and object extent
851 * offset, length.
852 *
853 * for now, we write only a single su, until we can
854 * pass a stride back to the caller.
855 */
856void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
857 u64 off, u64 *plen,
858 u64 *ono,
859 u64 *oxoff, u64 *oxlen)
860{
861 u32 osize = le32_to_cpu(layout->fl_object_size);
862 u32 su = le32_to_cpu(layout->fl_stripe_unit);
863 u32 sc = le32_to_cpu(layout->fl_stripe_count);
864 u32 bl, stripeno, stripepos, objsetno;
865 u32 su_per_object;
866 u64 t, su_offset;
867
868 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
869 osize, su);
870 su_per_object = osize / su;
871 dout("osize %u / su %u = su_per_object %u\n", osize, su,
872 su_per_object);
873
874 BUG_ON((su & ~PAGE_MASK) != 0);
875 /* bl = *off / su; */
876 t = off;
877 do_div(t, su);
878 bl = t;
879 dout("off %llu / su %u = bl %u\n", off, su, bl);
880
881 stripeno = bl / sc;
882 stripepos = bl % sc;
883 objsetno = stripeno / su_per_object;
884
885 *ono = objsetno * sc + stripepos;
886 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
887
888 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
889 t = off;
890 su_offset = do_div(t, su);
891 *oxoff = su_offset + (stripeno % su_per_object) * su;
892
893 /*
894 * Calculate the length of the extent being written to the selected
895 * object. This is the minimum of the full length requested (plen) or
896 * the remainder of the current stripe being written to.
897 */
898 *oxlen = min_t(u64, *plen, su - su_offset);
899 *plen = *oxlen;
900
901 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
902}
903
904/*
905 * calculate an object layout (i.e. pgid) from an oid,
906 * file_layout, and osdmap
907 */
908int ceph_calc_object_layout(struct ceph_object_layout *ol,
909 const char *oid,
910 struct ceph_file_layout *fl,
911 struct ceph_osdmap *osdmap)
912{
913 unsigned num, num_mask;
914 struct ceph_pg pgid;
915 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
916 int poolid = le32_to_cpu(fl->fl_pg_pool);
917 struct ceph_pg_pool_info *pool;
918 unsigned ps;
919
920 BUG_ON(!osdmap);
921
922 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
923 if (!pool)
924 return -EIO;
925 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
926 if (preferred >= 0) {
927 ps += preferred;
928 num = le32_to_cpu(pool->v.lpg_num);
929 num_mask = pool->lpg_num_mask;
930 } else {
931 num = le32_to_cpu(pool->v.pg_num);
932 num_mask = pool->pg_num_mask;
933 }
934
935 pgid.ps = cpu_to_le16(ps);
936 pgid.preferred = cpu_to_le16(preferred);
937 pgid.pool = fl->fl_pg_pool;
938 if (preferred >= 0)
939 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
940 (int)preferred);
941 else
942 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
943
944 ol->ol_pgid = pgid;
945 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
946 return 0;
947}
948
949/*
950 * Calculate raw osd vector for the given pgid. Return pointer to osd
951 * array, or NULL on failure.
952 */
953static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
954 int *osds, int *num)
955{
956 struct ceph_pg_mapping *pg;
957 struct ceph_pg_pool_info *pool;
958 int ruleno;
959 unsigned poolid, ps, pps;
960 int preferred;
961
962 /* pg_temp? */
963 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
964 if (pg) {
965 *num = pg->len;
966 return pg->osds;
967 }
968
969 /* crush */
970 poolid = le32_to_cpu(pgid.pool);
971 ps = le16_to_cpu(pgid.ps);
972 preferred = (s16)le16_to_cpu(pgid.preferred);
973
974 /* don't forcefeed bad device ids to crush */
975 if (preferred >= osdmap->max_osd ||
976 preferred >= osdmap->crush->max_devices)
977 preferred = -1;
978
979 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
980 if (!pool)
981 return NULL;
982 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
983 pool->v.type, pool->v.size);
984 if (ruleno < 0) {
985 pr_err("no crush rule pool %d type %d size %d\n",
986 poolid, pool->v.type, pool->v.size);
987 return NULL;
988 }
989
990 if (preferred >= 0)
991 pps = ceph_stable_mod(ps,
992 le32_to_cpu(pool->v.lpgp_num),
993 pool->lpgp_num_mask);
994 else
995 pps = ceph_stable_mod(ps,
996 le32_to_cpu(pool->v.pgp_num),
997 pool->pgp_num_mask);
998 pps += poolid;
999 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1000 min_t(int, pool->v.size, *num),
1001 preferred, osdmap->osd_weight);
1002 return osds;
1003}
1004
1005/*
1006 * Return primary osd for given pgid, or -1 if none.
1007 */
1008int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1009{
1010 int rawosds[10], *osds;
1011 int i, num = ARRAY_SIZE(rawosds);
1012
1013 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1014 if (!osds)
1015 return -1;
1016
1017 /* primary is first up osd */
1018 for (i = 0; i < num; i++)
1019 if (ceph_osd_is_up(osdmap, osds[i])) {
1020 return osds[i];
1021 break;
1022 }
1023 return -1;
1024}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..1fb55afb2642
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26};
27
28struct ceph_pg_mapping {
29 struct rb_node node;
30 struct ceph_pg pgid;
31 int len;
32 int osds[];
33};
34
35struct ceph_osdmap {
36 struct ceph_fsid fsid;
37 u32 epoch;
38 u32 mkfs_epoch;
39 struct ceph_timespec created, modified;
40
41 u32 flags; /* CEPH_OSDMAP_* */
42
43 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
44 u8 *osd_state; /* CEPH_OSD_* */
45 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
46 struct ceph_entity_addr *osd_addr;
47
48 struct rb_root pg_temp;
49 struct rb_root pg_pools;
50 u32 pool_max;
51
52 /* the CRUSH map specifies the mapping of placement groups to
53 * the list of osds that store+replicate them. */
54 struct crush_map *crush;
55};
56
57/*
58 * file layout helpers
59 */
60#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
61#define ceph_file_layout_stripe_count(l) \
62 ((__s32)le32_to_cpu((l).fl_stripe_count))
63#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
64#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
65#define ceph_file_layout_object_su(l) \
66 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
67#define ceph_file_layout_pg_preferred(l) \
68 ((__s32)le32_to_cpu((l).fl_pg_preferred))
69#define ceph_file_layout_pg_pool(l) \
70 ((__s32)le32_to_cpu((l).fl_pg_pool))
71
72static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
73{
74 return le32_to_cpu(l->fl_stripe_unit) *
75 le32_to_cpu(l->fl_stripe_count);
76}
77
78/* "period" == bytes before i start on a new set of objects */
79static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
80{
81 return le32_to_cpu(l->fl_object_size) *
82 le32_to_cpu(l->fl_stripe_count);
83}
84
85
86static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
87{
88 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
89}
90
91static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
92{
93 return map && (map->flags & flag);
94}
95
96extern char *ceph_osdmap_state_str(char *str, int len, int state);
97
98static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
99 int osd)
100{
101 if (osd >= map->max_osd)
102 return NULL;
103 return &map->osd_addr[osd];
104}
105
106extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
107extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
108 struct ceph_osdmap *map,
109 struct ceph_messenger *msgr);
110extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
111
112/* calculate mapping of a file extent to an object */
113extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
114 u64 off, u64 *plen,
115 u64 *bno, u64 *oxoff, u64 *oxlen);
116
117/* calculate mapping of object to a placement group */
118extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
119 const char *oid,
120 struct ceph_file_layout *fl,
121 struct ceph_osdmap *osdmap);
122extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
123 struct ceph_pg pgid);
124
125#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = alloc_page(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..26ac8b89a676
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 4
15#define CEPH_OSDMAP_VERSION 4
16
17/*
18 * fs id
19 */
20struct ceph_fsid {
21 unsigned char fsid[16];
22};
23
24static inline int ceph_fsid_compare(const struct ceph_fsid *a,
25 const struct ceph_fsid *b)
26{
27 return memcmp(a, b, sizeof(*a));
28}
29
30/*
31 * ino, object, etc.
32 */
33typedef __le64 ceph_snapid_t;
34#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
35#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
36#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
37
38struct ceph_timespec {
39 __le32 tv_sec;
40 __le32 tv_nsec;
41} __attribute__ ((packed));
42
43
44/*
45 * object layout - how objects are mapped into PGs
46 */
47#define CEPH_OBJECT_LAYOUT_HASH 1
48#define CEPH_OBJECT_LAYOUT_LINEAR 2
49#define CEPH_OBJECT_LAYOUT_HASHINO 3
50
51/*
52 * pg layout -- how PGs are mapped onto (sets of) OSDs
53 */
54#define CEPH_PG_LAYOUT_CRUSH 0
55#define CEPH_PG_LAYOUT_HASH 1
56#define CEPH_PG_LAYOUT_LINEAR 2
57#define CEPH_PG_LAYOUT_HYBRID 3
58
59
60/*
61 * placement group.
62 * we encode this into one __le64.
63 */
64struct ceph_pg {
65 __le16 preferred; /* preferred primary osd */
66 __le16 ps; /* placement seed */
67 __le32 pool; /* object pool */
68} __attribute__ ((packed));
69
70/*
71 * pg_pool is a set of pgs storing a pool of objects
72 *
73 * pg_num -- base number of pseudorandomly placed pgs
74 *
75 * pgp_num -- effective number when calculating pg placement. this
76 * is used for pg_num increases. new pgs result in data being "split"
77 * into new pgs. for this to proceed smoothly, new pgs are intiially
78 * colocated with their parents; that is, pgp_num doesn't increase
79 * until the new pgs have successfully split. only _then_ are the new
80 * pgs placed independently.
81 *
82 * lpg_num -- localized pg count (per device). replicas are randomly
83 * selected.
84 *
85 * lpgp_num -- as above.
86 */
87#define CEPH_PG_TYPE_REP 1
88#define CEPH_PG_TYPE_RAID4 2
89#define CEPH_PG_POOL_VERSION 2
90struct ceph_pg_pool {
91 __u8 type; /* CEPH_PG_TYPE_* */
92 __u8 size; /* number of osds in each pg */
93 __u8 crush_ruleset; /* crush placement rule */
94 __u8 object_hash; /* hash mapping object name to ps */
95 __le32 pg_num, pgp_num; /* number of pg's */
96 __le32 lpg_num, lpgp_num; /* number of localized pg's */
97 __le32 last_change; /* most recent epoch changed */
98 __le64 snap_seq; /* seq for per-pool snapshot */
99 __le32 snap_epoch; /* epoch of last snap */
100 __le32 num_snaps;
101 __le32 num_removed_snap_intervals;
102 __le64 uid;
103} __attribute__ ((packed));
104
105/*
106 * stable_mod func is used to control number of placement groups.
107 * similar to straight-up modulo, but produces a stable mapping as b
108 * increases over time. b is the number of bins, and bmask is the
109 * containing power of 2 minus 1.
110 *
111 * b <= bmask and bmask=(2**n)-1
112 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
113 */
114static inline int ceph_stable_mod(int x, int b, int bmask)
115{
116 if ((x & bmask) < b)
117 return x & bmask;
118 else
119 return x & (bmask >> 1);
120}
121
122/*
123 * object layout - how a given object should be stored.
124 */
125struct ceph_object_layout {
126 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
127 __le32 ol_stripe_unit; /* for per-object parity, if any */
128} __attribute__ ((packed));
129
130/*
131 * compound epoch+version, used by storage layer to serialize mutations
132 */
133struct ceph_eversion {
134 __le32 epoch;
135 __le64 version;
136} __attribute__ ((packed));
137
138/*
139 * osd map bits
140 */
141
142/* status bits */
143#define CEPH_OSD_EXISTS 1
144#define CEPH_OSD_UP 2
145
146/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
147#define CEPH_OSD_IN 0x10000
148#define CEPH_OSD_OUT 0
149
150
151/*
152 * osd map flag bits
153 */
154#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
155#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
156#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
157#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
158#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
159
160/*
161 * osd ops
162 */
163#define CEPH_OSD_OP_MODE 0xf000
164#define CEPH_OSD_OP_MODE_RD 0x1000
165#define CEPH_OSD_OP_MODE_WR 0x2000
166#define CEPH_OSD_OP_MODE_RMW 0x3000
167#define CEPH_OSD_OP_MODE_SUB 0x4000
168
169#define CEPH_OSD_OP_TYPE 0x0f00
170#define CEPH_OSD_OP_TYPE_LOCK 0x0100
171#define CEPH_OSD_OP_TYPE_DATA 0x0200
172#define CEPH_OSD_OP_TYPE_ATTR 0x0300
173#define CEPH_OSD_OP_TYPE_EXEC 0x0400
174#define CEPH_OSD_OP_TYPE_PG 0x0500
175
176enum {
177 /** data **/
178 /* read */
179 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
180 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
181
182 /* fancy read */
183 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
184
185 /* write */
186 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
187 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
188 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
189 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
190 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
191
192 /* fancy write */
193 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
194 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
195 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
196 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
197
198 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
199 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
200 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
201
202 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
203
204 /** attrs **/
205 /* read */
206 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
207 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
208
209 /* write */
210 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
213 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
214
215 /** subop **/
216 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
217 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
218 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
219 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
220 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
221
222 /** lock **/
223 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
224 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
225 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
226 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
227 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
228 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
229
230 /** exec **/
231 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
232
233 /** pg **/
234 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
235};
236
237static inline int ceph_osd_op_type_lock(int op)
238{
239 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
240}
241static inline int ceph_osd_op_type_data(int op)
242{
243 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
244}
245static inline int ceph_osd_op_type_attr(int op)
246{
247 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
248}
249static inline int ceph_osd_op_type_exec(int op)
250{
251 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
252}
253static inline int ceph_osd_op_type_pg(int op)
254{
255 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
256}
257
258static inline int ceph_osd_op_mode_subop(int op)
259{
260 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
261}
262static inline int ceph_osd_op_mode_read(int op)
263{
264 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
265}
266static inline int ceph_osd_op_mode_modify(int op)
267{
268 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
269}
270
271#define CEPH_OSD_TMAP_HDR 'h'
272#define CEPH_OSD_TMAP_SET 's'
273#define CEPH_OSD_TMAP_RM 'r'
274
275extern const char *ceph_osd_op_name(int op);
276
277
278/*
279 * osd op flags
280 *
281 * An op may be READ, WRITE, or READ|WRITE.
282 */
283enum {
284 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
285 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
286 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
287 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
288 CEPH_OSD_FLAG_READ = 16, /* op may read */
289 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
290 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
291 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
292 CEPH_OSD_FLAG_BALANCE_READS = 256,
293 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
294 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
295 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
296};
297
298enum {
299 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
300};
301
302#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
303#define EBLACKLISTED ESHUTDOWN /* blacklisted */
304
305/*
306 * an individual object operation. each may be accompanied by some data
307 * payload
308 */
309struct ceph_osd_op {
310 __le16 op; /* CEPH_OSD_OP_* */
311 __le32 flags; /* CEPH_OSD_FLAG_* */
312 union {
313 struct {
314 __le64 offset, length;
315 __le64 truncate_size;
316 __le32 truncate_seq;
317 } __attribute__ ((packed)) extent;
318 struct {
319 __le32 name_len;
320 __le32 value_len;
321 } __attribute__ ((packed)) xattr;
322 struct {
323 __u8 class_len;
324 __u8 method_len;
325 __u8 argc;
326 __le32 indata_len;
327 } __attribute__ ((packed)) cls;
328 struct {
329 __le64 cookie, count;
330 } __attribute__ ((packed)) pgls;
331 };
332 __le32 payload_len;
333} __attribute__ ((packed));
334
335/*
336 * osd request message header. each request may include multiple
337 * ceph_osd_op object operations.
338 */
339struct ceph_osd_request_head {
340 __le32 client_inc; /* client incarnation */
341 struct ceph_object_layout layout; /* pgid */
342 __le32 osdmap_epoch; /* client's osdmap epoch */
343
344 __le32 flags;
345
346 struct ceph_timespec mtime; /* for mutations only */
347 struct ceph_eversion reassert_version; /* if we are replaying op */
348
349 __le32 object_len; /* length of object name */
350
351 __le64 snapid; /* snapid to read */
352 __le64 snap_seq; /* writer's snap context */
353 __le32 num_snaps;
354
355 __le16 num_ops;
356 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
357} __attribute__ ((packed));
358
359struct ceph_osd_reply_head {
360 __le32 client_inc; /* client incarnation */
361 __le32 flags;
362 struct ceph_object_layout layout;
363 __le32 osdmap_epoch;
364 struct ceph_eversion reassert_version; /* for replaying uncommitted */
365
366 __le32 result; /* result code */
367
368 __le32 object_len; /* length of object name */
369 __le32 num_ops;
370 struct ceph_osd_op ops[0]; /* ops[], object */
371} __attribute__ ((packed));
372
373
374#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..e6f9bc57d472
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,907 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4#include <linux/slab.h>
5
6#include "super.h"
7#include "decode.h"
8
9/*
10 * Snapshots in ceph are driven in large part by cooperation from the
11 * client. In contrast to local file systems or file servers that
12 * implement snapshots at a single point in the system, ceph's
13 * distributed access to storage requires clients to help decide
14 * whether a write logically occurs before or after a recently created
15 * snapshot.
16 *
17 * This provides a perfect instantanous client-wide snapshot. Between
18 * clients, however, snapshots may appear to be applied at slightly
19 * different points in time, depending on delays in delivering the
20 * snapshot notification.
21 *
22 * Snapshots are _not_ file system-wide. Instead, each snapshot
23 * applies to the subdirectory nested beneath some directory. This
24 * effectively divides the hierarchy into multiple "realms," where all
25 * of the files contained by each realm share the same set of
26 * snapshots. An individual realm's snap set contains snapshots
27 * explicitly created on that realm, as well as any snaps in its
28 * parent's snap set _after_ the point at which the parent became it's
29 * parent (due to, say, a rename). Similarly, snaps from prior parents
30 * during the time intervals during which they were the parent are included.
31 *
32 * The client is spared most of this detail, fortunately... it must only
33 * maintains a hierarchy of realms reflecting the current parent/child
34 * realm relationship, and for each realm has an explicit list of snaps
35 * inherited from prior parents.
36 *
37 * A snap_realm struct is maintained for realms containing every inode
38 * with an open cap in the system. (The needed snap realm information is
39 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
40 * version number is used to ensure that as realm parameters change (new
41 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
42 *
43 * The realm hierarchy drives the generation of a 'snap context' for each
44 * realm, which simply lists the resulting set of snaps for the realm. This
45 * is attached to any writes sent to OSDs.
46 */
47/*
48 * Unfortunately error handling is a bit mixed here. If we get a snap
49 * update, but don't have enough memory to update our realm hierarchy,
50 * it's not clear what we can do about it (besides complaining to the
51 * console).
52 */
53
54
55/*
56 * increase ref count for the realm
57 *
58 * caller must hold snap_rwsem for write.
59 */
60void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
61 struct ceph_snap_realm *realm)
62{
63 dout("get_realm %p %d -> %d\n", realm,
64 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
65 /*
66 * since we _only_ increment realm refs or empty the empty
67 * list with snap_rwsem held, adjusting the empty list here is
68 * safe. we do need to protect against concurrent empty list
69 * additions, however.
70 */
71 if (atomic_read(&realm->nref) == 0) {
72 spin_lock(&mdsc->snap_empty_lock);
73 list_del_init(&realm->empty_item);
74 spin_unlock(&mdsc->snap_empty_lock);
75 }
76
77 atomic_inc(&realm->nref);
78}
79
80static void __insert_snap_realm(struct rb_root *root,
81 struct ceph_snap_realm *new)
82{
83 struct rb_node **p = &root->rb_node;
84 struct rb_node *parent = NULL;
85 struct ceph_snap_realm *r = NULL;
86
87 while (*p) {
88 parent = *p;
89 r = rb_entry(parent, struct ceph_snap_realm, node);
90 if (new->ino < r->ino)
91 p = &(*p)->rb_left;
92 else if (new->ino > r->ino)
93 p = &(*p)->rb_right;
94 else
95 BUG();
96 }
97
98 rb_link_node(&new->node, parent, p);
99 rb_insert_color(&new->node, root);
100}
101
102/*
103 * create and get the realm rooted at @ino and bump its ref count.
104 *
105 * caller must hold snap_rwsem for write.
106 */
107static struct ceph_snap_realm *ceph_create_snap_realm(
108 struct ceph_mds_client *mdsc,
109 u64 ino)
110{
111 struct ceph_snap_realm *realm;
112
113 realm = kzalloc(sizeof(*realm), GFP_NOFS);
114 if (!realm)
115 return ERR_PTR(-ENOMEM);
116
117 atomic_set(&realm->nref, 0); /* tree does not take a ref */
118 realm->ino = ino;
119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm);
125 dout("create_snap_realm %llx %p\n", realm->ino, realm);
126 return realm;
127}
128
129/*
130 * lookup the realm rooted at @ino.
131 *
132 * caller must hold snap_rwsem for write.
133 */
134struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
135 u64 ino)
136{
137 struct rb_node *n = mdsc->snap_realms.rb_node;
138 struct ceph_snap_realm *r;
139
140 while (n) {
141 r = rb_entry(n, struct ceph_snap_realm, node);
142 if (ino < r->ino)
143 n = n->rb_left;
144 else if (ino > r->ino)
145 n = n->rb_right;
146 else {
147 dout("lookup_snap_realm %llx %p\n", r->ino, r);
148 return r;
149 }
150 }
151 return NULL;
152}
153
154static void __put_snap_realm(struct ceph_mds_client *mdsc,
155 struct ceph_snap_realm *realm);
156
157/*
158 * called with snap_rwsem (write)
159 */
160static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
161 struct ceph_snap_realm *realm)
162{
163 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
164
165 rb_erase(&realm->node, &mdsc->snap_realms);
166
167 if (realm->parent) {
168 list_del_init(&realm->child_item);
169 __put_snap_realm(mdsc, realm->parent);
170 }
171
172 kfree(realm->prior_parent_snaps);
173 kfree(realm->snaps);
174 ceph_put_snap_context(realm->cached_context);
175 kfree(realm);
176}
177
178/*
179 * caller holds snap_rwsem (write)
180 */
181static void __put_snap_realm(struct ceph_mds_client *mdsc,
182 struct ceph_snap_realm *realm)
183{
184 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
185 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
186 if (atomic_dec_and_test(&realm->nref))
187 __destroy_snap_realm(mdsc, realm);
188}
189
190/*
191 * caller needn't hold any locks
192 */
193void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
194 struct ceph_snap_realm *realm)
195{
196 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
197 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
198 if (!atomic_dec_and_test(&realm->nref))
199 return;
200
201 if (down_write_trylock(&mdsc->snap_rwsem)) {
202 __destroy_snap_realm(mdsc, realm);
203 up_write(&mdsc->snap_rwsem);
204 } else {
205 spin_lock(&mdsc->snap_empty_lock);
206 list_add(&mdsc->snap_empty, &realm->empty_item);
207 spin_unlock(&mdsc->snap_empty_lock);
208 }
209}
210
211/*
212 * Clean up any realms whose ref counts have dropped to zero. Note
213 * that this does not include realms who were created but not yet
214 * used.
215 *
216 * Called under snap_rwsem (write)
217 */
218static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
219{
220 struct ceph_snap_realm *realm;
221
222 spin_lock(&mdsc->snap_empty_lock);
223 while (!list_empty(&mdsc->snap_empty)) {
224 realm = list_first_entry(&mdsc->snap_empty,
225 struct ceph_snap_realm, empty_item);
226 list_del(&realm->empty_item);
227 spin_unlock(&mdsc->snap_empty_lock);
228 __destroy_snap_realm(mdsc, realm);
229 spin_lock(&mdsc->snap_empty_lock);
230 }
231 spin_unlock(&mdsc->snap_empty_lock);
232}
233
234void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
235{
236 down_write(&mdsc->snap_rwsem);
237 __cleanup_empty_realms(mdsc);
238 up_write(&mdsc->snap_rwsem);
239}
240
241/*
242 * adjust the parent realm of a given @realm. adjust child list, and parent
243 * pointers, and ref counts appropriately.
244 *
245 * return true if parent was changed, 0 if unchanged, <0 on error.
246 *
247 * caller must hold snap_rwsem for write.
248 */
249static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
250 struct ceph_snap_realm *realm,
251 u64 parentino)
252{
253 struct ceph_snap_realm *parent;
254
255 if (realm->parent_ino == parentino)
256 return 0;
257
258 parent = ceph_lookup_snap_realm(mdsc, parentino);
259 if (!parent) {
260 parent = ceph_create_snap_realm(mdsc, parentino);
261 if (IS_ERR(parent))
262 return PTR_ERR(parent);
263 }
264 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
265 realm->ino, realm, realm->parent_ino, realm->parent,
266 parentino, parent);
267 if (realm->parent) {
268 list_del_init(&realm->child_item);
269 ceph_put_snap_realm(mdsc, realm->parent);
270 }
271 realm->parent_ino = parentino;
272 realm->parent = parent;
273 ceph_get_snap_realm(mdsc, parent);
274 list_add(&realm->child_item, &parent->children);
275 return 1;
276}
277
278
279static int cmpu64_rev(const void *a, const void *b)
280{
281 if (*(u64 *)a < *(u64 *)b)
282 return 1;
283 if (*(u64 *)a > *(u64 *)b)
284 return -1;
285 return 0;
286}
287
288/*
289 * build the snap context for a given realm.
290 */
291static int build_snap_context(struct ceph_snap_realm *realm)
292{
293 struct ceph_snap_realm *parent = realm->parent;
294 struct ceph_snap_context *snapc;
295 int err = 0;
296 int i;
297 int num = realm->num_prior_parent_snaps + realm->num_snaps;
298
299 /*
300 * build parent context, if it hasn't been built.
301 * conservatively estimate that all parent snaps might be
302 * included by us.
303 */
304 if (parent) {
305 if (!parent->cached_context) {
306 err = build_snap_context(parent);
307 if (err)
308 goto fail;
309 }
310 num += parent->cached_context->num_snaps;
311 }
312
313 /* do i actually need to update? not if my context seq
314 matches realm seq, and my parents' does to. (this works
315 because we rebuild_snap_realms() works _downward_ in
316 hierarchy after each update.) */
317 if (realm->cached_context &&
318 realm->cached_context->seq == realm->seq &&
319 (!parent ||
320 realm->cached_context->seq >= parent->cached_context->seq)) {
321 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
322 " (unchanged)\n",
323 realm->ino, realm, realm->cached_context,
324 realm->cached_context->seq,
325 realm->cached_context->num_snaps);
326 return 0;
327 }
328
329 /* alloc new snap context */
330 err = -ENOMEM;
331 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
332 goto fail;
333 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
334 if (!snapc)
335 goto fail;
336 atomic_set(&snapc->nref, 1);
337
338 /* build (reverse sorted) snap vector */
339 num = 0;
340 snapc->seq = realm->seq;
341 if (parent) {
342 /* include any of parent's snaps occuring _after_ my
343 parent became my parent */
344 for (i = 0; i < parent->cached_context->num_snaps; i++)
345 if (parent->cached_context->snaps[i] >=
346 realm->parent_since)
347 snapc->snaps[num++] =
348 parent->cached_context->snaps[i];
349 if (parent->cached_context->seq > snapc->seq)
350 snapc->seq = parent->cached_context->seq;
351 }
352 memcpy(snapc->snaps + num, realm->snaps,
353 sizeof(u64)*realm->num_snaps);
354 num += realm->num_snaps;
355 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
356 sizeof(u64)*realm->num_prior_parent_snaps);
357 num += realm->num_prior_parent_snaps;
358
359 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
360 snapc->num_snaps = num;
361 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
362 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
363
364 if (realm->cached_context)
365 ceph_put_snap_context(realm->cached_context);
366 realm->cached_context = snapc;
367 return 0;
368
369fail:
370 /*
371 * if we fail, clear old (incorrect) cached_context... hopefully
372 * we'll have better luck building it later
373 */
374 if (realm->cached_context) {
375 ceph_put_snap_context(realm->cached_context);
376 realm->cached_context = NULL;
377 }
378 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
379 realm, err);
380 return err;
381}
382
383/*
384 * rebuild snap context for the given realm and all of its children.
385 */
386static void rebuild_snap_realms(struct ceph_snap_realm *realm)
387{
388 struct ceph_snap_realm *child;
389
390 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
391 build_snap_context(realm);
392
393 list_for_each_entry(child, &realm->children, child_item)
394 rebuild_snap_realms(child);
395}
396
397
398/*
399 * helper to allocate and decode an array of snapids. free prior
400 * instance, if any.
401 */
402static int dup_array(u64 **dst, __le64 *src, int num)
403{
404 int i;
405
406 kfree(*dst);
407 if (num) {
408 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
409 if (!*dst)
410 return -ENOMEM;
411 for (i = 0; i < num; i++)
412 (*dst)[i] = get_unaligned_le64(src + i);
413 } else {
414 *dst = NULL;
415 }
416 return 0;
417}
418
419
420/*
421 * When a snapshot is applied, the size/mtime inode metadata is queued
422 * in a ceph_cap_snap (one for each snapshot) until writeback
423 * completes and the metadata can be flushed back to the MDS.
424 *
425 * However, if a (sync) write is currently in-progress when we apply
426 * the snapshot, we have to wait until the write succeeds or fails
427 * (and a final size/mtime is known). In this case the
428 * cap_snap->writing = 1, and is said to be "pending." When the write
429 * finishes, we __ceph_finish_cap_snap().
430 *
431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
432 * change).
433 */
434void ceph_queue_cap_snap(struct ceph_inode_info *ci,
435 struct ceph_snap_context *snapc)
436{
437 struct inode *inode = &ci->vfs_inode;
438 struct ceph_cap_snap *capsnap;
439 int used;
440
441 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
442 if (!capsnap) {
443 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
444 return;
445 }
446
447 spin_lock(&inode->i_lock);
448 used = __ceph_caps_used(ci);
449 if (__ceph_have_pending_cap_snap(ci)) {
450 /* there is no point in queuing multiple "pending" cap_snaps,
451 as no new writes are allowed to start when pending, so any
452 writes in progress now were started before the previous
453 cap_snap. lucky us. */
454 dout("queue_cap_snap %p snapc %p seq %llu used %d"
455 " already pending\n", inode, snapc, snapc->seq, used);
456 kfree(capsnap);
457 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
458 igrab(inode);
459
460 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item);
464
465 capsnap->follows = snapc->seq - 1;
466 capsnap->context = ceph_get_snap_context(snapc);
467 capsnap->issued = __ceph_caps_issued(ci, NULL);
468 capsnap->dirty = __ceph_caps_dirty(ci);
469
470 capsnap->mode = inode->i_mode;
471 capsnap->uid = inode->i_uid;
472 capsnap->gid = inode->i_gid;
473
474 /* fixme? */
475 capsnap->xattr_blob = NULL;
476 capsnap->xattr_len = 0;
477
478 /* dirty page count moved from _head to this cap_snap;
479 all subsequent writes page dirties occur _after_ this
480 snapshot. */
481 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
482 ci->i_wrbuffer_ref_head = 0;
483 ceph_put_snap_context(ci->i_head_snapc);
484 ci->i_head_snapc = NULL;
485 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
486
487 if (used & CEPH_CAP_FILE_WR) {
488 dout("queue_cap_snap %p cap_snap %p snapc %p"
489 " seq %llu used WR, now pending\n", inode,
490 capsnap, snapc, snapc->seq);
491 capsnap->writing = 1;
492 } else {
493 /* note mtime, size NOW. */
494 __ceph_finish_cap_snap(ci, capsnap);
495 }
496 } else {
497 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
498 kfree(capsnap);
499 }
500
501 spin_unlock(&inode->i_lock);
502}
503
504/*
505 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
506 * to be used for the snapshot, to be flushed back to the mds.
507 *
508 * If capsnap can now be flushed, add to snap_flush list, and return 1.
509 *
510 * Caller must hold i_lock.
511 */
512int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
513 struct ceph_cap_snap *capsnap)
514{
515 struct inode *inode = &ci->vfs_inode;
516 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
517
518 BUG_ON(capsnap->writing);
519 capsnap->size = inode->i_size;
520 capsnap->mtime = inode->i_mtime;
521 capsnap->atime = inode->i_atime;
522 capsnap->ctime = inode->i_ctime;
523 capsnap->time_warp_seq = ci->i_time_warp_seq;
524 if (capsnap->dirty_pages) {
525 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
526 "still has %d dirty pages\n", inode, capsnap,
527 capsnap->context, capsnap->context->seq,
528 capsnap->size, capsnap->dirty_pages);
529 return 0;
530 }
531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
532 inode, capsnap, capsnap->context,
533 capsnap->context->seq, capsnap->size);
534
535 spin_lock(&mdsc->snap_flush_lock);
536 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
537 spin_unlock(&mdsc->snap_flush_lock);
538 return 1; /* caller may want to ceph_flush_snaps */
539}
540
541
542/*
543 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
544 * the snap realm parameters from a given realm and all of its ancestors,
545 * up to the root.
546 *
547 * Caller must hold snap_rwsem for write.
548 */
549int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
550 void *p, void *e, bool deletion)
551{
552 struct ceph_mds_snap_realm *ri; /* encoded */
553 __le64 *snaps; /* encoded */
554 __le64 *prior_parent_snaps; /* encoded */
555 struct ceph_snap_realm *realm;
556 int invalidate = 0;
557 int err = -ENOMEM;
558
559 dout("update_snap_trace deletion=%d\n", deletion);
560more:
561 ceph_decode_need(&p, e, sizeof(*ri), bad);
562 ri = p;
563 p += sizeof(*ri);
564 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
565 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
566 snaps = p;
567 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
568 prior_parent_snaps = p;
569 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
570
571 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
572 if (!realm) {
573 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
574 if (IS_ERR(realm)) {
575 err = PTR_ERR(realm);
576 goto fail;
577 }
578 }
579
580 if (le64_to_cpu(ri->seq) > realm->seq) {
581 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
582 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
583 /*
584 * if the realm seq has changed, queue a cap_snap for every
585 * inode with open caps. we do this _before_ we update
586 * the realm info so that we prepare for writeback under the
587 * _previous_ snap context.
588 *
589 * ...unless it's a snap deletion!
590 */
591 if (!deletion) {
592 struct ceph_inode_info *ci;
593 struct inode *lastinode = NULL;
594
595 spin_lock(&realm->inodes_with_caps_lock);
596 list_for_each_entry(ci, &realm->inodes_with_caps,
597 i_snap_realm_item) {
598 struct inode *inode = igrab(&ci->vfs_inode);
599 if (!inode)
600 continue;
601 spin_unlock(&realm->inodes_with_caps_lock);
602 if (lastinode)
603 iput(lastinode);
604 lastinode = inode;
605 ceph_queue_cap_snap(ci, realm->cached_context);
606 spin_lock(&realm->inodes_with_caps_lock);
607 }
608 spin_unlock(&realm->inodes_with_caps_lock);
609 if (lastinode)
610 iput(lastinode);
611 dout("update_snap_trace cap_snaps queued\n");
612 }
613
614 } else {
615 dout("update_snap_trace %llx %p seq %lld unchanged\n",
616 realm->ino, realm, realm->seq);
617 }
618
619 /* ensure the parent is correct */
620 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
621 if (err < 0)
622 goto fail;
623 invalidate += err;
624
625 if (le64_to_cpu(ri->seq) > realm->seq) {
626 /* update realm parameters, snap lists */
627 realm->seq = le64_to_cpu(ri->seq);
628 realm->created = le64_to_cpu(ri->created);
629 realm->parent_since = le64_to_cpu(ri->parent_since);
630
631 realm->num_snaps = le32_to_cpu(ri->num_snaps);
632 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
633 if (err < 0)
634 goto fail;
635
636 realm->num_prior_parent_snaps =
637 le32_to_cpu(ri->num_prior_parent_snaps);
638 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
639 realm->num_prior_parent_snaps);
640 if (err < 0)
641 goto fail;
642
643 invalidate = 1;
644 } else if (!realm->cached_context) {
645 invalidate = 1;
646 }
647
648 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
649 realm, invalidate, p, e);
650
651 if (p < e)
652 goto more;
653
654 /* invalidate when we reach the _end_ (root) of the trace */
655 if (invalidate)
656 rebuild_snap_realms(realm);
657
658 __cleanup_empty_realms(mdsc);
659 return 0;
660
661bad:
662 err = -EINVAL;
663fail:
664 pr_err("update_snap_trace error %d\n", err);
665 return err;
666}
667
668
669/*
670 * Send any cap_snaps that are queued for flush. Try to carry
671 * s_mutex across multiple snap flushes to avoid locking overhead.
672 *
673 * Caller holds no locks.
674 */
675static void flush_snaps(struct ceph_mds_client *mdsc)
676{
677 struct ceph_inode_info *ci;
678 struct inode *inode;
679 struct ceph_mds_session *session = NULL;
680
681 dout("flush_snaps\n");
682 spin_lock(&mdsc->snap_flush_lock);
683 while (!list_empty(&mdsc->snap_flush_list)) {
684 ci = list_first_entry(&mdsc->snap_flush_list,
685 struct ceph_inode_info, i_snap_flush_item);
686 inode = &ci->vfs_inode;
687 igrab(inode);
688 spin_unlock(&mdsc->snap_flush_lock);
689 spin_lock(&inode->i_lock);
690 __ceph_flush_snaps(ci, &session);
691 spin_unlock(&inode->i_lock);
692 iput(inode);
693 spin_lock(&mdsc->snap_flush_lock);
694 }
695 spin_unlock(&mdsc->snap_flush_lock);
696
697 if (session) {
698 mutex_unlock(&session->s_mutex);
699 ceph_put_mds_session(session);
700 }
701 dout("flush_snaps done\n");
702}
703
704
705/*
706 * Handle a snap notification from the MDS.
707 *
708 * This can take two basic forms: the simplest is just a snap creation
709 * or deletion notification on an existing realm. This should update the
710 * realm and its children.
711 *
712 * The more difficult case is realm creation, due to snap creation at a
713 * new point in the file hierarchy, or due to a rename that moves a file or
714 * directory into another realm.
715 */
716void ceph_handle_snap(struct ceph_mds_client *mdsc,
717 struct ceph_mds_session *session,
718 struct ceph_msg *msg)
719{
720 struct super_block *sb = mdsc->client->sb;
721 int mds = session->s_mds;
722 u64 split;
723 int op;
724 int trace_len;
725 struct ceph_snap_realm *realm = NULL;
726 void *p = msg->front.iov_base;
727 void *e = p + msg->front.iov_len;
728 struct ceph_mds_snap_head *h;
729 int num_split_inos, num_split_realms;
730 __le64 *split_inos = NULL, *split_realms = NULL;
731 int i;
732 int locked_rwsem = 0;
733
734 /* decode */
735 if (msg->front.iov_len < sizeof(*h))
736 goto bad;
737 h = p;
738 op = le32_to_cpu(h->op);
739 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
740 * existing realm */
741 num_split_inos = le32_to_cpu(h->num_split_inos);
742 num_split_realms = le32_to_cpu(h->num_split_realms);
743 trace_len = le32_to_cpu(h->trace_len);
744 p += sizeof(*h);
745
746 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
747 ceph_snap_op_name(op), split, trace_len);
748
749 mutex_lock(&session->s_mutex);
750 session->s_seq++;
751 mutex_unlock(&session->s_mutex);
752
753 down_write(&mdsc->snap_rwsem);
754 locked_rwsem = 1;
755
756 if (op == CEPH_SNAP_OP_SPLIT) {
757 struct ceph_mds_snap_realm *ri;
758
759 /*
760 * A "split" breaks part of an existing realm off into
761 * a new realm. The MDS provides a list of inodes
762 * (with caps) and child realms that belong to the new
763 * child.
764 */
765 split_inos = p;
766 p += sizeof(u64) * num_split_inos;
767 split_realms = p;
768 p += sizeof(u64) * num_split_realms;
769 ceph_decode_need(&p, e, sizeof(*ri), bad);
770 /* we will peek at realm info here, but will _not_
771 * advance p, as the realm update will occur below in
772 * ceph_update_snap_trace. */
773 ri = p;
774
775 realm = ceph_lookup_snap_realm(mdsc, split);
776 if (!realm) {
777 realm = ceph_create_snap_realm(mdsc, split);
778 if (IS_ERR(realm))
779 goto out;
780 }
781 ceph_get_snap_realm(mdsc, realm);
782
783 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
784 for (i = 0; i < num_split_inos; i++) {
785 struct ceph_vino vino = {
786 .ino = le64_to_cpu(split_inos[i]),
787 .snap = CEPH_NOSNAP,
788 };
789 struct inode *inode = ceph_find_inode(sb, vino);
790 struct ceph_inode_info *ci;
791
792 if (!inode)
793 continue;
794 ci = ceph_inode(inode);
795
796 spin_lock(&inode->i_lock);
797 if (!ci->i_snap_realm)
798 goto skip_inode;
799 /*
800 * If this inode belongs to a realm that was
801 * created after our new realm, we experienced
802 * a race (due to another split notifications
803 * arriving from a different MDS). So skip
804 * this inode.
805 */
806 if (ci->i_snap_realm->created >
807 le64_to_cpu(ri->created)) {
808 dout(" leaving %p in newer realm %llx %p\n",
809 inode, ci->i_snap_realm->ino,
810 ci->i_snap_realm);
811 goto skip_inode;
812 }
813 dout(" will move %p to split realm %llx %p\n",
814 inode, realm->ino, realm);
815 /*
816 * Remove the inode from the realm's inode
817 * list, but don't add it to the new realm
818 * yet. We don't want the cap_snap to be
819 * queued (again) by ceph_update_snap_trace()
820 * below. Queue it _now_, under the old context.
821 */
822 spin_lock(&realm->inodes_with_caps_lock);
823 list_del_init(&ci->i_snap_realm_item);
824 spin_unlock(&realm->inodes_with_caps_lock);
825 spin_unlock(&inode->i_lock);
826
827 ceph_queue_cap_snap(ci,
828 ci->i_snap_realm->cached_context);
829
830 iput(inode);
831 continue;
832
833skip_inode:
834 spin_unlock(&inode->i_lock);
835 iput(inode);
836 }
837
838 /* we may have taken some of the old realm's children. */
839 for (i = 0; i < num_split_realms; i++) {
840 struct ceph_snap_realm *child =
841 ceph_lookup_snap_realm(mdsc,
842 le64_to_cpu(split_realms[i]));
843 if (!child)
844 continue;
845 adjust_snap_realm_parent(mdsc, child, realm->ino);
846 }
847 }
848
849 /*
850 * update using the provided snap trace. if we are deleting a
851 * snap, we can avoid queueing cap_snaps.
852 */
853 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY);
855
856 if (op == CEPH_SNAP_OP_SPLIT) {
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (!ci->i_snap_realm)
873 goto split_skip_inode;
874 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
875 spin_lock(&realm->inodes_with_caps_lock);
876 list_add(&ci->i_snap_realm_item,
877 &realm->inodes_with_caps);
878 ci->i_snap_realm = realm;
879 spin_unlock(&realm->inodes_with_caps_lock);
880 ceph_get_snap_realm(mdsc, realm);
881split_skip_inode:
882 spin_unlock(&inode->i_lock);
883 iput(inode);
884 }
885
886 /* we took a reference when we created the realm, above */
887 ceph_put_snap_realm(mdsc, realm);
888 }
889
890 __cleanup_empty_realms(mdsc);
891
892 up_write(&mdsc->snap_rwsem);
893
894 flush_snaps(mdsc);
895 return;
896
897bad:
898 pr_err("corrupt snap message from mds%d\n", mds);
899 ceph_msg_dump(msg);
900out:
901 if (locked_rwsem)
902 up_write(&mdsc->snap_rwsem);
903 return;
904}
905
906
907
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..75d02eaa1279
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1031 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/statfs.h>
16#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19
20#include "decode.h"
21#include "super.h"
22#include "mon_client.h"
23#include "auth.h"
24
25/*
26 * Ceph superblock operations
27 *
28 * Handle the basics of mounting, unmounting.
29 */
30
31
32/*
33 * find filename portion of a path (/foo/bar/baz -> baz)
34 */
35const char *ceph_file_part(const char *s, int len)
36{
37 const char *e = s + len;
38
39 while (e != s && *(e-1) != '/')
40 e--;
41 return e;
42}
43
44
45/*
46 * super ops
47 */
48static void ceph_put_super(struct super_block *s)
49{
50 struct ceph_client *cl = ceph_client(s);
51
52 dout("put_super\n");
53 ceph_mdsc_close_sessions(&cl->mdsc);
54 return;
55}
56
57static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
58{
59 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
60 struct ceph_monmap *monmap = client->monc.monmap;
61 struct ceph_statfs st;
62 u64 fsid;
63 int err;
64
65 dout("statfs\n");
66 err = ceph_monc_do_statfs(&client->monc, &st);
67 if (err < 0)
68 return err;
69
70 /* fill in kstatfs */
71 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
72
73 /*
74 * express utilization in terms of large blocks to avoid
75 * overflow on 32-bit machines.
76 */
77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
78 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
79 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
80 (CEPH_BLOCK_SHIFT-10);
81 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
82
83 buf->f_files = le64_to_cpu(st.num_objects);
84 buf->f_ffree = -1;
85 buf->f_namelen = PATH_MAX;
86 buf->f_frsize = PAGE_CACHE_SIZE;
87
88 /* leave fsid little-endian, regardless of host endianness */
89 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
90 buf->f_fsid.val[0] = fsid & 0xffffffff;
91 buf->f_fsid.val[1] = fsid >> 32;
92
93 return 0;
94}
95
96
97static int ceph_syncfs(struct super_block *sb, int wait)
98{
99 dout("sync_fs %d\n", wait);
100 ceph_osdc_sync(&ceph_client(sb)->osdc);
101 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
102 dout("sync_fs %d done\n", wait);
103 return 0;
104}
105
106
107/**
108 * ceph_show_options - Show mount options in /proc/mounts
109 * @m: seq_file to write to
110 * @mnt: mount descriptor
111 */
112static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
113{
114 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
115 struct ceph_mount_args *args = client->mount_args;
116
117 if (args->flags & CEPH_OPT_FSID)
118 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
120 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
121 if (args->flags & CEPH_OPT_NOSHARE)
122 seq_puts(m, ",noshare");
123 if (args->flags & CEPH_OPT_DIRSTAT)
124 seq_puts(m, ",dirstat");
125 if ((args->flags & CEPH_OPT_RBYTES) == 0)
126 seq_puts(m, ",norbytes");
127 if (args->flags & CEPH_OPT_NOCRC)
128 seq_puts(m, ",nocrc");
129 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
130 seq_puts(m, ",noasyncreaddir");
131 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
132 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
133 if (args->name)
134 seq_printf(m, ",name=%s", args->name);
135 if (args->secret)
136 seq_puts(m, ",secret=<hidden>");
137 return 0;
138}
139
140/*
141 * caches
142 */
143struct kmem_cache *ceph_inode_cachep;
144struct kmem_cache *ceph_cap_cachep;
145struct kmem_cache *ceph_dentry_cachep;
146struct kmem_cache *ceph_file_cachep;
147
148static void ceph_inode_init_once(void *foo)
149{
150 struct ceph_inode_info *ci = foo;
151 inode_init_once(&ci->vfs_inode);
152}
153
154static int default_congestion_kb(void)
155{
156 int congestion_kb;
157
158 /*
159 * Copied from NFS
160 *
161 * congestion size, scale with available memory.
162 *
163 * 64MB: 8192k
164 * 128MB: 11585k
165 * 256MB: 16384k
166 * 512MB: 23170k
167 * 1GB: 32768k
168 * 2GB: 46340k
169 * 4GB: 65536k
170 * 8GB: 92681k
171 * 16GB: 131072k
172 *
173 * This allows larger machines to have larger/more transfers.
174 * Limit the default to 256M
175 */
176 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
177 if (congestion_kb > 256*1024)
178 congestion_kb = 256*1024;
179
180 return congestion_kb;
181}
182
183static int __init init_caches(void)
184{
185 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
186 sizeof(struct ceph_inode_info),
187 __alignof__(struct ceph_inode_info),
188 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
189 ceph_inode_init_once);
190 if (ceph_inode_cachep == NULL)
191 return -ENOMEM;
192
193 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
194 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
195 if (ceph_cap_cachep == NULL)
196 goto bad_cap;
197
198 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
199 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
200 if (ceph_dentry_cachep == NULL)
201 goto bad_dentry;
202
203 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
204 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
205 if (ceph_file_cachep == NULL)
206 goto bad_file;
207
208 return 0;
209
210bad_file:
211 kmem_cache_destroy(ceph_dentry_cachep);
212bad_dentry:
213 kmem_cache_destroy(ceph_cap_cachep);
214bad_cap:
215 kmem_cache_destroy(ceph_inode_cachep);
216 return -ENOMEM;
217}
218
219static void destroy_caches(void)
220{
221 kmem_cache_destroy(ceph_inode_cachep);
222 kmem_cache_destroy(ceph_cap_cachep);
223 kmem_cache_destroy(ceph_dentry_cachep);
224 kmem_cache_destroy(ceph_file_cachep);
225}
226
227
228/*
229 * ceph_umount_begin - initiate forced umount. Tear down down the
230 * mount, skipping steps that may hang while waiting for server(s).
231 */
232static void ceph_umount_begin(struct super_block *sb)
233{
234 struct ceph_client *client = ceph_sb_to_client(sb);
235
236 dout("ceph_umount_begin - starting forced umount\n");
237 if (!client)
238 return;
239 client->mount_state = CEPH_MOUNT_SHUTDOWN;
240 return;
241}
242
243static const struct super_operations ceph_super_ops = {
244 .alloc_inode = ceph_alloc_inode,
245 .destroy_inode = ceph_destroy_inode,
246 .write_inode = ceph_write_inode,
247 .sync_fs = ceph_syncfs,
248 .put_super = ceph_put_super,
249 .show_options = ceph_show_options,
250 .statfs = ceph_statfs,
251 .umount_begin = ceph_umount_begin,
252};
253
254
255const char *ceph_msg_type_name(int type)
256{
257 switch (type) {
258 case CEPH_MSG_SHUTDOWN: return "shutdown";
259 case CEPH_MSG_PING: return "ping";
260 case CEPH_MSG_AUTH: return "auth";
261 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
262 case CEPH_MSG_MON_MAP: return "mon_map";
263 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
264 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
265 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
266 case CEPH_MSG_STATFS: return "statfs";
267 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
268 case CEPH_MSG_MDS_MAP: return "mds_map";
269 case CEPH_MSG_CLIENT_SESSION: return "client_session";
270 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
271 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
272 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
273 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
274 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
275 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
276 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
277 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
278 case CEPH_MSG_OSD_MAP: return "osd_map";
279 case CEPH_MSG_OSD_OP: return "osd_op";
280 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
281 default: return "unknown";
282 }
283}
284
285
286/*
287 * mount options
288 */
289enum {
290 Opt_fsidmajor,
291 Opt_fsidminor,
292 Opt_monport,
293 Opt_wsize,
294 Opt_rsize,
295 Opt_osdtimeout,
296 Opt_osdkeepalivetimeout,
297 Opt_mount_timeout,
298 Opt_osd_idle_ttl,
299 Opt_caps_wanted_delay_min,
300 Opt_caps_wanted_delay_max,
301 Opt_readdir_max_entries,
302 Opt_congestion_kb,
303 Opt_last_int,
304 /* int args above */
305 Opt_snapdirname,
306 Opt_name,
307 Opt_secret,
308 Opt_last_string,
309 /* string args above */
310 Opt_ip,
311 Opt_noshare,
312 Opt_dirstat,
313 Opt_nodirstat,
314 Opt_rbytes,
315 Opt_norbytes,
316 Opt_nocrc,
317 Opt_noasyncreaddir,
318};
319
320static match_table_t arg_tokens = {
321 {Opt_fsidmajor, "fsidmajor=%ld"},
322 {Opt_fsidminor, "fsidminor=%ld"},
323 {Opt_monport, "monport=%d"},
324 {Opt_wsize, "wsize=%d"},
325 {Opt_rsize, "rsize=%d"},
326 {Opt_osdtimeout, "osdtimeout=%d"},
327 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
328 {Opt_mount_timeout, "mount_timeout=%d"},
329 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
330 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
331 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
332 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
333 {Opt_congestion_kb, "write_congestion_kb=%d"},
334 /* int args above */
335 {Opt_snapdirname, "snapdirname=%s"},
336 {Opt_name, "name=%s"},
337 {Opt_secret, "secret=%s"},
338 /* string args above */
339 {Opt_ip, "ip=%s"},
340 {Opt_noshare, "noshare"},
341 {Opt_dirstat, "dirstat"},
342 {Opt_nodirstat, "nodirstat"},
343 {Opt_rbytes, "rbytes"},
344 {Opt_norbytes, "norbytes"},
345 {Opt_nocrc, "nocrc"},
346 {Opt_noasyncreaddir, "noasyncreaddir"},
347 {-1, NULL}
348};
349
350
351static struct ceph_mount_args *parse_mount_args(int flags, char *options,
352 const char *dev_name,
353 const char **path)
354{
355 struct ceph_mount_args *args;
356 const char *c;
357 int err = -ENOMEM;
358 substring_t argstr[MAX_OPT_ARGS];
359
360 args = kzalloc(sizeof(*args), GFP_KERNEL);
361 if (!args)
362 return ERR_PTR(-ENOMEM);
363 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
364 GFP_KERNEL);
365 if (!args->mon_addr)
366 goto out;
367
368 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
369
370 /* start with defaults */
371 args->sb_flags = flags;
372 args->flags = CEPH_OPT_DEFAULT;
373 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
374 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
375 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
376 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
377 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
378 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
379 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
380 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
381 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
382 args->max_readdir = 1024;
383 args->congestion_kb = default_congestion_kb();
384
385 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
386 err = -EINVAL;
387 if (!dev_name)
388 goto out;
389 *path = strstr(dev_name, ":/");
390 if (*path == NULL) {
391 pr_err("device name is missing path (no :/ in %s)\n",
392 dev_name);
393 goto out;
394 }
395
396 /* get mon ip(s) */
397 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
398 CEPH_MAX_MON, &args->num_mon);
399 if (err < 0)
400 goto out;
401
402 /* path on server */
403 *path += 2;
404 dout("server path '%s'\n", *path);
405
406 /* parse mount options */
407 while ((c = strsep(&options, ",")) != NULL) {
408 int token, intval, ret;
409 if (!*c)
410 continue;
411 err = -EINVAL;
412 token = match_token((char *)c, arg_tokens, argstr);
413 if (token < 0) {
414 pr_err("bad mount option at '%s'\n", c);
415 goto out;
416 }
417 if (token < Opt_last_int) {
418 ret = match_int(&argstr[0], &intval);
419 if (ret < 0) {
420 pr_err("bad mount option arg (not int) "
421 "at '%s'\n", c);
422 continue;
423 }
424 dout("got int token %d val %d\n", token, intval);
425 } else if (token > Opt_last_int && token < Opt_last_string) {
426 dout("got string token %d val %s\n", token,
427 argstr[0].from);
428 } else {
429 dout("got token %d\n", token);
430 }
431 switch (token) {
432 case Opt_fsidmajor:
433 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
434 break;
435 case Opt_fsidminor:
436 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
437 break;
438 case Opt_ip:
439 err = ceph_parse_ips(argstr[0].from,
440 argstr[0].to,
441 &args->my_addr,
442 1, NULL);
443 if (err < 0)
444 goto out;
445 args->flags |= CEPH_OPT_MYIP;
446 break;
447
448 case Opt_snapdirname:
449 kfree(args->snapdir_name);
450 args->snapdir_name = kstrndup(argstr[0].from,
451 argstr[0].to-argstr[0].from,
452 GFP_KERNEL);
453 break;
454 case Opt_name:
455 args->name = kstrndup(argstr[0].from,
456 argstr[0].to-argstr[0].from,
457 GFP_KERNEL);
458 break;
459 case Opt_secret:
460 args->secret = kstrndup(argstr[0].from,
461 argstr[0].to-argstr[0].from,
462 GFP_KERNEL);
463 break;
464
465 /* misc */
466 case Opt_wsize:
467 args->wsize = intval;
468 break;
469 case Opt_rsize:
470 args->rsize = intval;
471 break;
472 case Opt_osdtimeout:
473 args->osd_timeout = intval;
474 break;
475 case Opt_osdkeepalivetimeout:
476 args->osd_keepalive_timeout = intval;
477 break;
478 case Opt_mount_timeout:
479 args->mount_timeout = intval;
480 break;
481 case Opt_caps_wanted_delay_min:
482 args->caps_wanted_delay_min = intval;
483 break;
484 case Opt_caps_wanted_delay_max:
485 args->caps_wanted_delay_max = intval;
486 break;
487 case Opt_readdir_max_entries:
488 args->max_readdir = intval;
489 break;
490 case Opt_congestion_kb:
491 args->congestion_kb = intval;
492 break;
493
494 case Opt_noshare:
495 args->flags |= CEPH_OPT_NOSHARE;
496 break;
497
498 case Opt_dirstat:
499 args->flags |= CEPH_OPT_DIRSTAT;
500 break;
501 case Opt_nodirstat:
502 args->flags &= ~CEPH_OPT_DIRSTAT;
503 break;
504 case Opt_rbytes:
505 args->flags |= CEPH_OPT_RBYTES;
506 break;
507 case Opt_norbytes:
508 args->flags &= ~CEPH_OPT_RBYTES;
509 break;
510 case Opt_nocrc:
511 args->flags |= CEPH_OPT_NOCRC;
512 break;
513 case Opt_noasyncreaddir:
514 args->flags |= CEPH_OPT_NOASYNCREADDIR;
515 break;
516
517 default:
518 BUG_ON(token);
519 }
520 }
521 return args;
522
523out:
524 kfree(args->mon_addr);
525 kfree(args);
526 return ERR_PTR(err);
527}
528
529static void destroy_mount_args(struct ceph_mount_args *args)
530{
531 dout("destroy_mount_args %p\n", args);
532 kfree(args->snapdir_name);
533 args->snapdir_name = NULL;
534 kfree(args->name);
535 args->name = NULL;
536 kfree(args->secret);
537 args->secret = NULL;
538 kfree(args);
539}
540
541/*
542 * create a fresh client instance
543 */
544static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
545{
546 struct ceph_client *client;
547 int err = -ENOMEM;
548
549 client = kzalloc(sizeof(*client), GFP_KERNEL);
550 if (client == NULL)
551 return ERR_PTR(-ENOMEM);
552
553 mutex_init(&client->mount_mutex);
554
555 init_waitqueue_head(&client->auth_wq);
556
557 client->sb = NULL;
558 client->mount_state = CEPH_MOUNT_MOUNTING;
559 client->mount_args = args;
560
561 client->msgr = NULL;
562
563 client->auth_err = 0;
564 atomic_long_set(&client->writeback_count, 0);
565
566 err = bdi_init(&client->backing_dev_info);
567 if (err < 0)
568 goto fail;
569
570 err = -ENOMEM;
571 client->wb_wq = create_workqueue("ceph-writeback");
572 if (client->wb_wq == NULL)
573 goto fail_bdi;
574 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
575 if (client->pg_inv_wq == NULL)
576 goto fail_wb_wq;
577 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
578 if (client->trunc_wq == NULL)
579 goto fail_pg_inv_wq;
580
581 /* set up mempools */
582 err = -ENOMEM;
583 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
584 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
585 if (!client->wb_pagevec_pool)
586 goto fail_trunc_wq;
587
588 /* caps */
589 client->min_caps = args->max_readdir;
590 ceph_adjust_min_caps(client->min_caps);
591
592 /* subsystems */
593 err = ceph_monc_init(&client->monc, client);
594 if (err < 0)
595 goto fail_mempool;
596 err = ceph_osdc_init(&client->osdc, client);
597 if (err < 0)
598 goto fail_monc;
599 err = ceph_mdsc_init(&client->mdsc, client);
600 if (err < 0)
601 goto fail_osdc;
602 return client;
603
604fail_osdc:
605 ceph_osdc_stop(&client->osdc);
606fail_monc:
607 ceph_monc_stop(&client->monc);
608fail_mempool:
609 mempool_destroy(client->wb_pagevec_pool);
610fail_trunc_wq:
611 destroy_workqueue(client->trunc_wq);
612fail_pg_inv_wq:
613 destroy_workqueue(client->pg_inv_wq);
614fail_wb_wq:
615 destroy_workqueue(client->wb_wq);
616fail_bdi:
617 bdi_destroy(&client->backing_dev_info);
618fail:
619 kfree(client);
620 return ERR_PTR(err);
621}
622
623static void ceph_destroy_client(struct ceph_client *client)
624{
625 dout("destroy_client %p\n", client);
626
627 /* unmount */
628 ceph_mdsc_stop(&client->mdsc);
629 ceph_monc_stop(&client->monc);
630 ceph_osdc_stop(&client->osdc);
631
632 ceph_adjust_min_caps(-client->min_caps);
633
634 ceph_debugfs_client_cleanup(client);
635 destroy_workqueue(client->wb_wq);
636 destroy_workqueue(client->pg_inv_wq);
637 destroy_workqueue(client->trunc_wq);
638
639 if (client->msgr)
640 ceph_messenger_destroy(client->msgr);
641 mempool_destroy(client->wb_pagevec_pool);
642
643 destroy_mount_args(client->mount_args);
644
645 kfree(client);
646 dout("destroy_client %p done\n", client);
647}
648
649/*
650 * Initially learn our fsid, or verify an fsid matches.
651 */
652int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
653{
654 if (client->have_fsid) {
655 if (ceph_fsid_compare(&client->fsid, fsid)) {
656 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
657 PR_FSID(&client->fsid), PR_FSID(fsid));
658 return -1;
659 }
660 } else {
661 pr_info("client%lld fsid " FSID_FORMAT "\n",
662 client->monc.auth->global_id, PR_FSID(fsid));
663 memcpy(&client->fsid, fsid, sizeof(*fsid));
664 ceph_debugfs_client_init(client);
665 client->have_fsid = true;
666 }
667 return 0;
668}
669
670/*
671 * true if we have the mon map (and have thus joined the cluster)
672 */
673static int have_mon_map(struct ceph_client *client)
674{
675 return client->monc.monmap && client->monc.monmap->epoch;
676}
677
678/*
679 * Bootstrap mount by opening the root directory. Note the mount
680 * @started time from caller, and time out if this takes too long.
681 */
682static struct dentry *open_root_dentry(struct ceph_client *client,
683 const char *path,
684 unsigned long started)
685{
686 struct ceph_mds_client *mdsc = &client->mdsc;
687 struct ceph_mds_request *req = NULL;
688 int err;
689 struct dentry *root;
690
691 /* open dir */
692 dout("open_root_inode opening '%s'\n", path);
693 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
694 if (IS_ERR(req))
695 return ERR_PTR(PTR_ERR(req));
696 req->r_path1 = kstrdup(path, GFP_NOFS);
697 req->r_ino1.ino = CEPH_INO_ROOT;
698 req->r_ino1.snap = CEPH_NOSNAP;
699 req->r_started = started;
700 req->r_timeout = client->mount_args->mount_timeout * HZ;
701 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
702 req->r_num_caps = 2;
703 err = ceph_mdsc_do_request(mdsc, NULL, req);
704 if (err == 0) {
705 dout("open_root_inode success\n");
706 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
707 client->sb->s_root == NULL)
708 root = d_alloc_root(req->r_target_inode);
709 else
710 root = d_obtain_alias(req->r_target_inode);
711 req->r_target_inode = NULL;
712 dout("open_root_inode success, root dentry is %p\n", root);
713 } else {
714 root = ERR_PTR(err);
715 }
716 ceph_mdsc_put_request(req);
717 return root;
718}
719
720/*
721 * mount: join the ceph cluster, and open root directory.
722 */
723static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
724 const char *path)
725{
726 struct ceph_entity_addr *myaddr = NULL;
727 int err;
728 unsigned long timeout = client->mount_args->mount_timeout * HZ;
729 unsigned long started = jiffies; /* note the start time */
730 struct dentry *root;
731
732 dout("mount start\n");
733 mutex_lock(&client->mount_mutex);
734
735 /* initialize the messenger */
736 if (client->msgr == NULL) {
737 if (ceph_test_opt(client, MYIP))
738 myaddr = &client->mount_args->my_addr;
739 client->msgr = ceph_messenger_create(myaddr);
740 if (IS_ERR(client->msgr)) {
741 err = PTR_ERR(client->msgr);
742 client->msgr = NULL;
743 goto out;
744 }
745 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
746 }
747
748 /* open session, and wait for mon, mds, and osd maps */
749 err = ceph_monc_open_session(&client->monc);
750 if (err < 0)
751 goto out;
752
753 while (!have_mon_map(client)) {
754 err = -EIO;
755 if (timeout && time_after_eq(jiffies, started + timeout))
756 goto out;
757
758 /* wait */
759 dout("mount waiting for mon_map\n");
760 err = wait_event_interruptible_timeout(client->auth_wq,
761 have_mon_map(client) || (client->auth_err < 0),
762 timeout);
763 if (err == -EINTR || err == -ERESTARTSYS)
764 goto out;
765 if (client->auth_err < 0) {
766 err = client->auth_err;
767 goto out;
768 }
769 }
770
771 dout("mount opening root\n");
772 root = open_root_dentry(client, "", started);
773 if (IS_ERR(root)) {
774 err = PTR_ERR(root);
775 goto out;
776 }
777 if (client->sb->s_root)
778 dput(root);
779 else
780 client->sb->s_root = root;
781
782 if (path[0] == 0) {
783 dget(root);
784 } else {
785 dout("mount opening base mountpoint\n");
786 root = open_root_dentry(client, path, started);
787 if (IS_ERR(root)) {
788 err = PTR_ERR(root);
789 dput(client->sb->s_root);
790 client->sb->s_root = NULL;
791 goto out;
792 }
793 }
794
795 mnt->mnt_root = root;
796 mnt->mnt_sb = client->sb;
797
798 client->mount_state = CEPH_MOUNT_MOUNTED;
799 dout("mount success\n");
800 err = 0;
801
802out:
803 mutex_unlock(&client->mount_mutex);
804 return err;
805}
806
807static int ceph_set_super(struct super_block *s, void *data)
808{
809 struct ceph_client *client = data;
810 int ret;
811
812 dout("set_super %p data %p\n", s, data);
813
814 s->s_flags = client->mount_args->sb_flags;
815 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
816
817 s->s_fs_info = client;
818 client->sb = s;
819
820 s->s_op = &ceph_super_ops;
821 s->s_export_op = &ceph_export_ops;
822
823 s->s_time_gran = 1000; /* 1000 ns == 1 us */
824
825 ret = set_anon_super(s, NULL); /* what is that second arg for? */
826 if (ret != 0)
827 goto fail;
828
829 return ret;
830
831fail:
832 s->s_fs_info = NULL;
833 client->sb = NULL;
834 return ret;
835}
836
837/*
838 * share superblock if same fs AND options
839 */
840static int ceph_compare_super(struct super_block *sb, void *data)
841{
842 struct ceph_client *new = data;
843 struct ceph_mount_args *args = new->mount_args;
844 struct ceph_client *other = ceph_sb_to_client(sb);
845 int i;
846
847 dout("ceph_compare_super %p\n", sb);
848 if (args->flags & CEPH_OPT_FSID) {
849 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
850 dout("fsid doesn't match\n");
851 return 0;
852 }
853 } else {
854 /* do we share (a) monitor? */
855 for (i = 0; i < new->monc.monmap->num_mon; i++)
856 if (ceph_monmap_contains(other->monc.monmap,
857 &new->monc.monmap->mon_inst[i].addr))
858 break;
859 if (i == new->monc.monmap->num_mon) {
860 dout("mon ip not part of monmap\n");
861 return 0;
862 }
863 dout("mon ip matches existing sb %p\n", sb);
864 }
865 if (args->sb_flags != other->mount_args->sb_flags) {
866 dout("flags differ\n");
867 return 0;
868 }
869 return 1;
870}
871
872/*
873 * construct our own bdi so we can control readahead, etc.
874 */
875static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
876{
877 int err;
878
879 sb->s_bdi = &client->backing_dev_info;
880
881 /* set ra_pages based on rsize mount option? */
882 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
883 client->backing_dev_info.ra_pages =
884 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
885 >> PAGE_SHIFT;
886 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
887 return err;
888}
889
890static int ceph_get_sb(struct file_system_type *fs_type,
891 int flags, const char *dev_name, void *data,
892 struct vfsmount *mnt)
893{
894 struct super_block *sb;
895 struct ceph_client *client;
896 int err;
897 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
898 const char *path = NULL;
899 struct ceph_mount_args *args;
900
901 dout("ceph_get_sb\n");
902 args = parse_mount_args(flags, data, dev_name, &path);
903 if (IS_ERR(args)) {
904 err = PTR_ERR(args);
905 goto out_final;
906 }
907
908 /* create client (which we may/may not use) */
909 client = ceph_create_client(args);
910 if (IS_ERR(client)) {
911 err = PTR_ERR(client);
912 goto out_final;
913 }
914
915 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
916 compare_super = NULL;
917 sb = sget(fs_type, compare_super, ceph_set_super, client);
918 if (IS_ERR(sb)) {
919 err = PTR_ERR(sb);
920 goto out;
921 }
922
923 if (ceph_client(sb) != client) {
924 ceph_destroy_client(client);
925 client = ceph_client(sb);
926 dout("get_sb got existing client %p\n", client);
927 } else {
928 dout("get_sb using new client %p\n", client);
929 err = ceph_register_bdi(sb, client);
930 if (err < 0)
931 goto out_splat;
932 }
933
934 err = ceph_mount(client, mnt, path);
935 if (err < 0)
936 goto out_splat;
937 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
938 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
939 return 0;
940
941out_splat:
942 ceph_mdsc_close_sessions(&client->mdsc);
943 up_write(&sb->s_umount);
944 deactivate_super(sb);
945 goto out_final;
946
947out:
948 ceph_destroy_client(client);
949out_final:
950 dout("ceph_get_sb fail %d\n", err);
951 return err;
952}
953
954static void ceph_kill_sb(struct super_block *s)
955{
956 struct ceph_client *client = ceph_sb_to_client(s);
957 dout("kill_sb %p\n", s);
958 ceph_mdsc_pre_umount(&client->mdsc);
959 kill_anon_super(s); /* will call put_super after sb is r/o */
960 if (s->s_bdi == &client->backing_dev_info)
961 bdi_unregister(&client->backing_dev_info);
962 bdi_destroy(&client->backing_dev_info);
963 ceph_destroy_client(client);
964}
965
966static struct file_system_type ceph_fs_type = {
967 .owner = THIS_MODULE,
968 .name = "ceph",
969 .get_sb = ceph_get_sb,
970 .kill_sb = ceph_kill_sb,
971 .fs_flags = FS_RENAME_DOES_D_MOVE,
972};
973
974#define _STRINGIFY(x) #x
975#define STRINGIFY(x) _STRINGIFY(x)
976
977static int __init init_ceph(void)
978{
979 int ret = 0;
980
981 ret = ceph_debugfs_init();
982 if (ret < 0)
983 goto out;
984
985 ret = ceph_msgr_init();
986 if (ret < 0)
987 goto out_debugfs;
988
989 ret = init_caches();
990 if (ret)
991 goto out_msgr;
992
993 ceph_caps_init();
994
995 ret = register_filesystem(&ceph_fs_type);
996 if (ret)
997 goto out_icache;
998
999 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
1000 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1001 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1002 return 0;
1003
1004out_icache:
1005 destroy_caches();
1006out_msgr:
1007 ceph_msgr_exit();
1008out_debugfs:
1009 ceph_debugfs_cleanup();
1010out:
1011 return ret;
1012}
1013
1014static void __exit exit_ceph(void)
1015{
1016 dout("exit_ceph\n");
1017 unregister_filesystem(&ceph_fs_type);
1018 ceph_caps_finalize();
1019 destroy_caches();
1020 ceph_msgr_exit();
1021 ceph_debugfs_cleanup();
1022}
1023
1024module_init(init_ceph);
1025module_exit(exit_ceph);
1026
1027MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1028MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1029MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1030MODULE_DESCRIPTION("Ceph filesystem for Linux");
1031MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..ca702c67bc66
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,902 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24
25/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400
27
28/* large granularity for statfs utilization stats to facilitate
29 * large volume sizes on 32-bit machines. */
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32
33/*
34 * mount options
35 */
36#define CEPH_OPT_FSID (1<<0)
37#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
38#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
39#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
40#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
41#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
42#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
43
44#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
45
46#define ceph_set_opt(client, opt) \
47 (client)->mount_args->flags |= CEPH_OPT_##opt;
48#define ceph_test_opt(client, opt) \
49 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
50
51
52struct ceph_mount_args {
53 int sb_flags;
54 int num_mon;
55 struct ceph_entity_addr *mon_addr;
56 int flags;
57 int mount_timeout;
58 int osd_idle_ttl;
59 int caps_wanted_delay_min, caps_wanted_delay_max;
60 struct ceph_fsid fsid;
61 struct ceph_entity_addr my_addr;
62 int wsize;
63 int rsize; /* max readahead */
64 int max_readdir; /* max readdir size */
65 int congestion_kb; /* max readdir size */
66 int osd_timeout;
67 int osd_keepalive_timeout;
68 char *snapdir_name; /* default ".snap" */
69 char *name;
70 char *secret;
71 int cap_release_safety;
72};
73
74/*
75 * defaults
76 */
77#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
78#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
79#define CEPH_OSD_KEEPALIVE_DEFAULT 5
80#define CEPH_OSD_IDLE_TTL_DEFAULT 60
81#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
82
83#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
84#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
85
86#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
87#define CEPH_AUTH_NAME_DEFAULT "guest"
88
89/*
90 * Delay telling the MDS we no longer want caps, in case we reopen
91 * the file. Delay a minimum amount of time, even if we send a cap
92 * message for some other reason. Otherwise, take the oppotunity to
93 * update the mds to avoid sending another message later.
94 */
95#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
96#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
97
98
99/* mount state */
100enum {
101 CEPH_MOUNT_MOUNTING,
102 CEPH_MOUNT_MOUNTED,
103 CEPH_MOUNT_UNMOUNTING,
104 CEPH_MOUNT_UNMOUNTED,
105 CEPH_MOUNT_SHUTDOWN,
106};
107
108/*
109 * subtract jiffies
110 */
111static inline unsigned long time_sub(unsigned long a, unsigned long b)
112{
113 BUG_ON(time_after(b, a));
114 return (long)a - (long)b;
115}
116
117/*
118 * per-filesystem client state
119 *
120 * possibly shared by multiple mount points, if they are
121 * mounting the same ceph filesystem/cluster.
122 */
123struct ceph_client {
124 struct ceph_fsid fsid;
125 bool have_fsid;
126
127 struct mutex mount_mutex; /* serialize mount attempts */
128 struct ceph_mount_args *mount_args;
129
130 struct super_block *sb;
131
132 unsigned long mount_state;
133 wait_queue_head_t auth_wq;
134
135 int auth_err;
136
137 int min_caps; /* min caps i added */
138
139 struct ceph_messenger *msgr; /* messenger instance */
140 struct ceph_mon_client monc;
141 struct ceph_mds_client mdsc;
142 struct ceph_osd_client osdc;
143
144 /* writeback */
145 mempool_t *wb_pagevec_pool;
146 struct workqueue_struct *wb_wq;
147 struct workqueue_struct *pg_inv_wq;
148 struct workqueue_struct *trunc_wq;
149 atomic_long_t writeback_count;
150
151 struct backing_dev_info backing_dev_info;
152
153#ifdef CONFIG_DEBUG_FS
154 struct dentry *debugfs_monmap;
155 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
156 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
157 struct dentry *debugfs_congestion_kb;
158 struct dentry *debugfs_bdi;
159#endif
160};
161
162static inline struct ceph_client *ceph_client(struct super_block *sb)
163{
164 return sb->s_fs_info;
165}
166
167
168/*
169 * File i/o capability. This tracks shared state with the metadata
170 * server that allows us to cache or writeback attributes or to read
171 * and write data. For any given inode, we should have one or more
172 * capabilities, one issued by each metadata server, and our
173 * cumulative access is the OR of all issued capabilities.
174 *
175 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
176 * session capability lists.
177 */
178struct ceph_cap {
179 struct ceph_inode_info *ci;
180 struct rb_node ci_node; /* per-ci cap tree */
181 struct ceph_mds_session *session;
182 struct list_head session_caps; /* per-session caplist */
183 int mds;
184 u64 cap_id; /* unique cap id (mds provided) */
185 int issued; /* latest, from the mds */
186 int implemented; /* implemented superset of issued (for revocation) */
187 int mds_wanted;
188 u32 seq, issue_seq, mseq;
189 u32 cap_gen; /* active/stale cycle */
190 unsigned long last_used;
191 struct list_head caps_item;
192};
193
194#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
195#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
196#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
197
198/*
199 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
200 * we first complete any in-process sync writes and writeback any dirty
201 * data before flushing the snapped state (tracked here) back to the MDS.
202 */
203struct ceph_cap_snap {
204 atomic_t nref;
205 struct ceph_inode_info *ci;
206 struct list_head ci_item, flushing_item;
207
208 u64 follows, flush_tid;
209 int issued, dirty;
210 struct ceph_snap_context *context;
211
212 mode_t mode;
213 uid_t uid;
214 gid_t gid;
215
216 void *xattr_blob;
217 int xattr_len;
218 u64 xattr_version;
219
220 u64 size;
221 struct timespec mtime, atime, ctime;
222 u64 time_warp_seq;
223 int writing; /* a sync write is still in progress */
224 int dirty_pages; /* dirty pages awaiting writeback */
225};
226
227static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
228{
229 if (atomic_dec_and_test(&capsnap->nref))
230 kfree(capsnap);
231}
232
233/*
234 * The frag tree describes how a directory is fragmented, potentially across
235 * multiple metadata servers. It is also used to indicate points where
236 * metadata authority is delegated, and whether/where metadata is replicated.
237 *
238 * A _leaf_ frag will be present in the i_fragtree IFF there is
239 * delegation info. That is, if mds >= 0 || ndist > 0.
240 */
241#define CEPH_MAX_DIRFRAG_REP 4
242
243struct ceph_inode_frag {
244 struct rb_node node;
245
246 /* fragtree state */
247 u32 frag;
248 int split_by; /* i.e. 2^(split_by) children */
249
250 /* delegation and replication info */
251 int mds; /* -1 if same authority as parent */
252 int ndist; /* >0 if replicated */
253 int dist[CEPH_MAX_DIRFRAG_REP];
254};
255
256/*
257 * We cache inode xattrs as an encoded blob until they are first used,
258 * at which point we parse them into an rbtree.
259 */
260struct ceph_inode_xattr {
261 struct rb_node node;
262
263 const char *name;
264 int name_len;
265 const char *val;
266 int val_len;
267 int dirty;
268
269 int should_free_name;
270 int should_free_val;
271};
272
273struct ceph_inode_xattrs_info {
274 /*
275 * (still encoded) xattr blob. we avoid the overhead of parsing
276 * this until someone actually calls getxattr, etc.
277 *
278 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
279 * NULL means we don't know.
280 */
281 struct ceph_buffer *blob, *prealloc_blob;
282
283 struct rb_root index;
284 bool dirty;
285 int count;
286 int names_size;
287 int vals_size;
288 u64 version, index_version;
289};
290
291/*
292 * Ceph inode.
293 */
294#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
295#define CEPH_I_NODELAY 4 /* do not delay cap release */
296#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
297#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
298
299struct ceph_inode_info {
300 struct ceph_vino i_vino; /* ceph ino + snap */
301
302 u64 i_version;
303 u32 i_time_warp_seq;
304
305 unsigned i_ceph_flags;
306 unsigned long i_release_count;
307
308 struct ceph_file_layout i_layout;
309 char *i_symlink;
310
311 /* for dirs */
312 struct timespec i_rctime;
313 u64 i_rbytes, i_rfiles, i_rsubdirs;
314 u64 i_files, i_subdirs;
315 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
316
317 struct rb_root i_fragtree;
318 struct mutex i_fragtree_mutex;
319
320 struct ceph_inode_xattrs_info i_xattrs;
321
322 /* capabilities. protected _both_ by i_lock and cap->session's
323 * s_mutex. */
324 struct rb_root i_caps; /* cap list */
325 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
326 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
327 struct list_head i_dirty_item, i_flushing_item;
328 u64 i_cap_flush_seq;
329 /* we need to track cap writeback on a per-cap-bit basis, to allow
330 * overlapping, pipelined cap flushes to the mds. we can probably
331 * reduce the tid to 8 bits if we're concerned about inode size. */
332 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
333 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
334 unsigned long i_hold_caps_min; /* jiffies */
335 unsigned long i_hold_caps_max; /* jiffies */
336 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
337 int i_cap_exporting_mds; /* to handle cap migration between */
338 unsigned i_cap_exporting_mseq; /* mds's. */
339 unsigned i_cap_exporting_issued;
340 struct ceph_cap_reservation i_cap_migration_resv;
341 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
342 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
343 unsigned i_snap_caps; /* cap bits for snapped files */
344
345 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
346
347 u32 i_truncate_seq; /* last truncate to smaller size */
348 u64 i_truncate_size; /* and the size we last truncated down to */
349 int i_truncate_pending; /* still need to call vmtruncate */
350
351 u64 i_max_size; /* max file size authorized by mds */
352 u64 i_reported_size; /* (max_)size reported to or requested of mds */
353 u64 i_wanted_max_size; /* offset we'd like to write too */
354 u64 i_requested_max_size; /* max_size we've requested */
355
356 /* held references to caps */
357 int i_pin_ref;
358 int i_rd_ref, i_rdcache_ref, i_wr_ref;
359 int i_wrbuffer_ref, i_wrbuffer_ref_head;
360 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
361 u32 i_rdcache_gen; /* we increment this each time we get
362 FILE_CACHE. If it's non-zero, we
363 _may_ have cached pages. */
364 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
365
366 struct list_head i_unsafe_writes; /* uncommitted sync writes */
367 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
368 spinlock_t i_unsafe_lock;
369
370 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
371 int i_snap_realm_counter; /* snap realm (if caps) */
372 struct list_head i_snap_realm_item;
373 struct list_head i_snap_flush_item;
374
375 struct work_struct i_wb_work; /* writeback work */
376 struct work_struct i_pg_inv_work; /* page invalidation work */
377
378 struct work_struct i_vmtruncate_work;
379
380 struct inode vfs_inode; /* at end */
381};
382
383static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
384{
385 return container_of(inode, struct ceph_inode_info, vfs_inode);
386}
387
388static inline void ceph_i_clear(struct inode *inode, unsigned mask)
389{
390 struct ceph_inode_info *ci = ceph_inode(inode);
391
392 spin_lock(&inode->i_lock);
393 ci->i_ceph_flags &= ~mask;
394 spin_unlock(&inode->i_lock);
395}
396
397static inline void ceph_i_set(struct inode *inode, unsigned mask)
398{
399 struct ceph_inode_info *ci = ceph_inode(inode);
400
401 spin_lock(&inode->i_lock);
402 ci->i_ceph_flags |= mask;
403 spin_unlock(&inode->i_lock);
404}
405
406static inline bool ceph_i_test(struct inode *inode, unsigned mask)
407{
408 struct ceph_inode_info *ci = ceph_inode(inode);
409 bool r;
410
411 smp_mb();
412 r = (ci->i_ceph_flags & mask) == mask;
413 return r;
414}
415
416
417/* find a specific frag @f */
418extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
419 u32 f);
420
421/*
422 * choose fragment for value @v. copy frag content to pfrag, if leaf
423 * exists
424 */
425extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
426 struct ceph_inode_frag *pfrag,
427 int *found);
428
429/*
430 * Ceph dentry state
431 */
432struct ceph_dentry_info {
433 struct ceph_mds_session *lease_session;
434 u32 lease_gen, lease_shared_gen;
435 u32 lease_seq;
436 unsigned long lease_renew_after, lease_renew_from;
437 struct list_head lru;
438 struct dentry *dentry;
439 u64 time;
440 u64 offset;
441};
442
443static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
444{
445 return (struct ceph_dentry_info *)dentry->d_fsdata;
446}
447
448static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
449{
450 return ((loff_t)frag << 32) | (loff_t)off;
451}
452
453/*
454 * ino_t is <64 bits on many architectures, blech.
455 *
456 * don't include snap in ino hash, at least for now.
457 */
458static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
459{
460 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
461#if BITS_PER_LONG == 32
462 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
463 if (!ino)
464 ino = 1;
465#endif
466 return ino;
467}
468
469static inline int ceph_set_ino_cb(struct inode *inode, void *data)
470{
471 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
472 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
473 return 0;
474}
475
476static inline struct ceph_vino ceph_vino(struct inode *inode)
477{
478 return ceph_inode(inode)->i_vino;
479}
480
481/* for printf-style formatting */
482#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
483
484static inline u64 ceph_ino(struct inode *inode)
485{
486 return ceph_inode(inode)->i_vino.ino;
487}
488static inline u64 ceph_snap(struct inode *inode)
489{
490 return ceph_inode(inode)->i_vino.snap;
491}
492
493static inline int ceph_ino_compare(struct inode *inode, void *data)
494{
495 struct ceph_vino *pvino = (struct ceph_vino *)data;
496 struct ceph_inode_info *ci = ceph_inode(inode);
497 return ci->i_vino.ino == pvino->ino &&
498 ci->i_vino.snap == pvino->snap;
499}
500
501static inline struct inode *ceph_find_inode(struct super_block *sb,
502 struct ceph_vino vino)
503{
504 ino_t t = ceph_vino_to_ino(vino);
505 return ilookup5(sb, t, ceph_ino_compare, &vino);
506}
507
508
509/*
510 * caps helpers
511 */
512static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
513{
514 return !RB_EMPTY_ROOT(&ci->i_caps);
515}
516
517extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
518extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
519extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
520 struct ceph_cap *cap);
521
522static inline int ceph_caps_issued(struct ceph_inode_info *ci)
523{
524 int issued;
525 spin_lock(&ci->vfs_inode.i_lock);
526 issued = __ceph_caps_issued(ci, NULL);
527 spin_unlock(&ci->vfs_inode.i_lock);
528 return issued;
529}
530
531static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
532 int touch)
533{
534 int r;
535 spin_lock(&ci->vfs_inode.i_lock);
536 r = __ceph_caps_issued_mask(ci, mask, touch);
537 spin_unlock(&ci->vfs_inode.i_lock);
538 return r;
539}
540
541static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
542{
543 return ci->i_dirty_caps | ci->i_flushing_caps;
544}
545extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
546
547extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
548extern int __ceph_caps_used(struct ceph_inode_info *ci);
549
550extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
551
552/*
553 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
554 */
555static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
556{
557 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
558 if (w & CEPH_CAP_FILE_BUFFER)
559 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
560 return w;
561}
562
563/* what the mds thinks we want */
564extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
565
566extern void ceph_caps_init(void);
567extern void ceph_caps_finalize(void);
568extern void ceph_adjust_min_caps(int delta);
569extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
570extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
571extern void ceph_reservation_status(struct ceph_client *client,
572 int *total, int *avail, int *used,
573 int *reserved, int *min);
574
575static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
576{
577 return (struct ceph_client *)inode->i_sb->s_fs_info;
578}
579
580static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
581{
582 return (struct ceph_client *)sb->s_fs_info;
583}
584
585
586/*
587 * we keep buffered readdir results attached to file->private_data
588 */
589struct ceph_file_info {
590 int fmode; /* initialized on open */
591
592 /* readdir: position within the dir */
593 u32 frag;
594 struct ceph_mds_request *last_readdir;
595 int at_end;
596
597 /* readdir: position within a frag */
598 unsigned offset; /* offset of last chunk, adjusted for . and .. */
599 u64 next_offset; /* offset of next chunk (last_name's + 1) */
600 char *last_name; /* last entry in previous chunk */
601 struct dentry *dentry; /* next dentry (for dcache readdir) */
602 unsigned long dir_release_count;
603
604 /* used for -o dirstat read() on directory thing */
605 char *dir_info;
606 int dir_info_len;
607};
608
609
610
611/*
612 * snapshots
613 */
614
615/*
616 * A "snap context" is the set of existing snapshots when we
617 * write data. It is used by the OSD to guide its COW behavior.
618 *
619 * The ceph_snap_context is refcounted, and attached to each dirty
620 * page, indicating which context the dirty data belonged when it was
621 * dirtied.
622 */
623struct ceph_snap_context {
624 atomic_t nref;
625 u64 seq;
626 int num_snaps;
627 u64 snaps[];
628};
629
630static inline struct ceph_snap_context *
631ceph_get_snap_context(struct ceph_snap_context *sc)
632{
633 /*
634 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
635 atomic_read(&sc->nref)+1);
636 */
637 if (sc)
638 atomic_inc(&sc->nref);
639 return sc;
640}
641
642static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
643{
644 if (!sc)
645 return;
646 /*
647 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
648 atomic_read(&sc->nref)-1);
649 */
650 if (atomic_dec_and_test(&sc->nref)) {
651 /*printk(" deleting snap_context %p\n", sc);*/
652 kfree(sc);
653 }
654}
655
656/*
657 * A "snap realm" describes a subset of the file hierarchy sharing
658 * the same set of snapshots that apply to it. The realms themselves
659 * are organized into a hierarchy, such that children inherit (some of)
660 * the snapshots of their parents.
661 *
662 * All inodes within the realm that have capabilities are linked into a
663 * per-realm list.
664 */
665struct ceph_snap_realm {
666 u64 ino;
667 atomic_t nref;
668 struct rb_node node;
669
670 u64 created, seq;
671 u64 parent_ino;
672 u64 parent_since; /* snapid when our current parent became so */
673
674 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
675 int num_prior_parent_snaps; /* had prior to parent_since */
676 u64 *snaps; /* snaps specific to this realm */
677 int num_snaps;
678
679 struct ceph_snap_realm *parent;
680 struct list_head children; /* list of child realms */
681 struct list_head child_item;
682
683 struct list_head empty_item; /* if i have ref==0 */
684
685 /* the current set of snaps for this realm */
686 struct ceph_snap_context *cached_context;
687
688 struct list_head inodes_with_caps;
689 spinlock_t inodes_with_caps_lock;
690};
691
692
693
694/*
695 * calculate the number of pages a given length and offset map onto,
696 * if we align the data.
697 */
698static inline int calc_pages_for(u64 off, u64 len)
699{
700 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
701 (off >> PAGE_CACHE_SHIFT);
702}
703
704
705
706/* snap.c */
707struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
708 u64 ino);
709extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
710 struct ceph_snap_realm *realm);
711extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
712 struct ceph_snap_realm *realm);
713extern int ceph_update_snap_trace(struct ceph_mds_client *m,
714 void *p, void *e, bool deletion);
715extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg);
718extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
719 struct ceph_snap_context *snapc);
720extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
721 struct ceph_cap_snap *capsnap);
722extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
723
724/*
725 * a cap_snap is "pending" if it is still awaiting an in-progress
726 * sync write (that may/may not still update size, mtime, etc.).
727 */
728static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
729{
730 return !list_empty(&ci->i_cap_snaps) &&
731 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
732 ci_item)->writing;
733}
734
735
736/* super.c */
737extern struct kmem_cache *ceph_inode_cachep;
738extern struct kmem_cache *ceph_cap_cachep;
739extern struct kmem_cache *ceph_dentry_cachep;
740extern struct kmem_cache *ceph_file_cachep;
741
742extern const char *ceph_msg_type_name(int type);
743extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
744
745#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
746 "%02x%02x%02x%02x%02x%02x"
747#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
748 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
749 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
750 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
751
752/* inode.c */
753extern const struct inode_operations ceph_file_iops;
754
755extern struct inode *ceph_alloc_inode(struct super_block *sb);
756extern void ceph_destroy_inode(struct inode *inode);
757
758extern struct inode *ceph_get_inode(struct super_block *sb,
759 struct ceph_vino vino);
760extern struct inode *ceph_get_snapdir(struct inode *parent);
761extern int ceph_fill_file_size(struct inode *inode, int issued,
762 u32 truncate_seq, u64 truncate_size, u64 size);
763extern void ceph_fill_file_time(struct inode *inode, int issued,
764 u64 time_warp_seq, struct timespec *ctime,
765 struct timespec *mtime, struct timespec *atime);
766extern int ceph_fill_trace(struct super_block *sb,
767 struct ceph_mds_request *req,
768 struct ceph_mds_session *session);
769extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
770 struct ceph_mds_session *session);
771
772extern int ceph_inode_holds_cap(struct inode *inode, int mask);
773
774extern int ceph_inode_set_size(struct inode *inode, loff_t size);
775extern void __ceph_do_pending_vmtruncate(struct inode *inode);
776extern void ceph_queue_vmtruncate(struct inode *inode);
777
778extern void ceph_queue_invalidate(struct inode *inode);
779extern void ceph_queue_writeback(struct inode *inode);
780
781extern int ceph_do_getattr(struct inode *inode, int mask);
782extern int ceph_permission(struct inode *inode, int mask);
783extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
784extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
785 struct kstat *stat);
786
787/* xattr.c */
788extern int ceph_setxattr(struct dentry *, const char *, const void *,
789 size_t, int);
790extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
791extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
792extern int ceph_removexattr(struct dentry *, const char *);
793extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
794extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
795
796/* caps.c */
797extern const char *ceph_cap_string(int c);
798extern void ceph_handle_caps(struct ceph_mds_session *session,
799 struct ceph_msg *msg);
800extern int ceph_add_cap(struct inode *inode,
801 struct ceph_mds_session *session, u64 cap_id,
802 int fmode, unsigned issued, unsigned wanted,
803 unsigned cap, unsigned seq, u64 realmino, int flags,
804 struct ceph_cap_reservation *caps_reservation);
805extern void __ceph_remove_cap(struct ceph_cap *cap);
806static inline void ceph_remove_cap(struct ceph_cap *cap)
807{
808 struct inode *inode = &cap->ci->vfs_inode;
809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 spin_unlock(&inode->i_lock);
812}
813extern void ceph_put_cap(struct ceph_cap *cap);
814
815extern void ceph_queue_caps_release(struct inode *inode);
816extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
817extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
818extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
819 struct ceph_mds_session *session);
820extern int ceph_get_cap_mds(struct inode *inode);
821extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
822extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
823extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
824 struct ceph_snap_context *snapc);
825extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
826 struct ceph_mds_session **psession);
827extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
828 struct ceph_mds_session *session);
829extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
830extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
831
832extern int ceph_encode_inode_release(void **p, struct inode *inode,
833 int mds, int drop, int unless, int force);
834extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
835 int mds, int drop, int unless);
836
837extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
838 int *got, loff_t endoff);
839
840/* for counting open files by mode */
841static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
842{
843 ci->i_nr_by_mode[mode]++;
844}
845extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
846
847/* addr.c */
848extern const struct address_space_operations ceph_aops;
849extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
850
851/* file.c */
852extern const struct file_operations ceph_file_fops;
853extern const struct address_space_operations ceph_aops;
854extern int ceph_open(struct inode *inode, struct file *file);
855extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
856 struct nameidata *nd, int mode,
857 int locked_dir);
858extern int ceph_release(struct inode *inode, struct file *filp);
859extern void ceph_release_page_vector(struct page **pages, int num_pages);
860
861/* dir.c */
862extern const struct file_operations ceph_dir_fops;
863extern const struct inode_operations ceph_dir_iops;
864extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
865 ceph_snapdir_dentry_ops;
866
867extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
868extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
869 struct dentry *dentry, int err);
870
871extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn);
874
875/*
876 * our d_ops vary depending on whether the inode is live,
877 * snapshotted (read-only), or a virtual ".snap" directory.
878 */
879int ceph_init_dentry(struct dentry *dentry);
880
881
882/* ioctl.c */
883extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
884
885/* export.c */
886extern const struct export_operations ceph_export_ops;
887
888/* debugfs.c */
889extern int ceph_debugfs_init(void);
890extern void ceph_debugfs_cleanup(void);
891extern int ceph_debugfs_client_init(struct ceph_client *client);
892extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
893
894static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
895{
896 if (dentry && dentry->d_parent)
897 return dentry->d_parent->d_inode;
898
899 return NULL;
900}
901
902#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6#include <linux/slab.h>
7
8static bool ceph_is_valid_xattr(const char *name)
9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
14}
15
16/*
17 * These define virtual xattrs exposing the recursive directory
18 * statistics and layout metadata.
19 */
20struct ceph_vxattr_cb {
21 bool readonly;
22 char *name;
23 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
24 size_t size);
25};
26
27/* directories */
28
29static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
30 size_t size)
31{
32 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
33}
34
35static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
36 size_t size)
37{
38 return snprintf(val, size, "%lld", ci->i_files);
39}
40
41static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
42 size_t size)
43{
44 return snprintf(val, size, "%lld", ci->i_subdirs);
45}
46
47static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
48 size_t size)
49{
50 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
51}
52
53static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
54 size_t size)
55{
56 return snprintf(val, size, "%lld", ci->i_rfiles);
57}
58
59static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
60 size_t size)
61{
62 return snprintf(val, size, "%lld", ci->i_rsubdirs);
63}
64
65static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
66 size_t size)
67{
68 return snprintf(val, size, "%lld", ci->i_rbytes);
69}
70
71static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
72 size_t size)
73{
74 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
75 (long)ci->i_rctime.tv_nsec);
76}
77
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL }
88};
89
90/* files */
91
92static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
93 size_t size)
94{
95 int ret;
96
97 ret = snprintf(val, size,
98 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
99 (unsigned long long)ceph_file_layout_su(ci->i_layout),
100 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
101 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
102 if (ceph_file_layout_pg_preferred(ci->i_layout))
103 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
104 (unsigned long long)ceph_file_layout_pg_preferred(
105 ci->i_layout));
106 return ret;
107}
108
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL }
112};
113
114static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
115{
116 if (S_ISDIR(inode->i_mode))
117 return ceph_dir_vxattrs;
118 else if (S_ISREG(inode->i_mode))
119 return ceph_file_vxattrs;
120 return NULL;
121}
122
123static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
124 const char *name)
125{
126 do {
127 if (strcmp(vxattr->name, name) == 0)
128 return vxattr;
129 vxattr++;
130 } while (vxattr->name);
131 return NULL;
132}
133
134static int __set_xattr(struct ceph_inode_info *ci,
135 const char *name, int name_len,
136 const char *val, int val_len,
137 int dirty,
138 int should_free_name, int should_free_val,
139 struct ceph_inode_xattr **newxattr)
140{
141 struct rb_node **p;
142 struct rb_node *parent = NULL;
143 struct ceph_inode_xattr *xattr = NULL;
144 int c;
145 int new = 0;
146
147 p = &ci->i_xattrs.index.rb_node;
148 while (*p) {
149 parent = *p;
150 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
151 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
152 if (c < 0)
153 p = &(*p)->rb_left;
154 else if (c > 0)
155 p = &(*p)->rb_right;
156 else {
157 if (name_len == xattr->name_len)
158 break;
159 else if (name_len < xattr->name_len)
160 p = &(*p)->rb_left;
161 else
162 p = &(*p)->rb_right;
163 }
164 xattr = NULL;
165 }
166
167 if (!xattr) {
168 new = 1;
169 xattr = *newxattr;
170 xattr->name = name;
171 xattr->name_len = name_len;
172 xattr->should_free_name = should_free_name;
173
174 ci->i_xattrs.count++;
175 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
176 } else {
177 kfree(*newxattr);
178 *newxattr = NULL;
179 if (xattr->should_free_val)
180 kfree((void *)xattr->val);
181
182 if (should_free_name) {
183 kfree((void *)name);
184 name = xattr->name;
185 }
186 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len;
188 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len;
197 if (val)
198 xattr->val = val;
199 else
200 xattr->val = "";
201
202 xattr->val_len = val_len;
203 xattr->dirty = dirty;
204 xattr->should_free_val = (val && should_free_val);
205
206 if (new) {
207 rb_link_node(&xattr->node, parent, p);
208 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
209 dout("__set_xattr_val p=%p\n", p);
210 }
211
212 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
213 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
214
215 return 0;
216}
217
218static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 const char *name)
220{
221 struct rb_node **p;
222 struct rb_node *parent = NULL;
223 struct ceph_inode_xattr *xattr = NULL;
224 int c;
225
226 p = &ci->i_xattrs.index.rb_node;
227 while (*p) {
228 parent = *p;
229 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
230 c = strncmp(name, xattr->name, xattr->name_len);
231 if (c < 0)
232 p = &(*p)->rb_left;
233 else if (c > 0)
234 p = &(*p)->rb_right;
235 else {
236 dout("__get_xattr %s: found %.*s\n", name,
237 xattr->val_len, xattr->val);
238 return xattr;
239 }
240 }
241
242 dout("__get_xattr %s: not found\n", name);
243
244 return NULL;
245}
246
247static void __free_xattr(struct ceph_inode_xattr *xattr)
248{
249 BUG_ON(!xattr);
250
251 if (xattr->should_free_name)
252 kfree((void *)xattr->name);
253 if (xattr->should_free_val)
254 kfree((void *)xattr->val);
255
256 kfree(xattr);
257}
258
259static int __remove_xattr(struct ceph_inode_info *ci,
260 struct ceph_inode_xattr *xattr)
261{
262 if (!xattr)
263 return -EOPNOTSUPP;
264
265 rb_erase(&xattr->node, &ci->i_xattrs.index);
266
267 if (xattr->should_free_name)
268 kfree((void *)xattr->name);
269 if (xattr->should_free_val)
270 kfree((void *)xattr->val);
271
272 ci->i_xattrs.names_size -= xattr->name_len;
273 ci->i_xattrs.vals_size -= xattr->val_len;
274 ci->i_xattrs.count--;
275 kfree(xattr);
276
277 return 0;
278}
279
280static int __remove_xattr_by_name(struct ceph_inode_info *ci,
281 const char *name)
282{
283 struct rb_node **p;
284 struct ceph_inode_xattr *xattr;
285 int err;
286
287 p = &ci->i_xattrs.index.rb_node;
288 xattr = __get_xattr(ci, name);
289 err = __remove_xattr(ci, xattr);
290 return err;
291}
292
293static char *__copy_xattr_names(struct ceph_inode_info *ci,
294 char *dest)
295{
296 struct rb_node *p;
297 struct ceph_inode_xattr *xattr = NULL;
298
299 p = rb_first(&ci->i_xattrs.index);
300 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
301
302 while (p) {
303 xattr = rb_entry(p, struct ceph_inode_xattr, node);
304 memcpy(dest, xattr->name, xattr->name_len);
305 dest[xattr->name_len] = '\0';
306
307 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
308 xattr->name_len, ci->i_xattrs.names_size);
309
310 dest += xattr->name_len + 1;
311 p = rb_next(p);
312 }
313
314 return dest;
315}
316
317void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
318{
319 struct rb_node *p, *tmp;
320 struct ceph_inode_xattr *xattr = NULL;
321
322 p = rb_first(&ci->i_xattrs.index);
323
324 dout("__ceph_destroy_xattrs p=%p\n", p);
325
326 while (p) {
327 xattr = rb_entry(p, struct ceph_inode_xattr, node);
328 tmp = p;
329 p = rb_next(tmp);
330 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
331 xattr->name_len, xattr->name);
332 rb_erase(tmp, &ci->i_xattrs.index);
333
334 __free_xattr(xattr);
335 }
336
337 ci->i_xattrs.names_size = 0;
338 ci->i_xattrs.vals_size = 0;
339 ci->i_xattrs.index_version = 0;
340 ci->i_xattrs.count = 0;
341 ci->i_xattrs.index = RB_ROOT;
342}
343
344static int __build_xattrs(struct inode *inode)
345{
346 u32 namelen;
347 u32 numattr = 0;
348 void *p, *end;
349 u32 len;
350 const char *name, *val;
351 struct ceph_inode_info *ci = ceph_inode(inode);
352 int xattr_version;
353 struct ceph_inode_xattr **xattrs = NULL;
354 int err = 0;
355 int i;
356
357 dout("__build_xattrs() len=%d\n",
358 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
359
360 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
361 return 0; /* already built */
362
363 __ceph_destroy_xattrs(ci);
364
365start:
366 /* updated internal xattr rb tree */
367 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
368 p = ci->i_xattrs.blob->vec.iov_base;
369 end = p + ci->i_xattrs.blob->vec.iov_len;
370 ceph_decode_32_safe(&p, end, numattr, bad);
371 xattr_version = ci->i_xattrs.version;
372 spin_unlock(&inode->i_lock);
373
374 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
375 GFP_NOFS);
376 err = -ENOMEM;
377 if (!xattrs)
378 goto bad_lock;
379 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
380 for (i = 0; i < numattr; i++) {
381 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
382 GFP_NOFS);
383 if (!xattrs[i])
384 goto bad_lock;
385 }
386
387 spin_lock(&inode->i_lock);
388 if (ci->i_xattrs.version != xattr_version) {
389 /* lost a race, retry */
390 for (i = 0; i < numattr; i++)
391 kfree(xattrs[i]);
392 kfree(xattrs);
393 goto start;
394 }
395 err = -EIO;
396 while (numattr--) {
397 ceph_decode_32_safe(&p, end, len, bad);
398 namelen = len;
399 name = p;
400 p += len;
401 ceph_decode_32_safe(&p, end, len, bad);
402 val = p;
403 p += len;
404
405 err = __set_xattr(ci, name, namelen, val, len,
406 0, 0, 0, &xattrs[numattr]);
407
408 if (err < 0)
409 goto bad;
410 }
411 kfree(xattrs);
412 }
413 ci->i_xattrs.index_version = ci->i_xattrs.version;
414 ci->i_xattrs.dirty = false;
415
416 return err;
417bad_lock:
418 spin_lock(&inode->i_lock);
419bad:
420 if (xattrs) {
421 for (i = 0; i < numattr; i++)
422 kfree(xattrs[i]);
423 kfree(xattrs);
424 }
425 ci->i_xattrs.names_size = 0;
426 return err;
427}
428
429static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
430 int val_size)
431{
432 /*
433 * 4 bytes for the length, and additional 4 bytes per each xattr name,
434 * 4 bytes per each value
435 */
436 int size = 4 + ci->i_xattrs.count*(4 + 4) +
437 ci->i_xattrs.names_size +
438 ci->i_xattrs.vals_size;
439 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
440 ci->i_xattrs.count, ci->i_xattrs.names_size,
441 ci->i_xattrs.vals_size);
442
443 if (name_size)
444 size += 4 + 4 + name_size + val_size;
445
446 return size;
447}
448
449/*
450 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
451 * and swap into place.
452 */
453void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
454{
455 struct rb_node *p;
456 struct ceph_inode_xattr *xattr = NULL;
457 void *dest;
458
459 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
460 if (ci->i_xattrs.dirty) {
461 int need = __get_required_blob_size(ci, 0, 0);
462
463 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
464
465 p = rb_first(&ci->i_xattrs.index);
466 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
467
468 ceph_encode_32(&dest, ci->i_xattrs.count);
469 while (p) {
470 xattr = rb_entry(p, struct ceph_inode_xattr, node);
471
472 ceph_encode_32(&dest, xattr->name_len);
473 memcpy(dest, xattr->name, xattr->name_len);
474 dest += xattr->name_len;
475 ceph_encode_32(&dest, xattr->val_len);
476 memcpy(dest, xattr->val, xattr->val_len);
477 dest += xattr->val_len;
478
479 p = rb_next(p);
480 }
481
482 /* adjust buffer len; it may be larger than we need */
483 ci->i_xattrs.prealloc_blob->vec.iov_len =
484 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
485
486 if (ci->i_xattrs.blob)
487 ceph_buffer_put(ci->i_xattrs.blob);
488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
489 ci->i_xattrs.prealloc_blob = NULL;
490 ci->i_xattrs.dirty = false;
491 }
492}
493
494ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
495 size_t size)
496{
497 struct inode *inode = dentry->d_inode;
498 struct ceph_inode_info *ci = ceph_inode(inode);
499 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
500 int err;
501 struct ceph_inode_xattr *xattr;
502 struct ceph_vxattr_cb *vxattr = NULL;
503
504 if (!ceph_is_valid_xattr(name))
505 return -ENODATA;
506
507 /* let's see if a virtual xattr was requested */
508 if (vxattrs)
509 vxattr = ceph_match_vxattr(vxattrs, name);
510
511 spin_lock(&inode->i_lock);
512 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
513 ci->i_xattrs.version, ci->i_xattrs.index_version);
514
515 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
516 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
517 goto get_xattr;
518 } else {
519 spin_unlock(&inode->i_lock);
520 /* get xattrs from mds (if we don't already have them) */
521 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
522 if (err)
523 return err;
524 }
525
526 spin_lock(&inode->i_lock);
527
528 if (vxattr && vxattr->readonly) {
529 err = vxattr->getxattr_cb(ci, value, size);
530 goto out;
531 }
532
533 err = __build_xattrs(inode);
534 if (err < 0)
535 goto out;
536
537get_xattr:
538 err = -ENODATA; /* == ENOATTR */
539 xattr = __get_xattr(ci, name);
540 if (!xattr) {
541 if (vxattr)
542 err = vxattr->getxattr_cb(ci, value, size);
543 goto out;
544 }
545
546 err = -ERANGE;
547 if (size && size < xattr->val_len)
548 goto out;
549
550 err = xattr->val_len;
551 if (size == 0)
552 goto out;
553
554 memcpy(value, xattr->val, xattr->val_len);
555
556out:
557 spin_unlock(&inode->i_lock);
558 return err;
559}
560
561ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
562{
563 struct inode *inode = dentry->d_inode;
564 struct ceph_inode_info *ci = ceph_inode(inode);
565 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
566 u32 vir_namelen = 0;
567 u32 namelen;
568 int err;
569 u32 len;
570 int i;
571
572 spin_lock(&inode->i_lock);
573 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
574 ci->i_xattrs.version, ci->i_xattrs.index_version);
575
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
578 goto list_xattr;
579 } else {
580 spin_unlock(&inode->i_lock);
581 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
582 if (err)
583 return err;
584 }
585
586 spin_lock(&inode->i_lock);
587
588 err = __build_xattrs(inode);
589 if (err < 0)
590 goto out;
591
592list_xattr:
593 vir_namelen = 0;
594 /* include virtual dir xattrs */
595 if (vxattrs)
596 for (i = 0; vxattrs[i].name; i++)
597 vir_namelen += strlen(vxattrs[i].name) + 1;
598 /* adding 1 byte per each variable due to the null termination */
599 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
600 err = -ERANGE;
601 if (size && namelen > size)
602 goto out;
603
604 err = namelen;
605 if (size == 0)
606 goto out;
607
608 names = __copy_xattr_names(ci, names);
609
610 /* virtual xattr names, too */
611 if (vxattrs)
612 for (i = 0; vxattrs[i].name; i++) {
613 len = sprintf(names, "%s", vxattrs[i].name);
614 names += len + 1;
615 }
616
617out:
618 spin_unlock(&inode->i_lock);
619 return err;
620}
621
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags)
624{
625 struct ceph_client *client = ceph_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode;
629 struct ceph_mds_request *req;
630 struct ceph_mds_client *mdsc = &client->mdsc;
631 int err;
632 int i, nr_pages;
633 struct page **pages = NULL;
634 void *kaddr;
635
636 /* copy value into some pages */
637 nr_pages = calc_pages_for(0, size);
638 if (nr_pages) {
639 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
640 if (!pages)
641 return -ENOMEM;
642 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS);
645 if (!pages[i]) {
646 nr_pages = i;
647 goto out;
648 }
649 kaddr = kmap(pages[i]);
650 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
651 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
652 }
653 }
654
655 dout("setxattr value=%.*s\n", (int)size, value);
656
657 /* do request */
658 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
659 USE_AUTH_MDS);
660 if (IS_ERR(req)) {
661 err = PTR_ERR(req);
662 goto out;
663 }
664 req->r_inode = igrab(inode);
665 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
666 req->r_num_caps = 1;
667 req->r_args.setxattr.flags = cpu_to_le32(flags);
668 req->r_path2 = kstrdup(name, GFP_NOFS);
669
670 req->r_pages = pages;
671 req->r_num_pages = nr_pages;
672 req->r_data_len = size;
673
674 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
675 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
676 ceph_mdsc_put_request(req);
677 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
678
679out:
680 if (pages) {
681 for (i = 0; i < nr_pages; i++)
682 __free_page(pages[i]);
683 kfree(pages);
684 }
685 return err;
686}
687
688int ceph_setxattr(struct dentry *dentry, const char *name,
689 const void *value, size_t size, int flags)
690{
691 struct inode *inode = dentry->d_inode;
692 struct ceph_inode_info *ci = ceph_inode(inode);
693 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
694 int err;
695 int name_len = strlen(name);
696 int val_len = size;
697 char *newname = NULL;
698 char *newval = NULL;
699 struct ceph_inode_xattr *xattr = NULL;
700 int issued;
701 int required_blob_size;
702
703 if (ceph_snap(inode) != CEPH_NOSNAP)
704 return -EROFS;
705
706 if (!ceph_is_valid_xattr(name))
707 return -EOPNOTSUPP;
708
709 if (vxattrs) {
710 struct ceph_vxattr_cb *vxattr =
711 ceph_match_vxattr(vxattrs, name);
712 if (vxattr && vxattr->readonly)
713 return -EOPNOTSUPP;
714 }
715
716 /* preallocate memory for xattr name, value, index node */
717 err = -ENOMEM;
718 newname = kmalloc(name_len + 1, GFP_NOFS);
719 if (!newname)
720 goto out;
721 memcpy(newname, name, name_len + 1);
722
723 if (val_len) {
724 newval = kmalloc(val_len + 1, GFP_NOFS);
725 if (!newval)
726 goto out;
727 memcpy(newval, value, val_len);
728 newval[val_len] = '\0';
729 }
730
731 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
732 if (!xattr)
733 goto out;
734
735 spin_lock(&inode->i_lock);
736retry:
737 issued = __ceph_caps_issued(ci, NULL);
738 if (!(issued & CEPH_CAP_XATTR_EXCL))
739 goto do_sync;
740 __build_xattrs(inode);
741
742 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
743
744 if (!ci->i_xattrs.prealloc_blob ||
745 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
746 struct ceph_buffer *blob = NULL;
747
748 spin_unlock(&inode->i_lock);
749 dout(" preaallocating new blob size=%d\n", required_blob_size);
750 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
751 if (!blob)
752 goto out;
753 spin_lock(&inode->i_lock);
754 if (ci->i_xattrs.prealloc_blob)
755 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
756 ci->i_xattrs.prealloc_blob = blob;
757 goto retry;
758 }
759
760 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
761 err = __set_xattr(ci, newname, name_len, newval,
762 val_len, 1, 1, 1, &xattr);
763 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
764 ci->i_xattrs.dirty = true;
765 inode->i_ctime = CURRENT_TIME;
766 spin_unlock(&inode->i_lock);
767
768 return err;
769
770do_sync:
771 spin_unlock(&inode->i_lock);
772 err = ceph_sync_setxattr(dentry, name, value, size, flags);
773out:
774 kfree(newname);
775 kfree(newval);
776 kfree(xattr);
777 return err;
778}
779
780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{
782 struct ceph_client *client = ceph_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode;
786 struct ceph_mds_request *req;
787 int err;
788
789 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
790 USE_AUTH_MDS);
791 if (IS_ERR(req))
792 return PTR_ERR(req);
793 req->r_inode = igrab(inode);
794 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
795 req->r_num_caps = 1;
796 req->r_path2 = kstrdup(name, GFP_NOFS);
797
798 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
799 ceph_mdsc_put_request(req);
800 return err;
801}
802
803int ceph_removexattr(struct dentry *dentry, const char *name)
804{
805 struct inode *inode = dentry->d_inode;
806 struct ceph_inode_info *ci = ceph_inode(inode);
807 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
808 int issued;
809 int err;
810
811 if (ceph_snap(inode) != CEPH_NOSNAP)
812 return -EROFS;
813
814 if (!ceph_is_valid_xattr(name))
815 return -EOPNOTSUPP;
816
817 if (vxattrs) {
818 struct ceph_vxattr_cb *vxattr =
819 ceph_match_vxattr(vxattrs, name);
820 if (vxattr && vxattr->readonly)
821 return -EOPNOTSUPP;
822 }
823
824 spin_lock(&inode->i_lock);
825 __build_xattrs(inode);
826 issued = __ceph_caps_issued(ci, NULL);
827 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
828
829 if (!(issued & CEPH_CAP_XATTR_EXCL))
830 goto do_sync;
831
832 err = __remove_xattr_by_name(ceph_inode(inode), name);
833 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
834 ci->i_xattrs.dirty = true;
835 inode->i_ctime = CURRENT_TIME;
836
837 spin_unlock(&inode->i_lock);
838
839 return err;
840do_sync:
841 spin_unlock(&inode->i_lock);
842 err = ceph_send_removexattr(dentry, name);
843 return err;
844}
845
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 094ea65afc85..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,8 @@
1Version 1.62
2------------
3Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
4to more strictly handle corrupt frames.
5
1Version 1.61 6Version 1.61
2------------ 7------------
3Fix append problem to Samba servers (files opened with O_APPEND could 8Fix append problem to Samba servers (files opened with O_APPEND could
@@ -5,7 +10,9 @@ have duplicated data). Fix oops in cifs_lookup. Workaround problem
5mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. 10mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
6Disable use of server inode numbers when server only 11Disable use of server inode numbers when server only
7partially supports them (e.g. for one server querying inode numbers on 12partially supports them (e.g. for one server querying inode numbers on
8FindFirst fails but QPathInfo queries works). 13FindFirst fails but QPathInfo queries works). Fix oops with dfs in
14cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
15for OpenOffice when on forcedirectio mount e.g.)
9 16
10Version 1.60 17Version 1.60
11------------- 18-------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 79c1a93400be..a727b7cb075f 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -423,7 +423,7 @@ A partial list of the supported mount options follows:
423 source name to use to represent the client netbios machine 423 source name to use to represent the client netbios machine
424 name when doing the RFC1001 netbios session initialize. 424 name when doing the RFC1001 netbios session initialize.
425 direct Do not do inode data caching on files opened on this mount. 425 direct Do not do inode data caching on files opened on this mount.
426 This precludes mmaping files on this mount. In some cases 426 This precludes mmapping files on this mount. In some cases
427 with fast networks and little or no caching benefits on the 427 with fast networks and little or no caching benefits on the
428 client (e.g. when the application is doing large sequential 428 client (e.g. when the application is doing large sequential
429 reads bigger than page size without rereading the same data) 429 reads bigger than page size without rereading the same data)
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb24..a20bea598933 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
136 return 0; 136 return 0;
137 } 137 }
138 138
139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */ 139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ 140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */
141 *val = *(++(ctx->pointer)); /* value has enum value */ 141 *val = *(++(ctx->pointer)); /* value has enum value */
142 else 142 else
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index fea9e898c4ba..78e4d2a3a68b 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
15#include <linux/dcache.h> 15#include <linux/dcache.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/vfs.h> 19#include <linux/vfs.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include "cifsglob.h" 21#include "cifsglob.h"
@@ -54,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
54 * Extracts sharename form full UNC. 55 * Extracts sharename form full UNC.
55 * i.e. strips from UNC trailing path that is not part of share 56 * i.e. strips from UNC trailing path that is not part of share
56 * name and fixup missing '\' in the begining of DFS node refferal 57 * name and fixup missing '\' in the begining of DFS node refferal
57 * if neccessary. 58 * if necessary.
58 * Returns pointer to share name on success or ERR_PTR on error. 59 * Returns pointer to share name on success or ERR_PTR on error.
59 * Caller is responsible for freeing returned string. 60 * Caller is responsible for freeing returned string.
60 */ 61 */
@@ -269,7 +270,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
269 int err; 270 int err;
270 271
271 mntget(newmnt); 272 mntget(newmnt);
272 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist); 273 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
273 switch (err) { 274 switch (err) {
274 case 0: 275 case 0:
275 path_put(&nd->path); 276 path_put(&nd->path);
@@ -371,7 +372,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
371 if (IS_ERR(mnt)) 372 if (IS_ERR(mnt))
372 goto out_err; 373 goto out_err;
373 374
374 nd->path.mnt->mnt_flags |= MNT_SHRINKABLE;
375 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); 375 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
376 376
377out: 377out:
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..310d12f69a92 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
23#include <linux/string.h> 24#include <linux/string.h>
24#include <keys/user-type.h> 25#include <keys/user-type.h>
25#include <linux/key-type.h> 26#include <linux/key-type.h>
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..d07676bd76d2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h>
22#include "cifs_unicode.h" 23#include "cifs_unicode.h"
23#include "cifs_uniupr.h" 24#include "cifs_uniupr.h"
24#include "cifspdu.h" 25#include "cifspdu.h"
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..9b716d044bbd 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/slab.h>
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifsacl.h" 28#include "cifsacl.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..fbe986430d0c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/slab.h>
23#include "cifspdu.h" 24#include "cifspdu.h"
24#include "cifsglob.h" 25#include "cifsglob.h"
25#include "cifs_debug.h" 26#include "cifs_debug.h"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 29f1da761bbf..5183bc2a1916 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 312 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 313 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 314 cifs_inode->delete_pending = false;
315 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 317 cifs_inode->server_eof = 0;
317 318
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 639 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 640 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 641
641 retval = cifs_revalidate(file->f_path.dentry); 642 retval = cifs_revalidate_file(file);
642 if (retval < 0) 643 if (retval < 0)
643 return (loff_t)retval; 644 return (loff_t)retval;
644 } 645 }
@@ -758,7 +759,7 @@ const struct file_operations cifs_file_ops = {
758}; 759};
759 760
760const struct file_operations cifs_file_direct_ops = { 761const struct file_operations cifs_file_direct_ops = {
761 /* no mmap, no aio, no readv - 762 /* no aio, no readv -
762 BB reevaluate whether they can be done with directio, no cache */ 763 BB reevaluate whether they can be done with directio, no cache */
763 .read = cifs_user_read, 764 .read = cifs_user_read,
764 .write = cifs_user_write, 765 .write = cifs_user_write,
@@ -767,6 +768,7 @@ const struct file_operations cifs_file_direct_ops = {
767 .lock = cifs_lock, 768 .lock = cifs_lock,
768 .fsync = cifs_fsync, 769 .fsync = cifs_fsync,
769 .flush = cifs_flush, 770 .flush = cifs_flush,
771 .mmap = cifs_file_mmap,
770 .splice_read = generic_file_splice_read, 772 .splice_read = generic_file_splice_read,
771#ifdef CONFIG_CIFS_POSIX 773#ifdef CONFIG_CIFS_POSIX
772 .unlocked_ioctl = cifs_ioctl, 774 .unlocked_ioctl = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ac2b24c192f8..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
@@ -113,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
113extern const struct export_operations cifs_export_ops; 114extern const struct export_operations cifs_export_ops;
114#endif /* EXPERIMENTAL */ 115#endif /* EXPERIMENTAL */
115 116
116#define CIFS_VERSION "1.61" 117#define CIFS_VERSION "1.62"
117#endif /* _CIFSFS_H */ 118#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039c..ecf0ffbe2b64 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slab.h>
21#include <linux/slow-work.h> 22#include <linux/slow-work.h>
22#include "cifs_fs_sb.h" 23#include "cifs_fs_sb.h"
23#include "cifsacl.h" 24#include "cifsacl.h"
@@ -39,7 +40,7 @@
39 40
40/* 41/*
41 * MAX_REQ is the maximum number of requests that WE will send 42 * MAX_REQ is the maximum number of requests that WE will send
42 * on one socket concurently. It also matches the most common 43 * on one socket concurrently. It also matches the most common
43 * value of max multiplex returned by servers. We may 44 * value of max multiplex returned by servers. We may
44 * eventually want to use the negotiated value (in case 45 * eventually want to use the negotiated value (in case
45 * future servers can handle more) when we are more confident that 46 * future servers can handle more) when we are more confident that
@@ -149,6 +150,7 @@ struct TCP_Server_Info {
149 bool svlocal:1; /* local server or remote */ 150 bool svlocal:1; /* local server or remote */
150 bool noblocksnd; /* use blocking sendmsg */ 151 bool noblocksnd; /* use blocking sendmsg */
151 bool noautotune; /* do not autotune send buf sizes */ 152 bool noautotune; /* do not autotune send buf sizes */
153 bool tcp_nodelay;
152 atomic_t inFlight; /* number of requests on the wire to server */ 154 atomic_t inFlight; /* number of requests on the wire to server */
153#ifdef CONFIG_CIFS_STATS2 155#ifdef CONFIG_CIFS_STATS2
154 atomic_t inSend; /* requests trying to send */ 156 atomic_t inSend; /* requests trying to send */
@@ -204,7 +206,7 @@ struct cifsUidInfo {
204struct cifsSesInfo { 206struct cifsSesInfo {
205 struct list_head smb_ses_list; 207 struct list_head smb_ses_list;
206 struct list_head tcon_list; 208 struct list_head tcon_list;
207 struct semaphore sesSem; 209 struct mutex session_mutex;
208#if 0 210#if 0
209 struct cifsUidInfo *uidInfo; /* pointer to user info */ 211 struct cifsUidInfo *uidInfo; /* pointer to user info */
210#endif 212#endif
@@ -388,6 +390,7 @@ struct cifsInodeInfo {
388 bool clientCanCacheRead:1; /* read oplock */ 390 bool clientCanCacheRead:1; /* read oplock */
389 bool clientCanCacheAll:1; /* read and writebehind oplock */ 391 bool clientCanCacheAll:1; /* read and writebehind oplock */
390 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 392 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
393 bool invalid_mapping:1; /* pagecache is invalid */
391 u64 server_eof; /* current file size on server */ 394 u64 server_eof; /* current file size on server */
392 u64 uniqueid; /* server inode number */ 395 u64 uniqueid; /* server inode number */
393 struct inode vfs_inode; 396 struct inode vfs_inode;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 2d07f890a842..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
415 __u8 WordCount; 415 __u8 WordCount;
416} __attribute__((packed)); 416} __attribute__((packed));
417/* given a pointer to an smb_hdr retrieve the value of byte count */ 417/* given a pointer to an smb_hdr retrieve the value of byte count */
418#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 418#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
419#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 419#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
421#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2) 421#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
422 422
423/* 423/*
424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -1227,7 +1227,7 @@ typedef struct smb_com_setattr_rsp {
1227/* empty wct response to setattr */ 1227/* empty wct response to setattr */
1228 1228
1229/*******************************************************/ 1229/*******************************************************/
1230/* NT Transact structure defintions follow */ 1230/* NT Transact structure definitions follow */
1231/* Currently only ioctl, acl (get security descriptor) */ 1231/* Currently only ioctl, acl (get security descriptor) */
1232/* and notify are implemented */ 1232/* and notify are implemented */
1233/*******************************************************/ 1233/*******************************************************/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 104extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 105 struct cifs_fattr *fattr);
106 106
107extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 109 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 111 struct super_block *sb, int xid, const __u16 *pfid);
112extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 113extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 114 const unsigned char *search_path,
113 struct super_block *sb, int xid); 115 struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 144extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 145 const __u16 search_handle);
144 146
147extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
148 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 149extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 150 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 151 FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 156 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 157 const struct nls_table *nls_codepage, int remap);
154 158
159extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
160 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 161extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 162 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 163 const unsigned char *searchName,
@@ -363,13 +369,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
363 __u32 filter, struct file *file, int multishot, 369 __u32 filter, struct file *file, int multishot,
364 const struct nls_table *nls_codepage); 370 const struct nls_table *nls_codepage);
365extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 371extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
366 const unsigned char *searchName, char *EAData, 372 const unsigned char *searchName,
373 const unsigned char *ea_name, char *EAData,
367 size_t bufsize, const struct nls_table *nls_codepage, 374 size_t bufsize, const struct nls_table *nls_codepage,
368 int remap_special_chars); 375 int remap_special_chars);
369extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
370 const unsigned char *searchName, const unsigned char *ea_name,
371 unsigned char *ea_value, size_t buf_size,
372 const struct nls_table *nls_codepage, int remap_special_chars);
373extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, 376extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
374 const char *fileName, const char *ea_name, 377 const char *fileName, const char *ea_name,
375 const void *ea_value, const __u16 ea_value_len, 378 const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..3f4fbd670507 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -30,6 +30,7 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/vfs.h> 32#include <linux/vfs.h>
33#include <linux/slab.h>
33#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -170,19 +171,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
170 * need to prevent multiple threads trying to simultaneously 171 * need to prevent multiple threads trying to simultaneously
171 * reconnect the same SMB session 172 * reconnect the same SMB session
172 */ 173 */
173 down(&ses->sesSem); 174 mutex_lock(&ses->session_mutex);
174 if (ses->need_reconnect) 175 if (ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage); 176 rc = cifs_setup_session(0, ses, nls_codepage);
176 177
177 /* do we need to reconnect tcon? */ 178 /* do we need to reconnect tcon? */
178 if (rc || !tcon->need_reconnect) { 179 if (rc || !tcon->need_reconnect) {
179 up(&ses->sesSem); 180 mutex_unlock(&ses->session_mutex);
180 goto out; 181 goto out;
181 } 182 }
182 183
183 mark_open_files_invalid(tcon); 184 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 185 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 up(&ses->sesSem); 186 mutex_unlock(&ses->session_mutex);
186 cFYI(1, ("reconnect tcon rc = %d", rc)); 187 cFYI(1, ("reconnect tcon rc = %d", rc));
187 188
188 if (rc) 189 if (rc)
@@ -500,7 +501,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
500 } else if (pSMBr->hdr.WordCount == 13) { 501 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 502 cERROR(1, ("mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 503 "with CIFS_WEAK_PW_HASH support"));
503 rc = -EOPNOTSUPP; 504 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 505#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 506 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 507 } else if (pSMBr->hdr.WordCount != 17) {
@@ -700,13 +701,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
700 if (!ses || !ses->server) 701 if (!ses || !ses->server)
701 return -EIO; 702 return -EIO;
702 703
703 down(&ses->sesSem); 704 mutex_lock(&ses->session_mutex);
704 if (ses->need_reconnect) 705 if (ses->need_reconnect)
705 goto session_already_dead; /* no need to send SMBlogoff if uid 706 goto session_already_dead; /* no need to send SMBlogoff if uid
706 already closed due to reconnect */ 707 already closed due to reconnect */
707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); 708 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
708 if (rc) { 709 if (rc) {
709 up(&ses->sesSem); 710 mutex_unlock(&ses->session_mutex);
710 return rc; 711 return rc;
711 } 712 }
712 713
@@ -721,7 +722,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
721 pSMB->AndXCommand = 0xFF; 722 pSMB->AndXCommand = 0xFF;
722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); 723 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
723session_already_dead: 724session_already_dead:
724 up(&ses->sesSem); 725 mutex_unlock(&ses->session_mutex);
725 726
726 /* if session dead then we do not need to do ulogoff, 727 /* if session dead then we do not need to do ulogoff,
727 since server closed smb session, no sense reporting 728 since server closed smb session, no sense reporting
@@ -3230,8 +3231,72 @@ QInfRetry:
3230 return rc; 3231 return rc;
3231} 3232}
3232 3233
3234int
3235CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3236 u16 netfid, FILE_ALL_INFO *pFindData)
3237{
3238 struct smb_t2_qfi_req *pSMB = NULL;
3239 struct smb_t2_qfi_rsp *pSMBr = NULL;
3240 int rc = 0;
3241 int bytes_returned;
3242 __u16 params, byte_count;
3243
3244QFileInfoRetry:
3245 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3246 (void **) &pSMBr);
3247 if (rc)
3248 return rc;
3249
3250 params = 2 /* level */ + 2 /* fid */;
3251 pSMB->t2.TotalDataCount = 0;
3252 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3253 /* BB find exact max data count below from sess structure BB */
3254 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3255 pSMB->t2.MaxSetupCount = 0;
3256 pSMB->t2.Reserved = 0;
3257 pSMB->t2.Flags = 0;
3258 pSMB->t2.Timeout = 0;
3259 pSMB->t2.Reserved2 = 0;
3260 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3261 Fid) - 4);
3262 pSMB->t2.DataCount = 0;
3263 pSMB->t2.DataOffset = 0;
3264 pSMB->t2.SetupCount = 1;
3265 pSMB->t2.Reserved3 = 0;
3266 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3267 byte_count = params + 1 /* pad */ ;
3268 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3269 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3270 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3271 pSMB->Pad = 0;
3272 pSMB->Fid = netfid;
3273 pSMB->hdr.smb_buf_length += byte_count;
3274
3275 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3276 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3277 if (rc) {
3278 cFYI(1, ("Send error in QPathInfo = %d", rc));
3279 } else { /* decode response */
3280 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3233 3281
3282 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3283 rc = -EIO;
3284 else if (pSMBr->ByteCount < 40)
3285 rc = -EIO; /* bad smb */
3286 else if (pFindData) {
3287 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3288 memcpy((char *) pFindData,
3289 (char *) &pSMBr->hdr.Protocol +
3290 data_offset, sizeof(FILE_ALL_INFO));
3291 } else
3292 rc = -ENOMEM;
3293 }
3294 cifs_buf_release(pSMB);
3295 if (rc == -EAGAIN)
3296 goto QFileInfoRetry;
3234 3297
3298 return rc;
3299}
3235 3300
3236int 3301int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3302CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3400,75 @@ QPathInfoRetry:
3335} 3400}
3336 3401
3337int 3402int
3403CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3404 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3405{
3406 struct smb_t2_qfi_req *pSMB = NULL;
3407 struct smb_t2_qfi_rsp *pSMBr = NULL;
3408 int rc = 0;
3409 int bytes_returned;
3410 __u16 params, byte_count;
3411
3412UnixQFileInfoRetry:
3413 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3414 (void **) &pSMBr);
3415 if (rc)
3416 return rc;
3417
3418 params = 2 /* level */ + 2 /* fid */;
3419 pSMB->t2.TotalDataCount = 0;
3420 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3421 /* BB find exact max data count below from sess structure BB */
3422 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3423 pSMB->t2.MaxSetupCount = 0;
3424 pSMB->t2.Reserved = 0;
3425 pSMB->t2.Flags = 0;
3426 pSMB->t2.Timeout = 0;
3427 pSMB->t2.Reserved2 = 0;
3428 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3429 Fid) - 4);
3430 pSMB->t2.DataCount = 0;
3431 pSMB->t2.DataOffset = 0;
3432 pSMB->t2.SetupCount = 1;
3433 pSMB->t2.Reserved3 = 0;
3434 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3435 byte_count = params + 1 /* pad */ ;
3436 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3437 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3438 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3439 pSMB->Pad = 0;
3440 pSMB->Fid = netfid;
3441 pSMB->hdr.smb_buf_length += byte_count;
3442
3443 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3444 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3445 if (rc) {
3446 cFYI(1, ("Send error in QPathInfo = %d", rc));
3447 } else { /* decode response */
3448 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3449
3450 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3451 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
3452 "Unix Extensions can be disabled on mount "
3453 "by specifying the nosfu mount option."));
3454 rc = -EIO; /* bad smb */
3455 } else {
3456 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3457 memcpy((char *) pFindData,
3458 (char *) &pSMBr->hdr.Protocol +
3459 data_offset,
3460 sizeof(FILE_UNIX_BASIC_INFO));
3461 }
3462 }
3463
3464 cifs_buf_release(pSMB);
3465 if (rc == -EAGAIN)
3466 goto UnixQFileInfoRetry;
3467
3468 return rc;
3469}
3470
3471int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3472CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3473 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3474 FILE_UNIX_BASIC_INFO *pFindData,
@@ -3886,7 +4020,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3886 goto parse_DFS_referrals_exit; 4020 goto parse_DFS_referrals_exit;
3887 } 4021 }
3888 4022
3889 /* collect neccessary data from referrals */ 4023 /* collect necessary data from referrals */
3890 for (i = 0; i < *num_of_nodes; i++) { 4024 for (i = 0; i < *num_of_nodes; i++) {
3891 char *temp; 4025 char *temp;
3892 int max_len; 4026 int max_len;
@@ -5269,22 +5403,34 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5269 cifs_buf_release(pSMB); 5403 cifs_buf_release(pSMB);
5270 return rc; 5404 return rc;
5271} 5405}
5406
5272#ifdef CONFIG_CIFS_XATTR 5407#ifdef CONFIG_CIFS_XATTR
5408/*
5409 * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
5410 * function used by listxattr and getxattr type calls. When ea_name is set,
5411 * it looks for that attribute name and stuffs that value into the EAData
5412 * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
5413 * buffer. In both cases, the return value is either the length of the
5414 * resulting data or a negative error code. If EAData is a NULL pointer then
5415 * the data isn't copied to it, but the length is returned.
5416 */
5273ssize_t 5417ssize_t
5274CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 5418CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5275 const unsigned char *searchName, 5419 const unsigned char *searchName, const unsigned char *ea_name,
5276 char *EAData, size_t buf_size, 5420 char *EAData, size_t buf_size,
5277 const struct nls_table *nls_codepage, int remap) 5421 const struct nls_table *nls_codepage, int remap)
5278{ 5422{
5279 /* BB assumes one setup word */ 5423 /* BB assumes one setup word */
5280 TRANSACTION2_QPI_REQ *pSMB = NULL; 5424 TRANSACTION2_QPI_REQ *pSMB = NULL;
5281 TRANSACTION2_QPI_RSP *pSMBr = NULL; 5425 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5282 int rc = 0; 5426 int rc = 0;
5283 int bytes_returned; 5427 int bytes_returned;
5284 int name_len; 5428 int list_len;
5429 struct fealist *ea_response_data;
5285 struct fea *temp_fea; 5430 struct fea *temp_fea;
5286 char *temp_ptr; 5431 char *temp_ptr;
5287 __u16 params, byte_count; 5432 char *end_of_smb;
5433 __u16 params, byte_count, data_offset;
5288 5434
5289 cFYI(1, ("In Query All EAs path %s", searchName)); 5435 cFYI(1, ("In Query All EAs path %s", searchName));
5290QAllEAsRetry: 5436QAllEAsRetry:
@@ -5294,22 +5440,22 @@ QAllEAsRetry:
5294 return rc; 5440 return rc;
5295 5441
5296 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5442 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5297 name_len = 5443 list_len =
5298 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5444 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
5299 PATH_MAX, nls_codepage, remap); 5445 PATH_MAX, nls_codepage, remap);
5300 name_len++; /* trailing null */ 5446 list_len++; /* trailing null */
5301 name_len *= 2; 5447 list_len *= 2;
5302 } else { /* BB improve the check for buffer overruns BB */ 5448 } else { /* BB improve the check for buffer overruns BB */
5303 name_len = strnlen(searchName, PATH_MAX); 5449 list_len = strnlen(searchName, PATH_MAX);
5304 name_len++; /* trailing null */ 5450 list_len++; /* trailing null */
5305 strncpy(pSMB->FileName, searchName, name_len); 5451 strncpy(pSMB->FileName, searchName, list_len);
5306 } 5452 }
5307 5453
5308 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5454 params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
5309 pSMB->TotalDataCount = 0; 5455 pSMB->TotalDataCount = 0;
5310 pSMB->MaxParameterCount = cpu_to_le16(2); 5456 pSMB->MaxParameterCount = cpu_to_le16(2);
5311 /* BB find exact max SMB PDU from sess structure BB */ 5457 /* BB find exact max SMB PDU from sess structure BB */
5312 pSMB->MaxDataCount = cpu_to_le16(4000); 5458 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
5313 pSMB->MaxSetupCount = 0; 5459 pSMB->MaxSetupCount = 0;
5314 pSMB->Reserved = 0; 5460 pSMB->Reserved = 0;
5315 pSMB->Flags = 0; 5461 pSMB->Flags = 0;
@@ -5334,237 +5480,117 @@ QAllEAsRetry:
5334 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5480 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5335 if (rc) { 5481 if (rc) {
5336 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5482 cFYI(1, ("Send error in QueryAllEAs = %d", rc));
5337 } else { /* decode response */ 5483 goto QAllEAsOut;
5338 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5484 }
5339 5485
5340 /* BB also check enough total bytes returned */ 5486
5341 /* BB we need to improve the validity checking 5487 /* BB also check enough total bytes returned */
5342 of these trans2 responses */ 5488 /* BB we need to improve the validity checking
5343 if (rc || (pSMBr->ByteCount < 4)) 5489 of these trans2 responses */
5344 rc = -EIO; /* bad smb */ 5490
5345 /* else if (pFindData){ 5491 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
5346 memcpy((char *) pFindData, 5492 if (rc || (pSMBr->ByteCount < 4)) {
5347 (char *) &pSMBr->hdr.Protocol + 5493 rc = -EIO; /* bad smb */
5348 data_offset, kl); 5494 goto QAllEAsOut;
5349 }*/ else {
5350 /* check that length of list is not more than bcc */
5351 /* check that each entry does not go beyond length
5352 of list */
5353 /* check that each element of each entry does not
5354 go beyond end of list */
5355 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5356 struct fealist *ea_response_data;
5357 rc = 0;
5358 /* validate_trans2_offsets() */
5359 /* BB check if start of smb + data_offset > &bcc+ bcc */
5360 ea_response_data = (struct fealist *)
5361 (((char *) &pSMBr->hdr.Protocol) +
5362 data_offset);
5363 name_len = le32_to_cpu(ea_response_data->list_len);
5364 cFYI(1, ("ea length %d", name_len));
5365 if (name_len <= 8) {
5366 /* returned EA size zeroed at top of function */
5367 cFYI(1, ("empty EA list returned from server"));
5368 } else {
5369 /* account for ea list len */
5370 name_len -= 4;
5371 temp_fea = ea_response_data->list;
5372 temp_ptr = (char *)temp_fea;
5373 while (name_len > 0) {
5374 __u16 value_len;
5375 name_len -= 4;
5376 temp_ptr += 4;
5377 rc += temp_fea->name_len;
5378 /* account for prefix user. and trailing null */
5379 rc = rc + 5 + 1;
5380 if (rc < (int)buf_size) {
5381 memcpy(EAData, "user.", 5);
5382 EAData += 5;
5383 memcpy(EAData, temp_ptr,
5384 temp_fea->name_len);
5385 EAData += temp_fea->name_len;
5386 /* null terminate name */
5387 *EAData = 0;
5388 EAData = EAData + 1;
5389 } else if (buf_size == 0) {
5390 /* skip copy - calc size only */
5391 } else {
5392 /* stop before overrun buffer */
5393 rc = -ERANGE;
5394 break;
5395 }
5396 name_len -= temp_fea->name_len;
5397 temp_ptr += temp_fea->name_len;
5398 /* account for trailing null */
5399 name_len--;
5400 temp_ptr++;
5401 value_len =
5402 le16_to_cpu(temp_fea->value_len);
5403 name_len -= value_len;
5404 temp_ptr += value_len;
5405 /* BB check that temp_ptr is still
5406 within the SMB BB*/
5407
5408 /* no trailing null to account for
5409 in value len */
5410 /* go on to next EA */
5411 temp_fea = (struct fea *)temp_ptr;
5412 }
5413 }
5414 }
5415 } 5495 }
5416 cifs_buf_release(pSMB);
5417 if (rc == -EAGAIN)
5418 goto QAllEAsRetry;
5419 5496
5420 return (ssize_t)rc; 5497 /* check that length of list is not more than bcc */
5421} 5498 /* check that each entry does not go beyond length
5499 of list */
5500 /* check that each element of each entry does not
5501 go beyond end of list */
5502 /* validate_trans2_offsets() */
5503 /* BB check if start of smb + data_offset > &bcc+ bcc */
5422 5504
5423ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon, 5505 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5424 const unsigned char *searchName, const unsigned char *ea_name, 5506 ea_response_data = (struct fealist *)
5425 unsigned char *ea_value, size_t buf_size, 5507 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5426 const struct nls_table *nls_codepage, int remap)
5427{
5428 TRANSACTION2_QPI_REQ *pSMB = NULL;
5429 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5430 int rc = 0;
5431 int bytes_returned;
5432 int name_len;
5433 struct fea *temp_fea;
5434 char *temp_ptr;
5435 __u16 params, byte_count;
5436 5508
5437 cFYI(1, ("In Query EA path %s", searchName)); 5509 list_len = le32_to_cpu(ea_response_data->list_len);
5438QEARetry: 5510 cFYI(1, ("ea length %d", list_len));
5439 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5511 if (list_len <= 8) {
5440 (void **) &pSMBr); 5512 cFYI(1, ("empty EA list returned from server"));
5441 if (rc) 5513 goto QAllEAsOut;
5442 return rc; 5514 }
5443 5515
5444 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5516 /* make sure list_len doesn't go past end of SMB */
5445 name_len = 5517 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5446 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5518 if ((char *)ea_response_data + list_len > end_of_smb) {
5447 PATH_MAX, nls_codepage, remap); 5519 cFYI(1, ("EA list appears to go beyond SMB"));
5448 name_len++; /* trailing null */ 5520 rc = -EIO;
5449 name_len *= 2; 5521 goto QAllEAsOut;
5450 } else { /* BB improve the check for buffer overruns BB */
5451 name_len = strnlen(searchName, PATH_MAX);
5452 name_len++; /* trailing null */
5453 strncpy(pSMB->FileName, searchName, name_len);
5454 } 5522 }
5455 5523
5456 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5524 /* account for ea list len */
5457 pSMB->TotalDataCount = 0; 5525 list_len -= 4;
5458 pSMB->MaxParameterCount = cpu_to_le16(2); 5526 temp_fea = ea_response_data->list;
5459 /* BB find exact max SMB PDU from sess structure BB */ 5527 temp_ptr = (char *)temp_fea;
5460 pSMB->MaxDataCount = cpu_to_le16(4000); 5528 while (list_len > 0) {
5461 pSMB->MaxSetupCount = 0; 5529 unsigned int name_len;
5462 pSMB->Reserved = 0; 5530 __u16 value_len;
5463 pSMB->Flags = 0; 5531
5464 pSMB->Timeout = 0; 5532 list_len -= 4;
5465 pSMB->Reserved2 = 0; 5533 temp_ptr += 4;
5466 pSMB->ParameterOffset = cpu_to_le16(offsetof( 5534 /* make sure we can read name_len and value_len */
5467 struct smb_com_transaction2_qpi_req, InformationLevel) - 4); 5535 if (list_len < 0) {
5468 pSMB->DataCount = 0; 5536 cFYI(1, ("EA entry goes beyond length of list"));
5469 pSMB->DataOffset = 0; 5537 rc = -EIO;
5470 pSMB->SetupCount = 1; 5538 goto QAllEAsOut;
5471 pSMB->Reserved3 = 0; 5539 }
5472 pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
5473 byte_count = params + 1 /* pad */ ;
5474 pSMB->TotalParameterCount = cpu_to_le16(params);
5475 pSMB->ParameterCount = pSMB->TotalParameterCount;
5476 pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
5477 pSMB->Reserved4 = 0;
5478 pSMB->hdr.smb_buf_length += byte_count;
5479 pSMB->ByteCount = cpu_to_le16(byte_count);
5480 5540
5481 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5541 name_len = temp_fea->name_len;
5482 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5542 value_len = le16_to_cpu(temp_fea->value_len);
5483 if (rc) { 5543 list_len -= name_len + 1 + value_len;
5484 cFYI(1, ("Send error in Query EA = %d", rc)); 5544 if (list_len < 0) {
5485 } else { /* decode response */ 5545 cFYI(1, ("EA entry goes beyond length of list"));
5486 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5546 rc = -EIO;
5547 goto QAllEAsOut;
5548 }
5487 5549
5488 /* BB also check enough total bytes returned */ 5550 if (ea_name) {
5489 /* BB we need to improve the validity checking 5551 if (strncmp(ea_name, temp_ptr, name_len) == 0) {
5490 of these trans2 responses */ 5552 temp_ptr += name_len + 1;
5491 if (rc || (pSMBr->ByteCount < 4)) 5553 rc = value_len;
5492 rc = -EIO; /* bad smb */ 5554 if (buf_size == 0)
5493 /* else if (pFindData){ 5555 goto QAllEAsOut;
5494 memcpy((char *) pFindData, 5556 if ((size_t)value_len > buf_size) {
5495 (char *) &pSMBr->hdr.Protocol + 5557 rc = -ERANGE;
5496 data_offset, kl); 5558 goto QAllEAsOut;
5497 }*/ else {
5498 /* check that length of list is not more than bcc */
5499 /* check that each entry does not go beyond length
5500 of list */
5501 /* check that each element of each entry does not
5502 go beyond end of list */
5503 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5504 struct fealist *ea_response_data;
5505 rc = -ENODATA;
5506 /* validate_trans2_offsets() */
5507 /* BB check if start of smb + data_offset > &bcc+ bcc*/
5508 ea_response_data = (struct fealist *)
5509 (((char *) &pSMBr->hdr.Protocol) +
5510 data_offset);
5511 name_len = le32_to_cpu(ea_response_data->list_len);
5512 cFYI(1, ("ea length %d", name_len));
5513 if (name_len <= 8) {
5514 /* returned EA size zeroed at top of function */
5515 cFYI(1, ("empty EA list returned from server"));
5516 } else {
5517 /* account for ea list len */
5518 name_len -= 4;
5519 temp_fea = ea_response_data->list;
5520 temp_ptr = (char *)temp_fea;
5521 /* loop through checking if we have a matching
5522 name and then return the associated value */
5523 while (name_len > 0) {
5524 __u16 value_len;
5525 name_len -= 4;
5526 temp_ptr += 4;
5527 value_len =
5528 le16_to_cpu(temp_fea->value_len);
5529 /* BB validate that value_len falls within SMB,
5530 even though maximum for name_len is 255 */
5531 if (memcmp(temp_fea->name, ea_name,
5532 temp_fea->name_len) == 0) {
5533 /* found a match */
5534 rc = value_len;
5535 /* account for prefix user. and trailing null */
5536 if (rc <= (int)buf_size) {
5537 memcpy(ea_value,
5538 temp_fea->name+temp_fea->name_len+1,
5539 rc);
5540 /* ea values, unlike ea
5541 names, are not null
5542 terminated */
5543 } else if (buf_size == 0) {
5544 /* skip copy - calc size only */
5545 } else {
5546 /* stop before overrun buffer */
5547 rc = -ERANGE;
5548 }
5549 break;
5550 }
5551 name_len -= temp_fea->name_len;
5552 temp_ptr += temp_fea->name_len;
5553 /* account for trailing null */
5554 name_len--;
5555 temp_ptr++;
5556 name_len -= value_len;
5557 temp_ptr += value_len;
5558 /* No trailing null to account for in
5559 value_len. Go on to next EA */
5560 temp_fea = (struct fea *)temp_ptr;
5561 } 5559 }
5560 memcpy(EAData, temp_ptr, value_len);
5561 goto QAllEAsOut;
5562 }
5563 } else {
5564 /* account for prefix user. and trailing null */
5565 rc += (5 + 1 + name_len);
5566 if (rc < (int) buf_size) {
5567 memcpy(EAData, "user.", 5);
5568 EAData += 5;
5569 memcpy(EAData, temp_ptr, name_len);
5570 EAData += name_len;
5571 /* null terminate name */
5572 *EAData = 0;
5573 ++EAData;
5574 } else if (buf_size == 0) {
5575 /* skip copy - calc size only */
5576 } else {
5577 /* stop before overrun buffer */
5578 rc = -ERANGE;
5579 break;
5562 } 5580 }
5563 } 5581 }
5582 temp_ptr += name_len + 1 + value_len;
5583 temp_fea = (struct fea *)temp_ptr;
5564 } 5584 }
5585
5586 /* didn't find the named attribute */
5587 if (ea_name)
5588 rc = -ENODATA;
5589
5590QAllEAsOut:
5565 cifs_buf_release(pSMB); 5591 cifs_buf_release(pSMB);
5566 if (rc == -EAGAIN) 5592 if (rc == -EAGAIN)
5567 goto QEARetry; 5593 goto QAllEAsRetry;
5568 5594
5569 return (ssize_t)rc; 5595 return (ssize_t)rc;
5570} 5596}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 63ea83ff687f..d9566bf8f917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/slab.h>
26#include <linux/pagemap.h> 27#include <linux/pagemap.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/utsname.h> 29#include <linux/utsname.h>
@@ -98,7 +99,7 @@ struct smb_vol {
98 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 99 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
99 unsigned int rsize; 100 unsigned int rsize;
100 unsigned int wsize; 101 unsigned int wsize;
101 unsigned int sockopt; 102 bool sockopt_tcp_nodelay:1;
102 unsigned short int port; 103 unsigned short int port;
103 char *prepath; 104 char *prepath;
104}; 105};
@@ -1142,9 +1143,11 @@ cifs_parse_mount_options(char *options, const char *devname,
1142 simple_strtoul(value, &value, 0); 1143 simple_strtoul(value, &value, 0);
1143 } 1144 }
1144 } else if (strnicmp(data, "sockopt", 5) == 0) { 1145 } else if (strnicmp(data, "sockopt", 5) == 0) {
1145 if (value && *value) { 1146 if (!value || !*value) {
1146 vol->sockopt = 1147 cERROR(1, ("no socket option specified"));
1147 simple_strtoul(value, &value, 0); 1148 continue;
1149 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
1150 vol->sockopt_tcp_nodelay = 1;
1148 } 1151 }
1149 } else if (strnicmp(data, "netbiosname", 4) == 0) { 1152 } else if (strnicmp(data, "netbiosname", 4) == 0) {
1150 if (!value || !*value || (*value == ' ')) { 1153 if (!value || !*value || (*value == ' ')) {
@@ -1514,6 +1517,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1514 1517
1515 tcp_ses->noblocksnd = volume_info->noblocksnd; 1518 tcp_ses->noblocksnd = volume_info->noblocksnd;
1516 tcp_ses->noautotune = volume_info->noautotune; 1519 tcp_ses->noautotune = volume_info->noautotune;
1520 tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
1517 atomic_set(&tcp_ses->inFlight, 0); 1521 atomic_set(&tcp_ses->inFlight, 0);
1518 init_waitqueue_head(&tcp_ses->response_q); 1522 init_waitqueue_head(&tcp_ses->response_q);
1519 init_waitqueue_head(&tcp_ses->request_q); 1523 init_waitqueue_head(&tcp_ses->request_q);
@@ -1764,6 +1768,7 @@ static int
1764ipv4_connect(struct TCP_Server_Info *server) 1768ipv4_connect(struct TCP_Server_Info *server)
1765{ 1769{
1766 int rc = 0; 1770 int rc = 0;
1771 int val;
1767 bool connected = false; 1772 bool connected = false;
1768 __be16 orig_port = 0; 1773 __be16 orig_port = 0;
1769 struct socket *socket = server->ssocket; 1774 struct socket *socket = server->ssocket;
@@ -1845,6 +1850,14 @@ ipv4_connect(struct TCP_Server_Info *server)
1845 socket->sk->sk_rcvbuf = 140 * 1024; 1850 socket->sk->sk_rcvbuf = 140 * 1024;
1846 } 1851 }
1847 1852
1853 if (server->tcp_nodelay) {
1854 val = 1;
1855 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
1856 (char *)&val, sizeof(val));
1857 if (rc)
1858 cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
1859 }
1860
1848 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx", 1861 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
1849 socket->sk->sk_sndbuf, 1862 socket->sk->sk_sndbuf,
1850 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo)); 1863 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
@@ -1916,6 +1929,7 @@ static int
1916ipv6_connect(struct TCP_Server_Info *server) 1929ipv6_connect(struct TCP_Server_Info *server)
1917{ 1930{
1918 int rc = 0; 1931 int rc = 0;
1932 int val;
1919 bool connected = false; 1933 bool connected = false;
1920 __be16 orig_port = 0; 1934 __be16 orig_port = 0;
1921 struct socket *socket = server->ssocket; 1935 struct socket *socket = server->ssocket;
@@ -1987,6 +2001,15 @@ ipv6_connect(struct TCP_Server_Info *server)
1987 */ 2001 */
1988 socket->sk->sk_rcvtimeo = 7 * HZ; 2002 socket->sk->sk_rcvtimeo = 7 * HZ;
1989 socket->sk->sk_sndtimeo = 5 * HZ; 2003 socket->sk->sk_sndtimeo = 5 * HZ;
2004
2005 if (server->tcp_nodelay) {
2006 val = 1;
2007 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2008 (char *)&val, sizeof(val));
2009 if (rc)
2010 cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
2011 }
2012
1990 server->ssocket = socket; 2013 server->ssocket = socket;
1991 2014
1992 return rc; 2015 return rc;
@@ -2287,12 +2310,12 @@ int
2287cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, 2310cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2288 char *mount_data_global, const char *devname) 2311 char *mount_data_global, const char *devname)
2289{ 2312{
2290 int rc = 0; 2313 int rc;
2291 int xid; 2314 int xid;
2292 struct smb_vol *volume_info; 2315 struct smb_vol *volume_info;
2293 struct cifsSesInfo *pSesInfo = NULL; 2316 struct cifsSesInfo *pSesInfo;
2294 struct cifsTconInfo *tcon = NULL; 2317 struct cifsTconInfo *tcon;
2295 struct TCP_Server_Info *srvTcp = NULL; 2318 struct TCP_Server_Info *srvTcp;
2296 char *full_path; 2319 char *full_path;
2297 char *mount_data = mount_data_global; 2320 char *mount_data = mount_data_global;
2298#ifdef CONFIG_CIFS_DFS_UPCALL 2321#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -2301,6 +2324,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2301 int referral_walks_count = 0; 2324 int referral_walks_count = 0;
2302try_mount_again: 2325try_mount_again:
2303#endif 2326#endif
2327 rc = 0;
2328 tcon = NULL;
2329 pSesInfo = NULL;
2330 srvTcp = NULL;
2304 full_path = NULL; 2331 full_path = NULL;
2305 2332
2306 xid = GetXid(); 2333 xid = GetXid();
@@ -2362,13 +2389,13 @@ try_mount_again:
2362 */ 2389 */
2363 cifs_put_tcp_session(srvTcp); 2390 cifs_put_tcp_session(srvTcp);
2364 2391
2365 down(&pSesInfo->sesSem); 2392 mutex_lock(&pSesInfo->session_mutex);
2366 if (pSesInfo->need_reconnect) { 2393 if (pSesInfo->need_reconnect) {
2367 cFYI(1, ("Session needs reconnect")); 2394 cFYI(1, ("Session needs reconnect"));
2368 rc = cifs_setup_session(xid, pSesInfo, 2395 rc = cifs_setup_session(xid, pSesInfo,
2369 cifs_sb->local_nls); 2396 cifs_sb->local_nls);
2370 } 2397 }
2371 up(&pSesInfo->sesSem); 2398 mutex_unlock(&pSesInfo->session_mutex);
2372 } else if (!rc) { 2399 } else if (!rc) {
2373 cFYI(1, ("Existing smb sess not found")); 2400 cFYI(1, ("Existing smb sess not found"));
2374 pSesInfo = sesInfoAlloc(); 2401 pSesInfo = sesInfoAlloc();
@@ -2411,12 +2438,12 @@ try_mount_again:
2411 } 2438 }
2412 pSesInfo->linux_uid = volume_info->linux_uid; 2439 pSesInfo->linux_uid = volume_info->linux_uid;
2413 pSesInfo->overrideSecFlg = volume_info->secFlg; 2440 pSesInfo->overrideSecFlg = volume_info->secFlg;
2414 down(&pSesInfo->sesSem); 2441 mutex_lock(&pSesInfo->session_mutex);
2415 2442
2416 /* BB FIXME need to pass vol->secFlgs BB */ 2443 /* BB FIXME need to pass vol->secFlgs BB */
2417 rc = cifs_setup_session(xid, pSesInfo, 2444 rc = cifs_setup_session(xid, pSesInfo,
2418 cifs_sb->local_nls); 2445 cifs_sb->local_nls);
2419 up(&pSesInfo->sesSem); 2446 mutex_unlock(&pSesInfo->session_mutex);
2420 } 2447 }
2421 2448
2422 /* search for existing tcon to this server share */ 2449 /* search for existing tcon to this server share */
@@ -2597,6 +2624,7 @@ remote_path_check:
2597 2624
2598 cleanup_volume_info(&volume_info); 2625 cleanup_volume_info(&volume_info);
2599 referral_walks_count++; 2626 referral_walks_count++;
2627 FreeXid(xid);
2600 goto try_mount_again; 2628 goto try_mount_again;
2601 } 2629 }
2602#else /* No DFS support, return error on mount */ 2630#else /* No DFS support, return error on mount */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1f42f772865a..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -214,7 +214,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
214 posix_flags |= SMB_O_EXCL; 214 posix_flags |= SMB_O_EXCL;
215 if (oflags & O_TRUNC) 215 if (oflags & O_TRUNC)
216 posix_flags |= SMB_O_TRUNC; 216 posix_flags |= SMB_O_TRUNC;
217 if (oflags & O_SYNC) 217 /* be safe and imply O_SYNC for O_DSYNC */
218 if (oflags & O_DSYNC)
218 posix_flags |= SMB_O_SYNC; 219 posix_flags |= SMB_O_SYNC;
219 if (oflags & O_DIRECTORY) 220 if (oflags & O_DIRECTORY)
220 posix_flags |= SMB_O_DIRECTORY; 221 posix_flags |= SMB_O_DIRECTORY;
@@ -738,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
738 int isValid = 1; 739 int isValid = 1;
739 740
740 if (direntry->d_inode) { 741 if (direntry->d_inode) {
741 if (cifs_revalidate(direntry)) 742 if (cifs_revalidate_dentry(direntry))
742 return 0; 743 return 0;
743 } else { 744 } else {
744 cFYI(1, ("neg dentry 0x%p name = %s", 745 cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..6f8a0e3fb25b 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */ 24 */
25 25
26#include <linux/slab.h>
26#include <keys/user-type.h> 27#include <keys/user-type.h>
27#include "dns_resolve.h" 28#include "dns_resolve.h"
28#include "cifsglob.h" 29#include "cifsglob.h"
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 75949d6a5f1b..6177f7cca16a 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -24,7 +24,7 @@
24 */ 24 */
25 25
26 /* 26 /*
27 * See Documentation/filesystems/Exporting 27 * See Documentation/filesystems/nfs/Exporting
28 * and examples in fs/exportfs 28 * and examples in fs/exportfs
29 * 29 *
30 * Since cifs is a network file system, an "fsid" must be included for 30 * Since cifs is a network file system, an "fsid" must be included for
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 429337eb7afe..058b390d3da8 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -31,6 +31,7 @@
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h>
34#include <asm/div64.h> 35#include <asm/div64.h>
35#include "cifsfs.h" 36#include "cifsfs.h"
36#include "cifspdu.h" 37#include "cifspdu.h"
@@ -76,8 +77,10 @@ static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
76 reopening a file. They had their effect on the original open */ 77 reopening a file. They had their effect on the original open */
77 if (flags & O_APPEND) 78 if (flags & O_APPEND)
78 posix_flags |= (fmode_t)O_APPEND; 79 posix_flags |= (fmode_t)O_APPEND;
79 if (flags & O_SYNC) 80 if (flags & O_DSYNC)
80 posix_flags |= (fmode_t)O_SYNC; 81 posix_flags |= (fmode_t)O_DSYNC;
82 if (flags & __O_SYNC)
83 posix_flags |= (fmode_t)__O_SYNC;
81 if (flags & O_DIRECTORY) 84 if (flags & O_DIRECTORY)
82 posix_flags |= (fmode_t)O_DIRECTORY; 85 posix_flags |= (fmode_t)O_DIRECTORY;
83 if (flags & O_NOFOLLOW) 86 if (flags & O_NOFOLLOW)
@@ -217,8 +220,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
217 cFYI(1, ("inode unchanged on server")); 220 cFYI(1, ("inode unchanged on server"));
218 } else { 221 } else {
219 if (file->f_path.dentry->d_inode->i_mapping) { 222 if (file->f_path.dentry->d_inode->i_mapping) {
220 /* BB no need to lock inode until after invalidate 223 /* BB no need to lock inode until after invalidate
221 since namei code should already have it locked? */ 224 since namei code should already have it locked? */
222 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 225 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
223 if (rc != 0) 226 if (rc != 0)
224 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 227 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -1888,11 +1891,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1888 1891
1889int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1892int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1890{ 1893{
1891 struct dentry *dentry = file->f_path.dentry;
1892 int rc, xid; 1894 int rc, xid;
1893 1895
1894 xid = GetXid(); 1896 xid = GetXid();
1895 rc = cifs_revalidate(dentry); 1897 rc = cifs_revalidate_file(file);
1896 if (rc) { 1898 if (rc) {
1897 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
1898 FreeXid(xid); 1900 FreeXid(xid);
@@ -2287,9 +2289,9 @@ cifs_oplock_break(struct slow_work *work)
2287 if (inode && S_ISREG(inode->i_mode)) { 2289 if (inode && S_ISREG(inode->i_mode)) {
2288#ifdef CONFIG_CIFS_EXPERIMENTAL 2290#ifdef CONFIG_CIFS_EXPERIMENTAL
2289 if (cinode->clientCanCacheAll == 0) 2291 if (cinode->clientCanCacheAll == 0)
2290 break_lease(inode, FMODE_READ); 2292 break_lease(inode, O_RDONLY);
2291 else if (cinode->clientCanCacheRead == 0) 2293 else if (cinode->clientCanCacheRead == 0)
2292 break_lease(inode, FMODE_WRITE); 2294 break_lease(inode, O_WRONLY);
2293#endif 2295#endif
2294 rc = filemap_fdatawrite(inode->i_mapping); 2296 rc = filemap_fdatawrite(inode->i_mapping);
2295 if (cinode->clientCanCacheRead == 0) { 2297 if (cinode->clientCanCacheRead == 0) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cababd8a52df..35ec11716213 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <asm/div64.h> 25#include <asm/div64.h>
25#include "cifsfs.h" 26#include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 78 }
78} 79}
79 80
81/* check inode attributes against fattr. If they don't match, tag the
82 * inode for cache invalidation
83 */
84static void
85cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
86{
87 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
88
89 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
90
91 if (inode->i_state & I_NEW) {
92 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
93 return;
94 }
95
96 /* don't bother with revalidation if we have an oplock */
97 if (cifs_i->clientCanCacheRead) {
98 cFYI(1, ("%s: inode %llu is oplocked", __func__,
99 cifs_i->uniqueid));
100 return;
101 }
102
103 /* revalidate if mtime or size have changed */
104 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
105 cifs_i->server_eof == fattr->cf_eof) {
106 cFYI(1, ("%s: inode %llu is unchanged", __func__,
107 cifs_i->uniqueid));
108 return;
109 }
110
111 cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
112 cifs_i->uniqueid));
113 cifs_i->invalid_mapping = true;
114}
115
80/* populate an inode with info from a cifs_fattr struct */ 116/* populate an inode with info from a cifs_fattr struct */
81void 117void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 118cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 121 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 122 unsigned long oldtime = cifs_i->time;
87 123
124 cifs_revalidate_cache(inode, fattr);
125
88 inode->i_atime = fattr->cf_atime; 126 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 127 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 128 inode->i_ctime = fattr->cf_ctime;
@@ -111,6 +149,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
111 149
112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 150 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
113 151
152 cifs_i->server_eof = fattr->cf_eof;
114 /* 153 /*
115 * Can't safely change the file size here if the client is writing to 154 * Can't safely change the file size here if the client is writing to
116 * it due to potential races. 155 * it due to potential races.
@@ -230,6 +269,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
230 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 269 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
231} 270}
232 271
272int cifs_get_file_info_unix(struct file *filp)
273{
274 int rc;
275 int xid;
276 FILE_UNIX_BASIC_INFO find_data;
277 struct cifs_fattr fattr;
278 struct inode *inode = filp->f_path.dentry->d_inode;
279 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
280 struct cifsTconInfo *tcon = cifs_sb->tcon;
281 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
282
283 xid = GetXid();
284 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
285 if (!rc) {
286 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
287 } else if (rc == -EREMOTE) {
288 cifs_create_dfs_fattr(&fattr, inode->i_sb);
289 rc = 0;
290 }
291
292 cifs_fattr_to_inode(inode, &fattr);
293 FreeXid(xid);
294 return rc;
295}
296
233int cifs_get_inode_info_unix(struct inode **pinode, 297int cifs_get_inode_info_unix(struct inode **pinode,
234 const unsigned char *full_path, 298 const unsigned char *full_path,
235 struct super_block *sb, int xid) 299 struct super_block *sb, int xid)
@@ -366,7 +430,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
366 char ea_value[4]; 430 char ea_value[4];
367 __u32 mode; 431 __u32 mode;
368 432
369 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", 433 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
370 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 434 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
371 cifs_sb->mnt_cifs_flags & 435 cifs_sb->mnt_cifs_flags &
372 CIFS_MOUNT_MAP_SPECIAL_CHR); 436 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -431,6 +495,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
431 fattr->cf_gid = cifs_sb->mnt_gid; 495 fattr->cf_gid = cifs_sb->mnt_gid;
432} 496}
433 497
498int cifs_get_file_info(struct file *filp)
499{
500 int rc;
501 int xid;
502 FILE_ALL_INFO find_data;
503 struct cifs_fattr fattr;
504 struct inode *inode = filp->f_path.dentry->d_inode;
505 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
506 struct cifsTconInfo *tcon = cifs_sb->tcon;
507 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
508
509 xid = GetXid();
510 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
511 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
512 /*
513 * FIXME: legacy server -- fall back to path-based call?
514 * for now, just skip revalidating and mark inode for
515 * immediate reval.
516 */
517 rc = 0;
518 CIFS_I(inode)->time = 0;
519 goto cgfi_exit;
520 } else if (rc == -EREMOTE) {
521 cifs_create_dfs_fattr(&fattr, inode->i_sb);
522 rc = 0;
523 } else if (rc)
524 goto cgfi_exit;
525
526 /*
527 * don't bother with SFU junk here -- just mark inode as needing
528 * revalidation.
529 */
530 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
531 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
532 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
533 cifs_fattr_to_inode(inode, &fattr);
534cgfi_exit:
535 FreeXid(xid);
536 return rc;
537}
538
434int cifs_get_inode_info(struct inode **pinode, 539int cifs_get_inode_info(struct inode **pinode,
435 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 540 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
436 struct super_block *sb, int xid, const __u16 *pfid) 541 struct super_block *sb, int xid, const __u16 *pfid)
@@ -914,8 +1019,8 @@ undo_setattr:
914/* 1019/*
915 * If dentry->d_inode is null (usually meaning the cached dentry 1020 * If dentry->d_inode is null (usually meaning the cached dentry
916 * is a negative dentry) then we would attempt a standard SMB delete, but 1021 * is a negative dentry) then we would attempt a standard SMB delete, but
917 * if that fails we can not attempt the fall back mechanisms on EACESS 1022 * if that fails we can not attempt the fall back mechanisms on EACCESS
918 * but will return the EACESS to the caller. Note that the VFS does not call 1023 * but will return the EACCESS to the caller. Note that the VFS does not call
919 * unlink on negative dentries currently. 1024 * unlink on negative dentries currently.
920 */ 1025 */
921int cifs_unlink(struct inode *dir, struct dentry *dentry) 1026int cifs_unlink(struct inode *dir, struct dentry *dentry)
@@ -1388,135 +1493,103 @@ cifs_rename_exit:
1388 return rc; 1493 return rc;
1389} 1494}
1390 1495
1391int cifs_revalidate(struct dentry *direntry) 1496static bool
1497cifs_inode_needs_reval(struct inode *inode)
1392{ 1498{
1393 int xid; 1499 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1394 int rc = 0, wbrc = 0;
1395 char *full_path;
1396 struct cifs_sb_info *cifs_sb;
1397 struct cifsInodeInfo *cifsInode;
1398 loff_t local_size;
1399 struct timespec local_mtime;
1400 bool invalidate_inode = false;
1401 1500
1402 if (direntry->d_inode == NULL) 1501 if (cifs_i->clientCanCacheRead)
1403 return -ENOENT; 1502 return false;
1404 1503
1405 cifsInode = CIFS_I(direntry->d_inode); 1504 if (!lookupCacheEnabled)
1505 return true;
1406 1506
1407 if (cifsInode == NULL) 1507 if (cifs_i->time == 0)
1408 return -ENOENT; 1508 return true;
1409 1509
1410 /* no sense revalidating inode info on file that no one can write */ 1510 /* FIXME: the actimeo should be tunable */
1411 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1511 if (time_after_eq(jiffies, cifs_i->time + HZ))
1412 return rc; 1512 return true;
1513
1514 return false;
1515}
1516
1517/* check invalid_mapping flag and zap the cache if it's set */
1518static void
1519cifs_invalidate_mapping(struct inode *inode)
1520{
1521 int rc;
1522 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1523
1524 cifs_i->invalid_mapping = false;
1525
1526 /* write back any cached data */
1527 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1528 rc = filemap_write_and_wait(inode->i_mapping);
1529 if (rc)
1530 cifs_i->write_behind_rc = rc;
1531 }
1532 invalidate_remote_inode(inode);
1533}
1534
1535int cifs_revalidate_file(struct file *filp)
1536{
1537 int rc = 0;
1538 struct inode *inode = filp->f_path.dentry->d_inode;
1539
1540 if (!cifs_inode_needs_reval(inode))
1541 goto check_inval;
1542
1543 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1544 rc = cifs_get_file_info_unix(filp);
1545 else
1546 rc = cifs_get_file_info(filp);
1547
1548check_inval:
1549 if (CIFS_I(inode)->invalid_mapping)
1550 cifs_invalidate_mapping(inode);
1551
1552 return rc;
1553}
1554
1555/* revalidate a dentry's inode attributes */
1556int cifs_revalidate_dentry(struct dentry *dentry)
1557{
1558 int xid;
1559 int rc = 0;
1560 char *full_path = NULL;
1561 struct inode *inode = dentry->d_inode;
1562 struct super_block *sb = dentry->d_sb;
1563
1564 if (inode == NULL)
1565 return -ENOENT;
1413 1566
1414 xid = GetXid(); 1567 xid = GetXid();
1415 1568
1416 cifs_sb = CIFS_SB(direntry->d_sb); 1569 if (!cifs_inode_needs_reval(inode))
1570 goto check_inval;
1417 1571
1418 /* can not safely grab the rename sem here if rename calls revalidate 1572 /* can not safely grab the rename sem here if rename calls revalidate
1419 since that would deadlock */ 1573 since that would deadlock */
1420 full_path = build_path_from_dentry(direntry); 1574 full_path = build_path_from_dentry(dentry);
1421 if (full_path == NULL) { 1575 if (full_path == NULL) {
1422 rc = -ENOMEM; 1576 rc = -ENOMEM;
1423 FreeXid(xid); 1577 goto check_inval;
1424 return rc;
1425 }
1426 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1427 "jiffies %ld", full_path, direntry->d_inode,
1428 direntry->d_inode->i_count.counter, direntry,
1429 direntry->d_time, jiffies));
1430
1431 if (cifsInode->time == 0) {
1432 /* was set to zero previously to force revalidate */
1433 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1434 lookupCacheEnabled) {
1435 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1436 (direntry->d_inode->i_nlink == 1)) {
1437 kfree(full_path);
1438 FreeXid(xid);
1439 return rc;
1440 } else {
1441 cFYI(1, ("Have to revalidate file due to hardlinks"));
1442 }
1443 }
1444
1445 /* save mtime and size */
1446 local_mtime = direntry->d_inode->i_mtime;
1447 local_size = direntry->d_inode->i_size;
1448
1449 if (cifs_sb->tcon->unix_ext) {
1450 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
1451 direntry->d_sb, xid);
1452 if (rc) {
1453 cFYI(1, ("error on getting revalidate info %d", rc));
1454/* if (rc != -ENOENT)
1455 rc = 0; */ /* BB should we cache info on
1456 certain errors? */
1457 }
1458 } else {
1459 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1460 direntry->d_sb, xid, NULL);
1461 if (rc) {
1462 cFYI(1, ("error on getting revalidate info %d", rc));
1463/* if (rc != -ENOENT)
1464 rc = 0; */ /* BB should we cache info on
1465 certain errors? */
1466 }
1467 } 1578 }
1468 /* should we remap certain errors, access denied?, to zero */
1469 1579
1470 /* if not oplocked, we invalidate inode pages if mtime or file size 1580 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1471 had changed on server */ 1581 "jiffies %ld", full_path, inode, inode->i_count.counter,
1582 dentry, dentry->d_time, jiffies));
1472 1583
1473 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) && 1584 if (CIFS_SB(sb)->tcon->unix_ext)
1474 (local_size == direntry->d_inode->i_size)) { 1585 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1475 cFYI(1, ("cifs_revalidate - inode unchanged")); 1586 else
1476 } else { 1587 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1477 /* file may have changed on server */ 1588 xid, NULL);
1478 if (cifsInode->clientCanCacheRead) {
1479 /* no need to invalidate inode pages since we were the
1480 only ones who could have modified the file and the
1481 server copy is staler than ours */
1482 } else {
1483 invalidate_inode = true;
1484 }
1485 }
1486 1589
1487 /* can not grab this sem since kernel filesys locking documentation 1590check_inval:
1488 indicates i_mutex may be taken by the kernel on lookup and rename 1591 if (CIFS_I(inode)->invalid_mapping)
1489 which could deadlock if we grab the i_mutex here as well */ 1592 cifs_invalidate_mapping(inode);
1490/* mutex_lock(&direntry->d_inode->i_mutex);*/
1491 /* need to write out dirty pages here */
1492 if (direntry->d_inode->i_mapping) {
1493 /* do we need to lock inode until after invalidate completes
1494 below? */
1495 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1496 if (wbrc)
1497 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1498 }
1499 if (invalidate_inode) {
1500 /* shrink_dcache not necessary now that cifs dentry ops
1501 are exported for negative dentries */
1502/* if (S_ISDIR(direntry->d_inode->i_mode))
1503 shrink_dcache_parent(direntry); */
1504 if (S_ISREG(direntry->d_inode->i_mode)) {
1505 if (direntry->d_inode->i_mapping) {
1506 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1507 if (wbrc)
1508 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1509 }
1510 /* may eventually have to do this for open files too */
1511 if (list_empty(&(cifsInode->openFileList))) {
1512 /* changed on server - flush read ahead pages */
1513 cFYI(1, ("Invalidating read ahead data on "
1514 "closed file"));
1515 invalidate_remote_inode(direntry->d_inode);
1516 }
1517 }
1518 }
1519/* mutex_unlock(&direntry->d_inode->i_mutex); */
1520 1593
1521 kfree(full_path); 1594 kfree(full_path);
1522 FreeXid(xid); 1595 FreeXid(xid);
@@ -1526,7 +1599,7 @@ int cifs_revalidate(struct dentry *direntry)
1526int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1599int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1527 struct kstat *stat) 1600 struct kstat *stat)
1528{ 1601{
1529 int err = cifs_revalidate(dentry); 1602 int err = cifs_revalidate_dentry(dentry);
1530 if (!err) { 1603 if (!err) {
1531 generic_fillattr(dentry->d_inode, stat); 1604 generic_fillattr(dentry->d_inode, stat);
1532 stat->blksize = CIFS_MAX_MSGSIZE; 1605 stat->blksize = CIFS_MAX_MSGSIZE;
@@ -1762,8 +1835,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1762 CIFS_MOUNT_MAP_SPECIAL_CHR); 1835 CIFS_MOUNT_MAP_SPECIAL_CHR);
1763 } 1836 }
1764 1837
1765 if (!rc) 1838 if (!rc) {
1766 rc = inode_setattr(inode, attrs); 1839 rc = inode_setattr(inode, attrs);
1840
1841 /* force revalidate when any of these times are set since some
1842 of the fs types (eg ext3, fat) do not have fine enough
1843 time granularity to match protocol, and we do not have a
1844 a way (yet) to query the server fs's time granularity (and
1845 whether it rounds times down).
1846 */
1847 if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
1848 cifsInode->time = 0;
1849 }
1767out: 1850out:
1768 kfree(args); 1851 kfree(args);
1769 kfree(full_path); 1852 kfree(full_path);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..c1a9d4236a8c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/namei.h> 24#include <linux/namei.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..d1474996a812 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
79 ++ret_buf->ses_count; 79 ++ret_buf->ses_count;
80 INIT_LIST_HEAD(&ret_buf->smb_ses_list); 80 INIT_LIST_HEAD(&ret_buf->smb_ses_list);
81 INIT_LIST_HEAD(&ret_buf->tcon_list); 81 INIT_LIST_HEAD(&ret_buf->tcon_list);
82 init_MUTEX(&ret_buf->sesSem); 82 mutex_init(&ret_buf->session_mutex);
83 } 83 }
84 return ret_buf; 84 return ret_buf;
85} 85}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f84062f9a985..18e0bc1fb593 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
22 */ 22 */
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include "cifspdu.h" 27#include "cifspdu.h"
27#include "cifsglob.h" 28#include "cifsglob.h"
@@ -77,6 +78,11 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
77 78
78 cFYI(1, ("For %s", name->name)); 79 cFYI(1, ("For %s", name->name));
79 80
81 if (parent->d_op && parent->d_op->d_hash)
82 parent->d_op->d_hash(parent, name);
83 else
84 name->hash = full_name_hash(name->name, name->len);
85
80 dentry = d_lookup(parent, name); 86 dentry = d_lookup(parent, name);
81 if (dentry) { 87 if (dentry) {
82 /* FIXME: check for inode number changes? */ 88 /* FIXME: check for inode number changes? */
@@ -666,12 +672,11 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
666 min(len, max_len), nlt, 672 min(len, max_len), nlt,
667 cifs_sb->mnt_cifs_flags & 673 cifs_sb->mnt_cifs_flags &
668 CIFS_MOUNT_MAP_SPECIAL_CHR); 674 CIFS_MOUNT_MAP_SPECIAL_CHR);
675 pqst->len -= nls_nullsize(nlt);
669 } else { 676 } else {
670 pqst->name = filename; 677 pqst->name = filename;
671 pqst->len = len; 678 pqst->len = len;
672 } 679 }
673 pqst->hash = full_name_hash(pqst->name, pqst->len);
674/* cFYI(1, ("filldir on %s",pqst->name)); */
675 return rc; 680 return rc;
676} 681}
677 682
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7085a6275c4c..7c3fd7463f44 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
29#include "ntlmssp.h" 29#include "ntlmssp.h"
30#include "nterr.h" 30#include "nterr.h"
31#include <linux/utsname.h> 31#include <linux/utsname.h>
32#include <linux/slab.h>
32#include "cifs_spnego.h" 33#include "cifs_spnego.h"
33 34
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
@@ -223,9 +224,9 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
223 /* null user mount */ 224 /* null user mount */
224 *bcc_ptr = 0; 225 *bcc_ptr = 0;
225 *(bcc_ptr+1) = 0; 226 *(bcc_ptr+1) = 0;
226 } else { /* 300 should be long enough for any conceivable user name */ 227 } else {
227 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName, 228 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
228 300, nls_cp); 229 MAX_USERNAME_SIZE, nls_cp);
229 } 230 }
230 bcc_ptr += 2 * bytes_ret; 231 bcc_ptr += 2 * bytes_ret;
231 bcc_ptr += 2; /* account for null termination */ 232 bcc_ptr += 2; /* account for null termination */
@@ -246,11 +247,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
246 /* copy user */ 247 /* copy user */
247 if (ses->userName == NULL) { 248 if (ses->userName == NULL) {
248 /* BB what about null user mounts - check that we do this BB */ 249 /* BB what about null user mounts - check that we do this BB */
249 } else { /* 300 should be long enough for any conceivable user name */ 250 } else {
250 strncpy(bcc_ptr, ses->userName, 300); 251 strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
251 } 252 }
252 /* BB improve check for overflow */ 253 bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
253 bcc_ptr += strnlen(ses->userName, 300);
254 *bcc_ptr = 0; 254 *bcc_ptr = 0;
255 bcc_ptr++; /* account for null termination */ 255 bcc_ptr++; /* account for null termination */
256 256
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 224a1f478966..b6b6dcb500bf 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -371,7 +371,7 @@ E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
371 smbhash(p24 + 16, c8, p21 + 14, 1); 371 smbhash(p24 + 16, c8, p21 + 14, 1);
372} 372}
373 373
374#if 0 /* currently unsued */ 374#if 0 /* currently unused */
375static void 375static void
376D_P16(unsigned char *p14, unsigned char *in, unsigned char *out) 376D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
377{ 377{
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
24*/ 24*/
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/slab.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/string.h> 29#include <linux/string.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..ad081fe7eb18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/gfp.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/net.h> 27#include <linux/net.h>
27#include <linux/delay.h> 28#include <linux/delay.h>
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..f555ce077d4f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
@@ -244,7 +245,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
244 /* revalidate/getattr then populate from inode */ 245 /* revalidate/getattr then populate from inode */
245 } /* BB add else when above is implemented */ 246 } /* BB add else when above is implemented */
246 ea_name += 5; /* skip past user. prefix */ 247 ea_name += 5; /* skip past user. prefix */
247 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 248 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
248 buf_size, cifs_sb->local_nls, 249 buf_size, cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 250 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { 251 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +253,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
252 goto get_ea_exit; 253 goto get_ea_exit;
253 254
254 ea_name += 4; /* skip past os2. prefix */ 255 ea_name += 4; /* skip past os2. prefix */
255 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 256 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
256 buf_size, cifs_sb->local_nls, 257 buf_size, cifs_sb->local_nls,
257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 258 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, 259 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -364,8 +365,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
364 /* if proc/fs/cifs/streamstoxattr is set then 365 /* if proc/fs/cifs/streamstoxattr is set then
365 search server for EAs or streams to 366 search server for EAs or streams to
366 returns as xattrs */ 367 returns as xattrs */
367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size, 368 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
368 cifs_sb->local_nls, 369 buf_size, cifs_sb->local_nls,
369 cifs_sb->mnt_cifs_flags & 370 cifs_sb->mnt_cifs_flags &
370 CIFS_MOUNT_MAP_SPECIAL_CHR); 371 CIFS_MOUNT_MAP_SPECIAL_CHR);
371 372
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/slab.h>
15#include <linux/file.h> 16#include <linux/file.h>
16#include <linux/stat.h> 17#include <linux/stat.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21 22
22#include <linux/coda.h> 23#include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..a1695dcadd99 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/file.h> 19#include <linux/file.h>
20#include <linux/vfs.h> 20#include <linux/vfs.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 43c96ce29614..c6405ce3c50e 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -17,28 +17,25 @@ static struct ctl_table_header *fs_table_header;
17 17
18static ctl_table coda_table[] = { 18static ctl_table coda_table[] = {
19 { 19 {
20 .ctl_name = CTL_UNNUMBERED,
21 .procname = "timeout", 20 .procname = "timeout",
22 .data = &coda_timeout, 21 .data = &coda_timeout,
23 .maxlen = sizeof(int), 22 .maxlen = sizeof(int),
24 .mode = 0644, 23 .mode = 0644,
25 .proc_handler = &proc_dointvec 24 .proc_handler = proc_dointvec
26 }, 25 },
27 { 26 {
28 .ctl_name = CTL_UNNUMBERED,
29 .procname = "hard", 27 .procname = "hard",
30 .data = &coda_hard, 28 .data = &coda_hard,
31 .maxlen = sizeof(int), 29 .maxlen = sizeof(int),
32 .mode = 0644, 30 .mode = 0644,
33 .proc_handler = &proc_dointvec 31 .proc_handler = proc_dointvec
34 }, 32 },
35 { 33 {
36 .ctl_name = CTL_UNNUMBERED,
37 .procname = "fake_statfs", 34 .procname = "fake_statfs",
38 .data = &coda_fake_statfs, 35 .data = &coda_fake_statfs,
39 .maxlen = sizeof(int), 36 .maxlen = sizeof(int),
40 .mode = 0600, 37 .mode = 0600,
41 .proc_handler = &proc_dointvec 38 .proc_handler = proc_dointvec
42 }, 39 },
43 {} 40 {}
44}; 41};
@@ -46,7 +43,6 @@ static ctl_table coda_table[] = {
46#ifdef CONFIG_SYSCTL 43#ifdef CONFIG_SYSCTL
47static ctl_table fs_table[] = { 44static ctl_table fs_table[] = {
48 { 45 {
49 .ctl_name = CTL_UNNUMBERED,
50 .procname = "coda", 46 .procname = "coda",
51 .mode = 0555, 47 .mode = 0555,
52 .child = coda_table 48 .child = coda_table
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
26#include <linux/stat.h> 26#include <linux/stat.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
31#include <linux/vfs.h> 32#include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 6c19040ffeef..4b6ed03cc478 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -38,8 +38,6 @@
38#include <linux/dirent.h> 38#include <linux/dirent.h>
39#include <linux/fsnotify.h> 39#include <linux/fsnotify.h>
40#include <linux/highuid.h> 40#include <linux/highuid.h>
41#include <linux/sunrpc/svc.h>
42#include <linux/nfsd/nfsd.h>
43#include <linux/nfsd/syscall.h> 41#include <linux/nfsd/syscall.h>
44#include <linux/personality.h> 42#include <linux/personality.h>
45#include <linux/rwsem.h> 43#include <linux/rwsem.h>
@@ -51,6 +49,7 @@
51#include <linux/mm.h> 49#include <linux/mm.h>
52#include <linux/eventpoll.h> 50#include <linux/eventpoll.h>
53#include <linux/fs_struct.h> 51#include <linux/fs_struct.h>
52#include <linux/slab.h>
54 53
55#include <asm/uaccess.h> 54#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
@@ -1797,6 +1796,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
1797 return ret; 1796 return ret;
1798} 1797}
1799 1798
1799struct compat_sel_arg_struct {
1800 compat_ulong_t n;
1801 compat_uptr_t inp;
1802 compat_uptr_t outp;
1803 compat_uptr_t exp;
1804 compat_uptr_t tvp;
1805};
1806
1807asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
1808{
1809 struct compat_sel_arg_struct a;
1810
1811 if (copy_from_user(&a, arg, sizeof(a)))
1812 return -EFAULT;
1813 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
1814 compat_ptr(a.exp), compat_ptr(a.tvp));
1815}
1816
1800#ifdef HAVE_SET_RESTORE_SIGMASK 1817#ifdef HAVE_SET_RESTORE_SIGMASK
1801static long do_compat_pselect(int n, compat_ulong_t __user *inp, 1818static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1802 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1819 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 0adced2f296f..112e45a17e99 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -28,10 +28,12 @@
28 28
29#undef elfhdr 29#undef elfhdr
30#undef elf_phdr 30#undef elf_phdr
31#undef elf_shdr
31#undef elf_note 32#undef elf_note
32#undef elf_addr_t 33#undef elf_addr_t
33#define elfhdr elf32_hdr 34#define elfhdr elf32_hdr
34#define elf_phdr elf32_phdr 35#define elf_phdr elf32_phdr
36#define elf_shdr elf32_shdr
35#define elf_note elf32_note 37#define elf_note elf32_note
36#define elf_addr_t Elf32_Addr 38#define elf_addr_t Elf32_Addr
37 39
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d84e7058c298..c32a1b6a856b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
23#include <linux/ioctl.h> 23#include <linux/ioctl.h>
24#include <linux/if.h> 24#include <linux/if.h>
25#include <linux/if_bridge.h> 25#include <linux/if_bridge.h>
26#include <linux/slab.h>
27#include <linux/raid/md_u.h> 26#include <linux/raid/md_u.h>
28#include <linux/kd.h> 27#include <linux/kd.h>
29#include <linux/route.h> 28#include <linux/route.h>
@@ -60,6 +59,7 @@
60#include <linux/i2c.h> 59#include <linux/i2c.h>
61#include <linux/i2c-dev.h> 60#include <linux/i2c-dev.h>
62#include <linux/atalk.h> 61#include <linux/atalk.h>
62#include <linux/gfp.h>
63 63
64#include <net/bluetooth/bluetooth.h> 64#include <net/bluetooth/bluetooth.h>
65#include <net/bluetooth/hci.h> 65#include <net/bluetooth/hci.h>
@@ -111,43 +111,40 @@
111#include <linux/dvb/frontend.h> 111#include <linux/dvb/frontend.h>
112#include <linux/dvb/video.h> 112#include <linux/dvb/video.h>
113 113
114#include <linux/sort.h>
115
114#ifdef CONFIG_SPARC 116#ifdef CONFIG_SPARC
115#include <asm/fbio.h> 117#include <asm/fbio.h>
116#endif 118#endif
117 119
118static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd, 120static int w_long(unsigned int fd, unsigned int cmd,
119 unsigned long arg, struct file *f) 121 compat_ulong_t __user *argp)
120{
121 return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
122}
123
124static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg)
125{ 122{
126 mm_segment_t old_fs = get_fs(); 123 mm_segment_t old_fs = get_fs();
127 int err; 124 int err;
128 unsigned long val; 125 unsigned long val;
129 126
130 set_fs (KERNEL_DS); 127 set_fs (KERNEL_DS);
131 err = sys_ioctl(fd, cmd, (unsigned long)&val); 128 err = sys_ioctl(fd, cmd, (unsigned long)&val);
132 set_fs (old_fs); 129 set_fs (old_fs);
133 if (!err && put_user(val, (u32 __user *)compat_ptr(arg))) 130 if (!err && put_user(val, argp))
134 return -EFAULT; 131 return -EFAULT;
135 return err; 132 return err;
136} 133}
137 134
138static int rw_long(unsigned int fd, unsigned int cmd, unsigned long arg) 135static int rw_long(unsigned int fd, unsigned int cmd,
136 compat_ulong_t __user *argp)
139{ 137{
140 mm_segment_t old_fs = get_fs(); 138 mm_segment_t old_fs = get_fs();
141 u32 __user *argptr = compat_ptr(arg);
142 int err; 139 int err;
143 unsigned long val; 140 unsigned long val;
144 141
145 if(get_user(val, argptr)) 142 if(get_user(val, argp))
146 return -EFAULT; 143 return -EFAULT;
147 set_fs (KERNEL_DS); 144 set_fs (KERNEL_DS);
148 err = sys_ioctl(fd, cmd, (unsigned long)&val); 145 err = sys_ioctl(fd, cmd, (unsigned long)&val);
149 set_fs (old_fs); 146 set_fs (old_fs);
150 if (!err && put_user(val, argptr)) 147 if (!err && put_user(val, argp))
151 return -EFAULT; 148 return -EFAULT;
152 return err; 149 return err;
153} 150}
@@ -161,7 +158,8 @@ struct compat_video_event {
161 } u; 158 } u;
162}; 159};
163 160
164static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long arg) 161static int do_video_get_event(unsigned int fd, unsigned int cmd,
162 struct compat_video_event __user *up)
165{ 163{
166 struct video_event kevent; 164 struct video_event kevent;
167 mm_segment_t old_fs = get_fs(); 165 mm_segment_t old_fs = get_fs();
@@ -172,8 +170,6 @@ static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long a
172 set_fs(old_fs); 170 set_fs(old_fs);
173 171
174 if (!err) { 172 if (!err) {
175 struct compat_video_event __user *up = compat_ptr(arg);
176
177 err = put_user(kevent.type, &up->type); 173 err = put_user(kevent.type, &up->type);
178 err |= put_user(kevent.timestamp, &up->timestamp); 174 err |= put_user(kevent.timestamp, &up->timestamp);
179 err |= put_user(kevent.u.size.w, &up->u.size.w); 175 err |= put_user(kevent.u.size.w, &up->u.size.w);
@@ -192,15 +188,14 @@ struct compat_video_still_picture {
192 int32_t size; 188 int32_t size;
193}; 189};
194 190
195static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned long arg) 191static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
192 struct compat_video_still_picture __user *up)
196{ 193{
197 struct compat_video_still_picture __user *up;
198 struct video_still_picture __user *up_native; 194 struct video_still_picture __user *up_native;
199 compat_uptr_t fp; 195 compat_uptr_t fp;
200 int32_t size; 196 int32_t size;
201 int err; 197 int err;
202 198
203 up = (struct compat_video_still_picture __user *) arg;
204 err = get_user(fp, &up->iFrame); 199 err = get_user(fp, &up->iFrame);
205 err |= get_user(size, &up->size); 200 err |= get_user(size, &up->size);
206 if (err) 201 if (err)
@@ -224,14 +219,13 @@ struct compat_video_spu_palette {
224 compat_uptr_t palette; 219 compat_uptr_t palette;
225}; 220};
226 221
227static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned long arg) 222static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
223 struct compat_video_spu_palette __user *up)
228{ 224{
229 struct compat_video_spu_palette __user *up;
230 struct video_spu_palette __user *up_native; 225 struct video_spu_palette __user *up_native;
231 compat_uptr_t palp; 226 compat_uptr_t palp;
232 int length, err; 227 int length, err;
233 228
234 up = (struct compat_video_spu_palette __user *) arg;
235 err = get_user(palp, &up->palette); 229 err = get_user(palp, &up->palette);
236 err |= get_user(length, &up->length); 230 err |= get_user(length, &up->length);
237 231
@@ -246,428 +240,6 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned
246 return err; 240 return err;
247} 241}
248 242
249#ifdef CONFIG_NET
250static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
251{
252 struct compat_timeval __user *up = compat_ptr(arg);
253 struct timeval ktv;
254 mm_segment_t old_fs = get_fs();
255 int err;
256
257 set_fs(KERNEL_DS);
258 err = sys_ioctl(fd, cmd, (unsigned long)&ktv);
259 set_fs(old_fs);
260 if(!err) {
261 err = put_user(ktv.tv_sec, &up->tv_sec);
262 err |= __put_user(ktv.tv_usec, &up->tv_usec);
263 }
264 return err;
265}
266
267static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
268{
269 struct compat_timespec __user *up = compat_ptr(arg);
270 struct timespec kts;
271 mm_segment_t old_fs = get_fs();
272 int err;
273
274 set_fs(KERNEL_DS);
275 err = sys_ioctl(fd, cmd, (unsigned long)&kts);
276 set_fs(old_fs);
277 if (!err) {
278 err = put_user(kts.tv_sec, &up->tv_sec);
279 err |= __put_user(kts.tv_nsec, &up->tv_nsec);
280 }
281 return err;
282}
283
284struct ifmap32 {
285 compat_ulong_t mem_start;
286 compat_ulong_t mem_end;
287 unsigned short base_addr;
288 unsigned char irq;
289 unsigned char dma;
290 unsigned char port;
291};
292
293struct ifreq32 {
294#define IFHWADDRLEN 6
295#define IFNAMSIZ 16
296 union {
297 char ifrn_name[IFNAMSIZ]; /* if name, e.g. "en0" */
298 } ifr_ifrn;
299 union {
300 struct sockaddr ifru_addr;
301 struct sockaddr ifru_dstaddr;
302 struct sockaddr ifru_broadaddr;
303 struct sockaddr ifru_netmask;
304 struct sockaddr ifru_hwaddr;
305 short ifru_flags;
306 compat_int_t ifru_ivalue;
307 compat_int_t ifru_mtu;
308 struct ifmap32 ifru_map;
309 char ifru_slave[IFNAMSIZ]; /* Just fits the size */
310 char ifru_newname[IFNAMSIZ];
311 compat_caddr_t ifru_data;
312 /* XXXX? ifru_settings should be here */
313 } ifr_ifru;
314};
315
316struct ifconf32 {
317 compat_int_t ifc_len; /* size of buffer */
318 compat_caddr_t ifcbuf;
319};
320
321static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
322{
323 struct ifreq __user *uifr;
324 int err;
325
326 uifr = compat_alloc_user_space(sizeof(struct ifreq));
327 if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)))
328 return -EFAULT;
329
330 err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
331 if (err)
332 return err;
333
334 if (copy_in_user(compat_ptr(arg), uifr, sizeof(struct ifreq32)))
335 return -EFAULT;
336
337 return 0;
338}
339
340static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
341{
342 struct ifconf32 ifc32;
343 struct ifconf ifc;
344 struct ifconf __user *uifc;
345 struct ifreq32 __user *ifr32;
346 struct ifreq __user *ifr;
347 unsigned int i, j;
348 int err;
349
350 if (copy_from_user(&ifc32, compat_ptr(arg), sizeof(struct ifconf32)))
351 return -EFAULT;
352
353 if (ifc32.ifcbuf == 0) {
354 ifc32.ifc_len = 0;
355 ifc.ifc_len = 0;
356 ifc.ifc_req = NULL;
357 uifc = compat_alloc_user_space(sizeof(struct ifconf));
358 } else {
359 size_t len =((ifc32.ifc_len / sizeof (struct ifreq32)) + 1) *
360 sizeof (struct ifreq);
361 uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
362 ifc.ifc_len = len;
363 ifr = ifc.ifc_req = (void __user *)(uifc + 1);
364 ifr32 = compat_ptr(ifc32.ifcbuf);
365 for (i = 0; i < ifc32.ifc_len; i += sizeof (struct ifreq32)) {
366 if (copy_in_user(ifr, ifr32, sizeof(struct ifreq32)))
367 return -EFAULT;
368 ifr++;
369 ifr32++;
370 }
371 }
372 if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
373 return -EFAULT;
374
375 err = sys_ioctl (fd, SIOCGIFCONF, (unsigned long)uifc);
376 if (err)
377 return err;
378
379 if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
380 return -EFAULT;
381
382 ifr = ifc.ifc_req;
383 ifr32 = compat_ptr(ifc32.ifcbuf);
384 for (i = 0, j = 0;
385 i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
386 i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
387 if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
388 return -EFAULT;
389 ifr32++;
390 ifr++;
391 }
392
393 if (ifc32.ifcbuf == 0) {
394 /* Translate from 64-bit structure multiple to
395 * a 32-bit one.
396 */
397 i = ifc.ifc_len;
398 i = ((i / sizeof(struct ifreq)) * sizeof(struct ifreq32));
399 ifc32.ifc_len = i;
400 } else {
401 ifc32.ifc_len = i;
402 }
403 if (copy_to_user(compat_ptr(arg), &ifc32, sizeof(struct ifconf32)))
404 return -EFAULT;
405
406 return 0;
407}
408
409static int ethtool_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
410{
411 struct ifreq __user *ifr;
412 struct ifreq32 __user *ifr32;
413 u32 data;
414 void __user *datap;
415
416 ifr = compat_alloc_user_space(sizeof(*ifr));
417 ifr32 = compat_ptr(arg);
418
419 if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
420 return -EFAULT;
421
422 if (get_user(data, &ifr32->ifr_ifru.ifru_data))
423 return -EFAULT;
424
425 datap = compat_ptr(data);
426 if (put_user(datap, &ifr->ifr_ifru.ifru_data))
427 return -EFAULT;
428
429 return sys_ioctl(fd, cmd, (unsigned long) ifr);
430}
431
432static int bond_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
433{
434 struct ifreq kifr;
435 struct ifreq __user *uifr;
436 struct ifreq32 __user *ifr32 = compat_ptr(arg);
437 mm_segment_t old_fs;
438 int err;
439 u32 data;
440 void __user *datap;
441
442 switch (cmd) {
443 case SIOCBONDENSLAVE:
444 case SIOCBONDRELEASE:
445 case SIOCBONDSETHWADDR:
446 case SIOCBONDCHANGEACTIVE:
447 if (copy_from_user(&kifr, ifr32, sizeof(struct ifreq32)))
448 return -EFAULT;
449
450 old_fs = get_fs();
451 set_fs (KERNEL_DS);
452 err = sys_ioctl (fd, cmd, (unsigned long)&kifr);
453 set_fs (old_fs);
454
455 return err;
456 case SIOCBONDSLAVEINFOQUERY:
457 case SIOCBONDINFOQUERY:
458 uifr = compat_alloc_user_space(sizeof(*uifr));
459 if (copy_in_user(&uifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
460 return -EFAULT;
461
462 if (get_user(data, &ifr32->ifr_ifru.ifru_data))
463 return -EFAULT;
464
465 datap = compat_ptr(data);
466 if (put_user(datap, &uifr->ifr_ifru.ifru_data))
467 return -EFAULT;
468
469 return sys_ioctl (fd, cmd, (unsigned long)uifr);
470 default:
471 return -EINVAL;
472 };
473}
474
475static int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
476{
477 struct ifreq __user *u_ifreq64;
478 struct ifreq32 __user *u_ifreq32 = compat_ptr(arg);
479 char tmp_buf[IFNAMSIZ];
480 void __user *data64;
481 u32 data32;
482
483 if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
484 IFNAMSIZ))
485 return -EFAULT;
486 if (__get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
487 return -EFAULT;
488 data64 = compat_ptr(data32);
489
490 u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
491
492 /* Don't check these user accesses, just let that get trapped
493 * in the ioctl handler instead.
494 */
495 if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
496 IFNAMSIZ))
497 return -EFAULT;
498 if (__put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
499 return -EFAULT;
500
501 return sys_ioctl(fd, cmd, (unsigned long) u_ifreq64);
502}
503
504static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
505{
506 struct ifreq ifr;
507 struct ifreq32 __user *uifr32;
508 struct ifmap32 __user *uifmap32;
509 mm_segment_t old_fs;
510 int err;
511
512 uifr32 = compat_ptr(arg);
513 uifmap32 = &uifr32->ifr_ifru.ifru_map;
514 switch (cmd) {
515 case SIOCSIFMAP:
516 err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
517 err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
518 err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
519 err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
520 err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq);
521 err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma);
522 err |= __get_user(ifr.ifr_map.port, &uifmap32->port);
523 if (err)
524 return -EFAULT;
525 break;
526 case SIOCSHWTSTAMP:
527 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
528 return -EFAULT;
529 ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
530 break;
531 default:
532 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
533 return -EFAULT;
534 break;
535 }
536 old_fs = get_fs();
537 set_fs (KERNEL_DS);
538 err = sys_ioctl (fd, cmd, (unsigned long)&ifr);
539 set_fs (old_fs);
540 if (!err) {
541 switch (cmd) {
542 /* TUNSETIFF is defined as _IOW, it should be _IORW
543 * as the data is copied back to user space, but that
544 * cannot be fixed without breaking all existing apps.
545 */
546 case TUNSETIFF:
547 case TUNGETIFF:
548 case SIOCGIFFLAGS:
549 case SIOCGIFMETRIC:
550 case SIOCGIFMTU:
551 case SIOCGIFMEM:
552 case SIOCGIFHWADDR:
553 case SIOCGIFINDEX:
554 case SIOCGIFADDR:
555 case SIOCGIFBRDADDR:
556 case SIOCGIFDSTADDR:
557 case SIOCGIFNETMASK:
558 case SIOCGIFTXQLEN:
559 if (copy_to_user(uifr32, &ifr, sizeof(*uifr32)))
560 return -EFAULT;
561 break;
562 case SIOCGIFMAP:
563 err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
564 err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
565 err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
566 err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
567 err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq);
568 err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma);
569 err |= __put_user(ifr.ifr_map.port, &uifmap32->port);
570 if (err)
571 err = -EFAULT;
572 break;
573 }
574 }
575 return err;
576}
577
578struct rtentry32 {
579 u32 rt_pad1;
580 struct sockaddr rt_dst; /* target address */
581 struct sockaddr rt_gateway; /* gateway addr (RTF_GATEWAY) */
582 struct sockaddr rt_genmask; /* target network mask (IP) */
583 unsigned short rt_flags;
584 short rt_pad2;
585 u32 rt_pad3;
586 unsigned char rt_tos;
587 unsigned char rt_class;
588 short rt_pad4;
589 short rt_metric; /* +1 for binary compatibility! */
590 /* char * */ u32 rt_dev; /* forcing the device at add */
591 u32 rt_mtu; /* per route MTU/Window */
592 u32 rt_window; /* Window clamping */
593 unsigned short rt_irtt; /* Initial RTT */
594
595};
596
597struct in6_rtmsg32 {
598 struct in6_addr rtmsg_dst;
599 struct in6_addr rtmsg_src;
600 struct in6_addr rtmsg_gateway;
601 u32 rtmsg_type;
602 u16 rtmsg_dst_len;
603 u16 rtmsg_src_len;
604 u32 rtmsg_metric;
605 u32 rtmsg_info;
606 u32 rtmsg_flags;
607 s32 rtmsg_ifindex;
608};
609
610static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
611{
612 int ret;
613 void *r = NULL;
614 struct in6_rtmsg r6;
615 struct rtentry r4;
616 char devname[16];
617 u32 rtdev;
618 mm_segment_t old_fs = get_fs();
619
620 struct socket *mysock = sockfd_lookup(fd, &ret);
621
622 if (mysock && mysock->sk && mysock->sk->sk_family == AF_INET6) { /* ipv6 */
623 struct in6_rtmsg32 __user *ur6 = compat_ptr(arg);
624 ret = copy_from_user (&r6.rtmsg_dst, &(ur6->rtmsg_dst),
625 3 * sizeof(struct in6_addr));
626 ret |= __get_user (r6.rtmsg_type, &(ur6->rtmsg_type));
627 ret |= __get_user (r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
628 ret |= __get_user (r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
629 ret |= __get_user (r6.rtmsg_metric, &(ur6->rtmsg_metric));
630 ret |= __get_user (r6.rtmsg_info, &(ur6->rtmsg_info));
631 ret |= __get_user (r6.rtmsg_flags, &(ur6->rtmsg_flags));
632 ret |= __get_user (r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
633
634 r = (void *) &r6;
635 } else { /* ipv4 */
636 struct rtentry32 __user *ur4 = compat_ptr(arg);
637 ret = copy_from_user (&r4.rt_dst, &(ur4->rt_dst),
638 3 * sizeof(struct sockaddr));
639 ret |= __get_user (r4.rt_flags, &(ur4->rt_flags));
640 ret |= __get_user (r4.rt_metric, &(ur4->rt_metric));
641 ret |= __get_user (r4.rt_mtu, &(ur4->rt_mtu));
642 ret |= __get_user (r4.rt_window, &(ur4->rt_window));
643 ret |= __get_user (r4.rt_irtt, &(ur4->rt_irtt));
644 ret |= __get_user (rtdev, &(ur4->rt_dev));
645 if (rtdev) {
646 ret |= copy_from_user (devname, compat_ptr(rtdev), 15);
647 r4.rt_dev = devname; devname[15] = 0;
648 } else
649 r4.rt_dev = NULL;
650
651 r = (void *) &r4;
652 }
653
654 if (ret) {
655 ret = -EFAULT;
656 goto out;
657 }
658
659 set_fs (KERNEL_DS);
660 ret = sys_ioctl (fd, cmd, (unsigned long) r);
661 set_fs (old_fs);
662
663out:
664 if (mysock)
665 sockfd_put(mysock);
666
667 return ret;
668}
669#endif
670
671#ifdef CONFIG_BLOCK 243#ifdef CONFIG_BLOCK
672typedef struct sg_io_hdr32 { 244typedef struct sg_io_hdr32 {
673 compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ 245 compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
@@ -721,16 +293,21 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
721 return 0; 293 return 0;
722} 294}
723 295
724static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) 296static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
297 sg_io_hdr32_t __user *sgio32)
725{ 298{
726 sg_io_hdr_t __user *sgio; 299 sg_io_hdr_t __user *sgio;
727 sg_io_hdr32_t __user *sgio32;
728 u16 iovec_count; 300 u16 iovec_count;
729 u32 data; 301 u32 data;
730 void __user *dxferp; 302 void __user *dxferp;
731 int err; 303 int err;
304 int interface_id;
305
306 if (get_user(interface_id, &sgio32->interface_id))
307 return -EFAULT;
308 if (interface_id != 'S')
309 return sys_ioctl(fd, cmd, (unsigned long)sgio32);
732 310
733 sgio32 = compat_ptr(arg);
734 if (get_user(iovec_count, &sgio32->iovec_count)) 311 if (get_user(iovec_count, &sgio32->iovec_count))
735 return -EFAULT; 312 return -EFAULT;
736 313
@@ -820,11 +397,11 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
820 int unused; 397 int unused;
821}; 398};
822 399
823static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) 400static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
401 compat_sg_req_info __user *o)
824{ 402{
825 int err, i; 403 int err, i;
826 sg_req_info_t __user *r; 404 sg_req_info_t __user *r;
827 struct compat_sg_req_info __user *o = (void __user *)arg;
828 r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); 405 r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
829 err = sys_ioctl(fd,cmd,(unsigned long)r); 406 err = sys_ioctl(fd,cmd,(unsigned long)r);
830 if (err < 0) 407 if (err < 0)
@@ -852,9 +429,9 @@ struct sock_fprog32 {
852#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) 429#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32)
853#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) 430#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
854 431
855static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) 432static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
433 struct sock_fprog32 __user *u_fprog32)
856{ 434{
857 struct sock_fprog32 __user *u_fprog32 = compat_ptr(arg);
858 struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); 435 struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
859 void __user *fptr64; 436 void __user *fptr64;
860 u32 fptr32; 437 u32 fptr32;
@@ -891,15 +468,14 @@ struct ppp_idle32 {
891}; 468};
892#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) 469#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32)
893 470
894static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg) 471static int ppp_gidle(unsigned int fd, unsigned int cmd,
472 struct ppp_idle32 __user *idle32)
895{ 473{
896 struct ppp_idle __user *idle; 474 struct ppp_idle __user *idle;
897 struct ppp_idle32 __user *idle32;
898 __kernel_time_t xmit, recv; 475 __kernel_time_t xmit, recv;
899 int err; 476 int err;
900 477
901 idle = compat_alloc_user_space(sizeof(*idle)); 478 idle = compat_alloc_user_space(sizeof(*idle));
902 idle32 = compat_ptr(arg);
903 479
904 err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); 480 err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
905 481
@@ -913,15 +489,14 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg)
913 return err; 489 return err;
914} 490}
915 491
916static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg) 492static int ppp_scompress(unsigned int fd, unsigned int cmd,
493 struct ppp_option_data32 __user *odata32)
917{ 494{
918 struct ppp_option_data __user *odata; 495 struct ppp_option_data __user *odata;
919 struct ppp_option_data32 __user *odata32;
920 __u32 data; 496 __u32 data;
921 void __user *datap; 497 void __user *datap;
922 498
923 odata = compat_alloc_user_space(sizeof(*odata)); 499 odata = compat_alloc_user_space(sizeof(*odata));
924 odata32 = compat_ptr(arg);
925 500
926 if (get_user(data, &odata32->ptr)) 501 if (get_user(data, &odata32->ptr))
927 return -EFAULT; 502 return -EFAULT;
@@ -937,35 +512,6 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg)
937 return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); 512 return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
938} 513}
939 514
940static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
941{
942 int err;
943
944 switch (cmd) {
945 case PPPIOCGIDLE32:
946 err = ppp_gidle(fd, cmd, arg);
947 break;
948
949 case PPPIOCSCOMPRESS32:
950 err = ppp_scompress(fd, cmd, arg);
951 break;
952
953 default:
954 do {
955 static int count;
956 if (++count <= 20)
957 printk("ppp_ioctl: Unknown cmd fd(%d) "
958 "cmd(%08x) arg(%08x)\n",
959 (int)fd, (unsigned int)cmd, (unsigned int)arg);
960 } while(0);
961 err = -EINVAL;
962 break;
963 };
964
965 return err;
966}
967
968
969#ifdef CONFIG_BLOCK 515#ifdef CONFIG_BLOCK
970struct mtget32 { 516struct mtget32 {
971 compat_long_t mt_type; 517 compat_long_t mt_type;
@@ -983,7 +529,7 @@ struct mtpos32 {
983}; 529};
984#define MTIOCPOS32 _IOR('m', 3, struct mtpos32) 530#define MTIOCPOS32 _IOR('m', 3, struct mtpos32)
985 531
986static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) 532static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
987{ 533{
988 mm_segment_t old_fs = get_fs(); 534 mm_segment_t old_fs = get_fs();
989 struct mtget get; 535 struct mtget get;
@@ -999,19 +545,10 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
999 kcmd = MTIOCPOS; 545 kcmd = MTIOCPOS;
1000 karg = &pos; 546 karg = &pos;
1001 break; 547 break;
1002 case MTIOCGET32: 548 default: /* MTIOCGET32 */
1003 kcmd = MTIOCGET; 549 kcmd = MTIOCGET;
1004 karg = &get; 550 karg = &get;
1005 break; 551 break;
1006 default:
1007 do {
1008 static int count;
1009 if (++count <= 20)
1010 printk("mt_ioctl: Unknown cmd fd(%d) "
1011 "cmd(%08x) arg(%08x)\n",
1012 (int)fd, (unsigned int)cmd, (unsigned int)arg);
1013 } while(0);
1014 return -EINVAL;
1015 } 552 }
1016 set_fs (KERNEL_DS); 553 set_fs (KERNEL_DS);
1017 err = sys_ioctl (fd, kcmd, (unsigned long)karg); 554 err = sys_ioctl (fd, kcmd, (unsigned long)karg);
@@ -1020,11 +557,11 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1020 return err; 557 return err;
1021 switch (cmd) { 558 switch (cmd) {
1022 case MTIOCPOS32: 559 case MTIOCPOS32:
1023 upos32 = compat_ptr(arg); 560 upos32 = argp;
1024 err = __put_user(pos.mt_blkno, &upos32->mt_blkno); 561 err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
1025 break; 562 break;
1026 case MTIOCGET32: 563 case MTIOCGET32:
1027 umget32 = compat_ptr(arg); 564 umget32 = argp;
1028 err = __put_user(get.mt_type, &umget32->mt_type); 565 err = __put_user(get.mt_type, &umget32->mt_type);
1029 err |= __put_user(get.mt_resid, &umget32->mt_resid); 566 err |= __put_user(get.mt_resid, &umget32->mt_resid);
1030 err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); 567 err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
@@ -1039,162 +576,8 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1039 576
1040#endif /* CONFIG_BLOCK */ 577#endif /* CONFIG_BLOCK */
1041 578
1042#ifdef CONFIG_VT 579static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
1043 580 compat_uid_t __user *argp)
1044static int vt_check(struct file *file)
1045{
1046 struct tty_struct *tty;
1047 struct inode *inode = file->f_path.dentry->d_inode;
1048 struct vc_data *vc;
1049
1050 if (file->f_op->unlocked_ioctl != tty_ioctl)
1051 return -EINVAL;
1052
1053 tty = (struct tty_struct *)file->private_data;
1054 if (tty_paranoia_check(tty, inode, "tty_ioctl"))
1055 return -EINVAL;
1056
1057 if (tty->ops->ioctl != vt_ioctl)
1058 return -EINVAL;
1059
1060 vc = (struct vc_data *)tty->driver_data;
1061 if (!vc_cons_allocated(vc->vc_num)) /* impossible? */
1062 return -ENOIOCTLCMD;
1063
1064 /*
1065 * To have permissions to do most of the vt ioctls, we either have
1066 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
1067 */
1068 if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
1069 return 1;
1070 return 0;
1071}
1072
1073struct consolefontdesc32 {
1074 unsigned short charcount; /* characters in font (256 or 512) */
1075 unsigned short charheight; /* scan lines per character (1-32) */
1076 compat_caddr_t chardata; /* font data in expanded form */
1077};
1078
1079static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
1080{
1081 struct consolefontdesc32 __user *user_cfd = compat_ptr(arg);
1082 struct console_font_op op;
1083 compat_caddr_t data;
1084 int i, perm;
1085
1086 perm = vt_check(file);
1087 if (perm < 0) return perm;
1088
1089 switch (cmd) {
1090 case PIO_FONTX:
1091 if (!perm)
1092 return -EPERM;
1093 op.op = KD_FONT_OP_SET;
1094 op.flags = 0;
1095 op.width = 8;
1096 if (get_user(op.height, &user_cfd->charheight) ||
1097 get_user(op.charcount, &user_cfd->charcount) ||
1098 get_user(data, &user_cfd->chardata))
1099 return -EFAULT;
1100 op.data = compat_ptr(data);
1101 return con_font_op(vc_cons[fg_console].d, &op);
1102 case GIO_FONTX:
1103 op.op = KD_FONT_OP_GET;
1104 op.flags = 0;
1105 op.width = 8;
1106 if (get_user(op.height, &user_cfd->charheight) ||
1107 get_user(op.charcount, &user_cfd->charcount) ||
1108 get_user(data, &user_cfd->chardata))
1109 return -EFAULT;
1110 if (!data)
1111 return 0;
1112 op.data = compat_ptr(data);
1113 i = con_font_op(vc_cons[fg_console].d, &op);
1114 if (i)
1115 return i;
1116 if (put_user(op.height, &user_cfd->charheight) ||
1117 put_user(op.charcount, &user_cfd->charcount) ||
1118 put_user((compat_caddr_t)(unsigned long)op.data,
1119 &user_cfd->chardata))
1120 return -EFAULT;
1121 return 0;
1122 }
1123 return -EINVAL;
1124}
1125
1126struct console_font_op32 {
1127 compat_uint_t op; /* operation code KD_FONT_OP_* */
1128 compat_uint_t flags; /* KD_FONT_FLAG_* */
1129 compat_uint_t width, height; /* font size */
1130 compat_uint_t charcount;
1131 compat_caddr_t data; /* font data with height fixed to 32 */
1132};
1133
1134static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
1135{
1136 struct console_font_op op;
1137 struct console_font_op32 __user *fontop = compat_ptr(arg);
1138 int perm = vt_check(file), i;
1139 struct vc_data *vc;
1140
1141 if (perm < 0) return perm;
1142
1143 if (copy_from_user(&op, fontop, sizeof(struct console_font_op32)))
1144 return -EFAULT;
1145 if (!perm && op.op != KD_FONT_OP_GET)
1146 return -EPERM;
1147 op.data = compat_ptr(((struct console_font_op32 *)&op)->data);
1148 op.flags |= KD_FONT_FLAG_OLD;
1149 vc = ((struct tty_struct *)file->private_data)->driver_data;
1150 i = con_font_op(vc, &op);
1151 if (i)
1152 return i;
1153 ((struct console_font_op32 *)&op)->data = (unsigned long)op.data;
1154 if (copy_to_user(fontop, &op, sizeof(struct console_font_op32)))
1155 return -EFAULT;
1156 return 0;
1157}
1158
1159struct unimapdesc32 {
1160 unsigned short entry_ct;
1161 compat_caddr_t entries;
1162};
1163
1164static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
1165{
1166 struct unimapdesc32 tmp;
1167 struct unimapdesc32 __user *user_ud = compat_ptr(arg);
1168 int perm = vt_check(file);
1169 struct vc_data *vc;
1170
1171 if (perm < 0)
1172 return perm;
1173 if (copy_from_user(&tmp, user_ud, sizeof tmp))
1174 return -EFAULT;
1175 if (tmp.entries)
1176 if (!access_ok(VERIFY_WRITE, compat_ptr(tmp.entries),
1177 tmp.entry_ct*sizeof(struct unipair)))
1178 return -EFAULT;
1179 vc = ((struct tty_struct *)file->private_data)->driver_data;
1180 switch (cmd) {
1181 case PIO_UNIMAP:
1182 if (!perm)
1183 return -EPERM;
1184 return con_set_unimap(vc, tmp.entry_ct,
1185 compat_ptr(tmp.entries));
1186 case GIO_UNIMAP:
1187 if (!perm && fg_console != vc->vc_num)
1188 return -EPERM;
1189 return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct),
1190 compat_ptr(tmp.entries));
1191 }
1192 return 0;
1193}
1194
1195#endif /* CONFIG_VT */
1196
1197static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long arg)
1198{ 581{
1199 mm_segment_t old_fs = get_fs(); 582 mm_segment_t old_fs = get_fs();
1200 __kernel_uid_t kuid; 583 __kernel_uid_t kuid;
@@ -1207,184 +590,15 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a
1207 set_fs(old_fs); 590 set_fs(old_fs);
1208 591
1209 if (err >= 0) 592 if (err >= 0)
1210 err = put_user(kuid, (compat_uid_t __user *)compat_ptr(arg)); 593 err = put_user(kuid, argp);
1211
1212 return err;
1213}
1214
1215struct atmif_sioc32 {
1216 compat_int_t number;
1217 compat_int_t length;
1218 compat_caddr_t arg;
1219};
1220
1221struct atm_iobuf32 {
1222 compat_int_t length;
1223 compat_caddr_t buffer;
1224};
1225
1226#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct atmif_sioc32)
1227#define ATM_GETNAMES32 _IOW('a', ATMIOC_ITF+3, struct atm_iobuf32)
1228#define ATM_GETTYPE32 _IOW('a', ATMIOC_ITF+4, struct atmif_sioc32)
1229#define ATM_GETESI32 _IOW('a', ATMIOC_ITF+5, struct atmif_sioc32)
1230#define ATM_GETADDR32 _IOW('a', ATMIOC_ITF+6, struct atmif_sioc32)
1231#define ATM_RSTADDR32 _IOW('a', ATMIOC_ITF+7, struct atmif_sioc32)
1232#define ATM_ADDADDR32 _IOW('a', ATMIOC_ITF+8, struct atmif_sioc32)
1233#define ATM_DELADDR32 _IOW('a', ATMIOC_ITF+9, struct atmif_sioc32)
1234#define ATM_GETCIRANGE32 _IOW('a', ATMIOC_ITF+10, struct atmif_sioc32)
1235#define ATM_SETCIRANGE32 _IOW('a', ATMIOC_ITF+11, struct atmif_sioc32)
1236#define ATM_SETESI32 _IOW('a', ATMIOC_ITF+12, struct atmif_sioc32)
1237#define ATM_SETESIF32 _IOW('a', ATMIOC_ITF+13, struct atmif_sioc32)
1238#define ATM_GETSTAT32 _IOW('a', ATMIOC_SARCOM+0, struct atmif_sioc32)
1239#define ATM_GETSTATZ32 _IOW('a', ATMIOC_SARCOM+1, struct atmif_sioc32)
1240#define ATM_GETLOOP32 _IOW('a', ATMIOC_SARCOM+2, struct atmif_sioc32)
1241#define ATM_SETLOOP32 _IOW('a', ATMIOC_SARCOM+3, struct atmif_sioc32)
1242#define ATM_QUERYLOOP32 _IOW('a', ATMIOC_SARCOM+4, struct atmif_sioc32)
1243
1244static struct {
1245 unsigned int cmd32;
1246 unsigned int cmd;
1247} atm_ioctl_map[] = {
1248 { ATM_GETLINKRATE32, ATM_GETLINKRATE },
1249 { ATM_GETNAMES32, ATM_GETNAMES },
1250 { ATM_GETTYPE32, ATM_GETTYPE },
1251 { ATM_GETESI32, ATM_GETESI },
1252 { ATM_GETADDR32, ATM_GETADDR },
1253 { ATM_RSTADDR32, ATM_RSTADDR },
1254 { ATM_ADDADDR32, ATM_ADDADDR },
1255 { ATM_DELADDR32, ATM_DELADDR },
1256 { ATM_GETCIRANGE32, ATM_GETCIRANGE },
1257 { ATM_SETCIRANGE32, ATM_SETCIRANGE },
1258 { ATM_SETESI32, ATM_SETESI },
1259 { ATM_SETESIF32, ATM_SETESIF },
1260 { ATM_GETSTAT32, ATM_GETSTAT },
1261 { ATM_GETSTATZ32, ATM_GETSTATZ },
1262 { ATM_GETLOOP32, ATM_GETLOOP },
1263 { ATM_SETLOOP32, ATM_SETLOOP },
1264 { ATM_QUERYLOOP32, ATM_QUERYLOOP }
1265};
1266
1267#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
1268
1269static int do_atm_iobuf(unsigned int fd, unsigned int cmd, unsigned long arg)
1270{
1271 struct atm_iobuf __user *iobuf;
1272 struct atm_iobuf32 __user *iobuf32;
1273 u32 data;
1274 void __user *datap;
1275 int len, err;
1276
1277 iobuf = compat_alloc_user_space(sizeof(*iobuf));
1278 iobuf32 = compat_ptr(arg);
1279
1280 if (get_user(len, &iobuf32->length) ||
1281 get_user(data, &iobuf32->buffer))
1282 return -EFAULT;
1283 datap = compat_ptr(data);
1284 if (put_user(len, &iobuf->length) ||
1285 put_user(datap, &iobuf->buffer))
1286 return -EFAULT;
1287
1288 err = sys_ioctl(fd, cmd, (unsigned long)iobuf);
1289
1290 if (!err) {
1291 if (copy_in_user(&iobuf32->length, &iobuf->length,
1292 sizeof(int)))
1293 err = -EFAULT;
1294 }
1295
1296 return err;
1297}
1298
1299static int do_atmif_sioc(unsigned int fd, unsigned int cmd, unsigned long arg)
1300{
1301 struct atmif_sioc __user *sioc;
1302 struct atmif_sioc32 __user *sioc32;
1303 u32 data;
1304 void __user *datap;
1305 int err;
1306
1307 sioc = compat_alloc_user_space(sizeof(*sioc));
1308 sioc32 = compat_ptr(arg);
1309
1310 if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
1311 get_user(data, &sioc32->arg))
1312 return -EFAULT;
1313 datap = compat_ptr(data);
1314 if (put_user(datap, &sioc->arg))
1315 return -EFAULT;
1316 594
1317 err = sys_ioctl(fd, cmd, (unsigned long) sioc);
1318
1319 if (!err) {
1320 if (copy_in_user(&sioc32->length, &sioc->length,
1321 sizeof(int)))
1322 err = -EFAULT;
1323 }
1324 return err; 595 return err;
1325} 596}
1326 597
1327static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg) 598static int ioc_settimeout(unsigned int fd, unsigned int cmd,
1328{ 599 compat_ulong_t __user *argp)
1329 int i;
1330 unsigned int cmd = 0;
1331
1332 switch (cmd32) {
1333 case SONET_GETSTAT:
1334 case SONET_GETSTATZ:
1335 case SONET_GETDIAG:
1336 case SONET_SETDIAG:
1337 case SONET_CLRDIAG:
1338 case SONET_SETFRAMING:
1339 case SONET_GETFRAMING:
1340 case SONET_GETFRSENSE:
1341 return do_atmif_sioc(fd, cmd32, arg);
1342 }
1343
1344 for (i = 0; i < NR_ATM_IOCTL; i++) {
1345 if (cmd32 == atm_ioctl_map[i].cmd32) {
1346 cmd = atm_ioctl_map[i].cmd;
1347 break;
1348 }
1349 }
1350 if (i == NR_ATM_IOCTL)
1351 return -EINVAL;
1352
1353 switch (cmd) {
1354 case ATM_GETNAMES:
1355 return do_atm_iobuf(fd, cmd, arg);
1356
1357 case ATM_GETLINKRATE:
1358 case ATM_GETTYPE:
1359 case ATM_GETESI:
1360 case ATM_GETADDR:
1361 case ATM_RSTADDR:
1362 case ATM_ADDADDR:
1363 case ATM_DELADDR:
1364 case ATM_GETCIRANGE:
1365 case ATM_SETCIRANGE:
1366 case ATM_SETESI:
1367 case ATM_SETESIF:
1368 case ATM_GETSTAT:
1369 case ATM_GETSTATZ:
1370 case ATM_GETLOOP:
1371 case ATM_SETLOOP:
1372 case ATM_QUERYLOOP:
1373 return do_atmif_sioc(fd, cmd, arg);
1374 }
1375
1376 return -EINVAL;
1377}
1378
1379static __used int
1380ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
1381{
1382 return -EINVAL;
1383}
1384
1385static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
1386{ 600{
1387 return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg); 601 return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
1388} 602}
1389 603
1390/* Bluetooth ioctls */ 604/* Bluetooth ioctls */
@@ -1442,15 +656,15 @@ static int set_raw32_request(struct raw_config_request *req, struct raw32_config
1442 return ret ? -EFAULT : 0; 656 return ret ? -EFAULT : 0;
1443} 657}
1444 658
1445static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) 659static int raw_ioctl(unsigned fd, unsigned cmd,
660 struct raw32_config_request __user *user_req)
1446{ 661{
1447 int ret; 662 int ret;
1448 663
1449 switch (cmd) { 664 switch (cmd) {
1450 case RAW_SETBIND: 665 case RAW_SETBIND:
1451 case RAW_GETBIND: { 666 default: { /* RAW_GETBIND */
1452 struct raw_config_request req; 667 struct raw_config_request req;
1453 struct raw32_config_request __user *user_req = compat_ptr(arg);
1454 mm_segment_t oldfs = get_fs(); 668 mm_segment_t oldfs = get_fs();
1455 669
1456 if ((ret = get_raw32_request(&req, user_req))) 670 if ((ret = get_raw32_request(&req, user_req)))
@@ -1465,9 +679,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
1465 } 679 }
1466 break; 680 break;
1467 } 681 }
1468 default:
1469 ret = sys_ioctl(fd, cmd, arg);
1470 break;
1471 } 682 }
1472 return ret; 683 return ret;
1473} 684}
@@ -1495,11 +706,11 @@ struct serial_struct32 {
1495 compat_int_t reserved[1]; 706 compat_int_t reserved[1];
1496}; 707};
1497 708
1498static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg) 709static int serial_struct_ioctl(unsigned fd, unsigned cmd,
710 struct serial_struct32 __user *ss32)
1499{ 711{
1500 typedef struct serial_struct SS; 712 typedef struct serial_struct SS;
1501 typedef struct serial_struct32 SS32; 713 typedef struct serial_struct32 SS32;
1502 struct serial_struct32 __user *ss32 = compat_ptr(arg);
1503 int err; 714 int err;
1504 struct serial_struct ss; 715 struct serial_struct ss;
1505 mm_segment_t oldseg = get_fs(); 716 mm_segment_t oldseg = get_fs();
@@ -1537,96 +748,6 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
1537 return err; 748 return err;
1538} 749}
1539 750
1540struct usbdevfs_ctrltransfer32 {
1541 u8 bRequestType;
1542 u8 bRequest;
1543 u16 wValue;
1544 u16 wIndex;
1545 u16 wLength;
1546 u32 timeout; /* in milliseconds */
1547 compat_caddr_t data;
1548};
1549
1550#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32)
1551
1552static int do_usbdevfs_control(unsigned int fd, unsigned int cmd, unsigned long arg)
1553{
1554 struct usbdevfs_ctrltransfer32 __user *p32 = compat_ptr(arg);
1555 struct usbdevfs_ctrltransfer __user *p;
1556 __u32 udata;
1557 p = compat_alloc_user_space(sizeof(*p));
1558 if (copy_in_user(p, p32, (sizeof(*p32) - sizeof(compat_caddr_t))) ||
1559 get_user(udata, &p32->data) ||
1560 put_user(compat_ptr(udata), &p->data))
1561 return -EFAULT;
1562 return sys_ioctl(fd, USBDEVFS_CONTROL, (unsigned long)p);
1563}
1564
1565
1566struct usbdevfs_bulktransfer32 {
1567 compat_uint_t ep;
1568 compat_uint_t len;
1569 compat_uint_t timeout; /* in milliseconds */
1570 compat_caddr_t data;
1571};
1572
1573#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32)
1574
1575static int do_usbdevfs_bulk(unsigned int fd, unsigned int cmd, unsigned long arg)
1576{
1577 struct usbdevfs_bulktransfer32 __user *p32 = compat_ptr(arg);
1578 struct usbdevfs_bulktransfer __user *p;
1579 compat_uint_t n;
1580 compat_caddr_t addr;
1581
1582 p = compat_alloc_user_space(sizeof(*p));
1583
1584 if (get_user(n, &p32->ep) || put_user(n, &p->ep) ||
1585 get_user(n, &p32->len) || put_user(n, &p->len) ||
1586 get_user(n, &p32->timeout) || put_user(n, &p->timeout) ||
1587 get_user(addr, &p32->data) || put_user(compat_ptr(addr), &p->data))
1588 return -EFAULT;
1589
1590 return sys_ioctl(fd, USBDEVFS_BULK, (unsigned long)p);
1591}
1592
1593
1594/*
1595 * USBDEVFS_SUBMITURB, USBDEVFS_REAPURB and USBDEVFS_REAPURBNDELAY
1596 * are handled in usbdevfs core. -Christopher Li
1597 */
1598
1599struct usbdevfs_disconnectsignal32 {
1600 compat_int_t signr;
1601 compat_caddr_t context;
1602};
1603
1604#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32)
1605
1606static int do_usbdevfs_discsignal(unsigned int fd, unsigned int cmd, unsigned long arg)
1607{
1608 struct usbdevfs_disconnectsignal kdis;
1609 struct usbdevfs_disconnectsignal32 __user *udis;
1610 mm_segment_t old_fs;
1611 u32 uctx;
1612 int err;
1613
1614 udis = compat_ptr(arg);
1615
1616 if (get_user(kdis.signr, &udis->signr) ||
1617 __get_user(uctx, &udis->context))
1618 return -EFAULT;
1619
1620 kdis.context = compat_ptr(uctx);
1621
1622 old_fs = get_fs();
1623 set_fs(KERNEL_DS);
1624 err = sys_ioctl(fd, USBDEVFS_DISCSIGNAL, (unsigned long) &kdis);
1625 set_fs(old_fs);
1626
1627 return err;
1628}
1629
1630/* 751/*
1631 * I2C layer ioctls 752 * I2C layer ioctls
1632 */ 753 */
@@ -1655,9 +776,9 @@ struct i2c_rdwr_aligned {
1655 struct i2c_msg msgs[0]; 776 struct i2c_msg msgs[0];
1656}; 777};
1657 778
1658static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) 779static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
780 struct i2c_rdwr_ioctl_data32 __user *udata)
1659{ 781{
1660 struct i2c_rdwr_ioctl_data32 __user *udata = compat_ptr(arg);
1661 struct i2c_rdwr_aligned __user *tdata; 782 struct i2c_rdwr_aligned __user *tdata;
1662 struct i2c_msg __user *tmsgs; 783 struct i2c_msg __user *tmsgs;
1663 struct i2c_msg32 __user *umsgs; 784 struct i2c_msg32 __user *umsgs;
@@ -1691,10 +812,10 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
1691 return sys_ioctl(fd, cmd, (unsigned long)tdata); 812 return sys_ioctl(fd, cmd, (unsigned long)tdata);
1692} 813}
1693 814
1694static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) 815static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
816 struct i2c_smbus_ioctl_data32 __user *udata)
1695{ 817{
1696 struct i2c_smbus_ioctl_data __user *tdata; 818 struct i2c_smbus_ioctl_data __user *tdata;
1697 struct i2c_smbus_ioctl_data32 __user *udata;
1698 compat_caddr_t datap; 819 compat_caddr_t datap;
1699 820
1700 tdata = compat_alloc_user_space(sizeof(*tdata)); 821 tdata = compat_alloc_user_space(sizeof(*tdata));
@@ -1703,7 +824,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
1703 if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) 824 if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
1704 return -EFAULT; 825 return -EFAULT;
1705 826
1706 udata = compat_ptr(arg);
1707 if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) 827 if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
1708 return -EFAULT; 828 return -EFAULT;
1709 829
@@ -1718,27 +838,12 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
1718 return sys_ioctl(fd, cmd, (unsigned long)tdata); 838 return sys_ioctl(fd, cmd, (unsigned long)tdata);
1719} 839}
1720 840
1721/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
1722 * for some operations; this forces use of the newer bridge-utils that
1723 * use compatible ioctls
1724 */
1725static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
1726{
1727 u32 tmp;
1728
1729 if (get_user(tmp, (u32 __user *) arg))
1730 return -EFAULT;
1731 if (tmp == BRCTL_GET_VERSION)
1732 return BRCTL_VERSION + 1;
1733 return -EINVAL;
1734}
1735
1736#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) 841#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
1737#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) 842#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t)
1738#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) 843#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
1739#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) 844#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
1740 845
1741static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) 846static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
1742{ 847{
1743 mm_segment_t oldfs = get_fs(); 848 mm_segment_t oldfs = get_fs();
1744 compat_ulong_t val32; 849 compat_ulong_t val32;
@@ -1756,29 +861,14 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
1756 if (ret) 861 if (ret)
1757 return ret; 862 return ret;
1758 val32 = kval; 863 val32 = kval;
1759 return put_user(val32, (unsigned int __user *)arg); 864 return put_user(val32, (unsigned int __user *)argp);
1760 case RTC_IRQP_SET32: 865 case RTC_IRQP_SET32:
1761 return sys_ioctl(fd, RTC_IRQP_SET, arg); 866 return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
1762 case RTC_EPOCH_SET32: 867 case RTC_EPOCH_SET32:
1763 return sys_ioctl(fd, RTC_EPOCH_SET, arg); 868 return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
1764 default:
1765 /* unreached */
1766 return -ENOIOCTLCMD;
1767 } 869 }
1768}
1769 870
1770static int 871 return -ENOIOCTLCMD;
1771lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1772{
1773 struct compat_timeval __user *tc = (struct compat_timeval __user *)arg;
1774 struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval));
1775 struct timeval ts;
1776 if (get_user(ts.tv_sec, &tc->tv_sec) ||
1777 get_user(ts.tv_usec, &tc->tv_usec) ||
1778 put_user(ts.tv_sec, &tn->tv_sec) ||
1779 put_user(ts.tv_usec, &tn->tv_usec))
1780 return -EFAULT;
1781 return sys_ioctl(fd, cmd, (unsigned long)tn);
1782} 872}
1783 873
1784/* on ia32 l_start is on a 32-bit boundary */ 874/* on ia32 l_start is on a 32-bit boundary */
@@ -1798,9 +888,9 @@ struct space_resv_32 {
1798#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) 888#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32)
1799 889
1800/* just account for different alignment */ 890/* just account for different alignment */
1801static int compat_ioctl_preallocate(struct file *file, unsigned long arg) 891static int compat_ioctl_preallocate(struct file *file,
892 struct space_resv_32 __user *p32)
1802{ 893{
1803 struct space_resv_32 __user *p32 = compat_ptr(arg);
1804 struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); 894 struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
1805 895
1806 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || 896 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
@@ -1816,27 +906,13 @@ static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
1816} 906}
1817#endif 907#endif
1818 908
909/*
910 * simple reversible transform to make our table more evenly
911 * distributed after sorting.
912 */
913#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
1819 914
1820typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int, 915#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
1821 unsigned long, struct file *);
1822
1823struct ioctl_trans {
1824 unsigned long cmd;
1825 ioctl_trans_handler_t handler;
1826 struct ioctl_trans *next;
1827};
1828
1829#define HANDLE_IOCTL(cmd,handler) \
1830 { (cmd), (ioctl_trans_handler_t)(handler) },
1831
1832/* pointer to compatible structure or no argument */
1833#define COMPATIBLE_IOCTL(cmd) \
1834 { (cmd), do_ioctl32_pointer },
1835
1836/* argument is an unsigned long integer, not a pointer */
1837#define ULONG_IOCTL(cmd) \
1838 { (cmd), (ioctl_trans_handler_t)sys_ioctl },
1839
1840/* ioctl should not be warned about even if it's not implemented. 916/* ioctl should not be warned about even if it's not implemented.
1841 Valid reasons to use this: 917 Valid reasons to use this:
1842 - It is implemented with ->compat_ioctl on some device, but programs 918 - It is implemented with ->compat_ioctl on some device, but programs
@@ -1846,7 +922,7 @@ struct ioctl_trans {
1846 Most other reasons are not valid. */ 922 Most other reasons are not valid. */
1847#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) 923#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
1848 924
1849static struct ioctl_trans ioctl_start[] = { 925static unsigned int ioctl_pointer[] = {
1850/* compatible ioctls first */ 926/* compatible ioctls first */
1851COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ 927COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */
1852COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ 928COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */
@@ -1857,7 +933,6 @@ COMPATIBLE_IOCTL(TCSETA)
1857COMPATIBLE_IOCTL(TCSETAW) 933COMPATIBLE_IOCTL(TCSETAW)
1858COMPATIBLE_IOCTL(TCSETAF) 934COMPATIBLE_IOCTL(TCSETAF)
1859COMPATIBLE_IOCTL(TCSBRK) 935COMPATIBLE_IOCTL(TCSBRK)
1860ULONG_IOCTL(TCSBRKP)
1861COMPATIBLE_IOCTL(TCXONC) 936COMPATIBLE_IOCTL(TCXONC)
1862COMPATIBLE_IOCTL(TCFLSH) 937COMPATIBLE_IOCTL(TCFLSH)
1863COMPATIBLE_IOCTL(TCGETS) 938COMPATIBLE_IOCTL(TCGETS)
@@ -1867,7 +942,7 @@ COMPATIBLE_IOCTL(TCSETSF)
1867COMPATIBLE_IOCTL(TIOCLINUX) 942COMPATIBLE_IOCTL(TIOCLINUX)
1868COMPATIBLE_IOCTL(TIOCSBRK) 943COMPATIBLE_IOCTL(TIOCSBRK)
1869COMPATIBLE_IOCTL(TIOCCBRK) 944COMPATIBLE_IOCTL(TIOCCBRK)
1870ULONG_IOCTL(TIOCMIWAIT) 945COMPATIBLE_IOCTL(TIOCGSID)
1871COMPATIBLE_IOCTL(TIOCGICOUNT) 946COMPATIBLE_IOCTL(TIOCGICOUNT)
1872/* Little t */ 947/* Little t */
1873COMPATIBLE_IOCTL(TIOCGETD) 948COMPATIBLE_IOCTL(TIOCGETD)
@@ -1889,7 +964,6 @@ COMPATIBLE_IOCTL(TIOCSTI)
1889COMPATIBLE_IOCTL(TIOCOUTQ) 964COMPATIBLE_IOCTL(TIOCOUTQ)
1890COMPATIBLE_IOCTL(TIOCSPGRP) 965COMPATIBLE_IOCTL(TIOCSPGRP)
1891COMPATIBLE_IOCTL(TIOCGPGRP) 966COMPATIBLE_IOCTL(TIOCGPGRP)
1892ULONG_IOCTL(TIOCSCTTY)
1893COMPATIBLE_IOCTL(TIOCGPTN) 967COMPATIBLE_IOCTL(TIOCGPTN)
1894COMPATIBLE_IOCTL(TIOCSPTLCK) 968COMPATIBLE_IOCTL(TIOCSPTLCK)
1895COMPATIBLE_IOCTL(TIOCSERGETLSR) 969COMPATIBLE_IOCTL(TIOCSERGETLSR)
@@ -1912,44 +986,11 @@ COMPATIBLE_IOCTL(FIGETBSZ)
1912/* 'X' - originally XFS but some now in the VFS */ 986/* 'X' - originally XFS but some now in the VFS */
1913COMPATIBLE_IOCTL(FIFREEZE) 987COMPATIBLE_IOCTL(FIFREEZE)
1914COMPATIBLE_IOCTL(FITHAW) 988COMPATIBLE_IOCTL(FITHAW)
1915/* RAID */
1916COMPATIBLE_IOCTL(RAID_VERSION)
1917COMPATIBLE_IOCTL(GET_ARRAY_INFO)
1918COMPATIBLE_IOCTL(GET_DISK_INFO)
1919COMPATIBLE_IOCTL(PRINT_RAID_DEBUG)
1920COMPATIBLE_IOCTL(RAID_AUTORUN)
1921COMPATIBLE_IOCTL(CLEAR_ARRAY)
1922COMPATIBLE_IOCTL(ADD_NEW_DISK)
1923ULONG_IOCTL(HOT_REMOVE_DISK)
1924COMPATIBLE_IOCTL(SET_ARRAY_INFO)
1925COMPATIBLE_IOCTL(SET_DISK_INFO)
1926COMPATIBLE_IOCTL(WRITE_RAID_INFO)
1927COMPATIBLE_IOCTL(UNPROTECT_ARRAY)
1928COMPATIBLE_IOCTL(PROTECT_ARRAY)
1929ULONG_IOCTL(HOT_ADD_DISK)
1930ULONG_IOCTL(SET_DISK_FAULTY)
1931COMPATIBLE_IOCTL(RUN_ARRAY)
1932COMPATIBLE_IOCTL(STOP_ARRAY)
1933COMPATIBLE_IOCTL(STOP_ARRAY_RO)
1934COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
1935COMPATIBLE_IOCTL(GET_BITMAP_FILE)
1936ULONG_IOCTL(SET_BITMAP_FILE)
1937/* Big K */
1938COMPATIBLE_IOCTL(PIO_FONT)
1939COMPATIBLE_IOCTL(GIO_FONT)
1940COMPATIBLE_IOCTL(PIO_CMAP)
1941COMPATIBLE_IOCTL(GIO_CMAP)
1942ULONG_IOCTL(KDSIGACCEPT)
1943COMPATIBLE_IOCTL(KDGETKEYCODE) 989COMPATIBLE_IOCTL(KDGETKEYCODE)
1944COMPATIBLE_IOCTL(KDSETKEYCODE) 990COMPATIBLE_IOCTL(KDSETKEYCODE)
1945ULONG_IOCTL(KIOCSOUND)
1946ULONG_IOCTL(KDMKTONE)
1947COMPATIBLE_IOCTL(KDGKBTYPE) 991COMPATIBLE_IOCTL(KDGKBTYPE)
1948ULONG_IOCTL(KDSETMODE)
1949COMPATIBLE_IOCTL(KDGETMODE) 992COMPATIBLE_IOCTL(KDGETMODE)
1950ULONG_IOCTL(KDSKBMODE)
1951COMPATIBLE_IOCTL(KDGKBMODE) 993COMPATIBLE_IOCTL(KDGKBMODE)
1952ULONG_IOCTL(KDSKBMETA)
1953COMPATIBLE_IOCTL(KDGKBMETA) 994COMPATIBLE_IOCTL(KDGKBMETA)
1954COMPATIBLE_IOCTL(KDGKBENT) 995COMPATIBLE_IOCTL(KDGKBENT)
1955COMPATIBLE_IOCTL(KDSKBENT) 996COMPATIBLE_IOCTL(KDSKBENT)
@@ -1959,15 +1000,7 @@ COMPATIBLE_IOCTL(KDGKBDIACR)
1959COMPATIBLE_IOCTL(KDSKBDIACR) 1000COMPATIBLE_IOCTL(KDSKBDIACR)
1960COMPATIBLE_IOCTL(KDKBDREP) 1001COMPATIBLE_IOCTL(KDKBDREP)
1961COMPATIBLE_IOCTL(KDGKBLED) 1002COMPATIBLE_IOCTL(KDGKBLED)
1962ULONG_IOCTL(KDSKBLED)
1963COMPATIBLE_IOCTL(KDGETLED) 1003COMPATIBLE_IOCTL(KDGETLED)
1964ULONG_IOCTL(KDSETLED)
1965COMPATIBLE_IOCTL(GIO_SCRNMAP)
1966COMPATIBLE_IOCTL(PIO_SCRNMAP)
1967COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
1968COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
1969COMPATIBLE_IOCTL(PIO_FONTRESET)
1970COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
1971#ifdef CONFIG_BLOCK 1004#ifdef CONFIG_BLOCK
1972/* Big S */ 1005/* Big S */
1973COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) 1006COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -1979,32 +1012,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
1979COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) 1012COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
1980COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) 1013COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
1981#endif 1014#endif
1982/* Big T */ 1015/* Big V (don't complain on serial console) */
1983COMPATIBLE_IOCTL(TUNSETNOCSUM) 1016IGNORE_IOCTL(VT_OPENQRY)
1984COMPATIBLE_IOCTL(TUNSETDEBUG) 1017IGNORE_IOCTL(VT_GETMODE)
1985COMPATIBLE_IOCTL(TUNSETPERSIST)
1986COMPATIBLE_IOCTL(TUNSETOWNER)
1987COMPATIBLE_IOCTL(TUNSETLINK)
1988COMPATIBLE_IOCTL(TUNSETGROUP)
1989COMPATIBLE_IOCTL(TUNGETFEATURES)
1990COMPATIBLE_IOCTL(TUNSETOFFLOAD)
1991COMPATIBLE_IOCTL(TUNSETTXFILTER)
1992COMPATIBLE_IOCTL(TUNGETSNDBUF)
1993COMPATIBLE_IOCTL(TUNSETSNDBUF)
1994/* Big V */
1995COMPATIBLE_IOCTL(VT_SETMODE)
1996COMPATIBLE_IOCTL(VT_GETMODE)
1997COMPATIBLE_IOCTL(VT_GETSTATE)
1998COMPATIBLE_IOCTL(VT_OPENQRY)
1999ULONG_IOCTL(VT_ACTIVATE)
2000ULONG_IOCTL(VT_WAITACTIVE)
2001ULONG_IOCTL(VT_RELDISP)
2002ULONG_IOCTL(VT_DISALLOCATE)
2003COMPATIBLE_IOCTL(VT_RESIZE)
2004COMPATIBLE_IOCTL(VT_RESIZEX)
2005COMPATIBLE_IOCTL(VT_LOCKSWITCH)
2006COMPATIBLE_IOCTL(VT_UNLOCKSWITCH)
2007COMPATIBLE_IOCTL(VT_GETHIFONTMASK)
2008/* Little p (/dev/rtc, /dev/envctrl, etc.) */ 1018/* Little p (/dev/rtc, /dev/envctrl, etc.) */
2009COMPATIBLE_IOCTL(RTC_AIE_ON) 1019COMPATIBLE_IOCTL(RTC_AIE_ON)
2010COMPATIBLE_IOCTL(RTC_AIE_OFF) 1020COMPATIBLE_IOCTL(RTC_AIE_OFF)
@@ -2032,36 +1042,15 @@ COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
2032COMPATIBLE_IOCTL(MTIOCTOP) 1042COMPATIBLE_IOCTL(MTIOCTOP)
2033/* Socket level stuff */ 1043/* Socket level stuff */
2034COMPATIBLE_IOCTL(FIOQSIZE) 1044COMPATIBLE_IOCTL(FIOQSIZE)
2035COMPATIBLE_IOCTL(FIOSETOWN)
2036COMPATIBLE_IOCTL(SIOCSPGRP)
2037COMPATIBLE_IOCTL(FIOGETOWN)
2038COMPATIBLE_IOCTL(SIOCGPGRP)
2039COMPATIBLE_IOCTL(SIOCATMARK)
2040COMPATIBLE_IOCTL(SIOCSIFLINK)
2041COMPATIBLE_IOCTL(SIOCSIFENCAP)
2042COMPATIBLE_IOCTL(SIOCGIFENCAP)
2043COMPATIBLE_IOCTL(SIOCSIFNAME)
2044COMPATIBLE_IOCTL(SIOCSARP)
2045COMPATIBLE_IOCTL(SIOCGARP)
2046COMPATIBLE_IOCTL(SIOCDARP)
2047COMPATIBLE_IOCTL(SIOCSRARP)
2048COMPATIBLE_IOCTL(SIOCGRARP)
2049COMPATIBLE_IOCTL(SIOCDRARP)
2050COMPATIBLE_IOCTL(SIOCADDDLCI)
2051COMPATIBLE_IOCTL(SIOCDELDLCI)
2052COMPATIBLE_IOCTL(SIOCGMIIPHY)
2053COMPATIBLE_IOCTL(SIOCGMIIREG)
2054COMPATIBLE_IOCTL(SIOCSMIIREG)
2055COMPATIBLE_IOCTL(SIOCGIFVLAN)
2056COMPATIBLE_IOCTL(SIOCSIFVLAN)
2057COMPATIBLE_IOCTL(SIOCBRADDBR)
2058COMPATIBLE_IOCTL(SIOCBRDELBR)
2059#ifdef CONFIG_BLOCK 1045#ifdef CONFIG_BLOCK
1046/* loop */
1047IGNORE_IOCTL(LOOP_CLR_FD)
1048/* md calls this on random blockdevs */
1049IGNORE_IOCTL(RAID_VERSION)
2060/* SG stuff */ 1050/* SG stuff */
2061COMPATIBLE_IOCTL(SG_SET_TIMEOUT) 1051COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
2062COMPATIBLE_IOCTL(SG_GET_TIMEOUT) 1052COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
2063COMPATIBLE_IOCTL(SG_EMULATED_HOST) 1053COMPATIBLE_IOCTL(SG_EMULATED_HOST)
2064ULONG_IOCTL(SG_SET_TRANSFORM)
2065COMPATIBLE_IOCTL(SG_GET_TRANSFORM) 1054COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
2066COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) 1055COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
2067COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) 1056COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
@@ -2115,8 +1104,6 @@ COMPATIBLE_IOCTL(PPPIOCGCHAN)
2115/* PPPOX */ 1104/* PPPOX */
2116COMPATIBLE_IOCTL(PPPOEIOCSFWD) 1105COMPATIBLE_IOCTL(PPPOEIOCSFWD)
2117COMPATIBLE_IOCTL(PPPOEIOCDFWD) 1106COMPATIBLE_IOCTL(PPPOEIOCDFWD)
2118/* LP */
2119COMPATIBLE_IOCTL(LPGETSTATUS)
2120/* ppdev */ 1107/* ppdev */
2121COMPATIBLE_IOCTL(PPSETMODE) 1108COMPATIBLE_IOCTL(PPSETMODE)
2122COMPATIBLE_IOCTL(PPRSTATUS) 1109COMPATIBLE_IOCTL(PPRSTATUS)
@@ -2298,8 +1285,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
2298COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) 1285COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
2299COMPATIBLE_IOCTL(OSS_GETVERSION) 1286COMPATIBLE_IOCTL(OSS_GETVERSION)
2300/* AUTOFS */ 1287/* AUTOFS */
2301ULONG_IOCTL(AUTOFS_IOC_READY)
2302ULONG_IOCTL(AUTOFS_IOC_FAIL)
2303COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC) 1288COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
2304COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER) 1289COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
2305COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE) 1290COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
@@ -2311,22 +1296,6 @@ COMPATIBLE_IOCTL(RAW_SETBIND)
2311COMPATIBLE_IOCTL(RAW_GETBIND) 1296COMPATIBLE_IOCTL(RAW_GETBIND)
2312/* SMB ioctls which do not need any translations */ 1297/* SMB ioctls which do not need any translations */
2313COMPATIBLE_IOCTL(SMB_IOC_NEWCONN) 1298COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
2314/* Little a */
2315COMPATIBLE_IOCTL(ATMSIGD_CTRL)
2316COMPATIBLE_IOCTL(ATMARPD_CTRL)
2317COMPATIBLE_IOCTL(ATMLEC_CTRL)
2318COMPATIBLE_IOCTL(ATMLEC_MCAST)
2319COMPATIBLE_IOCTL(ATMLEC_DATA)
2320COMPATIBLE_IOCTL(ATM_SETSC)
2321COMPATIBLE_IOCTL(SIOCSIFATMTCP)
2322COMPATIBLE_IOCTL(SIOCMKCLIP)
2323COMPATIBLE_IOCTL(ATMARP_MKIP)
2324COMPATIBLE_IOCTL(ATMARP_SETENTRY)
2325COMPATIBLE_IOCTL(ATMARP_ENCAP)
2326COMPATIBLE_IOCTL(ATMTCP_CREATE)
2327COMPATIBLE_IOCTL(ATMTCP_REMOVE)
2328COMPATIBLE_IOCTL(ATMMPC_CTRL)
2329COMPATIBLE_IOCTL(ATMMPC_DATA)
2330/* Watchdog */ 1299/* Watchdog */
2331COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) 1300COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
2332COMPATIBLE_IOCTL(WDIOC_GETSTATUS) 1301COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -2408,30 +1377,11 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
2408COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) 1377COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
2409COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) 1378COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
2410COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) 1379COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
2411/* USB */
2412COMPATIBLE_IOCTL(USBDEVFS_RESETEP)
2413COMPATIBLE_IOCTL(USBDEVFS_SETINTERFACE)
2414COMPATIBLE_IOCTL(USBDEVFS_SETCONFIGURATION)
2415COMPATIBLE_IOCTL(USBDEVFS_GETDRIVER)
2416COMPATIBLE_IOCTL(USBDEVFS_DISCARDURB)
2417COMPATIBLE_IOCTL(USBDEVFS_CLAIMINTERFACE)
2418COMPATIBLE_IOCTL(USBDEVFS_RELEASEINTERFACE)
2419COMPATIBLE_IOCTL(USBDEVFS_CONNECTINFO)
2420COMPATIBLE_IOCTL(USBDEVFS_HUB_PORTINFO)
2421COMPATIBLE_IOCTL(USBDEVFS_RESET)
2422COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32)
2423COMPATIBLE_IOCTL(USBDEVFS_REAPURB32)
2424COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32)
2425COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT)
2426/* NBD */ 1380/* NBD */
2427ULONG_IOCTL(NBD_SET_SOCK)
2428ULONG_IOCTL(NBD_SET_BLKSIZE)
2429ULONG_IOCTL(NBD_SET_SIZE)
2430COMPATIBLE_IOCTL(NBD_DO_IT) 1381COMPATIBLE_IOCTL(NBD_DO_IT)
2431COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) 1382COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
2432COMPATIBLE_IOCTL(NBD_CLEAR_QUE) 1383COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
2433COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) 1384COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
2434ULONG_IOCTL(NBD_SET_SIZE_BLOCKS)
2435COMPATIBLE_IOCTL(NBD_DISCONNECT) 1385COMPATIBLE_IOCTL(NBD_DISCONNECT)
2436/* i2c */ 1386/* i2c */
2437COMPATIBLE_IOCTL(I2C_SLAVE) 1387COMPATIBLE_IOCTL(I2C_SLAVE)
@@ -2531,131 +1481,13 @@ COMPATIBLE_IOCTL(JSIOCGAXES)
2531COMPATIBLE_IOCTL(JSIOCGBUTTONS) 1481COMPATIBLE_IOCTL(JSIOCGBUTTONS)
2532COMPATIBLE_IOCTL(JSIOCGNAME(0)) 1482COMPATIBLE_IOCTL(JSIOCGNAME(0))
2533 1483
2534/* now things that need handlers */
2535#ifdef CONFIG_NET
2536HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
2537HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
2538HANDLE_IOCTL(SIOCGIFFLAGS, dev_ifsioc)
2539HANDLE_IOCTL(SIOCSIFFLAGS, dev_ifsioc)
2540HANDLE_IOCTL(SIOCGIFMETRIC, dev_ifsioc)
2541HANDLE_IOCTL(SIOCSIFMETRIC, dev_ifsioc)
2542HANDLE_IOCTL(SIOCGIFMTU, dev_ifsioc)
2543HANDLE_IOCTL(SIOCSIFMTU, dev_ifsioc)
2544HANDLE_IOCTL(SIOCGIFMEM, dev_ifsioc)
2545HANDLE_IOCTL(SIOCSIFMEM, dev_ifsioc)
2546HANDLE_IOCTL(SIOCGIFHWADDR, dev_ifsioc)
2547HANDLE_IOCTL(SIOCSIFHWADDR, dev_ifsioc)
2548HANDLE_IOCTL(SIOCADDMULTI, dev_ifsioc)
2549HANDLE_IOCTL(SIOCDELMULTI, dev_ifsioc)
2550HANDLE_IOCTL(SIOCGIFINDEX, dev_ifsioc)
2551HANDLE_IOCTL(SIOCGIFMAP, dev_ifsioc)
2552HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
2553HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
2554HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
2555HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
2556HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
2557
2558/* ioctls used by appletalk ddp.c */
2559HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
2560HANDLE_IOCTL(SIOCDIFADDR, dev_ifsioc)
2561HANDLE_IOCTL(SIOCSARP, dev_ifsioc)
2562HANDLE_IOCTL(SIOCDARP, dev_ifsioc)
2563
2564HANDLE_IOCTL(SIOCGIFBRDADDR, dev_ifsioc)
2565HANDLE_IOCTL(SIOCSIFBRDADDR, dev_ifsioc)
2566HANDLE_IOCTL(SIOCGIFDSTADDR, dev_ifsioc)
2567HANDLE_IOCTL(SIOCSIFDSTADDR, dev_ifsioc)
2568HANDLE_IOCTL(SIOCGIFNETMASK, dev_ifsioc)
2569HANDLE_IOCTL(SIOCSIFNETMASK, dev_ifsioc)
2570HANDLE_IOCTL(SIOCSIFPFLAGS, dev_ifsioc)
2571HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
2572HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
2573HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
2574HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
2575HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
2576HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
2577HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
2578HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
2579HANDLE_IOCTL(SIOCBONDSETHWADDR, bond_ioctl)
2580HANDLE_IOCTL(SIOCBONDSLAVEINFOQUERY, bond_ioctl)
2581HANDLE_IOCTL(SIOCBONDINFOQUERY, bond_ioctl)
2582HANDLE_IOCTL(SIOCBONDCHANGEACTIVE, bond_ioctl)
2583HANDLE_IOCTL(SIOCADDRT, routing_ioctl)
2584HANDLE_IOCTL(SIOCDELRT, routing_ioctl)
2585HANDLE_IOCTL(SIOCBRADDIF, dev_ifsioc)
2586HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
2587/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
2588HANDLE_IOCTL(SIOCRTMSG, ret_einval)
2589HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
2590HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
2591#endif
2592#ifdef CONFIG_BLOCK
2593HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
2594HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
2595#endif
2596HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
2597HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
2598HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
2599HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
2600#ifdef CONFIG_BLOCK
2601HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
2602HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
2603#endif
2604#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
2605HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
2606#ifdef CONFIG_VT
2607HANDLE_IOCTL(PIO_FONTX, do_fontx_ioctl)
2608HANDLE_IOCTL(GIO_FONTX, do_fontx_ioctl)
2609HANDLE_IOCTL(PIO_UNIMAP, do_unimap_ioctl)
2610HANDLE_IOCTL(GIO_UNIMAP, do_unimap_ioctl)
2611HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl)
2612#endif
2613/* One SMB ioctl needs translations. */
2614#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
2615HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
2616HANDLE_IOCTL(ATM_GETLINKRATE32, do_atm_ioctl)
2617HANDLE_IOCTL(ATM_GETNAMES32, do_atm_ioctl)
2618HANDLE_IOCTL(ATM_GETTYPE32, do_atm_ioctl)
2619HANDLE_IOCTL(ATM_GETESI32, do_atm_ioctl)
2620HANDLE_IOCTL(ATM_GETADDR32, do_atm_ioctl)
2621HANDLE_IOCTL(ATM_RSTADDR32, do_atm_ioctl)
2622HANDLE_IOCTL(ATM_ADDADDR32, do_atm_ioctl)
2623HANDLE_IOCTL(ATM_DELADDR32, do_atm_ioctl)
2624HANDLE_IOCTL(ATM_GETCIRANGE32, do_atm_ioctl)
2625HANDLE_IOCTL(ATM_SETCIRANGE32, do_atm_ioctl)
2626HANDLE_IOCTL(ATM_SETESI32, do_atm_ioctl)
2627HANDLE_IOCTL(ATM_SETESIF32, do_atm_ioctl)
2628HANDLE_IOCTL(ATM_GETSTAT32, do_atm_ioctl)
2629HANDLE_IOCTL(ATM_GETSTATZ32, do_atm_ioctl)
2630HANDLE_IOCTL(ATM_GETLOOP32, do_atm_ioctl)
2631HANDLE_IOCTL(ATM_SETLOOP32, do_atm_ioctl)
2632HANDLE_IOCTL(ATM_QUERYLOOP32, do_atm_ioctl)
2633HANDLE_IOCTL(SONET_GETSTAT, do_atm_ioctl)
2634HANDLE_IOCTL(SONET_GETSTATZ, do_atm_ioctl)
2635HANDLE_IOCTL(SONET_GETDIAG, do_atm_ioctl)
2636HANDLE_IOCTL(SONET_SETDIAG, do_atm_ioctl)
2637HANDLE_IOCTL(SONET_CLRDIAG, do_atm_ioctl)
2638HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
2639HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
2640HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
2641/* block stuff */
2642#ifdef CONFIG_BLOCK
2643/* loop */
2644IGNORE_IOCTL(LOOP_CLR_FD)
2645/* Raw devices */
2646HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
2647HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
2648#endif
2649/* Serial */
2650HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
2651HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
2652#ifdef TIOCGLTC 1484#ifdef TIOCGLTC
2653COMPATIBLE_IOCTL(TIOCGLTC) 1485COMPATIBLE_IOCTL(TIOCGLTC)
2654COMPATIBLE_IOCTL(TIOCSLTC) 1486COMPATIBLE_IOCTL(TIOCSLTC)
2655#endif 1487#endif
2656#ifdef TIOCSTART 1488#ifdef TIOCSTART
2657/* 1489/*
2658 * For these two we have defintions in ioctls.h and/or termios.h on 1490 * For these two we have definitions in ioctls.h and/or termios.h on
2659 * some architectures but no actual implemention. Some applications 1491 * some architectures but no actual implemention. Some applications
2660 * like bash call them if they are defined in the headers, so we provide 1492 * like bash call them if they are defined in the headers, so we provide
2661 * entries here to avoid syslog message spew. 1493 * entries here to avoid syslog message spew.
@@ -2663,43 +1495,6 @@ COMPATIBLE_IOCTL(TIOCSLTC)
2663COMPATIBLE_IOCTL(TIOCSTART) 1495COMPATIBLE_IOCTL(TIOCSTART)
2664COMPATIBLE_IOCTL(TIOCSTOP) 1496COMPATIBLE_IOCTL(TIOCSTOP)
2665#endif 1497#endif
2666/* Usbdevfs */
2667HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control)
2668HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk)
2669HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal)
2670COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
2671/* i2c */
2672HANDLE_IOCTL(I2C_FUNCS, w_long)
2673HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
2674HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
2675/* bridge */
2676HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
2677HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
2678/* Not implemented in the native kernel */
2679IGNORE_IOCTL(SIOCGIFCOUNT)
2680HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
2681HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
2682HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
2683HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl)
2684
2685/* dvb */
2686HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
2687HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
2688HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
2689
2690/* parport */
2691COMPATIBLE_IOCTL(LPTIME)
2692COMPATIBLE_IOCTL(LPCHAR)
2693COMPATIBLE_IOCTL(LPABORTOPEN)
2694COMPATIBLE_IOCTL(LPCAREFUL)
2695COMPATIBLE_IOCTL(LPWAIT)
2696COMPATIBLE_IOCTL(LPSETIRQ)
2697COMPATIBLE_IOCTL(LPGETSTATUS)
2698COMPATIBLE_IOCTL(LPGETSTATUS)
2699COMPATIBLE_IOCTL(LPRESET)
2700/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
2701COMPATIBLE_IOCTL(LPGETFLAGS)
2702HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
2703 1498
2704/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, 1499/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
2705 but we don't want warnings on other file systems. So declare 1500 but we don't want warnings on other file systems. So declare
@@ -2727,12 +1522,108 @@ IGNORE_IOCTL(FBIOGCURSOR32)
2727#endif 1522#endif
2728}; 1523};
2729 1524
2730#define IOCTL_HASHSIZE 256 1525/*
2731static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE]; 1526 * Convert common ioctl arguments based on their command number
2732 1527 *
2733static inline unsigned long ioctl32_hash(unsigned long cmd) 1528 * Please do not add any code in here. Instead, implement
1529 * a compat_ioctl operation in the place that handleѕ the
1530 * ioctl for the native case.
1531 */
1532static long do_ioctl_trans(int fd, unsigned int cmd,
1533 unsigned long arg, struct file *file)
2734{ 1534{
2735 return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE; 1535 void __user *argp = compat_ptr(arg);
1536
1537 switch (cmd) {
1538 case PPPIOCGIDLE32:
1539 return ppp_gidle(fd, cmd, argp);
1540 case PPPIOCSCOMPRESS32:
1541 return ppp_scompress(fd, cmd, argp);
1542 case PPPIOCSPASS32:
1543 case PPPIOCSACTIVE32:
1544 return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
1545#ifdef CONFIG_BLOCK
1546 case SG_IO:
1547 return sg_ioctl_trans(fd, cmd, argp);
1548 case SG_GET_REQUEST_TABLE:
1549 return sg_grt_trans(fd, cmd, argp);
1550 case MTIOCGET32:
1551 case MTIOCPOS32:
1552 return mt_ioctl_trans(fd, cmd, argp);
1553 /* Raw devices */
1554 case RAW_SETBIND:
1555 case RAW_GETBIND:
1556 return raw_ioctl(fd, cmd, argp);
1557#endif
1558#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
1559 case AUTOFS_IOC_SETTIMEOUT32:
1560 return ioc_settimeout(fd, cmd, argp);
1561 /* One SMB ioctl needs translations. */
1562#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
1563 case SMB_IOC_GETMOUNTUID_32:
1564 return do_smb_getmountuid(fd, cmd, argp);
1565 /* Serial */
1566 case TIOCGSERIAL:
1567 case TIOCSSERIAL:
1568 return serial_struct_ioctl(fd, cmd, argp);
1569 /* i2c */
1570 case I2C_FUNCS:
1571 return w_long(fd, cmd, argp);
1572 case I2C_RDWR:
1573 return do_i2c_rdwr_ioctl(fd, cmd, argp);
1574 case I2C_SMBUS:
1575 return do_i2c_smbus_ioctl(fd, cmd, argp);
1576 /* Not implemented in the native kernel */
1577 case RTC_IRQP_READ32:
1578 case RTC_IRQP_SET32:
1579 case RTC_EPOCH_READ32:
1580 case RTC_EPOCH_SET32:
1581 return rtc_ioctl(fd, cmd, argp);
1582
1583 /* dvb */
1584 case VIDEO_GET_EVENT:
1585 return do_video_get_event(fd, cmd, argp);
1586 case VIDEO_STILLPICTURE:
1587 return do_video_stillpicture(fd, cmd, argp);
1588 case VIDEO_SET_SPU_PALETTE:
1589 return do_video_set_spu_palette(fd, cmd, argp);
1590 }
1591
1592 /*
1593 * These take an integer instead of a pointer as 'arg',
1594 * so we must not do a compat_ptr() translation.
1595 */
1596 switch (cmd) {
1597 /* Big T */
1598 case TCSBRKP:
1599 case TIOCMIWAIT:
1600 case TIOCSCTTY:
1601 /* RAID */
1602 case HOT_REMOVE_DISK:
1603 case HOT_ADD_DISK:
1604 case SET_DISK_FAULTY:
1605 case SET_BITMAP_FILE:
1606 /* Big K */
1607 case KDSIGACCEPT:
1608 case KIOCSOUND:
1609 case KDMKTONE:
1610 case KDSETMODE:
1611 case KDSKBMODE:
1612 case KDSKBMETA:
1613 case KDSKBLED:
1614 case KDSETLED:
1615 /* AUTOFS */
1616 case AUTOFS_IOC_READY:
1617 case AUTOFS_IOC_FAIL:
1618 /* NBD */
1619 case NBD_SET_SOCK:
1620 case NBD_SET_BLKSIZE:
1621 case NBD_SET_SIZE:
1622 case NBD_SET_SIZE_BLOCKS:
1623 return do_vfs_ioctl(file, fd, cmd, arg);
1624 }
1625
1626 return -ENOIOCTLCMD;
2736} 1627}
2737 1628
2738static void compat_ioctl_error(struct file *filp, unsigned int fd, 1629static void compat_ioctl_error(struct file *filp, unsigned int fd,
@@ -2764,12 +1655,33 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
2764 free_page((unsigned long)path); 1655 free_page((unsigned long)path);
2765} 1656}
2766 1657
1658static int compat_ioctl_check_table(unsigned int xcmd)
1659{
1660 int i;
1661 const int max = ARRAY_SIZE(ioctl_pointer) - 1;
1662
1663 BUILD_BUG_ON(max >= (1 << 16));
1664
1665 /* guess initial offset into table, assuming a
1666 normalized distribution */
1667 i = ((xcmd >> 16) * max) >> 16;
1668
1669 /* do linear search up first, until greater or equal */
1670 while (ioctl_pointer[i] < xcmd && i < max)
1671 i++;
1672
1673 /* then do linear search down */
1674 while (ioctl_pointer[i] > xcmd && i > 0)
1675 i--;
1676
1677 return ioctl_pointer[i] == xcmd;
1678}
1679
2767asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, 1680asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2768 unsigned long arg) 1681 unsigned long arg)
2769{ 1682{
2770 struct file *filp; 1683 struct file *filp;
2771 int error = -EBADF; 1684 int error = -EBADF;
2772 struct ioctl_trans *t;
2773 int fput_needed; 1685 int fput_needed;
2774 1686
2775 filp = fget_light(fd, &fput_needed); 1687 filp = fget_light(fd, &fput_needed);
@@ -2797,7 +1709,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2797#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 1709#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
2798 case FS_IOC_RESVSP_32: 1710 case FS_IOC_RESVSP_32:
2799 case FS_IOC_RESVSP64_32: 1711 case FS_IOC_RESVSP64_32:
2800 error = compat_ioctl_preallocate(filp, arg); 1712 error = compat_ioctl_preallocate(filp, compat_ptr(arg));
2801 goto out_fput; 1713 goto out_fput;
2802#else 1714#else
2803 case FS_IOC_RESVSP: 1715 case FS_IOC_RESVSP:
@@ -2826,18 +1738,11 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2826 break; 1738 break;
2827 } 1739 }
2828 1740
2829 for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) { 1741 if (compat_ioctl_check_table(XFORM(cmd)))
2830 if (t->cmd == cmd) 1742 goto found_handler;
2831 goto found_handler;
2832 }
2833 1743
2834#ifdef CONFIG_NET 1744 error = do_ioctl_trans(fd, cmd, arg, filp);
2835 if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) && 1745 if (error == -ENOIOCTLCMD) {
2836 cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
2837 error = siocdevprivate_ioctl(fd, cmd, arg);
2838 } else
2839#endif
2840 {
2841 static int count; 1746 static int count;
2842 1747
2843 if (++count <= 50) 1748 if (++count <= 50)
@@ -2848,13 +1753,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2848 goto out_fput; 1753 goto out_fput;
2849 1754
2850 found_handler: 1755 found_handler:
2851 if (t->handler) { 1756 arg = (unsigned long)compat_ptr(arg);
2852 lock_kernel();
2853 error = t->handler(fd, cmd, arg, filp);
2854 unlock_kernel();
2855 goto out_fput;
2856 }
2857
2858 do_ioctl: 1757 do_ioctl:
2859 error = do_vfs_ioctl(filp, fd, cmd, arg); 1758 error = do_vfs_ioctl(filp, fd, cmd, arg);
2860 out_fput: 1759 out_fput:
@@ -2863,35 +1762,22 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2863 return error; 1762 return error;
2864} 1763}
2865 1764
2866static void ioctl32_insert_translation(struct ioctl_trans *trans) 1765static int __init init_sys32_ioctl_cmp(const void *p, const void *q)
2867{ 1766{
2868 unsigned long hash; 1767 unsigned int a, b;
2869 struct ioctl_trans *t; 1768 a = *(unsigned int *)p;
2870 1769 b = *(unsigned int *)q;
2871 hash = ioctl32_hash (trans->cmd); 1770 if (a > b)
2872 if (!ioctl32_hash_table[hash]) 1771 return 1;
2873 ioctl32_hash_table[hash] = trans; 1772 if (a < b)
2874 else { 1773 return -1;
2875 t = ioctl32_hash_table[hash]; 1774 return 0;
2876 while (t->next)
2877 t = t->next;
2878 trans->next = NULL;
2879 t->next = trans;
2880 }
2881} 1775}
2882 1776
2883static int __init init_sys32_ioctl(void) 1777static int __init init_sys32_ioctl(void)
2884{ 1778{
2885 int i; 1779 sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer),
2886 1780 init_sys32_ioctl_cmp, NULL);
2887 for (i = 0; i < ARRAY_SIZE(ioctl_start); i++) {
2888 if (ioctl_start[i].next) {
2889 printk("ioctl translation %d bad\n",i);
2890 return -1;
2891 }
2892
2893 ioctl32_insert_translation(&ioctl_start[i]);
2894 }
2895 return 0; 1781 return 0;
2896} 1782}
2897__initcall(init_sys32_ioctl); 1783__initcall(init_sys32_ioctl);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/slab.h>
37 38
38#include <linux/configfs.h> 39#include <linux/configfs.h>
39#include "configfs_internal.h" 40#include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/slab.h>
32 33
33#include <linux/configfs.h> 34#include <linux/configfs.h>
34#include "configfs_internal.h" 35#include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index c8afa6b1d91d..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/slab.h>
30 31
31#include <linux/configfs.h> 32#include <linux/configfs.h>
32#include "configfs_internal.h" 33#include "configfs_internal.h"
@@ -121,8 +122,10 @@ static int get_target(const char *symname, struct path *path,
121 ret = -ENOENT; 122 ret = -ENOENT;
122 path_put(path); 123 path_put(path);
123 } 124 }
124 } else 125 } else {
125 ret = -EPERM; 126 ret = -EPERM;
127 path_put(path);
128 }
126 } 129 }
127 130
128 return ret; 131 return ret;
diff --git a/fs/dcache.c b/fs/dcache.c
index a100fa35a48f..f1358e5c3a59 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,6 +257,7 @@ kill_it:
257 if (dentry) 257 if (dentry)
258 goto repeat; 258 goto repeat;
259} 259}
260EXPORT_SYMBOL(dput);
260 261
261/** 262/**
262 * d_invalidate - invalidate a dentry 263 * d_invalidate - invalidate a dentry
@@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry)
314 spin_unlock(&dcache_lock); 315 spin_unlock(&dcache_lock);
315 return 0; 316 return 0;
316} 317}
318EXPORT_SYMBOL(d_invalidate);
317 319
318/* This should be called _only_ with dcache_lock held */ 320/* This should be called _only_ with dcache_lock held */
319 321
@@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry)
328{ 330{
329 return __dget_locked(dentry); 331 return __dget_locked(dentry);
330} 332}
333EXPORT_SYMBOL(dget_locked);
331 334
332/** 335/**
333 * d_find_alias - grab a hashed alias of inode 336 * d_find_alias - grab a hashed alias of inode
@@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode)
384 } 387 }
385 return de; 388 return de;
386} 389}
390EXPORT_SYMBOL(d_find_alias);
387 391
388/* 392/*
389 * Try to kill dentries associated with this inode. 393 * Try to kill dentries associated with this inode.
@@ -408,6 +412,7 @@ restart:
408 } 412 }
409 spin_unlock(&dcache_lock); 413 spin_unlock(&dcache_lock);
410} 414}
415EXPORT_SYMBOL(d_prune_aliases);
411 416
412/* 417/*
413 * Throw away a dentry - free the inode, dput the parent. This requires that 418 * Throw away a dentry - free the inode, dput the parent. This requires that
@@ -610,6 +615,7 @@ void shrink_dcache_sb(struct super_block * sb)
610{ 615{
611 __shrink_dcache_sb(sb, NULL, 0); 616 __shrink_dcache_sb(sb, NULL, 0);
612} 617}
618EXPORT_SYMBOL(shrink_dcache_sb);
613 619
614/* 620/*
615 * destroy a single subtree of dentries for unmount 621 * destroy a single subtree of dentries for unmount
@@ -792,6 +798,7 @@ positive:
792 spin_unlock(&dcache_lock); 798 spin_unlock(&dcache_lock);
793 return 1; 799 return 1;
794} 800}
801EXPORT_SYMBOL(have_submounts);
795 802
796/* 803/*
797 * Search the dentry child list for the specified parent, 804 * Search the dentry child list for the specified parent,
@@ -876,6 +883,7 @@ void shrink_dcache_parent(struct dentry * parent)
876 while ((found = select_parent(parent)) != 0) 883 while ((found = select_parent(parent)) != 0)
877 __shrink_dcache_sb(sb, &found, 0); 884 __shrink_dcache_sb(sb, &found, 0);
878} 885}
886EXPORT_SYMBOL(shrink_dcache_parent);
879 887
880/* 888/*
881 * Scan `nr' dentries and return the number which remain. 889 * Scan `nr' dentries and return the number which remain.
@@ -968,6 +976,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
968 976
969 return dentry; 977 return dentry;
970} 978}
979EXPORT_SYMBOL(d_alloc);
971 980
972struct dentry *d_alloc_name(struct dentry *parent, const char *name) 981struct dentry *d_alloc_name(struct dentry *parent, const char *name)
973{ 982{
@@ -978,6 +987,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
978 q.hash = full_name_hash(q.name, q.len); 987 q.hash = full_name_hash(q.name, q.len);
979 return d_alloc(parent, &q); 988 return d_alloc(parent, &q);
980} 989}
990EXPORT_SYMBOL(d_alloc_name);
981 991
982/* the caller must hold dcache_lock */ 992/* the caller must hold dcache_lock */
983static void __d_instantiate(struct dentry *dentry, struct inode *inode) 993static void __d_instantiate(struct dentry *dentry, struct inode *inode)
@@ -1011,6 +1021,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
1011 spin_unlock(&dcache_lock); 1021 spin_unlock(&dcache_lock);
1012 security_d_instantiate(entry, inode); 1022 security_d_instantiate(entry, inode);
1013} 1023}
1024EXPORT_SYMBOL(d_instantiate);
1014 1025
1015/** 1026/**
1016 * d_instantiate_unique - instantiate a non-aliased dentry 1027 * d_instantiate_unique - instantiate a non-aliased dentry
@@ -1107,6 +1118,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1107 } 1118 }
1108 return res; 1119 return res;
1109} 1120}
1121EXPORT_SYMBOL(d_alloc_root);
1110 1122
1111static inline struct hlist_head *d_hash(struct dentry *parent, 1123static inline struct hlist_head *d_hash(struct dentry *parent,
1112 unsigned long hash) 1124 unsigned long hash)
@@ -1210,7 +1222,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1210 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1222 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
1211 spin_unlock(&dcache_lock); 1223 spin_unlock(&dcache_lock);
1212 security_d_instantiate(new, inode); 1224 security_d_instantiate(new, inode);
1213 d_rehash(dentry);
1214 d_move(new, dentry); 1225 d_move(new, dentry);
1215 iput(inode); 1226 iput(inode);
1216 } else { 1227 } else {
@@ -1224,6 +1235,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1224 d_add(dentry, inode); 1235 d_add(dentry, inode);
1225 return new; 1236 return new;
1226} 1237}
1238EXPORT_SYMBOL(d_splice_alias);
1227 1239
1228/** 1240/**
1229 * d_add_ci - lookup or allocate new dentry with case-exact name 1241 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -1313,6 +1325,7 @@ err_out:
1313 iput(inode); 1325 iput(inode);
1314 return ERR_PTR(error); 1326 return ERR_PTR(error);
1315} 1327}
1328EXPORT_SYMBOL(d_add_ci);
1316 1329
1317/** 1330/**
1318 * d_lookup - search for a dentry 1331 * d_lookup - search for a dentry
@@ -1356,6 +1369,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1356 } while (read_seqretry(&rename_lock, seq)); 1369 } while (read_seqretry(&rename_lock, seq));
1357 return dentry; 1370 return dentry;
1358} 1371}
1372EXPORT_SYMBOL(d_lookup);
1359 1373
1360struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1374struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1361{ 1375{
@@ -1482,6 +1496,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
1482out: 1496out:
1483 return 0; 1497 return 0;
1484} 1498}
1499EXPORT_SYMBOL(d_validate);
1485 1500
1486/* 1501/*
1487 * When a file is deleted, we have two options: 1502 * When a file is deleted, we have two options:
@@ -1527,6 +1542,7 @@ void d_delete(struct dentry * dentry)
1527 1542
1528 fsnotify_nameremove(dentry, isdir); 1543 fsnotify_nameremove(dentry, isdir);
1529} 1544}
1545EXPORT_SYMBOL(d_delete);
1530 1546
1531static void __d_rehash(struct dentry * entry, struct hlist_head *list) 1547static void __d_rehash(struct dentry * entry, struct hlist_head *list)
1532{ 1548{
@@ -1555,6 +1571,7 @@ void d_rehash(struct dentry * entry)
1555 spin_unlock(&entry->d_lock); 1571 spin_unlock(&entry->d_lock);
1556 spin_unlock(&dcache_lock); 1572 spin_unlock(&dcache_lock);
1557} 1573}
1574EXPORT_SYMBOL(d_rehash);
1558 1575
1559/* 1576/*
1560 * When switching names, the actual string doesn't strictly have to 1577 * When switching names, the actual string doesn't strictly have to
@@ -1701,6 +1718,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
1701 d_move_locked(dentry, target); 1718 d_move_locked(dentry, target);
1702 spin_unlock(&dcache_lock); 1719 spin_unlock(&dcache_lock);
1703} 1720}
1721EXPORT_SYMBOL(d_move);
1704 1722
1705/** 1723/**
1706 * d_ancestor - search for an ancestor 1724 * d_ancestor - search for an ancestor
@@ -1867,6 +1885,7 @@ shouldnt_be_hashed:
1867 spin_unlock(&dcache_lock); 1885 spin_unlock(&dcache_lock);
1868 BUG(); 1886 BUG();
1869} 1887}
1888EXPORT_SYMBOL_GPL(d_materialise_unique);
1870 1889
1871static int prepend(char **buffer, int *buflen, const char *str, int namelen) 1890static int prepend(char **buffer, int *buflen, const char *str, int namelen)
1872{ 1891{
@@ -2004,6 +2023,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
2004 path_put(&root); 2023 path_put(&root);
2005 return res; 2024 return res;
2006} 2025}
2026EXPORT_SYMBOL(d_path);
2007 2027
2008/* 2028/*
2009 * Helper function for dentry_operations.d_dname() members 2029 * Helper function for dentry_operations.d_dname() members
@@ -2170,6 +2190,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2170 return result; 2190 return result;
2171} 2191}
2172 2192
2193int path_is_under(struct path *path1, struct path *path2)
2194{
2195 struct vfsmount *mnt = path1->mnt;
2196 struct dentry *dentry = path1->dentry;
2197 int res;
2198 spin_lock(&vfsmount_lock);
2199 if (mnt != path2->mnt) {
2200 for (;;) {
2201 if (mnt->mnt_parent == mnt) {
2202 spin_unlock(&vfsmount_lock);
2203 return 0;
2204 }
2205 if (mnt->mnt_parent == path2->mnt)
2206 break;
2207 mnt = mnt->mnt_parent;
2208 }
2209 dentry = mnt->mnt_mountpoint;
2210 }
2211 res = is_subdir(dentry, path2->dentry);
2212 spin_unlock(&vfsmount_lock);
2213 return res;
2214}
2215EXPORT_SYMBOL(path_is_under);
2216
2173void d_genocide(struct dentry *root) 2217void d_genocide(struct dentry *root)
2174{ 2218{
2175 struct dentry *this_parent = root; 2219 struct dentry *this_parent = root;
@@ -2227,6 +2271,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
2227 } 2271 }
2228 return ino; 2272 return ino;
2229} 2273}
2274EXPORT_SYMBOL(find_inode_number);
2230 2275
2231static __initdata unsigned long dhash_entries; 2276static __initdata unsigned long dhash_entries;
2232static int __init set_dhash_entries(char *str) 2277static int __init set_dhash_entries(char *str)
@@ -2296,6 +2341,7 @@ static void __init dcache_init(void)
2296 2341
2297/* SLAB cache for __getname() consumers */ 2342/* SLAB cache for __getname() consumers */
2298struct kmem_cache *names_cachep __read_mostly; 2343struct kmem_cache *names_cachep __read_mostly;
2344EXPORT_SYMBOL(names_cachep);
2299 2345
2300EXPORT_SYMBOL(d_genocide); 2346EXPORT_SYMBOL(d_genocide);
2301 2347
@@ -2325,26 +2371,3 @@ void __init vfs_caches_init(unsigned long mempages)
2325 bdev_cache_init(); 2371 bdev_cache_init();
2326 chrdev_init(); 2372 chrdev_init();
2327} 2373}
2328
2329EXPORT_SYMBOL(d_alloc);
2330EXPORT_SYMBOL(d_alloc_root);
2331EXPORT_SYMBOL(d_delete);
2332EXPORT_SYMBOL(d_find_alias);
2333EXPORT_SYMBOL(d_instantiate);
2334EXPORT_SYMBOL(d_invalidate);
2335EXPORT_SYMBOL(d_lookup);
2336EXPORT_SYMBOL(d_move);
2337EXPORT_SYMBOL_GPL(d_materialise_unique);
2338EXPORT_SYMBOL(d_path);
2339EXPORT_SYMBOL(d_prune_aliases);
2340EXPORT_SYMBOL(d_rehash);
2341EXPORT_SYMBOL(d_splice_alias);
2342EXPORT_SYMBOL(d_add_ci);
2343EXPORT_SYMBOL(d_validate);
2344EXPORT_SYMBOL(dget_locked);
2345EXPORT_SYMBOL(dput);
2346EXPORT_SYMBOL(find_inode_number);
2347EXPORT_SYMBOL(have_submounts);
2348EXPORT_SYMBOL(names_cachep);
2349EXPORT_SYMBOL(shrink_dcache_parent);
2350EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d22438ef7674..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,12 +27,15 @@
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/magic.h> 29#include <linux/magic.h>
30#include <linux/slab.h>
30 31
31static struct vfsmount *debugfs_mount; 32static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 33static int debugfs_mount_count;
33static bool debugfs_registered; 34static bool debugfs_registered;
34 35
35static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) 36static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
37 void *data, const struct file_operations *fops)
38
36{ 39{
37 struct inode *inode = new_inode(sb); 40 struct inode *inode = new_inode(sb);
38 41
@@ -44,14 +47,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
44 init_special_inode(inode, mode, dev); 47 init_special_inode(inode, mode, dev);
45 break; 48 break;
46 case S_IFREG: 49 case S_IFREG:
47 inode->i_fop = &debugfs_file_operations; 50 inode->i_fop = fops ? fops : &debugfs_file_operations;
51 inode->i_private = data;
48 break; 52 break;
49 case S_IFLNK: 53 case S_IFLNK:
50 inode->i_op = &debugfs_link_operations; 54 inode->i_op = &debugfs_link_operations;
55 inode->i_fop = fops;
56 inode->i_private = data;
51 break; 57 break;
52 case S_IFDIR: 58 case S_IFDIR:
53 inode->i_op = &simple_dir_inode_operations; 59 inode->i_op = &simple_dir_inode_operations;
54 inode->i_fop = &simple_dir_operations; 60 inode->i_fop = fops ? fops : &simple_dir_operations;
61 inode->i_private = data;
55 62
56 /* directory inodes start off with i_nlink == 2 63 /* directory inodes start off with i_nlink == 2
57 * (for "." entry) */ 64 * (for "." entry) */
@@ -64,7 +71,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
64 71
65/* SMP-safe */ 72/* SMP-safe */
66static int debugfs_mknod(struct inode *dir, struct dentry *dentry, 73static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
67 int mode, dev_t dev) 74 int mode, dev_t dev, void *data,
75 const struct file_operations *fops)
68{ 76{
69 struct inode *inode; 77 struct inode *inode;
70 int error = -EPERM; 78 int error = -EPERM;
@@ -72,7 +80,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
72 if (dentry->d_inode) 80 if (dentry->d_inode)
73 return -EEXIST; 81 return -EEXIST;
74 82
75 inode = debugfs_get_inode(dir->i_sb, mode, dev); 83 inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
76 if (inode) { 84 if (inode) {
77 d_instantiate(dentry, inode); 85 d_instantiate(dentry, inode);
78 dget(dentry); 86 dget(dentry);
@@ -81,12 +89,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
81 return error; 89 return error;
82} 90}
83 91
84static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 92static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
93 void *data, const struct file_operations *fops)
85{ 94{
86 int res; 95 int res;
87 96
88 mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; 97 mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
89 res = debugfs_mknod(dir, dentry, mode, 0); 98 res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
90 if (!res) { 99 if (!res) {
91 inc_nlink(dir); 100 inc_nlink(dir);
92 fsnotify_mkdir(dir, dentry); 101 fsnotify_mkdir(dir, dentry);
@@ -94,18 +103,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
94 return res; 103 return res;
95} 104}
96 105
97static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode) 106static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
107 void *data, const struct file_operations *fops)
98{ 108{
99 mode = (mode & S_IALLUGO) | S_IFLNK; 109 mode = (mode & S_IALLUGO) | S_IFLNK;
100 return debugfs_mknod(dir, dentry, mode, 0); 110 return debugfs_mknod(dir, dentry, mode, 0, data, fops);
101} 111}
102 112
103static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode) 113static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
114 void *data, const struct file_operations *fops)
104{ 115{
105 int res; 116 int res;
106 117
107 mode = (mode & S_IALLUGO) | S_IFREG; 118 mode = (mode & S_IALLUGO) | S_IFREG;
108 res = debugfs_mknod(dir, dentry, mode, 0); 119 res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
109 if (!res) 120 if (!res)
110 fsnotify_create(dir, dentry); 121 fsnotify_create(dir, dentry);
111 return res; 122 return res;
@@ -139,7 +150,9 @@ static struct file_system_type debug_fs_type = {
139 150
140static int debugfs_create_by_name(const char *name, mode_t mode, 151static int debugfs_create_by_name(const char *name, mode_t mode,
141 struct dentry *parent, 152 struct dentry *parent,
142 struct dentry **dentry) 153 struct dentry **dentry,
154 void *data,
155 const struct file_operations *fops)
143{ 156{
144 int error = 0; 157 int error = 0;
145 158
@@ -148,15 +161,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
148 * block. A pointer to that is in the struct vfsmount that we 161 * block. A pointer to that is in the struct vfsmount that we
149 * have around. 162 * have around.
150 */ 163 */
151 if (!parent) { 164 if (!parent)
152 if (debugfs_mount && debugfs_mount->mnt_sb) { 165 parent = debugfs_mount->mnt_sb->s_root;
153 parent = debugfs_mount->mnt_sb->s_root;
154 }
155 }
156 if (!parent) {
157 pr_debug("debugfs: Ah! can not find a parent!\n");
158 return -EFAULT;
159 }
160 166
161 *dentry = NULL; 167 *dentry = NULL;
162 mutex_lock(&parent->d_inode->i_mutex); 168 mutex_lock(&parent->d_inode->i_mutex);
@@ -164,13 +170,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
164 if (!IS_ERR(*dentry)) { 170 if (!IS_ERR(*dentry)) {
165 switch (mode & S_IFMT) { 171 switch (mode & S_IFMT) {
166 case S_IFDIR: 172 case S_IFDIR:
167 error = debugfs_mkdir(parent->d_inode, *dentry, mode); 173 error = debugfs_mkdir(parent->d_inode, *dentry, mode,
174 data, fops);
168 break; 175 break;
169 case S_IFLNK: 176 case S_IFLNK:
170 error = debugfs_link(parent->d_inode, *dentry, mode); 177 error = debugfs_link(parent->d_inode, *dentry, mode,
178 data, fops);
171 break; 179 break;
172 default: 180 default:
173 error = debugfs_create(parent->d_inode, *dentry, mode); 181 error = debugfs_create(parent->d_inode, *dentry, mode,
182 data, fops);
174 break; 183 break;
175 } 184 }
176 dput(*dentry); 185 dput(*dentry);
@@ -184,7 +193,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
184/** 193/**
185 * debugfs_create_file - create a file in the debugfs filesystem 194 * debugfs_create_file - create a file in the debugfs filesystem
186 * @name: a pointer to a string containing the name of the file to create. 195 * @name: a pointer to a string containing the name of the file to create.
187 * @mode: the permission that the file should have 196 * @mode: the permission that the file should have.
188 * @parent: a pointer to the parent dentry for this file. This should be a 197 * @parent: a pointer to the parent dentry for this file. This should be a
189 * directory dentry if set. If this paramater is NULL, then the 198 * directory dentry if set. If this paramater is NULL, then the
190 * file will be created in the root of the debugfs filesystem. 199 * file will be created in the root of the debugfs filesystem.
@@ -195,8 +204,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
195 * this file. 204 * this file.
196 * 205 *
197 * This is the basic "create a file" function for debugfs. It allows for a 206 * This is the basic "create a file" function for debugfs. It allows for a
198 * wide range of flexibility in createing a file, or a directory (if you 207 * wide range of flexibility in creating a file, or a directory (if you want
199 * want to create a directory, the debugfs_create_dir() function is 208 * to create a directory, the debugfs_create_dir() function is
200 * recommended to be used instead.) 209 * recommended to be used instead.)
201 * 210 *
202 * This function will return a pointer to a dentry if it succeeds. This 211 * This function will return a pointer to a dentry if it succeeds. This
@@ -221,19 +230,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
221 if (error) 230 if (error)
222 goto exit; 231 goto exit;
223 232
224 error = debugfs_create_by_name(name, mode, parent, &dentry); 233 error = debugfs_create_by_name(name, mode, parent, &dentry,
234 data, fops);
225 if (error) { 235 if (error) {
226 dentry = NULL; 236 dentry = NULL;
227 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 237 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
228 goto exit; 238 goto exit;
229 } 239 }
230
231 if (dentry->d_inode) {
232 if (data)
233 dentry->d_inode->i_private = data;
234 if (fops)
235 dentry->d_inode->i_fop = fops;
236 }
237exit: 240exit:
238 return dentry; 241 return dentry;
239} 242}
@@ -494,7 +497,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
494 } 497 }
495 d_move(old_dentry, dentry); 498 d_move(old_dentry, dentry);
496 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, 499 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
497 old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode), 500 S_ISDIR(old_dentry->d_inode->i_mode),
498 NULL, old_dentry); 501 NULL, old_dentry);
499 fsnotify_oldname_free(old_name); 502 fsnotify_oldname_free(old_name);
500 unlock_rename(new_dir, old_dir); 503 unlock_rename(new_dir, old_dir);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d5f8c96964be..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/tty.h> 20#include <linux/tty.h>
20#include <linux/mutex.h> 21#include <linux/mutex.h>
@@ -517,11 +518,23 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
517 518
518struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) 519struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
519{ 520{
521 struct dentry *dentry;
522 struct tty_struct *tty;
523
520 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); 524 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
521 525
526 /* Ensure dentry has not been deleted by devpts_pty_kill() */
527 dentry = d_find_alias(pts_inode);
528 if (!dentry)
529 return NULL;
530
531 tty = NULL;
522 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) 532 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
523 return (struct tty_struct *)pts_inode->i_private; 533 tty = (struct tty_struct *)pts_inode->i_private;
524 return NULL; 534
535 dput(dentry);
536
537 return tty;
525} 538}
526 539
527void devpts_pty_kill(struct tty_struct *tty) 540void devpts_pty_kill(struct tty_struct *tty)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..e82adc2debb7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -104,6 +97,18 @@ struct dio {
104 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
105 sector_t cur_page_block; /* Where it starts */ 98 sector_t cur_page_block; /* Where it starts */
106 99
100 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */
102 unsigned long refcount; /* direct_io_worker() and bios */
103 struct bio *bio_list; /* singly linked via bi_private */
104 struct task_struct *waiter; /* waiting task (NULL if none) */
105
106 /* AIO related stuff */
107 struct kiocb *iocb; /* kiocb */
108 int is_async; /* is IO async ? */
109 int io_error; /* IO error in completion path */
110 ssize_t result; /* IO result */
111
107 /* 112 /*
108 * Page fetching state. These variables belong to dio_refill_pages(). 113 * Page fetching state. These variables belong to dio_refill_pages().
109 */ 114 */
@@ -115,22 +120,16 @@ struct dio {
115 * Page queue. These variables belong to dio_refill_pages() and 120 * Page queue. These variables belong to dio_refill_pages() and
116 * dio_get_page(). 121 * dio_get_page().
117 */ 122 */
118 struct page *pages[DIO_PAGES]; /* page buffer */
119 unsigned head; /* next page to process */ 123 unsigned head; /* next page to process */
120 unsigned tail; /* last valid page + 1 */ 124 unsigned tail; /* last valid page + 1 */
121 int page_errors; /* errno from get_user_pages() */ 125 int page_errors; /* errno from get_user_pages() */
122 126
123 /* BIO completion state */ 127 /*
124 spinlock_t bio_lock; /* protects BIO fields below */ 128 * pages[] (and any fields placed after it) are not zeroed out at
125 unsigned long refcount; /* direct_io_worker() and bios */ 129 * allocation time. Don't add new fields after pages[] unless you
126 struct bio *bio_list; /* singly linked via bi_private */ 130 * wish that they not be zeroed.
127 struct task_struct *waiter; /* waiting task (NULL if none) */ 131 */
128 132 struct page *pages[DIO_PAGES]; /* page buffer */
129 /* AIO related stuff */
130 struct kiocb *iocb; /* kiocb */
131 int is_async; /* is IO async ? */
132 int io_error; /* IO error in completion path */
133 ssize_t result; /* IO result */
134}; 133};
135 134
136/* 135/*
@@ -240,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
240 if (dio->end_io && dio->result) 239 if (dio->end_io && dio->result)
241 dio->end_io(dio->iocb, offset, transferred, 240 dio->end_io(dio->iocb, offset, transferred,
242 dio->map_bh.b_private); 241 dio->map_bh.b_private);
243 if (dio->lock_type == DIO_LOCKING) 242
243 if (dio->flags & DIO_LOCKING)
244 /* lockdep: non-owner release */ 244 /* lockdep: non-owner release */
245 up_read_non_owner(&dio->inode->i_alloc_sem); 245 up_read_non_owner(&dio->inode->i_alloc_sem);
246 246
@@ -515,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
515 map_bh->b_state = 0; 515 map_bh->b_state = 0;
516 map_bh->b_size = fs_count << dio->inode->i_blkbits; 516 map_bh->b_size = fs_count << dio->inode->i_blkbits;
517 517
518 /*
519 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
520 * forbid block creations: only overwrites are permitted.
521 * We will return early to the caller once we see an
522 * unmapped buffer head returned, and the caller will fall
523 * back to buffered I/O.
524 *
525 * Otherwise the decision is left to the get_blocks method,
526 * which may decide to handle it or also return an unmapped
527 * buffer head.
528 */
518 create = dio->rw & WRITE; 529 create = dio->rw & WRITE;
519 if (dio->lock_type == DIO_LOCKING) { 530 if (dio->flags & DIO_SKIP_HOLES) {
520 if (dio->block_in_file < (i_size_read(dio->inode) >> 531 if (dio->block_in_file < (i_size_read(dio->inode) >>
521 dio->blkbits)) 532 dio->blkbits))
522 create = 0; 533 create = 0;
523 } else if (dio->lock_type == DIO_NO_LOCKING) {
524 create = 0;
525 } 534 }
526 535
527 /*
528 * For writes inside i_size we forbid block creations: only
529 * overwrites are permitted. We fall back to buffered writes
530 * at a higher level for inside-i_size block-instantiating
531 * writes.
532 */
533 ret = (*dio->get_block)(dio->inode, fs_startblk, 536 ret = (*dio->get_block)(dio->inode, fs_startblk,
534 map_bh, create); 537 map_bh, create);
535 } 538 }
@@ -1028,9 +1031,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1028 if (dio->bio) 1031 if (dio->bio)
1029 dio_bio_submit(dio); 1032 dio_bio_submit(dio);
1030 1033
1031 /* All IO is now issued, send it on its way */
1032 blk_run_address_space(inode->i_mapping);
1033
1034 /* 1034 /*
1035 * It is possible that, we return short IO due to end of file. 1035 * It is possible that, we return short IO due to end of file.
1036 * In that case, we need to release all the pages we got hold on. 1036 * In that case, we need to release all the pages we got hold on.
@@ -1042,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1042 * we can let i_mutex go now that its achieved its purpose 1042 * we can let i_mutex go now that its achieved its purpose
1043 * of protecting us from looking up uninitialized blocks. 1043 * of protecting us from looking up uninitialized blocks.
1044 */ 1044 */
1045 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1045 if (rw == READ && (dio->flags & DIO_LOCKING))
1046 mutex_unlock(&dio->inode->i_mutex); 1046 mutex_unlock(&dio->inode->i_mutex);
1047 1047
1048 /* 1048 /*
@@ -1057,8 +1057,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1057 ((rw & READ) || (dio->result == dio->size))) 1057 ((rw & READ) || (dio->result == dio->size)))
1058 ret = -EIOCBQUEUED; 1058 ret = -EIOCBQUEUED;
1059 1059
1060 if (ret != -EIOCBQUEUED) 1060 if (ret != -EIOCBQUEUED) {
1061 /* All IO is now issued, send it on its way */
1062 blk_run_address_space(inode->i_mapping);
1061 dio_await_completion(dio); 1063 dio_await_completion(dio);
1064 }
1062 1065
1063 /* 1066 /*
1064 * Sync will always be dropping the final ref and completing the 1067 * Sync will always be dropping the final ref and completing the
@@ -1086,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1086 1089
1087/* 1090/*
1088 * This is a library function for use by filesystem drivers. 1091 * This is a library function for use by filesystem drivers.
1089 * The locking rules are governed by the dio_lock_type parameter.
1090 * 1092 *
1091 * DIO_NO_LOCKING (no locking, for raw block device access) 1093 * The locking rules are governed by the flags parameter:
1092 * For writes, i_mutex is not held on entry; it is never taken. 1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1093 * 1102 *
1094 * DIO_LOCKING (simple locking for regular files) 1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1095 * For writes we are called under i_mutex and return with i_mutex held, even 1104 * internal locking but rather rely on the filesystem to synchronize
1096 * though it is internally dropped. 1105 * direct I/O reads/writes versus each other and truncate.
1097 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1098 * returning. 1107 * entry and are never taken.
1099 *
1100 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1101 * uninitialised data, allowing parallel direct readers and writers)
1102 * For writes we are called without i_mutex, return without it, never touch it.
1103 * For reads we are called under i_mutex and return with i_mutex held, even
1104 * though it may be internally dropped.
1105 *
1106 * Additional i_alloc_sem locking requirements described inline below.
1107 */ 1108 */
1108ssize_t 1109ssize_t
1109__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1110 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1111 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1111 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1112 int dio_lock_type) 1113 int flags)
1113{ 1114{
1114 int seg; 1115 int seg;
1115 size_t size; 1116 size_t size;
@@ -1120,11 +1121,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1120 ssize_t retval = -EINVAL; 1121 ssize_t retval = -EINVAL;
1121 loff_t end = offset; 1122 loff_t end = offset;
1122 struct dio *dio; 1123 struct dio *dio;
1123 int release_i_mutex = 0;
1124 int acquire_i_mutex = 0;
1125 1124
1126 if (rw & WRITE) 1125 if (rw & WRITE)
1127 rw = WRITE_ODIRECT; 1126 rw = WRITE_ODIRECT_PLUG;
1128 1127
1129 if (bdev) 1128 if (bdev)
1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1129 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -1151,48 +1150,41 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1151 } 1150 }
1152 } 1151 }
1153 1152
1154 dio = kzalloc(sizeof(*dio), GFP_KERNEL); 1153 dio = kmalloc(sizeof(*dio), GFP_KERNEL);
1155 retval = -ENOMEM; 1154 retval = -ENOMEM;
1156 if (!dio) 1155 if (!dio)
1157 goto out; 1156 goto out;
1158
1159 /* 1157 /*
1160 * For block device access DIO_NO_LOCKING is used, 1158 * Believe it or not, zeroing out the page array caused a .5%
1161 * neither readers nor writers do any locking at all 1159 * performance regression in a database benchmark. So, we take
1162 * For regular files using DIO_LOCKING, 1160 * care to only zero out what's needed.
1163 * readers need to grab i_mutex and i_alloc_sem
1164 * writers need to grab i_alloc_sem only (i_mutex is already held)
1165 * For regular files using DIO_OWN_LOCKING,
1166 * neither readers nor writers take any locks here
1167 */ 1161 */
1168 dio->lock_type = dio_lock_type; 1162 memset(dio, 0, offsetof(struct dio, pages));
1169 if (dio_lock_type != DIO_NO_LOCKING) { 1163
1164 dio->flags = flags;
1165 if (dio->flags & DIO_LOCKING) {
1170 /* watch out for a 0 len io from a tricksy fs */ 1166 /* watch out for a 0 len io from a tricksy fs */
1171 if (rw == READ && end > offset) { 1167 if (rw == READ && end > offset) {
1172 struct address_space *mapping; 1168 struct address_space *mapping =
1169 iocb->ki_filp->f_mapping;
1173 1170
1174 mapping = iocb->ki_filp->f_mapping; 1171 /* will be released by direct_io_worker */
1175 if (dio_lock_type != DIO_OWN_LOCKING) { 1172 mutex_lock(&inode->i_mutex);
1176 mutex_lock(&inode->i_mutex);
1177 release_i_mutex = 1;
1178 }
1179 1173
1180 retval = filemap_write_and_wait_range(mapping, offset, 1174 retval = filemap_write_and_wait_range(mapping, offset,
1181 end - 1); 1175 end - 1);
1182 if (retval) { 1176 if (retval) {
1177 mutex_unlock(&inode->i_mutex);
1183 kfree(dio); 1178 kfree(dio);
1184 goto out; 1179 goto out;
1185 } 1180 }
1186
1187 if (dio_lock_type == DIO_OWN_LOCKING) {
1188 mutex_unlock(&inode->i_mutex);
1189 acquire_i_mutex = 1;
1190 }
1191 } 1181 }
1192 1182
1193 if (dio_lock_type == DIO_LOCKING) 1183 /*
1194 /* lockdep: not the owner will release it */ 1184 * Will be released at I/O completion, possibly in a
1195 down_read_non_owner(&inode->i_alloc_sem); 1185 * different thread.
1186 */
1187 down_read_non_owner(&inode->i_alloc_sem);
1196 } 1188 }
1197 1189
1198 /* 1190 /*
@@ -1210,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1210 /* 1202 /*
1211 * In case of error extending write may have instantiated a few 1203 * In case of error extending write may have instantiated a few
1212 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1204 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1213 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1205 *
1214 * it's own meaner. 1206 * NOTE: filesystems with their own locking have to handle this
1207 * on their own.
1215 */ 1208 */
1216 if (unlikely(retval < 0 && (rw & WRITE))) { 1209 if (flags & DIO_LOCKING) {
1217 loff_t isize = i_size_read(inode); 1210 if (unlikely((rw & WRITE) && retval < 0)) {
1218 1211 loff_t isize = i_size_read(inode);
1219 if (end > isize && dio_lock_type == DIO_LOCKING) 1212 if (end > isize)
1220 vmtruncate(inode, isize); 1213 vmtruncate(inode, isize);
1214 }
1221 } 1215 }
1222 1216
1223 if (rw == READ && dio_lock_type == DIO_LOCKING)
1224 release_i_mutex = 0;
1225
1226out: 1217out:
1227 if (release_i_mutex)
1228 mutex_unlock(&inode->i_mutex);
1229 else if (acquire_i_mutex)
1230 mutex_lock(&inode->i_mutex);
1231 return retval; 1218 return retval;
1232} 1219}
1233EXPORT_SYMBOL(__blockdev_direct_IO); 1220EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
33 spin_unlock(&ast_queue_lock); 33 spin_unlock(&ast_queue_lock);
34} 34}
35 35
36void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 36void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
37{ 37{
38 if (lkb->lkb_flags & DLM_IFL_USER) { 38 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type, bastmode); 39 dlm_user_add_ast(lkb, type, mode);
40 return; 40 return;
41 } 41 }
42 42
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref); 45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 lkb->lkb_ast_first = type;
47 } 48 }
49
50 /* sanity check, this should not happen */
51
52 if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
53 log_print("repeat cast %d castmode %d lock %x %s",
54 mode, lkb->lkb_castmode,
55 lkb->lkb_id, lkb->lkb_resource->res_name);
56
48 lkb->lkb_ast_type |= type; 57 lkb->lkb_ast_type |= type;
49 if (bastmode) 58 if (type == AST_BAST)
50 lkb->lkb_bastmode = bastmode; 59 lkb->lkb_bastmode = mode;
60 else
61 lkb->lkb_castmode = mode;
51 spin_unlock(&ast_queue_lock); 62 spin_unlock(&ast_queue_lock);
52 63
53 set_bit(WAKE_ASTS, &astd_wakeflags); 64 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
59 struct dlm_ls *ls = NULL; 70 struct dlm_ls *ls = NULL;
60 struct dlm_rsb *r = NULL; 71 struct dlm_rsb *r = NULL;
61 struct dlm_lkb *lkb; 72 struct dlm_lkb *lkb;
62 void (*cast) (void *astparam); 73 void (*castfn) (void *astparam);
63 void (*bast) (void *astparam, int mode); 74 void (*bastfn) (void *astparam, int mode);
64 int type = 0, bastmode; 75 int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
65 76
66repeat: 77repeat:
67 spin_lock(&ast_queue_lock); 78 spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
75 list_del(&lkb->lkb_astqueue); 86 list_del(&lkb->lkb_astqueue);
76 type = lkb->lkb_ast_type; 87 type = lkb->lkb_ast_type;
77 lkb->lkb_ast_type = 0; 88 lkb->lkb_ast_type = 0;
89 first = lkb->lkb_ast_first;
90 lkb->lkb_ast_first = 0;
78 bastmode = lkb->lkb_bastmode; 91 bastmode = lkb->lkb_bastmode;
79 92 castmode = lkb->lkb_castmode;
93 castfn = lkb->lkb_astfn;
94 bastfn = lkb->lkb_bastfn;
80 spin_unlock(&ast_queue_lock); 95 spin_unlock(&ast_queue_lock);
81 cast = lkb->lkb_astfn;
82 bast = lkb->lkb_bastfn;
83
84 if ((type & AST_COMP) && cast)
85 cast(lkb->lkb_astparam);
86 96
87 if ((type & AST_BAST) && bast) 97 do_cast = (type & AST_COMP) && castfn;
88 bast(lkb->lkb_astparam, bastmode); 98 do_bast = (type & AST_BAST) && bastfn;
99
100 /* Skip a bast if its blocking mode is compatible with the
101 granted mode of the preceding cast. */
102
103 if (do_bast) {
104 if (first == AST_COMP)
105 last_castmode = castmode;
106 else
107 last_castmode = lkb->lkb_castmode_done;
108 if (dlm_modes_compat(bastmode, last_castmode))
109 do_bast = 0;
110 }
111
112 if (first == AST_COMP) {
113 if (do_cast)
114 castfn(lkb->lkb_astparam);
115 if (do_bast)
116 bastfn(lkb->lkb_astparam, bastmode);
117 } else if (first == AST_BAST) {
118 if (do_bast)
119 bastfn(lkb->lkb_astparam, bastmode);
120 if (do_cast)
121 castfn(lkb->lkb_astparam);
122 } else {
123 log_error(ls, "bad ast_first %d ast_type %d",
124 first, type);
125 }
126
127 if (do_cast)
128 lkb->lkb_castmode_done = castmode;
129 if (do_bast)
130 lkb->lkb_bastmode_done = bastmode;
89 131
90 /* this removes the reference added by dlm_add_ast 132 /* this removes the reference added by dlm_add_ast
91 and may result in the lkb being freed */ 133 and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 16void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
17void dlm_del_ast(struct dlm_lkb *lkb); 17void dlm_del_ast(struct dlm_lkb *lkb);
18 18
19void dlm_astd_wake(void); 19void dlm_astd_wake(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index fd9859f92fad..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/slab.h>
17#include <linux/in.h> 18#include <linux/in.h>
18#include <linux/in6.h> 19#include <linux/in6.h>
19#include <net/ipv6.h> 20#include <net/ipv6.h>
@@ -410,10 +411,10 @@ static struct config_group *make_cluster(struct config_group *g,
410 struct dlm_comms *cms = NULL; 411 struct dlm_comms *cms = NULL;
411 void *gps = NULL; 412 void *gps = NULL;
412 413
413 cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL); 414 cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
414 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); 415 gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
415 sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL); 416 sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
416 cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL); 417 cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
417 418
418 if (!cl || !gps || !sps || !cms) 419 if (!cl || !gps || !sps || !cms)
419 goto fail; 420 goto fail;
@@ -482,9 +483,9 @@ static struct config_group *make_space(struct config_group *g, const char *name)
482 struct dlm_nodes *nds = NULL; 483 struct dlm_nodes *nds = NULL;
483 void *gps = NULL; 484 void *gps = NULL;
484 485
485 sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL); 486 sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
486 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); 487 gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
487 nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL); 488 nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
488 489
489 if (!sp || !gps || !nds) 490 if (!sp || !gps || !nds)
490 goto fail; 491 goto fail;
@@ -536,7 +537,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
536{ 537{
537 struct dlm_comm *cm; 538 struct dlm_comm *cm;
538 539
539 cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL); 540 cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
540 if (!cm) 541 if (!cm)
541 return ERR_PTR(-ENOMEM); 542 return ERR_PTR(-ENOMEM);
542 543
@@ -569,7 +570,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
569 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); 570 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
570 struct dlm_node *nd; 571 struct dlm_node *nd;
571 572
572 nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); 573 nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
573 if (!nd) 574 if (!nd)
574 return ERR_PTR(-ENOMEM); 575 return ERR_PTR(-ENOMEM);
575 576
@@ -705,7 +706,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
705 if (cm->addr_count >= DLM_MAX_ADDR_COUNT) 706 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
706 return -ENOSPC; 707 return -ENOSPC;
707 708
708 addr = kzalloc(sizeof(*addr), GFP_KERNEL); 709 addr = kzalloc(sizeof(*addr), GFP_NOFS);
709 if (!addr) 710 if (!addr)
710 return -ENOMEM; 711 return -ENOMEM;
711 712
@@ -868,7 +869,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
868 869
869 ids_count = sp->members_count; 870 ids_count = sp->members_count;
870 871
871 ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL); 872 ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
872 if (!ids) { 873 if (!ids) {
873 rv = -ENOMEM; 874 rv = -ENOMEM;
874 goto out; 875 goto out;
@@ -886,7 +887,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
886 if (!new_count) 887 if (!new_count)
887 goto out_ids; 888 goto out_ids;
888 889
889 new = kcalloc(new_count, sizeof(int), GFP_KERNEL); 890 new = kcalloc(new_count, sizeof(int), GFP_NOFS);
890 if (!new) { 891 if (!new) {
891 kfree(ids); 892 kfree(ids);
892 rv = -ENOMEM; 893 rv = -ENOMEM;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1c8bb8c3a82e..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/slab.h>
18 19
19#include "dlm_internal.h" 20#include "dlm_internal.h"
20#include "lock.h" 21#include "lock.h"
@@ -256,7 +257,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
256 lkb->lkb_status, 257 lkb->lkb_status,
257 lkb->lkb_grmode, 258 lkb->lkb_grmode,
258 lkb->lkb_rqmode, 259 lkb->lkb_rqmode,
259 lkb->lkb_highbast, 260 lkb->lkb_bastmode,
260 rsb_lookup, 261 rsb_lookup,
261 lkb->lkb_wait_type, 262 lkb->lkb_wait_type,
262 lkb->lkb_lvbseq, 263 lkb->lkb_lvbseq,
@@ -404,7 +405,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
404 if (bucket >= ls->ls_rsbtbl_size) 405 if (bucket >= ls->ls_rsbtbl_size)
405 return NULL; 406 return NULL;
406 407
407 ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL); 408 ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
408 if (!ri) 409 if (!ri)
409 return NULL; 410 return NULL;
410 if (n == 0) 411 if (n == 0)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index c4dfa1dcc86f..7b84c1dbc82e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
49 spin_unlock(&ls->ls_recover_list_lock); 49 spin_unlock(&ls->ls_recover_list_lock);
50 50
51 if (!found) 51 if (!found)
52 de = kzalloc(sizeof(struct dlm_direntry) + len, 52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
53 ls->ls_allocation);
54 return de; 53 return de;
55} 54}
56 55
@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
212 211
213 dlm_dir_clear(ls); 212 dlm_dir_clear(ls);
214 213
215 last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation); 214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
216 if (!last_name) 215 if (!last_name)
217 goto out; 216 goto out;
218 217
@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
323 if (namelen > DLM_RESNAME_MAXLEN) 322 if (namelen > DLM_RESNAME_MAXLEN)
324 return -EINVAL; 323 return -EINVAL;
325 324
326 de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation); 325 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
327 if (!de) 326 if (!de)
328 return -ENOMEM; 327 return -ENOMEM;
329 328
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d01ca0a711db..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
232 int8_t lkb_status; /* granted, waiting, convert */ 232 int8_t lkb_status; /* granted, waiting, convert */
233 int8_t lkb_rqmode; /* requested lock mode */ 233 int8_t lkb_rqmode; /* requested lock mode */
234 int8_t lkb_grmode; /* granted lock mode */ 234 int8_t lkb_grmode; /* granted lock mode */
235 int8_t lkb_bastmode; /* requested mode */
236 int8_t lkb_highbast; /* highest mode bast sent for */ 235 int8_t lkb_highbast; /* highest mode bast sent for */
236
237 int8_t lkb_wait_type; /* type of reply waiting for */ 237 int8_t lkb_wait_type; /* type of reply waiting for */
238 int8_t lkb_wait_count; 238 int8_t lkb_wait_count;
239 int8_t lkb_ast_type; /* type of ast queued for */ 239 int8_t lkb_ast_type; /* type of ast queued for */
240 int8_t lkb_ast_first; /* type of first ast queued */
241
242 int8_t lkb_bastmode; /* req mode of queued bast */
243 int8_t lkb_castmode; /* gr mode of queued cast */
244 int8_t lkb_bastmode_done; /* last delivered bastmode */
245 int8_t lkb_castmode_done; /* last delivered castmode */
240 246
241 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 247 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
242 struct list_head lkb_statequeue; /* rsb g/c/w list */ 248 struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -473,7 +479,6 @@ struct dlm_ls {
473 int ls_low_nodeid; 479 int ls_low_nodeid;
474 int ls_total_weight; 480 int ls_total_weight;
475 int *ls_node_array; 481 int *ls_node_array;
476 gfp_t ls_allocation;
477 482
478 struct dlm_rsb ls_stub_rsb; /* for returning errors */ 483 struct dlm_rsb ls_stub_rsb; /* for returning errors */
479 struct dlm_lkb ls_stub_lkb; /* for returning errors */ 484 struct dlm_lkb ls_stub_lkb; /* for returning errors */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index eb507c453c5f..17903b491298 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/slab.h>
59#include "dlm_internal.h" 60#include "dlm_internal.h"
60#include <linux/dlm_device.h> 61#include <linux/dlm_device.h>
61#include "memory.h" 62#include "memory.h"
@@ -307,7 +308,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
307 lkb->lkb_lksb->sb_status = rv; 308 lkb->lkb_lksb->sb_status = rv;
308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 309 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 310
310 dlm_add_ast(lkb, AST_COMP, 0); 311 dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
311} 312}
312 313
313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 314static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -320,10 +321,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320{ 321{
321 lkb->lkb_time_bast = ktime_get(); 322 lkb->lkb_time_bast = ktime_get();
322 323
323 if (is_master_copy(lkb)) 324 if (is_master_copy(lkb)) {
325 lkb->lkb_bastmode = rqmode; /* printed by debugfs */
324 send_bast(r, lkb, rqmode); 326 send_bast(r, lkb, rqmode);
325 else 327 } else {
326 dlm_add_ast(lkb, AST_BAST, rqmode); 328 dlm_add_ast(lkb, AST_BAST, rqmode);
329 }
327} 330}
328 331
329/* 332/*
@@ -2280,20 +2283,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2280 if (can_be_queued(lkb)) { 2283 if (can_be_queued(lkb)) {
2281 error = -EINPROGRESS; 2284 error = -EINPROGRESS;
2282 add_lkb(r, lkb, DLM_LKSTS_WAITING); 2285 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2283 send_blocking_asts(r, lkb);
2284 add_timeout(lkb); 2286 add_timeout(lkb);
2285 goto out; 2287 goto out;
2286 } 2288 }
2287 2289
2288 error = -EAGAIN; 2290 error = -EAGAIN;
2289 if (force_blocking_asts(lkb))
2290 send_blocking_asts_all(r, lkb);
2291 queue_cast(r, lkb, -EAGAIN); 2291 queue_cast(r, lkb, -EAGAIN);
2292
2293 out: 2292 out:
2294 return error; 2293 return error;
2295} 2294}
2296 2295
2296static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2297 int error)
2298{
2299 switch (error) {
2300 case -EAGAIN:
2301 if (force_blocking_asts(lkb))
2302 send_blocking_asts_all(r, lkb);
2303 break;
2304 case -EINPROGRESS:
2305 send_blocking_asts(r, lkb);
2306 break;
2307 }
2308}
2309
2297static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) 2310static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2298{ 2311{
2299 int error = 0; 2312 int error = 0;
@@ -2304,7 +2317,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2304 if (can_be_granted(r, lkb, 1, &deadlk)) { 2317 if (can_be_granted(r, lkb, 1, &deadlk)) {
2305 grant_lock(r, lkb); 2318 grant_lock(r, lkb);
2306 queue_cast(r, lkb, 0); 2319 queue_cast(r, lkb, 0);
2307 grant_pending_locks(r);
2308 goto out; 2320 goto out;
2309 } 2321 }
2310 2322
@@ -2334,7 +2346,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2334 if (_can_be_granted(r, lkb, 1)) { 2346 if (_can_be_granted(r, lkb, 1)) {
2335 grant_lock(r, lkb); 2347 grant_lock(r, lkb);
2336 queue_cast(r, lkb, 0); 2348 queue_cast(r, lkb, 0);
2337 grant_pending_locks(r);
2338 goto out; 2349 goto out;
2339 } 2350 }
2340 /* else fall through and move to convert queue */ 2351 /* else fall through and move to convert queue */
@@ -2344,28 +2355,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2344 error = -EINPROGRESS; 2355 error = -EINPROGRESS;
2345 del_lkb(r, lkb); 2356 del_lkb(r, lkb);
2346 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 2357 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2347 send_blocking_asts(r, lkb);
2348 add_timeout(lkb); 2358 add_timeout(lkb);
2349 goto out; 2359 goto out;
2350 } 2360 }
2351 2361
2352 error = -EAGAIN; 2362 error = -EAGAIN;
2353 if (force_blocking_asts(lkb))
2354 send_blocking_asts_all(r, lkb);
2355 queue_cast(r, lkb, -EAGAIN); 2363 queue_cast(r, lkb, -EAGAIN);
2356
2357 out: 2364 out:
2358 return error; 2365 return error;
2359} 2366}
2360 2367
2368static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2369 int error)
2370{
2371 switch (error) {
2372 case 0:
2373 grant_pending_locks(r);
2374 /* grant_pending_locks also sends basts */
2375 break;
2376 case -EAGAIN:
2377 if (force_blocking_asts(lkb))
2378 send_blocking_asts_all(r, lkb);
2379 break;
2380 case -EINPROGRESS:
2381 send_blocking_asts(r, lkb);
2382 break;
2383 }
2384}
2385
2361static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) 2386static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2362{ 2387{
2363 remove_lock(r, lkb); 2388 remove_lock(r, lkb);
2364 queue_cast(r, lkb, -DLM_EUNLOCK); 2389 queue_cast(r, lkb, -DLM_EUNLOCK);
2365 grant_pending_locks(r);
2366 return -DLM_EUNLOCK; 2390 return -DLM_EUNLOCK;
2367} 2391}
2368 2392
2393static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2394 int error)
2395{
2396 grant_pending_locks(r);
2397}
2398
2369/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ 2399/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
2370 2400
2371static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 2401static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2405,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2375 error = revert_lock(r, lkb); 2405 error = revert_lock(r, lkb);
2376 if (error) { 2406 if (error) {
2377 queue_cast(r, lkb, -DLM_ECANCEL); 2407 queue_cast(r, lkb, -DLM_ECANCEL);
2378 grant_pending_locks(r);
2379 return -DLM_ECANCEL; 2408 return -DLM_ECANCEL;
2380 } 2409 }
2381 return 0; 2410 return 0;
2382} 2411}
2383 2412
2413static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2414 int error)
2415{
2416 if (error)
2417 grant_pending_locks(r);
2418}
2419
2384/* 2420/*
2385 * Four stage 3 varieties: 2421 * Four stage 3 varieties:
2386 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() 2422 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2438,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2402 goto out; 2438 goto out;
2403 } 2439 }
2404 2440
2405 if (is_remote(r)) 2441 if (is_remote(r)) {
2406 /* receive_request() calls do_request() on remote node */ 2442 /* receive_request() calls do_request() on remote node */
2407 error = send_request(r, lkb); 2443 error = send_request(r, lkb);
2408 else 2444 } else {
2409 error = do_request(r, lkb); 2445 error = do_request(r, lkb);
2446 /* for remote locks the request_reply is sent
2447 between do_request and do_request_effects */
2448 do_request_effects(r, lkb, error);
2449 }
2410 out: 2450 out:
2411 return error; 2451 return error;
2412} 2452}
@@ -2417,11 +2457,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2417{ 2457{
2418 int error; 2458 int error;
2419 2459
2420 if (is_remote(r)) 2460 if (is_remote(r)) {
2421 /* receive_convert() calls do_convert() on remote node */ 2461 /* receive_convert() calls do_convert() on remote node */
2422 error = send_convert(r, lkb); 2462 error = send_convert(r, lkb);
2423 else 2463 } else {
2424 error = do_convert(r, lkb); 2464 error = do_convert(r, lkb);
2465 /* for remote locks the convert_reply is sent
2466 between do_convert and do_convert_effects */
2467 do_convert_effects(r, lkb, error);
2468 }
2425 2469
2426 return error; 2470 return error;
2427} 2471}
@@ -2432,11 +2476,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2432{ 2476{
2433 int error; 2477 int error;
2434 2478
2435 if (is_remote(r)) 2479 if (is_remote(r)) {
2436 /* receive_unlock() calls do_unlock() on remote node */ 2480 /* receive_unlock() calls do_unlock() on remote node */
2437 error = send_unlock(r, lkb); 2481 error = send_unlock(r, lkb);
2438 else 2482 } else {
2439 error = do_unlock(r, lkb); 2483 error = do_unlock(r, lkb);
2484 /* for remote locks the unlock_reply is sent
2485 between do_unlock and do_unlock_effects */
2486 do_unlock_effects(r, lkb, error);
2487 }
2440 2488
2441 return error; 2489 return error;
2442} 2490}
@@ -2447,11 +2495,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2447{ 2495{
2448 int error; 2496 int error;
2449 2497
2450 if (is_remote(r)) 2498 if (is_remote(r)) {
2451 /* receive_cancel() calls do_cancel() on remote node */ 2499 /* receive_cancel() calls do_cancel() on remote node */
2452 error = send_cancel(r, lkb); 2500 error = send_cancel(r, lkb);
2453 else 2501 } else {
2454 error = do_cancel(r, lkb); 2502 error = do_cancel(r, lkb);
2503 /* for remote locks the cancel_reply is sent
2504 between do_cancel and do_cancel_effects */
2505 do_cancel_effects(r, lkb, error);
2506 }
2455 2507
2456 return error; 2508 return error;
2457} 2509}
@@ -2689,7 +2741,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
2689 pass into lowcomms_commit and a message buffer (mb) that we 2741 pass into lowcomms_commit and a message buffer (mb) that we
2690 write our data into */ 2742 write our data into */
2691 2743
2692 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); 2744 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
2693 if (!mh) 2745 if (!mh)
2694 return -ENOBUFS; 2746 return -ENOBUFS;
2695 2747
@@ -3191,6 +3243,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3191 attach_lkb(r, lkb); 3243 attach_lkb(r, lkb);
3192 error = do_request(r, lkb); 3244 error = do_request(r, lkb);
3193 send_request_reply(r, lkb, error); 3245 send_request_reply(r, lkb, error);
3246 do_request_effects(r, lkb, error);
3194 3247
3195 unlock_rsb(r); 3248 unlock_rsb(r);
3196 put_rsb(r); 3249 put_rsb(r);
@@ -3226,15 +3279,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3226 goto out; 3279 goto out;
3227 3280
3228 receive_flags(lkb, ms); 3281 receive_flags(lkb, ms);
3282
3229 error = receive_convert_args(ls, lkb, ms); 3283 error = receive_convert_args(ls, lkb, ms);
3230 if (error) 3284 if (error) {
3231 goto out_reply; 3285 send_convert_reply(r, lkb, error);
3286 goto out;
3287 }
3288
3232 reply = !down_conversion(lkb); 3289 reply = !down_conversion(lkb);
3233 3290
3234 error = do_convert(r, lkb); 3291 error = do_convert(r, lkb);
3235 out_reply:
3236 if (reply) 3292 if (reply)
3237 send_convert_reply(r, lkb, error); 3293 send_convert_reply(r, lkb, error);
3294 do_convert_effects(r, lkb, error);
3238 out: 3295 out:
3239 unlock_rsb(r); 3296 unlock_rsb(r);
3240 put_rsb(r); 3297 put_rsb(r);
@@ -3266,13 +3323,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3266 goto out; 3323 goto out;
3267 3324
3268 receive_flags(lkb, ms); 3325 receive_flags(lkb, ms);
3326
3269 error = receive_unlock_args(ls, lkb, ms); 3327 error = receive_unlock_args(ls, lkb, ms);
3270 if (error) 3328 if (error) {
3271 goto out_reply; 3329 send_unlock_reply(r, lkb, error);
3330 goto out;
3331 }
3272 3332
3273 error = do_unlock(r, lkb); 3333 error = do_unlock(r, lkb);
3274 out_reply:
3275 send_unlock_reply(r, lkb, error); 3334 send_unlock_reply(r, lkb, error);
3335 do_unlock_effects(r, lkb, error);
3276 out: 3336 out:
3277 unlock_rsb(r); 3337 unlock_rsb(r);
3278 put_rsb(r); 3338 put_rsb(r);
@@ -3307,6 +3367,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3307 3367
3308 error = do_cancel(r, lkb); 3368 error = do_cancel(r, lkb);
3309 send_cancel_reply(r, lkb, error); 3369 send_cancel_reply(r, lkb, error);
3370 do_cancel_effects(r, lkb, error);
3310 out: 3371 out:
3311 unlock_rsb(r); 3372 unlock_rsb(r);
3312 put_rsb(r); 3373 put_rsb(r);
@@ -4512,7 +4573,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4512 } 4573 }
4513 4574
4514 if (flags & DLM_LKF_VALBLK) { 4575 if (flags & DLM_LKF_VALBLK) {
4515 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); 4576 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
4516 if (!ua->lksb.sb_lvbptr) { 4577 if (!ua->lksb.sb_lvbptr) {
4517 kfree(ua); 4578 kfree(ua);
4518 __put_lkb(ls, lkb); 4579 __put_lkb(ls, lkb);
@@ -4582,7 +4643,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4582 ua = lkb->lkb_ua; 4643 ua = lkb->lkb_ua;
4583 4644
4584 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { 4645 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
4585 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); 4646 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
4586 if (!ua->lksb.sb_lvbptr) { 4647 if (!ua->lksb.sb_lvbptr) {
4587 error = -ENOMEM; 4648 error = -ENOMEM;
4588 goto out_put; 4649 goto out_put;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d489fcc86713..f994a7dfda85 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
148 kfree(ls); 148 kfree(ls);
149} 149}
150 150
151static struct sysfs_ops dlm_attr_ops = { 151static const struct sysfs_ops dlm_attr_ops = {
152 .show = dlm_attr_show, 152 .show = dlm_attr_show,
153 .store = dlm_attr_store, 153 .store = dlm_attr_store,
154}; 154};
@@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in)
191 return error; 191 return error;
192} 192}
193 193
194static int dlm_uevent(struct kset *kset, struct kobject *kobj,
195 struct kobj_uevent_env *env)
196{
197 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
198
199 add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
200 return 0;
201}
202
203static struct kset_uevent_ops dlm_uevent_ops = {
204 .uevent = dlm_uevent,
205};
194 206
195int __init dlm_lockspace_init(void) 207int __init dlm_lockspace_init(void)
196{ 208{
@@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void)
199 INIT_LIST_HEAD(&lslist); 211 INIT_LIST_HEAD(&lslist);
200 spin_lock_init(&lslist_lock); 212 spin_lock_init(&lslist_lock);
201 213
202 dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj); 214 dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
203 if (!dlm_kset) { 215 if (!dlm_kset) {
204 printk(KERN_WARNING "%s: can not create kset\n", __func__); 216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
205 return -ENOMEM; 217 return -ENOMEM;
@@ -430,7 +442,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
430 442
431 error = -ENOMEM; 443 error = -ENOMEM;
432 444
433 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); 445 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
434 if (!ls) 446 if (!ls)
435 goto out; 447 goto out;
436 memcpy(ls->ls_name, name, namelen); 448 memcpy(ls->ls_name, name, namelen);
@@ -443,11 +455,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
443 if (flags & DLM_LSFL_TIMEWARN) 455 if (flags & DLM_LSFL_TIMEWARN)
444 set_bit(LSFL_TIMEWARN, &ls->ls_flags); 456 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
445 457
446 if (flags & DLM_LSFL_FS)
447 ls->ls_allocation = GFP_NOFS;
448 else
449 ls->ls_allocation = GFP_KERNEL;
450
451 /* ls_exflags are forced to match among nodes, and we don't 458 /* ls_exflags are forced to match among nodes, and we don't
452 need to require all nodes to have some flags set */ 459 need to require all nodes to have some flags set */
453 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | 460 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
@@ -456,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
456 size = dlm_config.ci_rsbtbl_size; 463 size = dlm_config.ci_rsbtbl_size;
457 ls->ls_rsbtbl_size = size; 464 ls->ls_rsbtbl_size = size;
458 465
459 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); 466 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
460 if (!ls->ls_rsbtbl) 467 if (!ls->ls_rsbtbl)
461 goto out_lsfree; 468 goto out_lsfree;
462 for (i = 0; i < size; i++) { 469 for (i = 0; i < size; i++) {
@@ -468,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
468 size = dlm_config.ci_lkbtbl_size; 475 size = dlm_config.ci_lkbtbl_size;
469 ls->ls_lkbtbl_size = size; 476 ls->ls_lkbtbl_size = size;
470 477
471 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); 478 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
472 if (!ls->ls_lkbtbl) 479 if (!ls->ls_lkbtbl)
473 goto out_rsbfree; 480 goto out_rsbfree;
474 for (i = 0; i < size; i++) { 481 for (i = 0; i < size; i++) {
@@ -480,7 +487,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
480 size = dlm_config.ci_dirtbl_size; 487 size = dlm_config.ci_dirtbl_size;
481 ls->ls_dirtbl_size = size; 488 ls->ls_dirtbl_size = size;
482 489
483 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); 490 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
484 if (!ls->ls_dirtbl) 491 if (!ls->ls_dirtbl)
485 goto out_lkbfree; 492 goto out_lkbfree;
486 for (i = 0; i < size; i++) { 493 for (i = 0; i < size; i++) {
@@ -527,7 +534,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
527 mutex_init(&ls->ls_requestqueue_mutex); 534 mutex_init(&ls->ls_requestqueue_mutex);
528 mutex_init(&ls->ls_clear_proc_locks); 535 mutex_init(&ls->ls_clear_proc_locks);
529 536
530 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 537 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
531 if (!ls->ls_recover_buf) 538 if (!ls->ls_recover_buf)
532 goto out_dirfree; 539 goto out_dirfree;
533 540
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70736eb4b516..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
51#include <linux/file.h> 51#include <linux/file.h>
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h>
54#include <net/sctp/user.h> 55#include <net/sctp/user.h>
55#include <net/ipv6.h> 56#include <net/ipv6.h>
56 57
@@ -1060,7 +1061,7 @@ static void init_local(void)
1060 if (dlm_our_addr(&sas, i)) 1061 if (dlm_our_addr(&sas, i))
1061 break; 1062 break;
1062 1063
1063 addr = kmalloc(sizeof(*addr), GFP_KERNEL); 1064 addr = kmalloc(sizeof(*addr), GFP_NOFS);
1064 if (!addr) 1065 if (!addr)
1065 break; 1066 break;
1066 memcpy(addr, &sas, sizeof(*addr)); 1067 memcpy(addr, &sas, sizeof(*addr));
@@ -1099,7 +1100,7 @@ static int sctp_listen_for_all(void)
1099 struct sockaddr_storage localaddr; 1100 struct sockaddr_storage localaddr;
1100 struct sctp_event_subscribe subscribe; 1101 struct sctp_event_subscribe subscribe;
1101 int result = -EINVAL, num = 1, i, addr_len; 1102 int result = -EINVAL, num = 1, i, addr_len;
1102 struct connection *con = nodeid2con(0, GFP_KERNEL); 1103 struct connection *con = nodeid2con(0, GFP_NOFS);
1103 int bufsize = NEEDED_RMEM; 1104 int bufsize = NEEDED_RMEM;
1104 1105
1105 if (!con) 1106 if (!con)
@@ -1171,7 +1172,7 @@ out:
1171static int tcp_listen_for_all(void) 1172static int tcp_listen_for_all(void)
1172{ 1173{
1173 struct socket *sock = NULL; 1174 struct socket *sock = NULL;
1174 struct connection *con = nodeid2con(0, GFP_KERNEL); 1175 struct connection *con = nodeid2con(0, GFP_NOFS);
1175 int result = -EINVAL; 1176 int result = -EINVAL;
1176 1177
1177 if (!con) 1178 if (!con)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b128775913b2..b12532e553f8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
48 struct dlm_member *memb; 48 struct dlm_member *memb;
49 int w, error; 49 int w, error;
50 50
51 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); 51 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
52 if (!memb) 52 if (!memb)
53 return -ENOMEM; 53 return -ENOMEM;
54 54
@@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
143 143
144 ls->ls_total_weight = total; 144 ls->ls_total_weight = total;
145 145
146 array = kmalloc(sizeof(int) * total, ls->ls_allocation); 146 array = kmalloc(sizeof(int) * total, GFP_NOFS);
147 if (!array) 147 if (!array)
148 return; 148 return;
149 149
@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
226 continue; 226 continue;
227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); 227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
228 228
229 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); 229 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
230 if (!memb) 230 if (!memb)
231 return -ENOMEM; 231 return -ENOMEM;
232 memb->nodeid = rv->new[i]; 232 memb->nodeid = rv->new[i];
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
312 /* 312 /*
313 * This in_recovery lock does two things: 313 * This in_recovery lock does two things:
314 * 1) Keeps this function from returning until all threads are out 314 * 1) Keeps this function from returning until all threads are out
315 * of locking routines and locking is truely stopped. 315 * of locking routines and locking is truly stopped.
316 * 2) Keeps any new requests from being processed until it's unlocked 316 * 2) Keeps any new requests from being processed until it's unlocked
317 * when recovery is complete. 317 * when recovery is complete.
318 */ 318 */
@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
341 int *ids = NULL, *new = NULL; 341 int *ids = NULL, *new = NULL;
342 int error, ids_count = 0, new_count = 0; 342 int error, ids_count = 0, new_count = 0;
343 343
344 rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation); 344 rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
345 if (!rv) 345 if (!rv)
346 return -ENOMEM; 346 return -ENOMEM;
347 347
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1775b84ebab..8e0d00db004f 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
42 p = kzalloc(ls->ls_lvblen, ls->ls_allocation); 42 p = kzalloc(ls->ls_lvblen, GFP_NOFS);
43 return p; 43 return p;
44} 44}
45 45
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
57 57
58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); 58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
59 59
60 r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation); 60 r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
61 return r; 61 return r;
62} 62}
63 63
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
75 lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation); 75 lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
76 return lkb; 76 return lkb;
77} 77}
78 78
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 55ea369f43a9..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
9#include <net/genetlink.h> 9#include <net/genetlink.h>
10#include <linux/dlm.h> 10#include <linux/dlm.h>
11#include <linux/dlm_netlink.h> 11#include <linux/dlm_netlink.h>
12#include <linux/gfp.h>
12 13
13#include "dlm_internal.h" 14#include "dlm_internal.h"
14 15
@@ -26,7 +27,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
26 struct sk_buff *skb; 27 struct sk_buff *skb;
27 void *data; 28 void *data;
28 29
29 skb = genlmsg_new(size, GFP_KERNEL); 30 skb = genlmsg_new(size, GFP_NOFS);
30 if (!skb) 31 if (!skb)
31 return -ENOMEM; 32 return -ENOMEM;
32 33
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e26c07..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
11#include <linux/poll.h> 11#include <linux/poll.h>
12#include <linux/dlm.h> 12#include <linux/dlm.h>
13#include <linux/dlm_plock.h> 13#include <linux/dlm_plock.h>
14#include <linux/slab.h>
14 15
15#include "dlm_internal.h" 16#include "dlm_internal.h"
16#include "lockspace.h" 17#include "lockspace.h"
@@ -82,7 +83,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
82 if (!ls) 83 if (!ls)
83 return -EINVAL; 84 return -EINVAL;
84 85
85 xop = kzalloc(sizeof(*xop), GFP_KERNEL); 86 xop = kzalloc(sizeof(*xop), GFP_NOFS);
86 if (!xop) { 87 if (!xop) {
87 rv = -ENOMEM; 88 rv = -ENOMEM;
88 goto out; 89 goto out;
@@ -143,7 +144,7 @@ out:
143} 144}
144EXPORT_SYMBOL_GPL(dlm_posix_lock); 145EXPORT_SYMBOL_GPL(dlm_posix_lock);
145 146
146/* Returns failure iff a succesful lock operation should be canceled */ 147/* Returns failure iff a successful lock operation should be canceled */
147static int dlm_plock_callback(struct plock_op *op) 148static int dlm_plock_callback(struct plock_op *op)
148{ 149{
149 struct file *file; 150 struct file *file;
@@ -211,7 +212,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
211 if (!ls) 212 if (!ls)
212 return -EINVAL; 213 return -EINVAL;
213 214
214 op = kzalloc(sizeof(*op), GFP_KERNEL); 215 op = kzalloc(sizeof(*op), GFP_NOFS);
215 if (!op) { 216 if (!op) {
216 rv = -ENOMEM; 217 rv = -ENOMEM;
217 goto out; 218 goto out;
@@ -266,7 +267,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
266 if (!ls) 267 if (!ls)
267 return -EINVAL; 268 return -EINVAL;
268 269
269 op = kzalloc(sizeof(*op), GFP_KERNEL); 270 op = kzalloc(sizeof(*op), GFP_NOFS);
270 if (!op) { 271 if (!op) {
271 rv = -ENOMEM; 272 rv = -ENOMEM;
272 goto out; 273 goto out;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 67522c268c14..3c83a49a48a3 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
38 char *mb; 38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len; 39 int mb_len = sizeof(struct dlm_rcom) + len;
40 40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); 41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
42 if (!mh) { 42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS", 43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len); 44 to_nodeid, type, len);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7a2307c08911..a44fa22890e1 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
35 struct rq_entry *e; 35 struct rq_entry *e;
36 int length = ms->m_header.h_length - sizeof(struct dlm_message); 36 int length = ms->m_header.h_length - sizeof(struct dlm_message);
37 37
38 e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation); 38 e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
39 if (!e) { 39 if (!e) {
40 log_print("dlm_add_requestqueue: out of memory len %d", length); 40 log_print("dlm_add_requestqueue: out of memory len %d", length);
41 return; 41 return;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebce994ab0b7..8b6e73c47435 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/dlm.h> 18#include <linux/dlm.h>
19#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
20#include <linux/slab.h>
20 21
21#include "dlm_internal.h" 22#include "dlm_internal.h"
22#include "lockspace.h" 23#include "lockspace.h"
@@ -173,7 +174,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
173/* we could possibly check if the cancel of an orphan has resulted in the lkb 174/* we could possibly check if the cancel of an orphan has resulted in the lkb
174 being removed and then remove that lkb from the orphans list and free it */ 175 being removed and then remove that lkb from the orphans list and free it */
175 176
176void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 177void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
177{ 178{
178 struct dlm_ls *ls; 179 struct dlm_ls *ls;
179 struct dlm_user_args *ua; 180 struct dlm_user_args *ua;
@@ -206,8 +207,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
206 207
207 ast_type = lkb->lkb_ast_type; 208 ast_type = lkb->lkb_ast_type;
208 lkb->lkb_ast_type |= type; 209 lkb->lkb_ast_type |= type;
209 if (bastmode) 210 if (type == AST_BAST)
210 lkb->lkb_bastmode = bastmode; 211 lkb->lkb_bastmode = mode;
212 else
213 lkb->lkb_castmode = mode;
211 214
212 if (!ast_type) { 215 if (!ast_type) {
213 kref_get(&lkb->lkb_ref); 216 kref_get(&lkb->lkb_ref);
@@ -267,7 +270,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
267 goto out; 270 goto out;
268 } 271 }
269 272
270 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); 273 ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
271 if (!ua) 274 if (!ua)
272 goto out; 275 goto out;
273 ua->proc = proc; 276 ua->proc = proc;
@@ -307,7 +310,7 @@ static int device_user_unlock(struct dlm_user_proc *proc,
307 if (!ls) 310 if (!ls)
308 return -ENOENT; 311 return -ENOENT;
309 312
310 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); 313 ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
311 if (!ua) 314 if (!ua)
312 goto out; 315 goto out;
313 ua->proc = proc; 316 ua->proc = proc;
@@ -352,7 +355,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
352 355
353 error = -ENOMEM; 356 error = -ENOMEM;
354 len = strlen(name) + strlen(name_prefix) + 2; 357 len = strlen(name) + strlen(name_prefix) + 2;
355 ls->ls_device.name = kzalloc(len, GFP_KERNEL); 358 ls->ls_device.name = kzalloc(len, GFP_NOFS);
356 if (!ls->ls_device.name) 359 if (!ls->ls_device.name)
357 goto fail; 360 goto fail;
358 361
@@ -520,7 +523,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
520#endif 523#endif
521 return -EINVAL; 524 return -EINVAL;
522 525
523 kbuf = kzalloc(count + 1, GFP_KERNEL); 526 kbuf = kzalloc(count + 1, GFP_NOFS);
524 if (!kbuf) 527 if (!kbuf)
525 return -ENOMEM; 528 return -ENOMEM;
526 529
@@ -546,7 +549,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
546 549
547 /* add 1 after namelen so that the name string is terminated */ 550 /* add 1 after namelen so that the name string is terminated */
548 kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, 551 kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
549 GFP_KERNEL); 552 GFP_NOFS);
550 if (!kbuf) { 553 if (!kbuf) {
551 kfree(k32buf); 554 kfree(k32buf);
552 return -ENOMEM; 555 return -ENOMEM;
@@ -648,7 +651,7 @@ static int device_open(struct inode *inode, struct file *file)
648 if (!ls) 651 if (!ls)
649 return -ENOENT; 652 return -ENOENT;
650 653
651 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); 654 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
652 if (!proc) { 655 if (!proc) {
653 dlm_put_lockspace(ls); 656 dlm_put_lockspace(ls);
654 return -ENOMEM; 657 return -ENOMEM;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 15int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index fbb6e5eed697..efb2b9400391 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/slab.h>
36#include <asm/unaligned.h> 37#include <asm/unaligned.h>
37#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
38 39
@@ -1748,7 +1749,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1748 char *cipher_name, size_t *key_size) 1749 char *cipher_name, size_t *key_size)
1749{ 1750{
1750 char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; 1751 char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
1751 char *full_alg_name; 1752 char *full_alg_name = NULL;
1752 int rc; 1753 int rc;
1753 1754
1754 *key_tfm = NULL; 1755 *key_tfm = NULL;
@@ -1763,7 +1764,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1763 if (rc) 1764 if (rc)
1764 goto out; 1765 goto out;
1765 *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); 1766 *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
1766 kfree(full_alg_name);
1767 if (IS_ERR(*key_tfm)) { 1767 if (IS_ERR(*key_tfm)) {
1768 rc = PTR_ERR(*key_tfm); 1768 rc = PTR_ERR(*key_tfm);
1769 printk(KERN_ERR "Unable to allocate crypto cipher with name " 1769 printk(KERN_ERR "Unable to allocate crypto cipher with name "
@@ -1786,6 +1786,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1786 goto out; 1786 goto out;
1787 } 1787 }
1788out: 1788out:
1789 kfree(full_alg_name);
1789 return rc; 1790 return rc;
1790} 1791}
1791 1792
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 2dda5ade75bc..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/fs_stack.h> 28#include <linux/fs_stack.h>
29#include <linux/slab.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
30 31
31/** 32/**
@@ -62,7 +63,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
62 struct inode *lower_inode = 63 struct inode *lower_inode =
63 ecryptfs_inode_to_lower(dentry->d_inode); 64 ecryptfs_inode_to_lower(dentry->d_inode);
64 65
65 fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL); 66 fsstack_copy_attr_all(dentry->d_inode, lower_inode);
66 } 67 }
67out: 68out:
68 return rc; 69 return rc;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9e944057001b..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/poll.h> 27#include <linux/poll.h>
28#include <linux/slab.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/pagemap.h> 30#include <linux/pagemap.h>
30#include <linux/security.h> 31#include <linux/security.h>
@@ -158,7 +159,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
158 struct dentry *ecryptfs_dentry = file->f_path.dentry; 159 struct dentry *ecryptfs_dentry = file->f_path.dentry;
159 /* Private value of ecryptfs_dentry allocated in 160 /* Private value of ecryptfs_dentry allocated in
160 * ecryptfs_lookup() */ 161 * ecryptfs_lookup() */
161 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 162 struct dentry *lower_dentry;
162 struct ecryptfs_file_info *file_info; 163 struct ecryptfs_file_info *file_info;
163 164
164 mount_crypt_stat = &ecryptfs_superblock_to_private( 165 mount_crypt_stat = &ecryptfs_superblock_to_private(
@@ -191,13 +192,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
191 | ECRYPTFS_ENCRYPTED); 192 | ECRYPTFS_ENCRYPTED);
192 } 193 }
193 mutex_unlock(&crypt_stat->cs_mutex); 194 mutex_unlock(&crypt_stat->cs_mutex);
194 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
195 && !(file->f_flags & O_RDONLY)) {
196 rc = -EPERM;
197 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
198 "file must hence be opened RO\n", __func__);
199 goto out;
200 }
201 if (!ecryptfs_inode_to_private(inode)->lower_file) { 195 if (!ecryptfs_inode_to_private(inode)->lower_file) {
202 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 196 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
203 if (rc) { 197 if (rc) {
@@ -208,6 +202,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
208 goto out; 202 goto out;
209 } 203 }
210 } 204 }
205 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
206 && !(file->f_flags & O_RDONLY)) {
207 rc = -EPERM;
208 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
209 "file must hence be opened RO\n", __func__);
210 goto out;
211 }
211 ecryptfs_set_file_lower( 212 ecryptfs_set_file_lower(
212 file, ecryptfs_inode_to_private(inode)->lower_file); 213 file, ecryptfs_inode_to_private(inode)->lower_file);
213 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { 214 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
@@ -299,7 +300,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file,
299const struct file_operations ecryptfs_dir_fops = { 300const struct file_operations ecryptfs_dir_fops = {
300 .readdir = ecryptfs_readdir, 301 .readdir = ecryptfs_readdir,
301 .ioctl = ecryptfs_ioctl, 302 .ioctl = ecryptfs_ioctl,
302 .mmap = generic_file_mmap,
303 .open = ecryptfs_open, 303 .open = ecryptfs_open,
304 .flush = ecryptfs_flush, 304 .flush = ecryptfs_flush,
305 .release = ecryptfs_release, 305 .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 056fed62d0de..d3362faf3852 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h>
34#include <asm/unaligned.h> 35#include <asm/unaligned.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
@@ -282,7 +283,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
282 goto out; 283 goto out;
283 } 284 }
284 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, 285 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
285 ecryptfs_dir_inode->i_sb, 1); 286 ecryptfs_dir_inode->i_sb,
287 ECRYPTFS_INTERPOSE_FLAG_D_ADD);
286 if (rc) { 288 if (rc) {
287 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", 289 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
288 __func__, rc); 290 __func__, rc);
@@ -463,9 +465,6 @@ out_lock:
463 unlock_dir(lower_dir_dentry); 465 unlock_dir(lower_dir_dentry);
464 dput(lower_new_dentry); 466 dput(lower_new_dentry);
465 dput(lower_old_dentry); 467 dput(lower_old_dentry);
466 d_drop(lower_old_dentry);
467 d_drop(new_dentry);
468 d_drop(old_dentry);
469 return rc; 468 return rc;
470} 469}
471 470
@@ -614,6 +613,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
614 struct dentry *lower_new_dentry; 613 struct dentry *lower_new_dentry;
615 struct dentry *lower_old_dir_dentry; 614 struct dentry *lower_old_dir_dentry;
616 struct dentry *lower_new_dir_dentry; 615 struct dentry *lower_new_dir_dentry;
616 struct dentry *trap = NULL;
617 617
618 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); 618 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
619 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); 619 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
@@ -621,14 +621,24 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
621 dget(lower_new_dentry); 621 dget(lower_new_dentry);
622 lower_old_dir_dentry = dget_parent(lower_old_dentry); 622 lower_old_dir_dentry = dget_parent(lower_old_dentry);
623 lower_new_dir_dentry = dget_parent(lower_new_dentry); 623 lower_new_dir_dentry = dget_parent(lower_new_dentry);
624 lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); 624 trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
625 /* source should not be ancestor of target */
626 if (trap == lower_old_dentry) {
627 rc = -EINVAL;
628 goto out_lock;
629 }
630 /* target should not be ancestor of source */
631 if (trap == lower_new_dentry) {
632 rc = -ENOTEMPTY;
633 goto out_lock;
634 }
625 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, 635 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
626 lower_new_dir_dentry->d_inode, lower_new_dentry); 636 lower_new_dir_dentry->d_inode, lower_new_dentry);
627 if (rc) 637 if (rc)
628 goto out_lock; 638 goto out_lock;
629 fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL); 639 fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
630 if (new_dir != old_dir) 640 if (new_dir != old_dir)
631 fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL); 641 fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
632out_lock: 642out_lock:
633 unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); 643 unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
634 dput(lower_new_dentry->d_parent); 644 dput(lower_new_dentry->d_parent);
@@ -715,31 +725,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
715 /* Released in ecryptfs_put_link(); only release here on error */ 725 /* Released in ecryptfs_put_link(); only release here on error */
716 buf = kmalloc(len, GFP_KERNEL); 726 buf = kmalloc(len, GFP_KERNEL);
717 if (!buf) { 727 if (!buf) {
718 rc = -ENOMEM; 728 buf = ERR_PTR(-ENOMEM);
719 goto out; 729 goto out;
720 } 730 }
721 old_fs = get_fs(); 731 old_fs = get_fs();
722 set_fs(get_ds()); 732 set_fs(get_ds());
723 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); 733 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
724 set_fs(old_fs); 734 set_fs(old_fs);
725 if (rc < 0) 735 if (rc < 0) {
726 goto out_free; 736 kfree(buf);
727 else 737 buf = ERR_PTR(rc);
738 } else
728 buf[rc] = '\0'; 739 buf[rc] = '\0';
729 rc = 0;
730 nd_set_link(nd, buf);
731 goto out;
732out_free:
733 kfree(buf);
734out: 740out:
735 return ERR_PTR(rc); 741 nd_set_link(nd, buf);
742 return NULL;
736} 743}
737 744
738static void 745static void
739ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) 746ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
740{ 747{
741 /* Free the char* */ 748 char *buf = nd_get_link(nd);
742 kfree(nd_get_link(nd)); 749 if (!IS_ERR(buf)) {
750 /* Free the char* */
751 kfree(buf);
752 }
743} 753}
744 754
745/** 755/**
@@ -772,18 +782,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
772} 782}
773 783
774/** 784/**
775 * ecryptfs_truncate 785 * truncate_upper
776 * @dentry: The ecryptfs layer dentry 786 * @dentry: The ecryptfs layer dentry
777 * @new_length: The length to expand the file to 787 * @ia: Address of the ecryptfs inode's attributes
788 * @lower_ia: Address of the lower inode's attributes
778 * 789 *
779 * Function to handle truncations modifying the size of the file. Note 790 * Function to handle truncations modifying the size of the file. Note
780 * that the file sizes are interpolated. When expanding, we are simply 791 * that the file sizes are interpolated. When expanding, we are simply
781 * writing strings of 0's out. When truncating, we need to modify the 792 * writing strings of 0's out. When truncating, we truncate the upper
782 * underlying file size according to the page index interpolations. 793 * inode and update the lower_ia according to the page index
794 * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
795 * the caller must use lower_ia in a call to notify_change() to perform
796 * the truncation of the lower inode.
783 * 797 *
784 * Returns zero on success; non-zero otherwise 798 * Returns zero on success; non-zero otherwise
785 */ 799 */
786int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) 800static int truncate_upper(struct dentry *dentry, struct iattr *ia,
801 struct iattr *lower_ia)
787{ 802{
788 int rc = 0; 803 int rc = 0;
789 struct inode *inode = dentry->d_inode; 804 struct inode *inode = dentry->d_inode;
@@ -794,8 +809,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
794 loff_t lower_size_before_truncate; 809 loff_t lower_size_before_truncate;
795 loff_t lower_size_after_truncate; 810 loff_t lower_size_after_truncate;
796 811
797 if (unlikely((new_length == i_size))) 812 if (unlikely((ia->ia_size == i_size))) {
813 lower_ia->ia_valid &= ~ATTR_SIZE;
798 goto out; 814 goto out;
815 }
799 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 816 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
800 /* Set up a fake ecryptfs file, this is used to interface with 817 /* Set up a fake ecryptfs file, this is used to interface with
801 * the file in the underlying filesystem so that the 818 * the file in the underlying filesystem so that the
@@ -815,28 +832,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
815 &fake_ecryptfs_file, 832 &fake_ecryptfs_file,
816 ecryptfs_inode_to_private(dentry->d_inode)->lower_file); 833 ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
817 /* Switch on growing or shrinking file */ 834 /* Switch on growing or shrinking file */
818 if (new_length > i_size) { 835 if (ia->ia_size > i_size) {
819 char zero[] = { 0x00 }; 836 char zero[] = { 0x00 };
820 837
838 lower_ia->ia_valid &= ~ATTR_SIZE;
821 /* Write a single 0 at the last position of the file; 839 /* Write a single 0 at the last position of the file;
822 * this triggers code that will fill in 0's throughout 840 * this triggers code that will fill in 0's throughout
823 * the intermediate portion of the previous end of the 841 * the intermediate portion of the previous end of the
824 * file and the new and of the file */ 842 * file and the new and of the file */
825 rc = ecryptfs_write(&fake_ecryptfs_file, zero, 843 rc = ecryptfs_write(&fake_ecryptfs_file, zero,
826 (new_length - 1), 1); 844 (ia->ia_size - 1), 1);
827 } else { /* new_length < i_size_read(inode) */ 845 } else { /* ia->ia_size < i_size_read(inode) */
828 /* We're chopping off all the pages down do the page 846 /* We're chopping off all the pages down to the page
829 * in which new_length is located. Fill in the end of 847 * in which ia->ia_size is located. Fill in the end of
830 * that page from (new_length & ~PAGE_CACHE_MASK) to 848 * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
831 * PAGE_CACHE_SIZE with zeros. */ 849 * PAGE_CACHE_SIZE with zeros. */
832 size_t num_zeros = (PAGE_CACHE_SIZE 850 size_t num_zeros = (PAGE_CACHE_SIZE
833 - (new_length & ~PAGE_CACHE_MASK)); 851 - (ia->ia_size & ~PAGE_CACHE_MASK));
834 852
835 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 853 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
836 rc = vmtruncate(inode, new_length); 854 rc = vmtruncate(inode, ia->ia_size);
837 if (rc) 855 if (rc)
838 goto out_free; 856 goto out_free;
839 rc = vmtruncate(lower_dentry->d_inode, new_length); 857 lower_ia->ia_size = ia->ia_size;
858 lower_ia->ia_valid |= ATTR_SIZE;
840 goto out_free; 859 goto out_free;
841 } 860 }
842 if (num_zeros) { 861 if (num_zeros) {
@@ -848,7 +867,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
848 goto out_free; 867 goto out_free;
849 } 868 }
850 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, 869 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
851 new_length, num_zeros); 870 ia->ia_size, num_zeros);
852 kfree(zeros_virt); 871 kfree(zeros_virt);
853 if (rc) { 872 if (rc) {
854 printk(KERN_ERR "Error attempting to zero out " 873 printk(KERN_ERR "Error attempting to zero out "
@@ -857,7 +876,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
857 goto out_free; 876 goto out_free;
858 } 877 }
859 } 878 }
860 vmtruncate(inode, new_length); 879 vmtruncate(inode, ia->ia_size);
861 rc = ecryptfs_write_inode_size_to_metadata(inode); 880 rc = ecryptfs_write_inode_size_to_metadata(inode);
862 if (rc) { 881 if (rc) {
863 printk(KERN_ERR "Problem with " 882 printk(KERN_ERR "Problem with "
@@ -870,10 +889,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
870 lower_size_before_truncate = 889 lower_size_before_truncate =
871 upper_size_to_lower_size(crypt_stat, i_size); 890 upper_size_to_lower_size(crypt_stat, i_size);
872 lower_size_after_truncate = 891 lower_size_after_truncate =
873 upper_size_to_lower_size(crypt_stat, new_length); 892 upper_size_to_lower_size(crypt_stat, ia->ia_size);
874 if (lower_size_after_truncate < lower_size_before_truncate) 893 if (lower_size_after_truncate < lower_size_before_truncate) {
875 vmtruncate(lower_dentry->d_inode, 894 lower_ia->ia_size = lower_size_after_truncate;
876 lower_size_after_truncate); 895 lower_ia->ia_valid |= ATTR_SIZE;
896 } else
897 lower_ia->ia_valid &= ~ATTR_SIZE;
877 } 898 }
878out_free: 899out_free:
879 if (ecryptfs_file_to_private(&fake_ecryptfs_file)) 900 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
@@ -883,6 +904,33 @@ out:
883 return rc; 904 return rc;
884} 905}
885 906
907/**
908 * ecryptfs_truncate
909 * @dentry: The ecryptfs layer dentry
910 * @new_length: The length to expand the file to
911 *
912 * Simple function that handles the truncation of an eCryptfs inode and
913 * its corresponding lower inode.
914 *
915 * Returns zero on success; non-zero otherwise
916 */
917int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
918{
919 struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
920 struct iattr lower_ia = { .ia_valid = 0 };
921 int rc;
922
923 rc = truncate_upper(dentry, &ia, &lower_ia);
924 if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
925 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
926
927 mutex_lock(&lower_dentry->d_inode->i_mutex);
928 rc = notify_change(lower_dentry, &lower_ia);
929 mutex_unlock(&lower_dentry->d_inode->i_mutex);
930 }
931 return rc;
932}
933
886static int 934static int
887ecryptfs_permission(struct inode *inode, int mask) 935ecryptfs_permission(struct inode *inode, int mask)
888{ 936{
@@ -905,6 +953,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
905{ 953{
906 int rc = 0; 954 int rc = 0;
907 struct dentry *lower_dentry; 955 struct dentry *lower_dentry;
956 struct iattr lower_ia;
908 struct inode *inode; 957 struct inode *inode;
909 struct inode *lower_inode; 958 struct inode *lower_inode;
910 struct ecryptfs_crypt_stat *crypt_stat; 959 struct ecryptfs_crypt_stat *crypt_stat;
@@ -943,15 +992,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
943 } 992 }
944 } 993 }
945 mutex_unlock(&crypt_stat->cs_mutex); 994 mutex_unlock(&crypt_stat->cs_mutex);
995 memcpy(&lower_ia, ia, sizeof(lower_ia));
996 if (ia->ia_valid & ATTR_FILE)
997 lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
946 if (ia->ia_valid & ATTR_SIZE) { 998 if (ia->ia_valid & ATTR_SIZE) {
947 ecryptfs_printk(KERN_DEBUG, 999 rc = truncate_upper(dentry, ia, &lower_ia);
948 "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
949 ia->ia_valid, ATTR_SIZE);
950 rc = ecryptfs_truncate(dentry, ia->ia_size);
951 /* ecryptfs_truncate handles resizing of the lower file */
952 ia->ia_valid &= ~ATTR_SIZE;
953 ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
954 ia->ia_valid);
955 if (rc < 0) 1000 if (rc < 0)
956 goto out; 1001 goto out;
957 } 1002 }
@@ -960,14 +1005,29 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
960 * mode change is for clearing setuid/setgid bits. Allow lower fs 1005 * mode change is for clearing setuid/setgid bits. Allow lower fs
961 * to interpret this in its own way. 1006 * to interpret this in its own way.
962 */ 1007 */
963 if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) 1008 if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
964 ia->ia_valid &= ~ATTR_MODE; 1009 lower_ia.ia_valid &= ~ATTR_MODE;
965 1010
966 mutex_lock(&lower_dentry->d_inode->i_mutex); 1011 mutex_lock(&lower_dentry->d_inode->i_mutex);
967 rc = notify_change(lower_dentry, ia); 1012 rc = notify_change(lower_dentry, &lower_ia);
968 mutex_unlock(&lower_dentry->d_inode->i_mutex); 1013 mutex_unlock(&lower_dentry->d_inode->i_mutex);
969out: 1014out:
970 fsstack_copy_attr_all(inode, lower_inode, NULL); 1015 fsstack_copy_attr_all(inode, lower_inode);
1016 return rc;
1017}
1018
1019int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1020 struct kstat *stat)
1021{
1022 struct kstat lower_stat;
1023 int rc;
1024
1025 rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
1026 ecryptfs_dentry_to_lower(dentry), &lower_stat);
1027 if (!rc) {
1028 generic_fillattr(dentry->d_inode, stat);
1029 stat->blocks = lower_stat.blocks;
1030 }
971 return rc; 1031 return rc;
972} 1032}
973 1033
@@ -1100,6 +1160,7 @@ const struct inode_operations ecryptfs_dir_iops = {
1100const struct inode_operations ecryptfs_main_iops = { 1160const struct inode_operations ecryptfs_main_iops = {
1101 .permission = ecryptfs_permission, 1161 .permission = ecryptfs_permission,
1102 .setattr = ecryptfs_setattr, 1162 .setattr = ecryptfs_setattr,
1163 .getattr = ecryptfs_getattr,
1103 .setxattr = ecryptfs_setxattr, 1164 .setxattr = ecryptfs_setxattr,
1104 .getxattr = ecryptfs_getxattr, 1165 .getxattr = ecryptfs_getxattr,
1105 .listxattr = ecryptfs_listxattr, 1166 .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
37/** 38/**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/slab.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include "ecryptfs_kernel.h" 28#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index c6ac85d6c701..af1a8f01ebac 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,7 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/ima.h> 38#include <linux/slab.h>
39#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
40 40
41/** 41/**
@@ -119,7 +119,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
119 const struct cred *cred = current_cred(); 119 const struct cred *cred = current_cred();
120 struct ecryptfs_inode_info *inode_info = 120 struct ecryptfs_inode_info *inode_info =
121 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); 121 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
122 int opened_lower_file = 0;
123 int rc = 0; 122 int rc = 0;
124 123
125 mutex_lock(&inode_info->lower_file_mutex); 124 mutex_lock(&inode_info->lower_file_mutex);
@@ -136,12 +135,9 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
136 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 135 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
137 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 136 "rc = [%d]\n", lower_dentry, lower_mnt, rc);
138 inode_info->lower_file = NULL; 137 inode_info->lower_file = NULL;
139 } else 138 }
140 opened_lower_file = 1;
141 } 139 }
142 mutex_unlock(&inode_info->lower_file_mutex); 140 mutex_unlock(&inode_info->lower_file_mutex);
143 if (opened_lower_file)
144 ima_counts_get(inode_info->lower_file);
145 return rc; 141 return rc;
146} 142}
147 143
@@ -194,7 +190,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
194 init_special_inode(inode, lower_inode->i_mode, 190 init_special_inode(inode, lower_inode->i_mode,
195 lower_inode->i_rdev); 191 lower_inode->i_rdev);
196 dentry->d_op = &ecryptfs_dops; 192 dentry->d_op = &ecryptfs_dops;
197 fsstack_copy_attr_all(inode, lower_inode, NULL); 193 fsstack_copy_attr_all(inode, lower_inode);
198 /* This size will be overwritten for real files w/ headers and 194 /* This size will be overwritten for real files w/ headers and
199 * other metadata */ 195 * other metadata */
200 fsstack_copy_inode_size(inode, lower_inode); 196 fsstack_copy_inode_size(inode, lower_inode);
@@ -590,8 +586,8 @@ out:
590 * with as much information as it can before needing 586 * with as much information as it can before needing
591 * the lower filesystem. 587 * the lower filesystem.
592 * ecryptfs_read_super(): this accesses the lower filesystem and uses 588 * ecryptfs_read_super(): this accesses the lower filesystem and uses
593 * ecryptfs_interpolate to perform most of the linking 589 * ecryptfs_interpose to perform most of the linking
594 * ecryptfs_interpolate(): links the lower filesystem into ecryptfs 590 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
595 */ 591 */
596static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, 592static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
597 const char *dev_name, void *raw_data, 593 const char *dev_name, void *raw_data,
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
20 * 02111-1307, USA. 20 * 02111-1307, USA.
21 */ 21 */
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/slab.h>
23#include <linux/user_namespace.h> 24#include <linux/user_namespace.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include "ecryptfs_kernel.h" 26#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
24#include <linux/random.h> 24#include <linux/random.h>
25#include <linux/miscdevice.h> 25#include <linux/miscdevice.h>
26#include <linux/poll.h> 26#include <linux/poll.h>
27#include <linux/slab.h>
27#include <linux/wait.h> 28#include <linux/wait.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..d491237c98e7 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..fcef41c1d2cf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h>
29#include <linux/seq_file.h> 30#include <linux/seq_file.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/file.h> 32#include <linux/file.h>
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8b47e4200e65..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/anon_inodes.h> 17#include <linux/anon_inodes.h>
@@ -135,26 +136,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
135 return events; 136 return events;
136} 137}
137 138
138static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 139static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
139 loff_t *ppos) 140{
141 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
142 ctx->count -= *cnt;
143}
144
145/**
146 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
147 * @ctx: [in] Pointer to eventfd context.
148 * @wait: [in] Wait queue to be removed.
149 * @cnt: [out] Pointer to the 64bit conter value.
150 *
151 * Returns zero if successful, or the following error codes:
152 *
153 * -EAGAIN : The operation would have blocked.
154 *
155 * This is used to atomically remove a wait queue entry from the eventfd wait
156 * queue head, and read/reset the counter value.
157 */
158int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
159 __u64 *cnt)
160{
161 unsigned long flags;
162
163 spin_lock_irqsave(&ctx->wqh.lock, flags);
164 eventfd_ctx_do_read(ctx, cnt);
165 __remove_wait_queue(&ctx->wqh, wait);
166 if (*cnt != 0 && waitqueue_active(&ctx->wqh))
167 wake_up_locked_poll(&ctx->wqh, POLLOUT);
168 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
169
170 return *cnt != 0 ? 0 : -EAGAIN;
171}
172EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
173
174/**
175 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
176 * @ctx: [in] Pointer to eventfd context.
177 * @no_wait: [in] Different from zero if the operation should not block.
178 * @cnt: [out] Pointer to the 64bit conter value.
179 *
180 * Returns zero if successful, or the following error codes:
181 *
182 * -EAGAIN : The operation would have blocked but @no_wait was nonzero.
183 * -ERESTARTSYS : A signal interrupted the wait operation.
184 *
185 * If @no_wait is zero, the function might sleep until the eventfd internal
186 * counter becomes greater than zero.
187 */
188ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
140{ 189{
141 struct eventfd_ctx *ctx = file->private_data;
142 ssize_t res; 190 ssize_t res;
143 __u64 ucnt = 0;
144 DECLARE_WAITQUEUE(wait, current); 191 DECLARE_WAITQUEUE(wait, current);
145 192
146 if (count < sizeof(ucnt))
147 return -EINVAL;
148 spin_lock_irq(&ctx->wqh.lock); 193 spin_lock_irq(&ctx->wqh.lock);
194 *cnt = 0;
149 res = -EAGAIN; 195 res = -EAGAIN;
150 if (ctx->count > 0) 196 if (ctx->count > 0)
151 res = sizeof(ucnt); 197 res = 0;
152 else if (!(file->f_flags & O_NONBLOCK)) { 198 else if (!no_wait) {
153 __add_wait_queue(&ctx->wqh, &wait); 199 __add_wait_queue(&ctx->wqh, &wait);
154 for (res = 0;;) { 200 for (;;) {
155 set_current_state(TASK_INTERRUPTIBLE); 201 set_current_state(TASK_INTERRUPTIBLE);
156 if (ctx->count > 0) { 202 if (ctx->count > 0) {
157 res = sizeof(ucnt); 203 res = 0;
158 break; 204 break;
159 } 205 }
160 if (signal_pending(current)) { 206 if (signal_pending(current)) {
@@ -168,18 +214,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
168 __remove_wait_queue(&ctx->wqh, &wait); 214 __remove_wait_queue(&ctx->wqh, &wait);
169 __set_current_state(TASK_RUNNING); 215 __set_current_state(TASK_RUNNING);
170 } 216 }
171 if (likely(res > 0)) { 217 if (likely(res == 0)) {
172 ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 218 eventfd_ctx_do_read(ctx, cnt);
173 ctx->count -= ucnt;
174 if (waitqueue_active(&ctx->wqh)) 219 if (waitqueue_active(&ctx->wqh))
175 wake_up_locked_poll(&ctx->wqh, POLLOUT); 220 wake_up_locked_poll(&ctx->wqh, POLLOUT);
176 } 221 }
177 spin_unlock_irq(&ctx->wqh.lock); 222 spin_unlock_irq(&ctx->wqh.lock);
178 if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
179 return -EFAULT;
180 223
181 return res; 224 return res;
182} 225}
226EXPORT_SYMBOL_GPL(eventfd_ctx_read);
227
228static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
229 loff_t *ppos)
230{
231 struct eventfd_ctx *ctx = file->private_data;
232 ssize_t res;
233 __u64 cnt;
234
235 if (count < sizeof(cnt))
236 return -EINVAL;
237 res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
238 if (res < 0)
239 return res;
240
241 return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
242}
183 243
184static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, 244static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
185 loff_t *ppos) 245 loff_t *ppos)
@@ -339,7 +399,7 @@ struct file *eventfd_file_create(unsigned int count, int flags)
339 ctx->flags = flags; 399 ctx->flags = flags;
340 400
341 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, 401 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
342 flags & EFD_SHARED_FCNTL_FLAGS); 402 O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
343 if (IS_ERR(file)) 403 if (IS_ERR(file))
344 eventfd_free_ctx(ctx); 404 eventfd_free_ctx(ctx);
345 405
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c063420..bd056a5b4efc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -251,10 +251,10 @@ ctl_table epoll_table[] = {
251 .data = &max_user_watches, 251 .data = &max_user_watches,
252 .maxlen = sizeof(int), 252 .maxlen = sizeof(int),
253 .mode = 0644, 253 .mode = 0644,
254 .proc_handler = &proc_dointvec_minmax, 254 .proc_handler = proc_dointvec_minmax,
255 .extra1 = &zero, 255 .extra1 = &zero,
256 }, 256 },
257 { .ctl_name = 0 } 257 { }
258}; 258};
259#endif /* CONFIG_SYSCTL */ 259#endif /* CONFIG_SYSCTL */
260 260
@@ -1206,7 +1206,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1206 * a file structure and a free file descriptor. 1206 * a file structure and a free file descriptor.
1207 */ 1207 */
1208 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1208 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1209 flags & O_CLOEXEC); 1209 O_RDWR | (flags & O_CLOEXEC));
1210 if (error < 0) 1210 if (error < 0)
1211 ep_free(ep); 1211 ep_free(ep);
1212 1212
diff --git a/fs/exec.c b/fs/exec.c
index ba112bd4a339..49cdaa19e5b9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,6 @@
46#include <linux/proc_fs.h> 46#include <linux/proc_fs.h>
47#include <linux/mount.h> 47#include <linux/mount.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/ima.h>
50#include <linux/syscalls.h> 49#include <linux/syscalls.h>
51#include <linux/tsacct_kern.h> 50#include <linux/tsacct_kern.h>
52#include <linux/cn_proc.h> 51#include <linux/cn_proc.h>
@@ -196,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
196 * to work from. 195 * to work from.
197 */ 196 */
198 rlim = current->signal->rlim; 197 rlim = current->signal->rlim;
199 if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { 198 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
200 put_page(page); 199 put_page(page);
201 return NULL; 200 return NULL;
202 } 201 }
@@ -247,6 +246,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
247 vma->vm_start = vma->vm_end - PAGE_SIZE; 246 vma->vm_start = vma->vm_end - PAGE_SIZE;
248 vma->vm_flags = VM_STACK_FLAGS; 247 vma->vm_flags = VM_STACK_FLAGS;
249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 250 err = insert_vm_struct(mm, vma);
251 if (err) 251 if (err)
252 goto err; 252 goto err;
@@ -517,7 +517,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
517 /* 517 /*
518 * cover the whole range: [new_start, old_end) 518 * cover the whole range: [new_start, old_end)
519 */ 519 */
520 vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL); 520 if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
521 return -ENOMEM;
521 522
522 /* 523 /*
523 * move the page tables downwards, on failure we rely on 524 * move the page tables downwards, on failure we rely on
@@ -548,15 +549,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
548 tlb_finish_mmu(tlb, new_end, old_end); 549 tlb_finish_mmu(tlb, new_end, old_end);
549 550
550 /* 551 /*
551 * shrink the vma to just the new range. 552 * Shrink the vma to just the new range. Always succeeds.
552 */ 553 */
553 vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); 554 vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
554 555
555 return 0; 556 return 0;
556} 557}
557 558
558#define EXTRA_STACK_VM_PAGES 20 /* random */
559
560/* 559/*
561 * Finalizes the stack vm_area_struct. The flags and permissions are updated, 560 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
562 * the stack is optionally relocated, and some extra space is added. 561 * the stack is optionally relocated, and some extra space is added.
@@ -572,10 +571,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
572 struct vm_area_struct *prev = NULL; 571 struct vm_area_struct *prev = NULL;
573 unsigned long vm_flags; 572 unsigned long vm_flags;
574 unsigned long stack_base; 573 unsigned long stack_base;
574 unsigned long stack_size;
575 unsigned long stack_expand;
576 unsigned long rlim_stack;
575 577
576#ifdef CONFIG_STACK_GROWSUP 578#ifdef CONFIG_STACK_GROWSUP
577 /* Limit stack size to 1GB */ 579 /* Limit stack size to 1GB */
578 stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; 580 stack_base = rlimit_max(RLIMIT_STACK);
579 if (stack_base > (1 << 30)) 581 if (stack_base > (1 << 30))
580 stack_base = 1 << 30; 582 stack_base = 1 << 30;
581 583
@@ -628,10 +630,23 @@ int setup_arg_pages(struct linux_binprm *bprm,
628 goto out_unlock; 630 goto out_unlock;
629 } 631 }
630 632
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start;
635 /*
636 * Align this down to a page boundary as expand_stack
637 * will align it up.
638 */
639 rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
631#ifdef CONFIG_STACK_GROWSUP 640#ifdef CONFIG_STACK_GROWSUP
632 stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; 641 if (stack_size + stack_expand > rlim_stack)
642 stack_base = vma->vm_start + rlim_stack;
643 else
644 stack_base = vma->vm_end + stack_expand;
633#else 645#else
634 stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; 646 if (stack_size + stack_expand > rlim_stack)
647 stack_base = vma->vm_end - rlim_stack;
648 else
649 stack_base = vma->vm_start - stack_expand;
635#endif 650#endif
636 ret = expand_stack(vma, stack_base); 651 ret = expand_stack(vma, stack_base);
637 if (ret) 652 if (ret)
@@ -703,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
703 /* Notify parent that we're no longer interested in the old VM */ 718 /* Notify parent that we're no longer interested in the old VM */
704 tsk = current; 719 tsk = current;
705 old_mm = current->mm; 720 old_mm = current->mm;
721 sync_mm_rss(tsk, old_mm);
706 mm_release(tsk, old_mm); 722 mm_release(tsk, old_mm);
707 723
708 if (old_mm) { 724 if (old_mm) {
@@ -827,7 +843,9 @@ static int de_thread(struct task_struct *tsk)
827 attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); 843 attach_pid(tsk, PIDTYPE_PID, task_pid(leader));
828 transfer_pid(leader, tsk, PIDTYPE_PGID); 844 transfer_pid(leader, tsk, PIDTYPE_PGID);
829 transfer_pid(leader, tsk, PIDTYPE_SID); 845 transfer_pid(leader, tsk, PIDTYPE_SID);
846
830 list_replace_rcu(&leader->tasks, &tsk->tasks); 847 list_replace_rcu(&leader->tasks, &tsk->tasks);
848 list_replace_init(&leader->sibling, &tsk->sibling);
831 849
832 tsk->group_leader = tsk; 850 tsk->group_leader = tsk;
833 leader->group_leader = tsk; 851 leader->group_leader = tsk;
@@ -924,6 +942,15 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
924void set_task_comm(struct task_struct *tsk, char *buf) 942void set_task_comm(struct task_struct *tsk, char *buf)
925{ 943{
926 task_lock(tsk); 944 task_lock(tsk);
945
946 /*
947 * Threads may access current->comm without holding
948 * the task lock, so write the string carefully.
949 * Readers without a lock may see incomplete new
950 * names but are safe from non-terminating string reads.
951 */
952 memset(tsk->comm, 0, TASK_COMM_LEN);
953 wmb();
927 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 954 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
928 task_unlock(tsk); 955 task_unlock(tsk);
929 perf_event_comm(tsk); 956 perf_event_comm(tsk);
@@ -931,9 +958,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
931 958
932int flush_old_exec(struct linux_binprm * bprm) 959int flush_old_exec(struct linux_binprm * bprm)
933{ 960{
934 char * name; 961 int retval;
935 int i, ch, retval;
936 char tcomm[sizeof(current->comm)];
937 962
938 /* 963 /*
939 * Make sure we have a private signal table and that 964 * Make sure we have a private signal table and that
@@ -954,6 +979,25 @@ int flush_old_exec(struct linux_binprm * bprm)
954 979
955 bprm->mm = NULL; /* We're using it now */ 980 bprm->mm = NULL; /* We're using it now */
956 981
982 current->flags &= ~PF_RANDOMIZE;
983 flush_thread();
984 current->personality &= ~bprm->per_clear;
985
986 return 0;
987
988out:
989 return retval;
990}
991EXPORT_SYMBOL(flush_old_exec);
992
993void setup_new_exec(struct linux_binprm * bprm)
994{
995 int i, ch;
996 char * name;
997 char tcomm[sizeof(current->comm)];
998
999 arch_pick_mmap_layout(current->mm);
1000
957 /* This is the point of no return */ 1001 /* This is the point of no return */
958 current->sas_ss_sp = current->sas_ss_size = 0; 1002 current->sas_ss_sp = current->sas_ss_size = 0;
959 1003
@@ -975,9 +1019,6 @@ int flush_old_exec(struct linux_binprm * bprm)
975 tcomm[i] = '\0'; 1019 tcomm[i] = '\0';
976 set_task_comm(current, tcomm); 1020 set_task_comm(current, tcomm);
977 1021
978 current->flags &= ~PF_RANDOMIZE;
979 flush_thread();
980
981 /* Set the new mm task size. We have to do that late because it may 1022 /* Set the new mm task size. We have to do that late because it may
982 * depend on TIF_32BIT which is only updated in flush_thread() on 1023 * depend on TIF_32BIT which is only updated in flush_thread() on
983 * some architectures like powerpc 1024 * some architectures like powerpc
@@ -993,8 +1034,6 @@ int flush_old_exec(struct linux_binprm * bprm)
993 set_dumpable(current->mm, suid_dumpable); 1034 set_dumpable(current->mm, suid_dumpable);
994 } 1035 }
995 1036
996 current->personality &= ~bprm->per_clear;
997
998 /* 1037 /*
999 * Flush performance counters when crossing a 1038 * Flush performance counters when crossing a
1000 * security domain: 1039 * security domain:
@@ -1009,14 +1048,8 @@ int flush_old_exec(struct linux_binprm * bprm)
1009 1048
1010 flush_signal_handlers(current, 0); 1049 flush_signal_handlers(current, 0);
1011 flush_old_files(current->files); 1050 flush_old_files(current->files);
1012
1013 return 0;
1014
1015out:
1016 return retval;
1017} 1051}
1018 1052EXPORT_SYMBOL(setup_new_exec);
1019EXPORT_SYMBOL(flush_old_exec);
1020 1053
1021/* 1054/*
1022 * Prepare credentials and lock ->cred_guard_mutex. 1055 * Prepare credentials and lock ->cred_guard_mutex.
@@ -1209,9 +1242,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1209 retval = security_bprm_check(bprm); 1242 retval = security_bprm_check(bprm);
1210 if (retval) 1243 if (retval)
1211 return retval; 1244 return retval;
1212 retval = ima_bprm_check(bprm);
1213 if (retval)
1214 return retval;
1215 1245
1216 /* kernel module loader fixup */ 1246 /* kernel module loader fixup */
1217 /* so we don't try to load run modprobe in kernel space. */ 1247 /* so we don't try to load run modprobe in kernel space. */
@@ -1503,7 +1533,7 @@ static int format_corename(char *corename, long signr)
1503 /* core limit size */ 1533 /* core limit size */
1504 case 'c': 1534 case 'c':
1505 rc = snprintf(out_ptr, out_end - out_ptr, 1535 rc = snprintf(out_ptr, out_end - out_ptr,
1506 "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur); 1536 "%lu", rlimit(RLIMIT_CORE));
1507 if (rc > out_end - out_ptr) 1537 if (rc > out_end - out_ptr)
1508 goto out; 1538 goto out;
1509 out_ptr += rc; 1539 out_ptr += rc;
@@ -1531,12 +1561,13 @@ out:
1531 return ispipe; 1561 return ispipe;
1532} 1562}
1533 1563
1534static int zap_process(struct task_struct *start) 1564static int zap_process(struct task_struct *start, int exit_code)
1535{ 1565{
1536 struct task_struct *t; 1566 struct task_struct *t;
1537 int nr = 0; 1567 int nr = 0;
1538 1568
1539 start->signal->flags = SIGNAL_GROUP_EXIT; 1569 start->signal->flags = SIGNAL_GROUP_EXIT;
1570 start->signal->group_exit_code = exit_code;
1540 start->signal->group_stop_count = 0; 1571 start->signal->group_stop_count = 0;
1541 1572
1542 t = start; 1573 t = start;
@@ -1561,8 +1592,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1561 spin_lock_irq(&tsk->sighand->siglock); 1592 spin_lock_irq(&tsk->sighand->siglock);
1562 if (!signal_group_exit(tsk->signal)) { 1593 if (!signal_group_exit(tsk->signal)) {
1563 mm->core_state = core_state; 1594 mm->core_state = core_state;
1564 tsk->signal->group_exit_code = exit_code; 1595 nr = zap_process(tsk, exit_code);
1565 nr = zap_process(tsk);
1566 } 1596 }
1567 spin_unlock_irq(&tsk->sighand->siglock); 1597 spin_unlock_irq(&tsk->sighand->siglock);
1568 if (unlikely(nr < 0)) 1598 if (unlikely(nr < 0))
@@ -1611,7 +1641,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1611 if (p->mm) { 1641 if (p->mm) {
1612 if (unlikely(p->mm == mm)) { 1642 if (unlikely(p->mm == mm)) {
1613 lock_task_sighand(p, &flags); 1643 lock_task_sighand(p, &flags);
1614 nr += zap_process(p); 1644 nr += zap_process(p, exit_code);
1615 unlock_task_sighand(p, &flags); 1645 unlock_task_sighand(p, &flags);
1616 } 1646 }
1617 break; 1647 break;
@@ -1718,14 +1748,19 @@ void set_dumpable(struct mm_struct *mm, int value)
1718 } 1748 }
1719} 1749}
1720 1750
1721int get_dumpable(struct mm_struct *mm) 1751static int __get_dumpable(unsigned long mm_flags)
1722{ 1752{
1723 int ret; 1753 int ret;
1724 1754
1725 ret = mm->flags & 0x3; 1755 ret = mm_flags & MMF_DUMPABLE_MASK;
1726 return (ret >= 2) ? 2 : ret; 1756 return (ret >= 2) ? 2 : ret;
1727} 1757}
1728 1758
1759int get_dumpable(struct mm_struct *mm)
1760{
1761 return __get_dumpable(mm->flags);
1762}
1763
1729static void wait_for_dump_helpers(struct file *file) 1764static void wait_for_dump_helpers(struct file *file)
1730{ 1765{
1731 struct pipe_inode_info *pipe; 1766 struct pipe_inode_info *pipe;
@@ -1756,17 +1791,26 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1756 struct mm_struct *mm = current->mm; 1791 struct mm_struct *mm = current->mm;
1757 struct linux_binfmt * binfmt; 1792 struct linux_binfmt * binfmt;
1758 struct inode * inode; 1793 struct inode * inode;
1759 struct file * file;
1760 const struct cred *old_cred; 1794 const struct cred *old_cred;
1761 struct cred *cred; 1795 struct cred *cred;
1762 int retval = 0; 1796 int retval = 0;
1763 int flag = 0; 1797 int flag = 0;
1764 int ispipe = 0; 1798 int ispipe = 0;
1765 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1766 char **helper_argv = NULL; 1799 char **helper_argv = NULL;
1767 int helper_argc = 0; 1800 int helper_argc = 0;
1768 int dump_count = 0; 1801 int dump_count = 0;
1769 static atomic_t core_dump_count = ATOMIC_INIT(0); 1802 static atomic_t core_dump_count = ATOMIC_INIT(0);
1803 struct coredump_params cprm = {
1804 .signr = signr,
1805 .regs = regs,
1806 .limit = rlimit(RLIMIT_CORE),
1807 /*
1808 * We must use the same mm->flags while dumping core to avoid
1809 * inconsistency of bit flags, since this flag is not protected
1810 * by any locks.
1811 */
1812 .mm_flags = mm->flags,
1813 };
1770 1814
1771 audit_core_dumps(signr); 1815 audit_core_dumps(signr);
1772 1816
@@ -1784,7 +1828,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1784 /* 1828 /*
1785 * If another thread got here first, or we are not dumpable, bail out. 1829 * If another thread got here first, or we are not dumpable, bail out.
1786 */ 1830 */
1787 if (mm->core_state || !get_dumpable(mm)) { 1831 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1788 up_write(&mm->mmap_sem); 1832 up_write(&mm->mmap_sem);
1789 put_cred(cred); 1833 put_cred(cred);
1790 goto fail; 1834 goto fail;
@@ -1795,7 +1839,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1795 * process nor do we know its entire history. We only know it 1839 * process nor do we know its entire history. We only know it
1796 * was tainted so we dump it as root in mode 2. 1840 * was tainted so we dump it as root in mode 2.
1797 */ 1841 */
1798 if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ 1842 if (__get_dumpable(cprm.mm_flags) == 2) {
1843 /* Setuid core dump mode */
1799 flag = O_EXCL; /* Stop rewrite attacks */ 1844 flag = O_EXCL; /* Stop rewrite attacks */
1800 cred->fsuid = 0; /* Dump root private */ 1845 cred->fsuid = 0; /* Dump root private */
1801 } 1846 }
@@ -1822,15 +1867,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1822 ispipe = format_corename(corename, signr); 1867 ispipe = format_corename(corename, signr);
1823 unlock_kernel(); 1868 unlock_kernel();
1824 1869
1825 if ((!ispipe) && (core_limit < binfmt->min_coredump)) 1870 if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1826 goto fail_unlock; 1871 goto fail_unlock;
1827 1872
1828 if (ispipe) { 1873 if (ispipe) {
1829 if (core_limit == 0) { 1874 if (cprm.limit == 0) {
1830 /* 1875 /*
1831 * Normally core limits are irrelevant to pipes, since 1876 * Normally core limits are irrelevant to pipes, since
1832 * we're not writing to the file system, but we use 1877 * we're not writing to the file system, but we use
1833 * core_limit of 0 here as a speacial value. Any 1878 * cprm.limit of 0 here as a speacial value. Any
1834 * non-zero limit gets set to RLIM_INFINITY below, but 1879 * non-zero limit gets set to RLIM_INFINITY below, but
1835 * a limit of 0 skips the dump. This is a consistent 1880 * a limit of 0 skips the dump. This is a consistent
1836 * way to catch recursive crashes. We can still crash 1881 * way to catch recursive crashes. We can still crash
@@ -1863,25 +1908,25 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1863 goto fail_dropcount; 1908 goto fail_dropcount;
1864 } 1909 }
1865 1910
1866 core_limit = RLIM_INFINITY; 1911 cprm.limit = RLIM_INFINITY;
1867 1912
1868 /* SIGPIPE can happen, but it's just never processed */ 1913 /* SIGPIPE can happen, but it's just never processed */
1869 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, 1914 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
1870 &file)) { 1915 &cprm.file)) {
1871 printk(KERN_INFO "Core dump to %s pipe failed\n", 1916 printk(KERN_INFO "Core dump to %s pipe failed\n",
1872 corename); 1917 corename);
1873 goto fail_dropcount; 1918 goto fail_dropcount;
1874 } 1919 }
1875 } else 1920 } else
1876 file = filp_open(corename, 1921 cprm.file = filp_open(corename,
1877 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1922 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1878 0600); 1923 0600);
1879 if (IS_ERR(file)) 1924 if (IS_ERR(cprm.file))
1880 goto fail_dropcount; 1925 goto fail_dropcount;
1881 inode = file->f_path.dentry->d_inode; 1926 inode = cprm.file->f_path.dentry->d_inode;
1882 if (inode->i_nlink > 1) 1927 if (inode->i_nlink > 1)
1883 goto close_fail; /* multiple links - don't dump */ 1928 goto close_fail; /* multiple links - don't dump */
1884 if (!ispipe && d_unhashed(file->f_path.dentry)) 1929 if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1885 goto close_fail; 1930 goto close_fail;
1886 1931
1887 /* AK: actually i see no reason to not allow this for named pipes etc., 1932 /* AK: actually i see no reason to not allow this for named pipes etc.,
@@ -1891,24 +1936,26 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1891 /* 1936 /*
1892 * Dont allow local users get cute and trick others to coredump 1937 * Dont allow local users get cute and trick others to coredump
1893 * into their pre-created files: 1938 * into their pre-created files:
1939 * Note, this is not relevant for pipes
1894 */ 1940 */
1895 if (inode->i_uid != current_fsuid()) 1941 if (!ispipe && (inode->i_uid != current_fsuid()))
1896 goto close_fail; 1942 goto close_fail;
1897 if (!file->f_op) 1943 if (!cprm.file->f_op)
1898 goto close_fail; 1944 goto close_fail;
1899 if (!file->f_op->write) 1945 if (!cprm.file->f_op->write)
1900 goto close_fail; 1946 goto close_fail;
1901 if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0) 1947 if (!ispipe &&
1948 do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1902 goto close_fail; 1949 goto close_fail;
1903 1950
1904 retval = binfmt->core_dump(signr, regs, file, core_limit); 1951 retval = binfmt->core_dump(&cprm);
1905 1952
1906 if (retval) 1953 if (retval)
1907 current->signal->group_exit_code |= 0x80; 1954 current->signal->group_exit_code |= 0x80;
1908close_fail: 1955close_fail:
1909 if (ispipe && core_pipe_limit) 1956 if (ispipe && core_pipe_limit)
1910 wait_for_dump_helpers(file); 1957 wait_for_dump_helpers(cprm.file);
1911 filp_close(file, NULL); 1958 filp_close(cprm.file, NULL);
1912fail_dropcount: 1959fail_dropcount:
1913 if (dump_count) 1960 if (dump_count)
1914 atomic_dec(&core_dump_count); 1961 atomic_dec(&core_dump_count);
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index cc2d22db119c..2d0f757fda3e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
12# Kbuild - Gets included from the Kernels Makefile and build system 12# Kbuild - Gets included from the Kernels Makefile and build system
13# 13#
14 14
15exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o 15exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o 16obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index c6718e4817fe..f0d520312d8b 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -49,11 +49,14 @@
49#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ 49#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
50#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ 50#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
51#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ 51#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
52#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
52#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ 53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
53 54
54/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
55# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) 56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
56# define EXOFS_ATTR_INODE_DATA 1 57# define EXOFS_ATTR_INODE_DATA 1
58# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
59# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
57 60
58/* 61/*
59 * The maximum number of files we can have is limited by the size of the 62 * The maximum number of files we can have is limited by the size of the
@@ -78,17 +81,67 @@ enum {
78#define EXOFS_SUPER_MAGIC 0x5DF5 81#define EXOFS_SUPER_MAGIC 0x5DF5
79 82
80/* 83/*
81 * The file system control block - stored in an object's data (mainly, the one 84 * The file system control block - stored in object EXOFS_SUPER_ID's data.
82 * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored 85 * This is where the in-memory superblock is stored on disk.
83 * on disk. Right now it just has a magic value, which is basically a sanity
84 * check on our ability to communicate with the object store.
85 */ 86 */
87enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
86struct exofs_fscb { 88struct exofs_fscb {
87 __le64 s_nextid; /* Highest object ID used */ 89 __le64 s_nextid; /* Highest object ID used */
88 __le32 s_numfiles; /* Number of files on fs */ 90 __le64 s_numfiles; /* Number of files on fs */
91 __le32 s_version; /* == EXOFS_FSCB_VER */
89 __le16 s_magic; /* Magic signature */ 92 __le16 s_magic; /* Magic signature */
90 __le16 s_newfs; /* Non-zero if this is a new fs */ 93 __le16 s_newfs; /* Non-zero if this is a new fs */
91}; 94
95 /* From here on it's a static part, only written by mkexofs */
96 __le64 s_dev_table_oid; /* Resurved, not used */
97 __le64 s_dev_table_count; /* == 0 means no dev_table */
98} __packed;
99
100/*
101 * Describes the raid used in the FS. It is part of the device table.
102 * This here is taken from the pNFS-objects definition. In exofs we
103 * use one raid policy through-out the filesystem. (NOTE: the funny
104 * alignment at begining. We take care of it at exofs_device_table.
105 */
106struct exofs_dt_data_map {
107 __le32 cb_num_comps;
108 __le64 cb_stripe_unit;
109 __le32 cb_group_width;
110 __le32 cb_group_depth;
111 __le32 cb_mirror_cnt;
112 __le32 cb_raid_algorithm;
113} __packed;
114
115/*
116 * This is an osd device information descriptor. It is a single entry in
117 * the exofs device table. It describes an osd target lun which
118 * contains data belonging to this FS. (Same partition_id on all devices)
119 */
120struct exofs_dt_device_info {
121 __le32 systemid_len;
122 u8 systemid[OSD_SYSTEMID_LEN];
123 __le64 long_name_offset; /* If !0 then offset-in-file */
124 __le32 osdname_len; /* */
125 u8 osdname[44]; /* Embbeded, Ususally an asci uuid */
126} __packed;
127
128/*
129 * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
130 * It contains the raid used for this multy-device FS and an array of
131 * participating devices.
132 */
133struct exofs_device_table {
134 __le32 dt_version; /* == EXOFS_DT_VER */
135 struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
136
137 /* Resurved space For future use. Total includeing this:
138 * (8 * sizeof(le64))
139 */
140 __le64 __Resurved[4];
141
142 __le64 dt_num_devices; /* Array size */
143 struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
144} __packed;
92 145
93/**************************************************************************** 146/****************************************************************************
94 * inode-related things 147 * inode-related things
@@ -155,22 +208,41 @@ enum {
155 (((name_len) + offsetof(struct exofs_dir_entry, name) + \ 208 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
156 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) 209 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
157 210
158/************************* 211/*
159 * function declarations * 212 * The on-disk (optional) layout structure.
160 *************************/ 213 * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
161/* osd.c */ 214 * attribute, attached to any inode, usually to a directory.
162void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], 215 */
163 const struct osd_obj_id *obj); 216
217enum exofs_inode_layout_gen_functions {
218 LAYOUT_MOVING_WINDOW = 0,
219 LAYOUT_IMPLICT = 1,
220};
164 221
165int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid); 222struct exofs_on_disk_inode_layout {
166static inline int exofs_check_ok(struct osd_request *or) 223 __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
224 __le16 pad;
225 union {
226 /* gen_func == LAYOUT_MOVING_WINDOW (default) */
227 struct exofs_layout_sliding_window {
228 __le32 num_devices; /* first n devices in global-table*/
229 } sliding_window __packed;
230
231 /* gen_func == LAYOUT_IMPLICT */
232 struct exofs_layout_implict_list {
233 struct exofs_dt_data_map data_map;
234 /* Variable array of size data_map.cb_num_comps. These
235 * are device indexes of the devices in the global table
236 */
237 __le32 dev_indexes[];
238 } implict __packed;
239 };
240} __packed;
241
242static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
167{ 243{
168 return exofs_check_ok_resid(or, NULL, NULL); 244 return sizeof(struct exofs_on_disk_inode_layout) +
245 max_devs * sizeof(__le32);
169} 246}
170int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
171int exofs_async_op(struct osd_request *or,
172 osd_req_done_fn *async_done, void *caller_context, u8 *cred);
173
174int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
175 247
176#endif /*ifndef __EXOFS_COM_H__*/ 248#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 5ec72e020b22..8442e353309f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -30,13 +30,17 @@
30 * along with exofs; if not, write to the Free Software 30 * along with exofs; if not, write to the Free Software
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33#ifndef __EXOFS_H__
34#define __EXOFS_H__
33 35
34#include <linux/fs.h> 36#include <linux/fs.h>
35#include <linux/time.h> 37#include <linux/time.h>
36#include "common.h" 38#include "common.h"
37 39
38#ifndef __EXOFS_H__ 40/* FIXME: Remove once pnfs hits mainline
39#define __EXOFS_H__ 41 * #include <linux/exportfs/pnfs_osd_xdr.h>
42 */
43#include "pnfs.h"
40 44
41#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) 45#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
42 46
@@ -51,34 +55,110 @@
51/* u64 has problems with printk this will cast it to unsigned long long */ 55/* u64 has problems with printk this will cast it to unsigned long long */
52#define _LLU(x) (unsigned long long)(x) 56#define _LLU(x) (unsigned long long)(x)
53 57
58struct exofs_layout {
59 osd_id s_pid; /* partition ID of file system*/
60
61 /* Our way of looking at the data_map */
62 unsigned stripe_unit;
63 unsigned mirrors_p1;
64
65 unsigned group_width;
66 u64 group_depth;
67 unsigned group_count;
68
69 enum exofs_inode_layout_gen_functions lay_func;
70
71 unsigned s_numdevs; /* Num of devices in array */
72 struct osd_dev *s_ods[0]; /* Variable length */
73};
74
54/* 75/*
55 * our extension to the in-memory superblock 76 * our extension to the in-memory superblock
56 */ 77 */
57struct exofs_sb_info { 78struct exofs_sb_info {
58 struct osd_dev *s_dev; /* returned by get_osd_dev */ 79 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
59 osd_id s_pid; /* partition ID of file system*/
60 int s_timeout; /* timeout for OSD operations */ 80 int s_timeout; /* timeout for OSD operations */
61 uint64_t s_nextid; /* highest object ID used */ 81 uint64_t s_nextid; /* highest object ID used */
62 uint32_t s_numfiles; /* number of files on fs */ 82 uint32_t s_numfiles; /* number of files on fs */
63 spinlock_t s_next_gen_lock; /* spinlock for gen # update */ 83 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
64 u32 s_next_generation; /* next gen # to use */ 84 u32 s_next_generation; /* next gen # to use */
65 atomic_t s_curr_pending; /* number of pending commands */ 85 atomic_t s_curr_pending; /* number of pending commands */
66 uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ 86 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
87
88 struct pnfs_osd_data_map data_map; /* Default raid to use
89 * FIXME: Needed ?
90 */
91/* struct exofs_layout dir_layout;*/ /* Default dir layout */
92 struct exofs_layout layout; /* Default files layout,
93 * contains the variable osd_dev
94 * array. Keep last */
95 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
67}; 96};
68 97
69/* 98/*
70 * our extension to the in-memory inode 99 * our extension to the in-memory inode
71 */ 100 */
72struct exofs_i_info { 101struct exofs_i_info {
102 struct inode vfs_inode; /* normal in-memory inode */
103 wait_queue_head_t i_wq; /* wait queue for inode */
73 unsigned long i_flags; /* various atomic flags */ 104 unsigned long i_flags; /* various atomic flags */
74 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ 105 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
75 uint32_t i_dir_start_lookup; /* which page to start lookup */ 106 uint32_t i_dir_start_lookup; /* which page to start lookup */
76 wait_queue_head_t i_wq; /* wait queue for inode */
77 uint64_t i_commit_size; /* the object's written length */ 107 uint64_t i_commit_size; /* the object's written length */
78 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ 108 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
79 struct inode vfs_inode; /* normal in-memory inode */
80}; 109};
81 110
111static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
112{
113 return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
114}
115
116struct exofs_io_state;
117typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
118
119struct exofs_io_state {
120 struct kref kref;
121
122 void *private;
123 exofs_io_done_fn done;
124
125 struct exofs_layout *layout;
126 struct osd_obj_id obj;
127 u8 *cred;
128
129 /* Global read/write IO*/
130 loff_t offset;
131 unsigned long length;
132 void *kern_buff;
133
134 struct page **pages;
135 unsigned nr_pages;
136 unsigned pgbase;
137 unsigned pages_consumed;
138
139 /* Attributes */
140 unsigned in_attr_len;
141 struct osd_attr *in_attr;
142 unsigned out_attr_len;
143 struct osd_attr *out_attr;
144
145 /* Variable array of size numdevs */
146 unsigned numdevs;
147 struct exofs_per_dev_state {
148 struct osd_request *or;
149 struct bio *bio;
150 loff_t offset;
151 unsigned length;
152 unsigned dev;
153 } per_dev[];
154};
155
156static inline unsigned exofs_io_state_size(unsigned numdevs)
157{
158 return sizeof(struct exofs_io_state) +
159 sizeof(struct exofs_per_dev_state) * numdevs;
160}
161
82/* 162/*
83 * our inode flags 163 * our inode flags
84 */ 164 */
@@ -123,6 +203,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
123} 203}
124 204
125/* 205/*
206 * Given a layout, object_number and stripe_index return the associated global
207 * dev_index
208 */
209unsigned exofs_layout_od_id(struct exofs_layout *layout,
210 osd_id obj_no, unsigned layout_index);
211/*
126 * Maximum count of links to a file 212 * Maximum count of links to a file
127 */ 213 */
128#define EXOFS_LINK_MAX 32000 214#define EXOFS_LINK_MAX 32000
@@ -130,6 +216,43 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
130/************************* 216/*************************
131 * function declarations * 217 * function declarations *
132 *************************/ 218 *************************/
219
220/* ios.c */
221void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
222 const struct osd_obj_id *obj);
223int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
224 u64 offset, void *p, unsigned length);
225
226int exofs_get_io_state(struct exofs_layout *layout,
227 struct exofs_io_state **ios);
228void exofs_put_io_state(struct exofs_io_state *ios);
229
230int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
231
232int exofs_sbi_create(struct exofs_io_state *ios);
233int exofs_sbi_remove(struct exofs_io_state *ios);
234int exofs_sbi_write(struct exofs_io_state *ios);
235int exofs_sbi_read(struct exofs_io_state *ios);
236
237int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
238
239int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
240static inline int exofs_oi_write(struct exofs_i_info *oi,
241 struct exofs_io_state *ios)
242{
243 ios->obj.id = exofs_oi_objno(oi);
244 ios->cred = oi->i_cred;
245 return exofs_sbi_write(ios);
246}
247
248static inline int exofs_oi_read(struct exofs_i_info *oi,
249 struct exofs_io_state *ios)
250{
251 ios->obj.id = exofs_oi_objno(oi);
252 ios->cred = oi->i_cred;
253 return exofs_sbi_read(ios);
254}
255
133/* inode.c */ 256/* inode.c */
134void exofs_truncate(struct inode *inode); 257void exofs_truncate(struct inode *inode);
135int exofs_setattr(struct dentry *, struct iattr *); 258int exofs_setattr(struct dentry *, struct iattr *);
@@ -138,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
138 struct page **pagep, void **fsdata); 261 struct page **pagep, void **fsdata);
139extern struct inode *exofs_iget(struct super_block *, unsigned long); 262extern struct inode *exofs_iget(struct super_block *, unsigned long);
140struct inode *exofs_new_inode(struct inode *, int); 263struct inode *exofs_new_inode(struct inode *, int);
141extern int exofs_write_inode(struct inode *, int); 264extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
142extern void exofs_delete_inode(struct inode *); 265extern void exofs_delete_inode(struct inode *);
143 266
144/* dir.c: */ 267/* dir.c: */
@@ -169,6 +292,7 @@ extern const struct file_operations exofs_file_operations;
169 292
170/* inode.c */ 293/* inode.c */
171extern const struct address_space_operations exofs_aops; 294extern const struct address_space_operations exofs_aops;
295extern const struct osd_attr g_attr_logical_length;
172 296
173/* namei.c */ 297/* namei.c */
174extern const struct inode_operations exofs_dir_inode_operations; 298extern const struct inode_operations exofs_dir_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6c10f7476699..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,94 +31,117 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/writeback.h> 35#include <linux/writeback.h>
35#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
36#include <scsi/scsi_device.h> 37#include <scsi/scsi_device.h>
37 38
38#include "exofs.h" 39#include "exofs.h"
39 40
40#ifdef CONFIG_EXOFS_DEBUG 41#define EXOFS_DBGMSG2(M...) do {} while (0)
41# define EXOFS_DEBUG_OBJ_ISIZE 1 42
42#endif 43enum { BIO_MAX_PAGES_KMALLOC =
44 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
45 MAX_PAGES_KMALLOC =
46 PAGE_SIZE / sizeof(struct page *),
47};
43 48
44struct page_collect { 49struct page_collect {
45 struct exofs_sb_info *sbi; 50 struct exofs_sb_info *sbi;
46 struct request_queue *req_q;
47 struct inode *inode; 51 struct inode *inode;
48 unsigned expected_pages; 52 unsigned expected_pages;
53 struct exofs_io_state *ios;
49 54
50 struct bio *bio; 55 struct page **pages;
56 unsigned alloc_pages;
51 unsigned nr_pages; 57 unsigned nr_pages;
52 unsigned long length; 58 unsigned long length;
53 loff_t pg_first; /* keep 64bit also in 32-arches */ 59 loff_t pg_first; /* keep 64bit also in 32-arches */
54}; 60};
55 61
56static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
57 struct inode *inode) 63 struct inode *inode)
58{ 64{
59 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 65 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
60 66
61 pcol->sbi = sbi; 67 pcol->sbi = sbi;
62 pcol->req_q = osd_request_queue(sbi->s_dev);
63 pcol->inode = inode; 68 pcol->inode = inode;
64 pcol->expected_pages = expected_pages; 69 pcol->expected_pages = expected_pages;
65 70
66 pcol->bio = NULL; 71 pcol->ios = NULL;
72 pcol->pages = NULL;
73 pcol->alloc_pages = 0;
67 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
68 pcol->length = 0; 75 pcol->length = 0;
69 pcol->pg_first = -1; 76 pcol->pg_first = -1;
70
71 EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
72 expected_pages);
73} 77}
74 78
75static void _pcol_reset(struct page_collect *pcol) 79static void _pcol_reset(struct page_collect *pcol)
76{ 80{
77 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); 81 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
78 82
79 pcol->bio = NULL; 83 pcol->pages = NULL;
84 pcol->alloc_pages = 0;
80 pcol->nr_pages = 0; 85 pcol->nr_pages = 0;
81 pcol->length = 0; 86 pcol->length = 0;
82 pcol->pg_first = -1; 87 pcol->pg_first = -1;
83 EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n", 88 pcol->ios = NULL;
84 pcol->inode->i_ino, pcol->expected_pages);
85 89
86 /* this is probably the end of the loop but in writes 90 /* this is probably the end of the loop but in writes
87 * it might not end here. don't be left with nothing 91 * it might not end here. don't be left with nothing
88 */ 92 */
89 if (!pcol->expected_pages) 93 if (!pcol->expected_pages)
90 pcol->expected_pages = 128; 94 pcol->expected_pages = MAX_PAGES_KMALLOC;
91} 95}
92 96
93static int pcol_try_alloc(struct page_collect *pcol) 97static int pcol_try_alloc(struct page_collect *pcol)
94{ 98{
95 int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES); 99 unsigned pages = min_t(unsigned, pcol->expected_pages,
100 MAX_PAGES_KMALLOC);
101
102 if (!pcol->ios) { /* First time allocate io_state */
103 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
104
105 if (ret)
106 return ret;
107 }
108
109 /* TODO: easily support bio chaining */
110 pages = min_t(unsigned, pages,
111 pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
96 112
97 for (; pages; pages >>= 1) { 113 for (; pages; pages >>= 1) {
98 pcol->bio = bio_alloc(GFP_KERNEL, pages); 114 pcol->pages = kmalloc(pages * sizeof(struct page *),
99 if (likely(pcol->bio)) 115 GFP_KERNEL);
116 if (likely(pcol->pages)) {
117 pcol->alloc_pages = pages;
100 return 0; 118 return 0;
119 }
101 } 120 }
102 121
103 EXOFS_ERR("Failed to kcalloc expected_pages=%u\n", 122 EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
104 pcol->expected_pages); 123 pcol->expected_pages);
105 return -ENOMEM; 124 return -ENOMEM;
106} 125}
107 126
108static void pcol_free(struct page_collect *pcol) 127static void pcol_free(struct page_collect *pcol)
109{ 128{
110 bio_put(pcol->bio); 129 kfree(pcol->pages);
111 pcol->bio = NULL; 130 pcol->pages = NULL;
131
132 if (pcol->ios) {
133 exofs_put_io_state(pcol->ios);
134 pcol->ios = NULL;
135 }
112} 136}
113 137
114static int pcol_add_page(struct page_collect *pcol, struct page *page, 138static int pcol_add_page(struct page_collect *pcol, struct page *page,
115 unsigned len) 139 unsigned len)
116{ 140{
117 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); 141 if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
118 if (unlikely(len != added_len))
119 return -ENOMEM; 142 return -ENOMEM;
120 143
121 ++pcol->nr_pages; 144 pcol->pages[pcol->nr_pages++] = page;
122 pcol->length += len; 145 pcol->length += len;
123 return 0; 146 return 0;
124} 147}
@@ -161,32 +184,26 @@ static void update_write_page(struct page *page, int ret)
161/* Called at the end of reads, to optionally unlock pages and update their 184/* Called at the end of reads, to optionally unlock pages and update their
162 * status. 185 * status.
163 */ 186 */
164static int __readpages_done(struct osd_request *or, struct page_collect *pcol, 187static int __readpages_done(struct page_collect *pcol, bool do_unlock)
165 bool do_unlock)
166{ 188{
167 struct bio_vec *bvec;
168 int i; 189 int i;
169 u64 resid; 190 u64 resid;
170 u64 good_bytes; 191 u64 good_bytes;
171 u64 length = 0; 192 u64 length = 0;
172 int ret = exofs_check_ok_resid(or, &resid, NULL); 193 int ret = exofs_check_io(pcol->ios, &resid);
173
174 osd_end_request(or);
175 194
176 if (likely(!ret)) 195 if (likely(!ret))
177 good_bytes = pcol->length; 196 good_bytes = pcol->length;
178 else if (!resid)
179 good_bytes = 0;
180 else 197 else
181 good_bytes = pcol->length - resid; 198 good_bytes = pcol->length - resid;
182 199
183 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" 200 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
184 " length=0x%lx nr_pages=%u\n", 201 " length=0x%lx nr_pages=%u\n",
185 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 202 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
186 pcol->nr_pages); 203 pcol->nr_pages);
187 204
188 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 205 for (i = 0; i < pcol->nr_pages; i++) {
189 struct page *page = bvec->bv_page; 206 struct page *page = pcol->pages[i];
190 struct inode *inode = page->mapping->host; 207 struct inode *inode = page->mapping->host;
191 int page_stat; 208 int page_stat;
192 209
@@ -198,38 +215,37 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
198 else 215 else
199 page_stat = ret; 216 page_stat = ret;
200 217
201 EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n", 218 EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n",
202 inode->i_ino, page->index, 219 inode->i_ino, page->index,
203 page_stat ? "bad_bytes" : "good_bytes"); 220 page_stat ? "bad_bytes" : "good_bytes");
204 221
205 ret = update_read_page(page, page_stat); 222 ret = update_read_page(page, page_stat);
206 if (do_unlock) 223 if (do_unlock)
207 unlock_page(page); 224 unlock_page(page);
208 length += bvec->bv_len; 225 length += PAGE_SIZE;
209 } 226 }
210 227
211 pcol_free(pcol); 228 pcol_free(pcol);
212 EXOFS_DBGMSG("readpages_done END\n"); 229 EXOFS_DBGMSG2("readpages_done END\n");
213 return ret; 230 return ret;
214} 231}
215 232
216/* callback of async reads */ 233/* callback of async reads */
217static void readpages_done(struct osd_request *or, void *p) 234static void readpages_done(struct exofs_io_state *ios, void *p)
218{ 235{
219 struct page_collect *pcol = p; 236 struct page_collect *pcol = p;
220 237
221 __readpages_done(or, pcol, true); 238 __readpages_done(pcol, true);
222 atomic_dec(&pcol->sbi->s_curr_pending); 239 atomic_dec(&pcol->sbi->s_curr_pending);
223 kfree(p); 240 kfree(pcol);
224} 241}
225 242
226static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) 243static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
227{ 244{
228 struct bio_vec *bvec;
229 int i; 245 int i;
230 246
231 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 247 for (i = 0; i < pcol->nr_pages; i++) {
232 struct page *page = bvec->bv_page; 248 struct page *page = pcol->pages[i];
233 249
234 if (rw == READ) 250 if (rw == READ)
235 update_read_page(page, ret); 251 update_read_page(page, ret);
@@ -238,36 +254,29 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
238 254
239 unlock_page(page); 255 unlock_page(page);
240 } 256 }
241 pcol_free(pcol);
242} 257}
243 258
244static int read_exec(struct page_collect *pcol, bool is_sync) 259static int read_exec(struct page_collect *pcol, bool is_sync)
245{ 260{
246 struct exofs_i_info *oi = exofs_i(pcol->inode); 261 struct exofs_i_info *oi = exofs_i(pcol->inode);
247 struct osd_obj_id obj = {pcol->sbi->s_pid, 262 struct exofs_io_state *ios = pcol->ios;
248 pcol->inode->i_ino + EXOFS_OBJ_OFF};
249 struct osd_request *or = NULL;
250 struct page_collect *pcol_copy = NULL; 263 struct page_collect *pcol_copy = NULL;
251 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
252 int ret; 264 int ret;
253 265
254 if (!pcol->bio) 266 if (!pcol->pages)
255 return 0; 267 return 0;
256 268
257 /* see comment in _readpage() about sync reads */ 269 /* see comment in _readpage() about sync reads */
258 WARN_ON(is_sync && (pcol->nr_pages != 1)); 270 WARN_ON(is_sync && (pcol->nr_pages != 1));
259 271
260 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); 272 ios->pages = pcol->pages;
261 if (unlikely(!or)) { 273 ios->nr_pages = pcol->nr_pages;
262 ret = -ENOMEM; 274 ios->length = pcol->length;
263 goto err; 275 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
264 }
265
266 osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
267 276
268 if (is_sync) { 277 if (is_sync) {
269 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); 278 exofs_oi_read(oi, pcol->ios);
270 return __readpages_done(or, pcol, false); 279 return __readpages_done(pcol, false);
271 } 280 }
272 281
273 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 282 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -277,14 +286,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
277 } 286 }
278 287
279 *pcol_copy = *pcol; 288 *pcol_copy = *pcol;
280 ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred); 289 ios->done = readpages_done;
290 ios->private = pcol_copy;
291 ret = exofs_oi_read(oi, ios);
281 if (unlikely(ret)) 292 if (unlikely(ret))
282 goto err; 293 goto err;
283 294
284 atomic_inc(&pcol->sbi->s_curr_pending); 295 atomic_inc(&pcol->sbi->s_curr_pending);
285 296
286 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 297 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
287 obj.id, _LLU(i_start), pcol->length); 298 ios->obj.id, _LLU(ios->offset), pcol->length);
288 299
289 /* pages ownership was passed to pcol_copy */ 300 /* pages ownership was passed to pcol_copy */
290 _pcol_reset(pcol); 301 _pcol_reset(pcol);
@@ -293,12 +304,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
293err: 304err:
294 if (!is_sync) 305 if (!is_sync)
295 _unlock_pcol_pages(pcol, ret, READ); 306 _unlock_pcol_pages(pcol, ret, READ);
296 else /* Pages unlocked by caller in sync mode only free bio */ 307
297 pcol_free(pcol); 308 pcol_free(pcol);
298 309
299 kfree(pcol_copy); 310 kfree(pcol_copy);
300 if (or)
301 osd_end_request(or);
302 return ret; 311 return ret;
303} 312}
304 313
@@ -361,7 +370,7 @@ try_again:
361 goto try_again; 370 goto try_again;
362 } 371 }
363 372
364 if (!pcol->bio) { 373 if (!pcol->pages) {
365 ret = pcol_try_alloc(pcol); 374 ret = pcol_try_alloc(pcol);
366 if (unlikely(ret)) 375 if (unlikely(ret))
367 goto fail; 376 goto fail;
@@ -370,12 +379,12 @@ try_again:
370 if (len != PAGE_CACHE_SIZE) 379 if (len != PAGE_CACHE_SIZE)
371 zero_user(page, len, PAGE_CACHE_SIZE - len); 380 zero_user(page, len, PAGE_CACHE_SIZE - len);
372 381
373 EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", 382 EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
374 inode->i_ino, page->index, len); 383 inode->i_ino, page->index, len);
375 384
376 ret = pcol_add_page(pcol, page, len); 385 ret = pcol_add_page(pcol, page, len);
377 if (ret) { 386 if (ret) {
378 EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p " 387 EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
379 "this_len=0x%zx nr_pages=%u length=0x%lx\n", 388 "this_len=0x%zx nr_pages=%u length=0x%lx\n",
380 page, len, pcol->nr_pages, pcol->length); 389 page, len, pcol->nr_pages, pcol->length);
381 390
@@ -419,9 +428,8 @@ static int _readpage(struct page *page, bool is_sync)
419 428
420 _pcol_init(&pcol, 1, page->mapping->host); 429 _pcol_init(&pcol, 1, page->mapping->host);
421 430
422 /* readpage_strip might call read_exec(,async) inside at several places 431 /* readpage_strip might call read_exec(,is_sync==false) at several
423 * but this is safe for is_async=0 since read_exec will not do anything 432 * places but not if we have a single page.
424 * when we have a single page.
425 */ 433 */
426 ret = readpage_strip(&pcol, page); 434 ret = readpage_strip(&pcol, page);
427 if (ret) { 435 if (ret) {
@@ -440,35 +448,30 @@ static int exofs_readpage(struct file *file, struct page *page)
440 return _readpage(page, false); 448 return _readpage(page, false);
441} 449}
442 450
443/* Callback for osd_write. All writes are asynchronouse */ 451/* Callback for osd_write. All writes are asynchronous */
444static void writepages_done(struct osd_request *or, void *p) 452static void writepages_done(struct exofs_io_state *ios, void *p)
445{ 453{
446 struct page_collect *pcol = p; 454 struct page_collect *pcol = p;
447 struct bio_vec *bvec;
448 int i; 455 int i;
449 u64 resid; 456 u64 resid;
450 u64 good_bytes; 457 u64 good_bytes;
451 u64 length = 0; 458 u64 length = 0;
459 int ret = exofs_check_io(ios, &resid);
452 460
453 int ret = exofs_check_ok_resid(or, NULL, &resid);
454
455 osd_end_request(or);
456 atomic_dec(&pcol->sbi->s_curr_pending); 461 atomic_dec(&pcol->sbi->s_curr_pending);
457 462
458 if (likely(!ret)) 463 if (likely(!ret))
459 good_bytes = pcol->length; 464 good_bytes = pcol->length;
460 else if (!resid)
461 good_bytes = 0;
462 else 465 else
463 good_bytes = pcol->length - resid; 466 good_bytes = pcol->length - resid;
464 467
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" 468 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n", 469 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 470 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages); 471 pcol->nr_pages);
469 472
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 473 for (i = 0; i < pcol->nr_pages; i++) {
471 struct page *page = bvec->bv_page; 474 struct page *page = pcol->pages[i];
472 struct inode *inode = page->mapping->host; 475 struct inode *inode = page->mapping->host;
473 int page_stat; 476 int page_stat;
474 477
@@ -482,37 +485,27 @@ static void writepages_done(struct osd_request *or, void *p)
482 485
483 update_write_page(page, page_stat); 486 update_write_page(page, page_stat);
484 unlock_page(page); 487 unlock_page(page);
485 EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n", 488 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat); 489 inode->i_ino, page->index, page_stat);
487 490
488 length += bvec->bv_len; 491 length += PAGE_SIZE;
489 } 492 }
490 493
491 pcol_free(pcol); 494 pcol_free(pcol);
492 kfree(pcol); 495 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n"); 496 EXOFS_DBGMSG2("writepages_done END\n");
494} 497}
495 498
496static int write_exec(struct page_collect *pcol) 499static int write_exec(struct page_collect *pcol)
497{ 500{
498 struct exofs_i_info *oi = exofs_i(pcol->inode); 501 struct exofs_i_info *oi = exofs_i(pcol->inode);
499 struct osd_obj_id obj = {pcol->sbi->s_pid, 502 struct exofs_io_state *ios = pcol->ios;
500 pcol->inode->i_ino + EXOFS_OBJ_OFF};
501 struct osd_request *or = NULL;
502 struct page_collect *pcol_copy = NULL; 503 struct page_collect *pcol_copy = NULL;
503 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
504 int ret; 504 int ret;
505 505
506 if (!pcol->bio) 506 if (!pcol->pages)
507 return 0; 507 return 0;
508 508
509 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
510 if (unlikely(!or)) {
511 EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
512 ret = -ENOMEM;
513 goto err;
514 }
515
516 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 509 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
517 if (!pcol_copy) { 510 if (!pcol_copy) {
518 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); 511 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
@@ -522,17 +515,22 @@ static int write_exec(struct page_collect *pcol)
522 515
523 *pcol_copy = *pcol; 516 *pcol_copy = *pcol;
524 517
525 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ 518 ios->pages = pcol_copy->pages;
526 osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length); 519 ios->nr_pages = pcol_copy->nr_pages;
527 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); 520 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
521 ios->length = pcol_copy->length;
522 ios->done = writepages_done;
523 ios->private = pcol_copy;
524
525 ret = exofs_oi_write(oi, ios);
528 if (unlikely(ret)) { 526 if (unlikely(ret)) {
529 EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); 527 EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
530 goto err; 528 goto err;
531 } 529 }
532 530
533 atomic_inc(&pcol->sbi->s_curr_pending); 531 atomic_inc(&pcol->sbi->s_curr_pending);
534 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", 532 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
535 pcol->inode->i_ino, pcol->pg_first, _LLU(i_start), 533 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
536 pcol->length); 534 pcol->length);
537 /* pages ownership was passed to pcol_copy */ 535 /* pages ownership was passed to pcol_copy */
538 _pcol_reset(pcol); 536 _pcol_reset(pcol);
@@ -540,9 +538,9 @@ static int write_exec(struct page_collect *pcol)
540 538
541err: 539err:
542 _unlock_pcol_pages(pcol, ret, WRITE); 540 _unlock_pcol_pages(pcol, ret, WRITE);
541 pcol_free(pcol);
543 kfree(pcol_copy); 542 kfree(pcol_copy);
544 if (or) 543
545 osd_end_request(or);
546 return ret; 544 return ret;
547} 545}
548 546
@@ -586,6 +584,9 @@ static int writepage_strip(struct page *page,
586 if (PageError(page)) 584 if (PageError(page))
587 ClearPageError(page); 585 ClearPageError(page);
588 unlock_page(page); 586 unlock_page(page);
587 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
588 "outside the limits\n",
589 inode->i_ino, page->index);
589 return 0; 590 return 0;
590 } 591 }
591 } 592 }
@@ -600,21 +601,24 @@ try_again:
600 ret = write_exec(pcol); 601 ret = write_exec(pcol);
601 if (unlikely(ret)) 602 if (unlikely(ret))
602 goto fail; 603 goto fail;
604
605 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
606 inode->i_ino, page->index);
603 goto try_again; 607 goto try_again;
604 } 608 }
605 609
606 if (!pcol->bio) { 610 if (!pcol->pages) {
607 ret = pcol_try_alloc(pcol); 611 ret = pcol_try_alloc(pcol);
608 if (unlikely(ret)) 612 if (unlikely(ret))
609 goto fail; 613 goto fail;
610 } 614 }
611 615
612 EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", 616 EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
613 inode->i_ino, page->index, len); 617 inode->i_ino, page->index, len);
614 618
615 ret = pcol_add_page(pcol, page, len); 619 ret = pcol_add_page(pcol, page, len);
616 if (unlikely(ret)) { 620 if (unlikely(ret)) {
617 EXOFS_DBGMSG("Failed pcol_add_page " 621 EXOFS_DBGMSG2("Failed pcol_add_page "
618 "nr_pages=%u total_length=0x%lx\n", 622 "nr_pages=%u total_length=0x%lx\n",
619 pcol->nr_pages, pcol->length); 623 pcol->nr_pages, pcol->length);
620 624
@@ -634,6 +638,8 @@ try_again:
634 return 0; 638 return 0;
635 639
636fail: 640fail:
641 EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
642 inode->i_ino, page->index, ret);
637 set_bit(AS_EIO, &page->mapping->flags); 643 set_bit(AS_EIO, &page->mapping->flags);
638 unlock_page(page); 644 unlock_page(page);
639 return ret; 645 return ret;
@@ -652,14 +658,17 @@ static int exofs_writepages(struct address_space *mapping,
652 wbc->range_end >> PAGE_CACHE_SHIFT; 658 wbc->range_end >> PAGE_CACHE_SHIFT;
653 659
654 if (start || end) 660 if (start || end)
655 expected_pages = min(end - start + 1, 32L); 661 expected_pages = end - start + 1;
656 else 662 else
657 expected_pages = mapping->nrpages; 663 expected_pages = mapping->nrpages;
658 664
659 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx" 665 if (expected_pages < 32L)
660 " m->nrpages=%lu start=0x%lx end=0x%lx\n", 666 expected_pages = 32L;
667
668 EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
669 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
661 mapping->host->i_ino, wbc->range_start, wbc->range_end, 670 mapping->host->i_ino, wbc->range_start, wbc->range_end,
662 mapping->nrpages, start, end); 671 mapping->nrpages, start, end, expected_pages);
663 672
664 _pcol_init(&pcol, expected_pages, mapping->host); 673 _pcol_init(&pcol, expected_pages, mapping->host);
665 674
@@ -731,13 +740,28 @@ static int exofs_write_begin_export(struct file *file,
731 fsdata); 740 fsdata);
732} 741}
733 742
743static int exofs_write_end(struct file *file, struct address_space *mapping,
744 loff_t pos, unsigned len, unsigned copied,
745 struct page *page, void *fsdata)
746{
747 struct inode *inode = mapping->host;
748 /* According to comment in simple_write_end i_mutex is held */
749 loff_t i_size = inode->i_size;
750 int ret;
751
752 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
753 if (i_size != inode->i_size)
754 mark_inode_dirty(inode);
755 return ret;
756}
757
734const struct address_space_operations exofs_aops = { 758const struct address_space_operations exofs_aops = {
735 .readpage = exofs_readpage, 759 .readpage = exofs_readpage,
736 .readpages = exofs_readpages, 760 .readpages = exofs_readpages,
737 .writepage = exofs_writepage, 761 .writepage = exofs_writepage,
738 .writepages = exofs_writepages, 762 .writepages = exofs_writepages,
739 .write_begin = exofs_write_begin_export, 763 .write_begin = exofs_write_begin_export,
740 .write_end = simple_write_end, 764 .write_end = exofs_write_end,
741}; 765};
742 766
743/****************************************************************************** 767/******************************************************************************
@@ -771,19 +795,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock,
771const struct osd_attr g_attr_logical_length = ATTR_DEF( 795const struct osd_attr g_attr_logical_length = ATTR_DEF(
772 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 796 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
773 797
798static int _do_truncate(struct inode *inode)
799{
800 struct exofs_i_info *oi = exofs_i(inode);
801 loff_t isize = i_size_read(inode);
802 int ret;
803
804 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
805
806 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
807
808 ret = exofs_oi_truncate(oi, (u64)isize);
809 EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
810 return ret;
811}
812
774/* 813/*
775 * Truncate a file to the specified size - all we have to do is set the size 814 * Truncate a file to the specified size - all we have to do is set the size
776 * attribute. We make sure the object exists first. 815 * attribute. We make sure the object exists first.
777 */ 816 */
778void exofs_truncate(struct inode *inode) 817void exofs_truncate(struct inode *inode)
779{ 818{
780 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
781 struct exofs_i_info *oi = exofs_i(inode); 819 struct exofs_i_info *oi = exofs_i(inode);
782 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
783 struct osd_request *or;
784 struct osd_attr attr;
785 loff_t isize = i_size_read(inode);
786 __be64 newsize;
787 int ret; 820 int ret;
788 821
789 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 822 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
@@ -793,22 +826,6 @@ void exofs_truncate(struct inode *inode)
793 return; 826 return;
794 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 827 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
795 return; 828 return;
796 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
797
798 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
799
800 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
801 if (unlikely(!or)) {
802 EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
803 goto fail;
804 }
805
806 osd_req_set_attributes(or, &obj);
807
808 newsize = cpu_to_be64((u64)isize);
809 attr = g_attr_logical_length;
810 attr.val_ptr = &newsize;
811 osd_req_add_set_attr_list(or, &attr, 1);
812 829
813 /* if we are about to truncate an object, and it hasn't been 830 /* if we are about to truncate an object, and it hasn't been
814 * created yet, wait 831 * created yet, wait
@@ -816,8 +833,7 @@ void exofs_truncate(struct inode *inode)
816 if (unlikely(wait_obj_created(oi))) 833 if (unlikely(wait_obj_created(oi)))
817 goto fail; 834 goto fail;
818 835
819 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); 836 ret = _do_truncate(inode);
820 osd_end_request(or);
821 if (ret) 837 if (ret)
822 goto fail; 838 goto fail;
823 839
@@ -845,67 +861,110 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
845 return error; 861 return error;
846} 862}
847 863
864static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
865 EXOFS_APAGE_FS_DATA,
866 EXOFS_ATTR_INODE_FILE_LAYOUT,
867 0);
868static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
869 EXOFS_APAGE_FS_DATA,
870 EXOFS_ATTR_INODE_DIR_LAYOUT,
871 0);
872
848/* 873/*
849 * Read an inode from the OSD, and return it as is. We also return the size 874 * Read the Linux inode info from the OSD, and return it as is. In exofs the
850 * attribute in the 'sanity' argument if we got compiled with debugging turned 875 * inode info is in an application specific page/attribute of the osd-object.
851 * on.
852 */ 876 */
853static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 877static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
854 struct exofs_fcb *inode, uint64_t *sanity) 878 struct exofs_fcb *inode)
855{ 879{
856 struct exofs_sb_info *sbi = sb->s_fs_info; 880 struct exofs_sb_info *sbi = sb->s_fs_info;
857 struct osd_request *or; 881 struct osd_attr attrs[] = {
858 struct osd_attr attr; 882 [0] = g_attr_inode_data,
859 struct osd_obj_id obj = {sbi->s_pid, 883 [1] = g_attr_inode_file_layout,
860 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF}; 884 [2] = g_attr_inode_dir_layout,
885 };
886 struct exofs_io_state *ios;
887 struct exofs_on_disk_inode_layout *layout;
861 int ret; 888 int ret;
862 889
863 exofs_make_credential(oi->i_cred, &obj); 890 ret = exofs_get_io_state(&sbi->layout, &ios);
864 891 if (unlikely(ret)) {
865 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 892 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
866 if (unlikely(!or)) { 893 return ret;
867 EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
868 return -ENOMEM;
869 } 894 }
870 osd_req_get_attributes(or, &obj);
871 895
872 /* we need the inode attribute */ 896 ios->obj.id = exofs_oi_objno(oi);
873 osd_req_add_get_attr_list(or, &g_attr_inode_data, 1); 897 exofs_make_credential(oi->i_cred, &ios->obj);
898 ios->cred = oi->i_cred;
874 899
875#ifdef EXOFS_DEBUG_OBJ_ISIZE 900 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
876 /* we get the size attributes to do a sanity check */ 901 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
877 osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
878#endif
879 902
880 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); 903 ios->in_attr = attrs;
881 if (ret) 904 ios->in_attr_len = ARRAY_SIZE(attrs);
905
906 ret = exofs_sbi_read(ios);
907 if (unlikely(ret)) {
908 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
909 _LLU(ios->obj.id), ret);
910 memset(inode, 0, sizeof(*inode));
911 inode->i_mode = 0040000 | (0777 & ~022);
912 /* If object is lost on target we might as well enable it's
913 * delete.
914 */
915 if ((ret == -ENOENT) || (ret == -EINVAL))
916 ret = 0;
882 goto out; 917 goto out;
918 }
883 919
884 attr = g_attr_inode_data; 920 ret = extract_attr_from_ios(ios, &attrs[0]);
885 ret = extract_attr_from_req(or, &attr);
886 if (ret) { 921 if (ret) {
887 EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n"); 922 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
888 goto out; 923 goto out;
889 } 924 }
925 WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
926 memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
890 927
891 WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE); 928 ret = extract_attr_from_ios(ios, &attrs[1]);
892 memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE); 929 if (ret) {
930 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
931 goto out;
932 }
933 if (attrs[1].len) {
934 layout = attrs[1].val_ptr;
935 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
936 EXOFS_ERR("%s: unsupported files layout %d\n",
937 __func__, layout->gen_func);
938 ret = -ENOTSUPP;
939 goto out;
940 }
941 }
893 942
894#ifdef EXOFS_DEBUG_OBJ_ISIZE 943 ret = extract_attr_from_ios(ios, &attrs[2]);
895 attr = g_attr_logical_length;
896 ret = extract_attr_from_req(or, &attr);
897 if (ret) { 944 if (ret) {
898 EXOFS_ERR("ERROR: extract attr from or failed\n"); 945 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
899 goto out; 946 goto out;
900 } 947 }
901 *sanity = get_unaligned_be64(attr.val_ptr); 948 if (attrs[2].len) {
902#endif 949 layout = attrs[2].val_ptr;
950 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
951 EXOFS_ERR("%s: unsupported meta-data layout %d\n",
952 __func__, layout->gen_func);
953 ret = -ENOTSUPP;
954 goto out;
955 }
956 }
903 957
904out: 958out:
905 osd_end_request(or); 959 exofs_put_io_state(ios);
906 return ret; 960 return ret;
907} 961}
908 962
963static void __oi_init(struct exofs_i_info *oi)
964{
965 init_waitqueue_head(&oi->i_wq);
966 oi->i_flags = 0;
967}
909/* 968/*
910 * Fill in an inode read from the OSD and set it up for use 969 * Fill in an inode read from the OSD and set it up for use
911 */ 970 */
@@ -914,7 +973,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
914 struct exofs_i_info *oi; 973 struct exofs_i_info *oi;
915 struct exofs_fcb fcb; 974 struct exofs_fcb fcb;
916 struct inode *inode; 975 struct inode *inode;
917 uint64_t uninitialized_var(sanity);
918 int ret; 976 int ret;
919 977
920 inode = iget_locked(sb, ino); 978 inode = iget_locked(sb, ino);
@@ -923,13 +981,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
923 if (!(inode->i_state & I_NEW)) 981 if (!(inode->i_state & I_NEW))
924 return inode; 982 return inode;
925 oi = exofs_i(inode); 983 oi = exofs_i(inode);
984 __oi_init(oi);
926 985
927 /* read the inode from the osd */ 986 /* read the inode from the osd */
928 ret = exofs_get_inode(sb, oi, &fcb, &sanity); 987 ret = exofs_get_inode(sb, oi, &fcb);
929 if (ret) 988 if (ret)
930 goto bad_inode; 989 goto bad_inode;
931 990
932 init_waitqueue_head(&oi->i_wq);
933 set_obj_created(oi); 991 set_obj_created(oi);
934 992
935 /* copy stuff from on-disk struct to in-memory struct */ 993 /* copy stuff from on-disk struct to in-memory struct */
@@ -947,15 +1005,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
947 inode->i_blkbits = EXOFS_BLKSHIFT; 1005 inode->i_blkbits = EXOFS_BLKSHIFT;
948 inode->i_generation = le32_to_cpu(fcb.i_generation); 1006 inode->i_generation = le32_to_cpu(fcb.i_generation);
949 1007
950#ifdef EXOFS_DEBUG_OBJ_ISIZE
951 if ((inode->i_size != sanity) &&
952 (!exofs_inode_is_fast_symlink(inode))) {
953 EXOFS_ERR("WARNING: Size of object from inode and "
954 "attributes differ (%lld != %llu)\n",
955 inode->i_size, _LLU(sanity));
956 }
957#endif
958
959 oi->i_dir_start_lookup = 0; 1008 oi->i_dir_start_lookup = 0;
960 1009
961 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 1010 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
@@ -1020,23 +1069,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
1020 * set the obj_created flag so that other methods know that the object exists on 1069 * set the obj_created flag so that other methods know that the object exists on
1021 * the OSD. 1070 * the OSD.
1022 */ 1071 */
1023static void create_done(struct osd_request *or, void *p) 1072static void create_done(struct exofs_io_state *ios, void *p)
1024{ 1073{
1025 struct inode *inode = p; 1074 struct inode *inode = p;
1026 struct exofs_i_info *oi = exofs_i(inode); 1075 struct exofs_i_info *oi = exofs_i(inode);
1027 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 1076 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1028 int ret; 1077 int ret;
1029 1078
1030 ret = exofs_check_ok(or); 1079 ret = exofs_check_io(ios, NULL);
1031 osd_end_request(or); 1080 exofs_put_io_state(ios);
1081
1032 atomic_dec(&sbi->s_curr_pending); 1082 atomic_dec(&sbi->s_curr_pending);
1033 1083
1034 if (unlikely(ret)) { 1084 if (unlikely(ret)) {
1035 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1085 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1036 _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF)); 1086 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1037 make_bad_inode(inode); 1087 /*TODO: When FS is corrupted creation can fail, object already
1038 } else 1088 * exist. Get rid of this asynchronous creation, if exist
1039 set_obj_created(oi); 1089 * increment the obj counter and try the next object. Until we
1090 * succeed. All these dangling objects will be made into lost
1091 * files by chkfs.exofs
1092 */
1093 }
1094
1095 set_obj_created(oi);
1040 1096
1041 atomic_dec(&inode->i_count); 1097 atomic_dec(&inode->i_count);
1042 wake_up(&oi->i_wq); 1098 wake_up(&oi->i_wq);
@@ -1051,8 +1107,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1051 struct inode *inode; 1107 struct inode *inode;
1052 struct exofs_i_info *oi; 1108 struct exofs_i_info *oi;
1053 struct exofs_sb_info *sbi; 1109 struct exofs_sb_info *sbi;
1054 struct osd_request *or; 1110 struct exofs_io_state *ios;
1055 struct osd_obj_id obj;
1056 int ret; 1111 int ret;
1057 1112
1058 sb = dir->i_sb; 1113 sb = dir->i_sb;
@@ -1061,8 +1116,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1061 return ERR_PTR(-ENOMEM); 1116 return ERR_PTR(-ENOMEM);
1062 1117
1063 oi = exofs_i(inode); 1118 oi = exofs_i(inode);
1119 __oi_init(oi);
1064 1120
1065 init_waitqueue_head(&oi->i_wq);
1066 set_obj_2bcreated(oi); 1121 set_obj_2bcreated(oi);
1067 1122
1068 sbi = sb->s_fs_info; 1123 sbi = sb->s_fs_info;
@@ -1089,28 +1144,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1089 1144
1090 mark_inode_dirty(inode); 1145 mark_inode_dirty(inode);
1091 1146
1092 obj.partition = sbi->s_pid; 1147 ret = exofs_get_io_state(&sbi->layout, &ios);
1093 obj.id = inode->i_ino + EXOFS_OBJ_OFF; 1148 if (unlikely(ret)) {
1094 exofs_make_credential(oi->i_cred, &obj); 1149 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
1095 1150 return ERR_PTR(ret);
1096 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1097 if (unlikely(!or)) {
1098 EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
1099 return ERR_PTR(-ENOMEM);
1100 } 1151 }
1101 1152
1102 osd_req_create_object(or, &obj); 1153 ios->obj.id = exofs_oi_objno(oi);
1154 exofs_make_credential(oi->i_cred, &ios->obj);
1103 1155
1104 /* increment the refcount so that the inode will still be around when we 1156 /* increment the refcount so that the inode will still be around when we
1105 * reach the callback 1157 * reach the callback
1106 */ 1158 */
1107 atomic_inc(&inode->i_count); 1159 atomic_inc(&inode->i_count);
1108 1160
1109 ret = exofs_async_op(or, create_done, inode, oi->i_cred); 1161 ios->done = create_done;
1162 ios->private = inode;
1163 ios->cred = oi->i_cred;
1164 ret = exofs_sbi_create(ios);
1110 if (ret) { 1165 if (ret) {
1111 atomic_dec(&inode->i_count); 1166 atomic_dec(&inode->i_count);
1112 osd_end_request(or); 1167 exofs_put_io_state(ios);
1113 return ERR_PTR(-EIO); 1168 return ERR_PTR(ret);
1114 } 1169 }
1115 atomic_inc(&sbi->s_curr_pending); 1170 atomic_inc(&sbi->s_curr_pending);
1116 1171
@@ -1128,11 +1183,11 @@ struct updatei_args {
1128/* 1183/*
1129 * Callback function from exofs_update_inode(). 1184 * Callback function from exofs_update_inode().
1130 */ 1185 */
1131static void updatei_done(struct osd_request *or, void *p) 1186static void updatei_done(struct exofs_io_state *ios, void *p)
1132{ 1187{
1133 struct updatei_args *args = p; 1188 struct updatei_args *args = p;
1134 1189
1135 osd_end_request(or); 1190 exofs_put_io_state(ios);
1136 1191
1137 atomic_dec(&args->sbi->s_curr_pending); 1192 atomic_dec(&args->sbi->s_curr_pending);
1138 1193
@@ -1148,16 +1203,17 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1148 struct exofs_i_info *oi = exofs_i(inode); 1203 struct exofs_i_info *oi = exofs_i(inode);
1149 struct super_block *sb = inode->i_sb; 1204 struct super_block *sb = inode->i_sb;
1150 struct exofs_sb_info *sbi = sb->s_fs_info; 1205 struct exofs_sb_info *sbi = sb->s_fs_info;
1151 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; 1206 struct exofs_io_state *ios;
1152 struct osd_request *or;
1153 struct osd_attr attr; 1207 struct osd_attr attr;
1154 struct exofs_fcb *fcb; 1208 struct exofs_fcb *fcb;
1155 struct updatei_args *args; 1209 struct updatei_args *args;
1156 int ret; 1210 int ret;
1157 1211
1158 args = kzalloc(sizeof(*args), GFP_KERNEL); 1212 args = kzalloc(sizeof(*args), GFP_KERNEL);
1159 if (!args) 1213 if (!args) {
1214 EXOFS_DBGMSG("Faild kzalloc of args\n");
1160 return -ENOMEM; 1215 return -ENOMEM;
1216 }
1161 1217
1162 fcb = &args->fcb; 1218 fcb = &args->fcb;
1163 1219
@@ -1186,18 +1242,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1186 } else 1242 } else
1187 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1243 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1188 1244
1189 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 1245 ret = exofs_get_io_state(&sbi->layout, &ios);
1190 if (unlikely(!or)) { 1246 if (unlikely(ret)) {
1191 EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n"); 1247 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
1192 ret = -ENOMEM;
1193 goto free_args; 1248 goto free_args;
1194 } 1249 }
1195 1250
1196 osd_req_set_attributes(or, &obj);
1197
1198 attr = g_attr_inode_data; 1251 attr = g_attr_inode_data;
1199 attr.val_ptr = fcb; 1252 attr.val_ptr = fcb;
1200 osd_req_add_set_attr_list(or, &attr, 1); 1253 ios->out_attr_len = 1;
1254 ios->out_attr = &attr;
1201 1255
1202 if (!obj_created(oi)) { 1256 if (!obj_created(oi)) {
1203 EXOFS_DBGMSG("!obj_created\n"); 1257 EXOFS_DBGMSG("!obj_created\n");
@@ -1206,43 +1260,42 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1206 EXOFS_DBGMSG("wait_event done\n"); 1260 EXOFS_DBGMSG("wait_event done\n");
1207 } 1261 }
1208 1262
1209 if (do_sync) { 1263 if (!do_sync) {
1210 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
1211 osd_end_request(or);
1212 goto free_args;
1213 } else {
1214 args->sbi = sbi; 1264 args->sbi = sbi;
1265 ios->done = updatei_done;
1266 ios->private = args;
1267 }
1215 1268
1216 ret = exofs_async_op(or, updatei_done, args, oi->i_cred); 1269 ret = exofs_oi_write(oi, ios);
1217 if (ret) { 1270 if (!do_sync && !ret) {
1218 osd_end_request(or);
1219 goto free_args;
1220 }
1221 atomic_inc(&sbi->s_curr_pending); 1271 atomic_inc(&sbi->s_curr_pending);
1222 goto out; /* deallocation in updatei_done */ 1272 goto out; /* deallocation in updatei_done */
1223 } 1273 }
1224 1274
1275 exofs_put_io_state(ios);
1225free_args: 1276free_args:
1226 kfree(args); 1277 kfree(args);
1227out: 1278out:
1228 EXOFS_DBGMSG("ret=>%d\n", ret); 1279 EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
1280 inode->i_ino, do_sync, ret);
1229 return ret; 1281 return ret;
1230} 1282}
1231 1283
1232int exofs_write_inode(struct inode *inode, int wait) 1284int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1233{ 1285{
1234 return exofs_update_inode(inode, wait); 1286 return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1235} 1287}
1236 1288
1237/* 1289/*
1238 * Callback function from exofs_delete_inode() - don't have much cleaning up to 1290 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1239 * do. 1291 * do.
1240 */ 1292 */
1241static void delete_done(struct osd_request *or, void *p) 1293static void delete_done(struct exofs_io_state *ios, void *p)
1242{ 1294{
1243 struct exofs_sb_info *sbi; 1295 struct exofs_sb_info *sbi = p;
1244 osd_end_request(or); 1296
1245 sbi = p; 1297 exofs_put_io_state(ios);
1298
1246 atomic_dec(&sbi->s_curr_pending); 1299 atomic_dec(&sbi->s_curr_pending);
1247} 1300}
1248 1301
@@ -1256,8 +1309,7 @@ void exofs_delete_inode(struct inode *inode)
1256 struct exofs_i_info *oi = exofs_i(inode); 1309 struct exofs_i_info *oi = exofs_i(inode);
1257 struct super_block *sb = inode->i_sb; 1310 struct super_block *sb = inode->i_sb;
1258 struct exofs_sb_info *sbi = sb->s_fs_info; 1311 struct exofs_sb_info *sbi = sb->s_fs_info;
1259 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; 1312 struct exofs_io_state *ios;
1260 struct osd_request *or;
1261 int ret; 1313 int ret;
1262 1314
1263 truncate_inode_pages(&inode->i_data, 0); 1315 truncate_inode_pages(&inode->i_data, 0);
@@ -1274,25 +1326,26 @@ void exofs_delete_inode(struct inode *inode)
1274 1326
1275 clear_inode(inode); 1327 clear_inode(inode);
1276 1328
1277 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 1329 ret = exofs_get_io_state(&sbi->layout, &ios);
1278 if (unlikely(!or)) { 1330 if (unlikely(ret)) {
1279 EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n"); 1331 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1280 return; 1332 return;
1281 } 1333 }
1282 1334
1283 osd_req_remove_object(or, &obj);
1284
1285 /* if we are deleting an obj that hasn't been created yet, wait */ 1335 /* if we are deleting an obj that hasn't been created yet, wait */
1286 if (!obj_created(oi)) { 1336 if (!obj_created(oi)) {
1287 BUG_ON(!obj_2bcreated(oi)); 1337 BUG_ON(!obj_2bcreated(oi));
1288 wait_event(oi->i_wq, obj_created(oi)); 1338 wait_event(oi->i_wq, obj_created(oi));
1289 } 1339 }
1290 1340
1291 ret = exofs_async_op(or, delete_done, sbi, oi->i_cred); 1341 ios->obj.id = exofs_oi_objno(oi);
1342 ios->done = delete_done;
1343 ios->private = sbi;
1344 ios->cred = oi->i_cred;
1345 ret = exofs_sbi_remove(ios);
1292 if (ret) { 1346 if (ret) {
1293 EXOFS_ERR( 1347 EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
1294 "ERROR: @exofs_delete_inode exofs_async_op failed\n"); 1348 exofs_put_io_state(ios);
1295 osd_end_request(or);
1296 return; 1349 return;
1297 } 1350 }
1298 atomic_inc(&sbi->s_curr_pending); 1351 atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
new file mode 100644
index 000000000000..4337cad7777b
--- /dev/null
+++ b/fs/exofs/ios.c
@@ -0,0 +1,823 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com>
6 *
7 * This file is part of exofs.
8 *
9 * exofs is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation. Since it is based on ext2, and the only
12 * valid version of GPL for the Linux kernel is version 2, the only valid
13 * version of GPL for exofs is version 2.
14 *
15 * exofs is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with exofs; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include <linux/slab.h>
26#include <scsi/scsi_device.h>
27#include <asm/div64.h>
28
29#include "exofs.h"
30
31#define EXOFS_DBGMSG2(M...) do {} while (0)
32/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
33
34void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
35{
36 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
37}
38
39int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
40 u64 offset, void *p, unsigned length)
41{
42 struct osd_request *or = osd_start_request(od, GFP_KERNEL);
43/* struct osd_sense_info osi = {.key = 0};*/
44 int ret;
45
46 if (unlikely(!or)) {
47 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
48 return -ENOMEM;
49 }
50 ret = osd_req_read_kern(or, obj, offset, p, length);
51 if (unlikely(ret)) {
52 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
53 goto out;
54 }
55
56 ret = osd_finalize_request(or, 0, cred, NULL);
57 if (unlikely(ret)) {
58 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
59 goto out;
60 }
61
62 ret = osd_execute_request(or);
63 if (unlikely(ret))
64 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
65 /* osd_req_decode_sense(or, ret); */
66
67out:
68 osd_end_request(or);
69 return ret;
70}
71
72int exofs_get_io_state(struct exofs_layout *layout,
73 struct exofs_io_state **pios)
74{
75 struct exofs_io_state *ios;
76
77 /*TODO: Maybe use kmem_cach per sbi of size
78 * exofs_io_state_size(layout->s_numdevs)
79 */
80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
81 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs));
84 *pios = NULL;
85 return -ENOMEM;
86 }
87
88 ios->layout = layout;
89 ios->obj.partition = layout->s_pid;
90 *pios = ios;
91 return 0;
92}
93
94void exofs_put_io_state(struct exofs_io_state *ios)
95{
96 if (ios) {
97 unsigned i;
98
99 for (i = 0; i < ios->numdevs; i++) {
100 struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
101
102 if (per_dev->or)
103 osd_end_request(per_dev->or);
104 if (per_dev->bio)
105 bio_put(per_dev->bio);
106 }
107
108 kfree(ios);
109 }
110}
111
112unsigned exofs_layout_od_id(struct exofs_layout *layout,
113 osd_id obj_no, unsigned layout_index)
114{
115/* switch (layout->lay_func) {
116 case LAYOUT_MOVING_WINDOW:
117 {*/
118 unsigned dev_mod = obj_no;
119
120 return (layout_index + dev_mod * layout->mirrors_p1) %
121 layout->s_numdevs;
122/* }
123 case LAYOUT_FUNC_IMPLICT:
124 return layout->devs[layout_index];
125 }*/
126}
127
128static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
129 unsigned layout_index)
130{
131 return ios->layout->s_ods[
132 exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
133}
134
135static void _sync_done(struct exofs_io_state *ios, void *p)
136{
137 struct completion *waiting = p;
138
139 complete(waiting);
140}
141
142static void _last_io(struct kref *kref)
143{
144 struct exofs_io_state *ios = container_of(
145 kref, struct exofs_io_state, kref);
146
147 ios->done(ios, ios->private);
148}
149
150static void _done_io(struct osd_request *or, void *p)
151{
152 struct exofs_io_state *ios = p;
153
154 kref_put(&ios->kref, _last_io);
155}
156
157static int exofs_io_execute(struct exofs_io_state *ios)
158{
159 DECLARE_COMPLETION_ONSTACK(wait);
160 bool sync = (ios->done == NULL);
161 int i, ret;
162
163 if (sync) {
164 ios->done = _sync_done;
165 ios->private = &wait;
166 }
167
168 for (i = 0; i < ios->numdevs; i++) {
169 struct osd_request *or = ios->per_dev[i].or;
170 if (unlikely(!or))
171 continue;
172
173 ret = osd_finalize_request(or, 0, ios->cred, NULL);
174 if (unlikely(ret)) {
175 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
176 ret);
177 return ret;
178 }
179 }
180
181 kref_init(&ios->kref);
182
183 for (i = 0; i < ios->numdevs; i++) {
184 struct osd_request *or = ios->per_dev[i].or;
185 if (unlikely(!or))
186 continue;
187
188 kref_get(&ios->kref);
189 osd_execute_request_async(or, _done_io, ios);
190 }
191
192 kref_put(&ios->kref, _last_io);
193 ret = 0;
194
195 if (sync) {
196 wait_for_completion(&wait);
197 ret = exofs_check_io(ios, NULL);
198 }
199 return ret;
200}
201
202static void _clear_bio(struct bio *bio)
203{
204 struct bio_vec *bv;
205 unsigned i;
206
207 __bio_for_each_segment(bv, bio, i, 0) {
208 unsigned this_count = bv->bv_len;
209
210 if (likely(PAGE_SIZE == this_count))
211 clear_highpage(bv->bv_page);
212 else
213 zero_user(bv->bv_page, bv->bv_offset, this_count);
214 }
215}
216
217int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
218{
219 enum osd_err_priority acumulated_osd_err = 0;
220 int acumulated_lin_err = 0;
221 int i;
222
223 for (i = 0; i < ios->numdevs; i++) {
224 struct osd_sense_info osi;
225 struct osd_request *or = ios->per_dev[i].or;
226 int ret;
227
228 if (unlikely(!or))
229 continue;
230
231 ret = osd_req_decode_sense(or, &osi);
232 if (likely(!ret))
233 continue;
234
235 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
236 /* start read offset passed endof file */
237 _clear_bio(ios->per_dev[i].bio);
238 EXOFS_DBGMSG("start read offset passed end of file "
239 "offset=0x%llx, length=0x%llx\n",
240 _LLU(ios->per_dev[i].offset),
241 _LLU(ios->per_dev[i].length));
242
243 continue; /* we recovered */
244 }
245
246 if (osi.osd_err_pri >= acumulated_osd_err) {
247 acumulated_osd_err = osi.osd_err_pri;
248 acumulated_lin_err = ret;
249 }
250 }
251
252 /* TODO: raid specific residual calculations */
253 if (resid) {
254 if (likely(!acumulated_lin_err))
255 *resid = 0;
256 else
257 *resid = ios->length;
258 }
259
260 return acumulated_lin_err;
261}
262
263/*
264 * L - logical offset into the file
265 *
266 * U - The number of bytes in a stripe within a group
267 *
268 * U = stripe_unit * group_width
269 *
270 * T - The number of bytes striped within a group of component objects
271 * (before advancing to the next group)
272 *
273 * T = stripe_unit * group_width * group_depth
274 *
275 * S - The number of bytes striped across all component objects
276 * before the pattern repeats
277 *
278 * S = stripe_unit * group_width * group_depth * group_count
279 *
280 * M - The "major" (i.e., across all components) stripe number
281 *
282 * M = L / S
283 *
284 * G - Counts the groups from the beginning of the major stripe
285 *
286 * G = (L - (M * S)) / T [or (L % S) / T]
287 *
288 * H - The byte offset within the group
289 *
290 * H = (L - (M * S)) % T [or (L % S) % T]
291 *
292 * N - The "minor" (i.e., across the group) stripe number
293 *
294 * N = H / U
295 *
296 * C - The component index coresponding to L
297 *
298 * C = (H - (N * U)) / stripe_unit + G * group_width
299 * [or (L % U) / stripe_unit + G * group_width]
300 *
301 * O - The component offset coresponding to L
302 *
303 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
304 */
305struct _striping_info {
306 u64 obj_offset;
307 u64 group_length;
308 u64 total_group_length;
309 u64 Major;
310 unsigned dev;
311 unsigned unit_off;
312};
313
314static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
315 struct _striping_info *si)
316{
317 u32 stripe_unit = ios->layout->stripe_unit;
318 u32 group_width = ios->layout->group_width;
319 u64 group_depth = ios->layout->group_depth;
320
321 u32 U = stripe_unit * group_width;
322 u64 T = U * group_depth;
323 u64 S = T * ios->layout->group_count;
324 u64 M = div64_u64(file_offset, S);
325
326 /*
327 G = (L - (M * S)) / T
328 H = (L - (M * S)) % T
329 */
330 u64 LmodS = file_offset - M * S;
331 u32 G = div64_u64(LmodS, T);
332 u64 H = LmodS - G * T;
333
334 u32 N = div_u64(H, U);
335
336 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
337 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
338 si->dev *= ios->layout->mirrors_p1;
339
340 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
341
342 si->obj_offset = si->unit_off + (N * stripe_unit) +
343 (M * group_depth * stripe_unit);
344
345 si->group_length = T - H;
346 si->total_group_length = T;
347 si->Major = M;
348}
349
350static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
351 unsigned pgbase, struct exofs_per_dev_state *per_dev,
352 int cur_len)
353{
354 unsigned pg = *cur_pg;
355 struct request_queue *q =
356 osd_request_queue(exofs_ios_od(ios, per_dev->dev));
357
358 per_dev->length += cur_len;
359
360 if (per_dev->bio == NULL) {
361 unsigned pages_in_stripe = ios->layout->group_width *
362 (ios->layout->stripe_unit / PAGE_SIZE);
363 unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
364 ios->layout->group_width;
365
366 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
367 if (unlikely(!per_dev->bio)) {
368 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
369 bio_size);
370 return -ENOMEM;
371 }
372 }
373
374 while (cur_len > 0) {
375 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
376 unsigned added_len;
377
378 BUG_ON(ios->nr_pages <= pg);
379 cur_len -= pglen;
380
381 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
382 pglen, pgbase);
383 if (unlikely(pglen != added_len))
384 return -ENOMEM;
385 pgbase = 0;
386 ++pg;
387 }
388 BUG_ON(cur_len);
389
390 *cur_pg = pg;
391 return 0;
392}
393
394static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
395 struct _striping_info *si, unsigned first_comp)
396{
397 unsigned stripe_unit = ios->layout->stripe_unit;
398 unsigned mirrors_p1 = ios->layout->mirrors_p1;
399 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
400 unsigned dev = si->dev;
401 unsigned first_dev = dev - (dev % devs_in_group);
402 unsigned comp = first_comp + (dev - first_dev);
403 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
404 unsigned cur_pg = ios->pages_consumed;
405 int ret = 0;
406
407 while (length) {
408 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
409 unsigned cur_len, page_off = 0;
410
411 if (!per_dev->length) {
412 per_dev->dev = dev;
413 if (dev < si->dev) {
414 per_dev->offset = si->obj_offset + stripe_unit -
415 si->unit_off;
416 cur_len = stripe_unit;
417 } else if (dev == si->dev) {
418 per_dev->offset = si->obj_offset;
419 cur_len = stripe_unit - si->unit_off;
420 page_off = si->unit_off & ~PAGE_MASK;
421 BUG_ON(page_off && (page_off != ios->pgbase));
422 } else { /* dev > si->dev */
423 per_dev->offset = si->obj_offset - si->unit_off;
424 cur_len = stripe_unit;
425 }
426
427 if (max_comp < comp)
428 max_comp = comp;
429
430 dev += mirrors_p1;
431 dev = (dev % devs_in_group) + first_dev;
432 } else {
433 cur_len = stripe_unit;
434 }
435 if (cur_len >= length)
436 cur_len = length;
437
438 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
439 cur_len);
440 if (unlikely(ret))
441 goto out;
442
443 comp += mirrors_p1;
444 comp = (comp % devs_in_group) + first_comp;
445
446 length -= cur_len;
447 }
448out:
449 ios->numdevs = max_comp + mirrors_p1;
450 ios->pages_consumed = cur_pg;
451 return ret;
452}
453
454static int _prepare_for_striping(struct exofs_io_state *ios)
455{
456 u64 length = ios->length;
457 struct _striping_info si;
458 unsigned devs_in_group = ios->layout->group_width *
459 ios->layout->mirrors_p1;
460 unsigned first_comp = 0;
461 int ret = 0;
462
463 _calc_stripe_info(ios, ios->offset, &si);
464
465 if (!ios->pages) {
466 if (ios->kern_buff) {
467 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
468
469 per_dev->offset = si.obj_offset;
470 per_dev->dev = si.dev;
471
472 /* no cross device without page array */
473 BUG_ON((ios->layout->group_width > 1) &&
474 (si.unit_off + ios->length >
475 ios->layout->stripe_unit));
476 }
477 ios->numdevs = ios->layout->mirrors_p1;
478 return 0;
479 }
480
481 while (length) {
482 if (length < si.group_length)
483 si.group_length = length;
484
485 ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
486 if (unlikely(ret))
487 goto out;
488
489 length -= si.group_length;
490
491 si.group_length = si.total_group_length;
492 si.unit_off = 0;
493 ++si.Major;
494 si.obj_offset = si.Major * ios->layout->stripe_unit *
495 ios->layout->group_depth;
496
497 si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
498 si.dev %= ios->layout->s_numdevs;
499
500 first_comp += devs_in_group;
501 first_comp %= ios->layout->s_numdevs;
502 }
503
504out:
505 return ret;
506}
507
508int exofs_sbi_create(struct exofs_io_state *ios)
509{
510 int i, ret;
511
512 for (i = 0; i < ios->layout->s_numdevs; i++) {
513 struct osd_request *or;
514
515 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
516 if (unlikely(!or)) {
517 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
518 ret = -ENOMEM;
519 goto out;
520 }
521 ios->per_dev[i].or = or;
522 ios->numdevs++;
523
524 osd_req_create_object(or, &ios->obj);
525 }
526 ret = exofs_io_execute(ios);
527
528out:
529 return ret;
530}
531
532int exofs_sbi_remove(struct exofs_io_state *ios)
533{
534 int i, ret;
535
536 for (i = 0; i < ios->layout->s_numdevs; i++) {
537 struct osd_request *or;
538
539 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
540 if (unlikely(!or)) {
541 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
542 ret = -ENOMEM;
543 goto out;
544 }
545 ios->per_dev[i].or = or;
546 ios->numdevs++;
547
548 osd_req_remove_object(or, &ios->obj);
549 }
550 ret = exofs_io_execute(ios);
551
552out:
553 return ret;
554}
555
556static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
557{
558 struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
559 unsigned dev = ios->per_dev[cur_comp].dev;
560 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
561 int ret = 0;
562
563 if (ios->pages && !master_dev->length)
564 return 0; /* Just an empty slot */
565
566 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
567 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
568 struct osd_request *or;
569
570 or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
571 if (unlikely(!or)) {
572 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
573 ret = -ENOMEM;
574 goto out;
575 }
576 per_dev->or = or;
577 per_dev->offset = master_dev->offset;
578
579 if (ios->pages) {
580 struct bio *bio;
581
582 if (per_dev != master_dev) {
583 bio = bio_kmalloc(GFP_KERNEL,
584 master_dev->bio->bi_max_vecs);
585 if (unlikely(!bio)) {
586 EXOFS_DBGMSG(
587 "Faild to allocate BIO size=%u\n",
588 master_dev->bio->bi_max_vecs);
589 ret = -ENOMEM;
590 goto out;
591 }
592
593 __bio_clone(bio, master_dev->bio);
594 bio->bi_bdev = NULL;
595 bio->bi_next = NULL;
596 per_dev->length = master_dev->length;
597 per_dev->bio = bio;
598 per_dev->dev = dev;
599 } else {
600 bio = master_dev->bio;
601 /* FIXME: bio_set_dir() */
602 bio->bi_rw |= (1 << BIO_RW);
603 }
604
605 osd_req_write(or, &ios->obj, per_dev->offset, bio,
606 per_dev->length);
607 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
608 "length=0x%llx dev=%d\n",
609 _LLU(ios->obj.id), _LLU(per_dev->offset),
610 _LLU(per_dev->length), dev);
611 } else if (ios->kern_buff) {
612 ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
613 ios->kern_buff, ios->length);
614 if (unlikely(ret))
615 goto out;
616 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
617 "length=0x%llx dev=%d\n",
618 _LLU(ios->obj.id), _LLU(per_dev->offset),
619 _LLU(ios->length), dev);
620 } else {
621 osd_req_set_attributes(or, &ios->obj);
622 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
623 _LLU(ios->obj.id), ios->out_attr_len, dev);
624 }
625
626 if (ios->out_attr)
627 osd_req_add_set_attr_list(or, ios->out_attr,
628 ios->out_attr_len);
629
630 if (ios->in_attr)
631 osd_req_add_get_attr_list(or, ios->in_attr,
632 ios->in_attr_len);
633 }
634
635out:
636 return ret;
637}
638
639int exofs_sbi_write(struct exofs_io_state *ios)
640{
641 int i;
642 int ret;
643
644 ret = _prepare_for_striping(ios);
645 if (unlikely(ret))
646 return ret;
647
648 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
649 ret = _sbi_write_mirror(ios, i);
650 if (unlikely(ret))
651 return ret;
652 }
653
654 ret = exofs_io_execute(ios);
655 return ret;
656}
657
658static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
659{
660 struct osd_request *or;
661 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
662 unsigned first_dev = (unsigned)ios->obj.id;
663
664 if (ios->pages && !per_dev->length)
665 return 0; /* Just an empty slot */
666
667 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
668 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
669 if (unlikely(!or)) {
670 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
671 return -ENOMEM;
672 }
673 per_dev->or = or;
674
675 if (ios->pages) {
676 osd_req_read(or, &ios->obj, per_dev->offset,
677 per_dev->bio, per_dev->length);
678 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
679 " dev=%d\n", _LLU(ios->obj.id),
680 _LLU(per_dev->offset), _LLU(per_dev->length),
681 first_dev);
682 } else if (ios->kern_buff) {
683 int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
684 ios->kern_buff, ios->length);
685 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
686 "length=0x%llx dev=%d ret=>%d\n",
687 _LLU(ios->obj.id), _LLU(per_dev->offset),
688 _LLU(ios->length), first_dev, ret);
689 if (unlikely(ret))
690 return ret;
691 } else {
692 osd_req_get_attributes(or, &ios->obj);
693 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
694 _LLU(ios->obj.id), ios->in_attr_len, first_dev);
695 }
696 if (ios->out_attr)
697 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
698
699 if (ios->in_attr)
700 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
701
702 return 0;
703}
704
705int exofs_sbi_read(struct exofs_io_state *ios)
706{
707 int i;
708 int ret;
709
710 ret = _prepare_for_striping(ios);
711 if (unlikely(ret))
712 return ret;
713
714 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
715 ret = _sbi_read_mirror(ios, i);
716 if (unlikely(ret))
717 return ret;
718 }
719
720 ret = exofs_io_execute(ios);
721 return ret;
722}
723
724int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
725{
726 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
727 void *iter = NULL;
728 int nelem;
729
730 do {
731 nelem = 1;
732 osd_req_decode_get_attr_list(ios->per_dev[0].or,
733 &cur_attr, &nelem, &iter);
734 if ((cur_attr.attr_page == attr->attr_page) &&
735 (cur_attr.attr_id == attr->attr_id)) {
736 attr->len = cur_attr.len;
737 attr->val_ptr = cur_attr.val_ptr;
738 return 0;
739 }
740 } while (iter);
741
742 return -EIO;
743}
744
745static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
746 struct osd_attr *attr)
747{
748 int last_comp = cur_comp + ios->layout->mirrors_p1;
749
750 for (; cur_comp < last_comp; ++cur_comp) {
751 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
752 struct osd_request *or;
753
754 or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
755 if (unlikely(!or)) {
756 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
757 return -ENOMEM;
758 }
759 per_dev->or = or;
760
761 osd_req_set_attributes(or, &ios->obj);
762 osd_req_add_set_attr_list(or, attr, 1);
763 }
764
765 return 0;
766}
767
768int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
769{
770 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
771 struct exofs_io_state *ios;
772 struct exofs_trunc_attr {
773 struct osd_attr attr;
774 __be64 newsize;
775 } *size_attrs;
776 struct _striping_info si;
777 int i, ret;
778
779 ret = exofs_get_io_state(&sbi->layout, &ios);
780 if (unlikely(ret))
781 return ret;
782
783 size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
784 GFP_KERNEL);
785 if (unlikely(!size_attrs)) {
786 ret = -ENOMEM;
787 goto out;
788 }
789
790 ios->obj.id = exofs_oi_objno(oi);
791 ios->cred = oi->i_cred;
792
793 ios->numdevs = ios->layout->s_numdevs;
794 _calc_stripe_info(ios, size, &si);
795
796 for (i = 0; i < ios->layout->group_width; ++i) {
797 struct exofs_trunc_attr *size_attr = &size_attrs[i];
798 u64 obj_size;
799
800 if (i < si.dev)
801 obj_size = si.obj_offset +
802 ios->layout->stripe_unit - si.unit_off;
803 else if (i == si.dev)
804 obj_size = si.obj_offset;
805 else /* i > si.dev */
806 obj_size = si.obj_offset - si.unit_off;
807
808 size_attr->newsize = cpu_to_be64(obj_size);
809 size_attr->attr = g_attr_logical_length;
810 size_attr->attr.val_ptr = &size_attr->newsize;
811
812 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
813 &size_attr->attr);
814 if (unlikely(ret))
815 goto out;
816 }
817 ret = exofs_io_execute(ios);
818
819out:
820 kfree(size_attrs);
821 exofs_put_io_state(ios);
822 return ret;
823}
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
deleted file mode 100644
index 4372542df284..000000000000
--- a/fs/exofs/osd.c
+++ /dev/null
@@ -1,125 +0,0 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com>
6 *
7 * This file is part of exofs.
8 *
9 * exofs is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation. Since it is based on ext2, and the only
12 * valid version of GPL for the Linux kernel is version 2, the only valid
13 * version of GPL for exofs is version 2.
14 *
15 * exofs is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with exofs; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include <scsi/scsi_device.h>
26#include <scsi/osd_sense.h>
27
28#include "exofs.h"
29
30int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
31{
32 struct osd_sense_info osi;
33 int ret = osd_req_decode_sense(or, &osi);
34
35 if (ret) { /* translate to Linux codes */
36 if (osi.additional_code == scsi_invalid_field_in_cdb) {
37 if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
38 ret = -EFAULT;
39 if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
40 ret = -ENOENT;
41 else
42 ret = -EINVAL;
43 } else if (osi.additional_code == osd_quota_error)
44 ret = -ENOSPC;
45 else
46 ret = -EIO;
47 }
48
49 /* FIXME: should be include in osd_sense_info */
50 if (in_resid)
51 *in_resid = or->in.req ? or->in.req->resid_len : 0;
52
53 if (out_resid)
54 *out_resid = or->out.req ? or->out.req->resid_len : 0;
55
56 return ret;
57}
58
59void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
60{
61 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
62}
63
64/*
65 * Perform a synchronous OSD operation.
66 */
67int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
68{
69 int ret;
70
71 or->timeout = timeout;
72 ret = osd_finalize_request(or, 0, credential, NULL);
73 if (ret) {
74 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
75 return ret;
76 }
77
78 ret = osd_execute_request(or);
79
80 if (ret)
81 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
82 /* osd_req_decode_sense(or, ret); */
83 return ret;
84}
85
86/*
87 * Perform an asynchronous OSD operation.
88 */
89int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
90 void *caller_context, u8 *cred)
91{
92 int ret;
93
94 ret = osd_finalize_request(or, 0, cred, NULL);
95 if (ret) {
96 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
97 return ret;
98 }
99
100 ret = osd_execute_request_async(or, async_done, caller_context);
101
102 if (ret)
103 EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
104 return ret;
105}
106
107int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
108{
109 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
110 void *iter = NULL;
111 int nelem;
112
113 do {
114 nelem = 1;
115 osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
116 if ((cur_attr.attr_page == attr->attr_page) &&
117 (cur_attr.attr_id == attr->attr_id)) {
118 attr->len = cur_attr.len;
119 attr->val_ptr = cur_attr.val_ptr;
120 return 0;
121 }
122 } while (iter);
123
124 return -EIO;
125}
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 000000000000..c52e9888b8ab
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,45 @@
1/*
2 * Copyright (C) 2008, 2009
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of exofs.
6 *
7 * exofs is free software; you can redistribute it and/or modify it under the
8 * terms of the GNU General Public License version 2 as published by the Free
9 * Software Foundation.
10 *
11 */
12
13/* FIXME: Remove this file once pnfs hits mainline */
14
15#ifndef __EXOFS_PNFS_H__
16#define __EXOFS_PNFS_H__
17
18#if ! defined(__PNFS_OSD_XDR_H__)
19
20enum pnfs_iomode {
21 IOMODE_READ = 1,
22 IOMODE_RW = 2,
23 IOMODE_ANY = 3,
24};
25
26/* Layout Structure */
27enum pnfs_osd_raid_algorithm4 {
28 PNFS_OSD_RAID_0 = 1,
29 PNFS_OSD_RAID_4 = 2,
30 PNFS_OSD_RAID_5 = 3,
31 PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
32};
33
34struct pnfs_osd_data_map {
35 u32 odm_num_comps;
36 u64 odm_stripe_unit;
37 u32 odm_group_width;
38 u32 odm_group_depth;
39 u32 odm_mirror_cnt;
40 u32 odm_raid_algorithm;
41};
42
43#endif /* ! defined(__PNFS_OSD_XDR_H__) */
44
45#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f500dec3b59..18e57ea1e5b4 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/random.h> 38#include <linux/random.h>
39#include <linux/exportfs.h> 39#include <linux/exportfs.h>
40#include <linux/slab.h>
40 41
41#include "exofs.h" 42#include "exofs.h"
42 43
@@ -203,49 +204,45 @@ int exofs_sync_fs(struct super_block *sb, int wait)
203{ 204{
204 struct exofs_sb_info *sbi; 205 struct exofs_sb_info *sbi;
205 struct exofs_fscb *fscb; 206 struct exofs_fscb *fscb;
206 struct osd_request *or; 207 struct exofs_io_state *ios;
207 struct osd_obj_id obj;
208 int ret = -ENOMEM; 208 int ret = -ENOMEM;
209 209
210 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
211 if (!fscb) {
212 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
213 return -ENOMEM;
214 }
215
216 lock_super(sb); 210 lock_super(sb);
217 sbi = sb->s_fs_info; 211 sbi = sb->s_fs_info;
212 fscb = &sbi->s_fscb;
213
214 ret = exofs_get_io_state(&sbi->layout, &ios);
215 if (ret)
216 goto out;
217
218 /* Note: We only write the changing part of the fscb. .i.e upto the
219 * the fscb->s_dev_table_oid member. There is no read-modify-write
220 * here.
221 */
222 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
223 memset(fscb, 0, ios->length);
218 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 224 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
219 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); 225 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
220 fscb->s_magic = cpu_to_le16(sb->s_magic); 226 fscb->s_magic = cpu_to_le16(sb->s_magic);
221 fscb->s_newfs = 0; 227 fscb->s_newfs = 0;
228 fscb->s_version = EXOFS_FSCB_VER;
222 229
223 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 230 ios->obj.id = EXOFS_SUPER_ID;
224 if (unlikely(!or)) { 231 ios->offset = 0;
225 EXOFS_ERR("exofs_write_super: osd_start_request failed.\n"); 232 ios->kern_buff = fscb;
226 goto out; 233 ios->cred = sbi->s_cred;
227 }
228
229 obj.partition = sbi->s_pid;
230 obj.id = EXOFS_SUPER_ID;
231 ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
232 if (unlikely(ret)) {
233 EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
234 goto out;
235 }
236 234
237 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); 235 ret = exofs_sbi_write(ios);
238 if (unlikely(ret)) { 236 if (unlikely(ret)) {
239 EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n"); 237 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
240 goto out; 238 goto out;
241 } 239 }
242 sb->s_dirt = 0; 240 sb->s_dirt = 0;
243 241
244out: 242out:
245 if (or) 243 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
246 osd_end_request(or); 244 exofs_put_io_state(ios);
247 unlock_super(sb); 245 unlock_super(sb);
248 kfree(fscb);
249 return ret; 246 return ret;
250} 247}
251 248
@@ -257,6 +254,29 @@ static void exofs_write_super(struct super_block *sb)
257 sb->s_dirt = 0; 254 sb->s_dirt = 0;
258} 255}
259 256
257static void _exofs_print_device(const char *msg, const char *dev_path,
258 struct osd_dev *od, u64 pid)
259{
260 const struct osd_dev_info *odi = osduld_device_info(od);
261
262 printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
263 msg, dev_path ?: "", odi->osdname, _LLU(pid));
264}
265
266void exofs_free_sbi(struct exofs_sb_info *sbi)
267{
268 while (sbi->layout.s_numdevs) {
269 int i = --sbi->layout.s_numdevs;
270 struct osd_dev *od = sbi->layout.s_ods[i];
271
272 if (od) {
273 sbi->layout.s_ods[i] = NULL;
274 osduld_put_device(od);
275 }
276 }
277 kfree(sbi);
278}
279
260/* 280/*
261 * This function is called when the vfs is freeing the superblock. We just 281 * This function is called when the vfs is freeing the superblock. We just
262 * need to free our own part. 282 * need to free our own part.
@@ -279,11 +299,235 @@ static void exofs_put_super(struct super_block *sb)
279 msecs_to_jiffies(100)); 299 msecs_to_jiffies(100));
280 } 300 }
281 301
282 osduld_put_device(sbi->s_dev); 302 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
283 kfree(sb->s_fs_info); 303 sbi->layout.s_pid);
304
305 exofs_free_sbi(sbi);
284 sb->s_fs_info = NULL; 306 sb->s_fs_info = NULL;
285} 307}
286 308
309static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
310 struct exofs_device_table *dt)
311{
312 u64 stripe_length;
313
314 sbi->data_map.odm_num_comps =
315 le32_to_cpu(dt->dt_data_map.cb_num_comps);
316 sbi->data_map.odm_stripe_unit =
317 le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
318 sbi->data_map.odm_group_width =
319 le32_to_cpu(dt->dt_data_map.cb_group_width);
320 sbi->data_map.odm_group_depth =
321 le32_to_cpu(dt->dt_data_map.cb_group_depth);
322 sbi->data_map.odm_mirror_cnt =
323 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
324 sbi->data_map.odm_raid_algorithm =
325 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
326
327/* FIXME: Only raid0 for now. if not so, do not mount */
328 if (sbi->data_map.odm_num_comps != numdevs) {
329 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
330 sbi->data_map.odm_num_comps, numdevs);
331 return -EINVAL;
332 }
333 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
334 EXOFS_ERR("Only RAID_0 for now\n");
335 return -EINVAL;
336 }
337 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
338 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
339 numdevs, sbi->data_map.odm_mirror_cnt);
340 return -EINVAL;
341 }
342
343 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
344 EXOFS_ERR("Stripe Unit(0x%llx)"
345 " must be Multples of PAGE_SIZE(0x%lx)\n",
346 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
347 return -EINVAL;
348 }
349
350 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
351 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
352
353 if (sbi->data_map.odm_group_width) {
354 sbi->layout.group_width = sbi->data_map.odm_group_width;
355 sbi->layout.group_depth = sbi->data_map.odm_group_depth;
356 if (!sbi->layout.group_depth) {
357 EXOFS_ERR("group_depth == 0 && group_width != 0\n");
358 return -EINVAL;
359 }
360 sbi->layout.group_count = sbi->data_map.odm_num_comps /
361 sbi->layout.mirrors_p1 /
362 sbi->data_map.odm_group_width;
363 } else {
364 if (sbi->data_map.odm_group_depth) {
365 printk(KERN_NOTICE "Warning: group_depth ignored "
366 "group_width == 0 && group_depth == %d\n",
367 sbi->data_map.odm_group_depth);
368 sbi->data_map.odm_group_depth = 0;
369 }
370 sbi->layout.group_width = sbi->data_map.odm_num_comps /
371 sbi->layout.mirrors_p1;
372 sbi->layout.group_depth = -1;
373 sbi->layout.group_count = 1;
374 }
375
376 stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
377 if (stripe_length >= (1ULL << 32)) {
378 EXOFS_ERR("Total Stripe length(0x%llx)"
379 " >= 32bit is not supported\n", _LLU(stripe_length));
380 return -EINVAL;
381 }
382
383 return 0;
384}
385
386/* @odi is valid only as long as @fscb_dev is valid */
387static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
388 struct osd_dev_info *odi)
389{
390 odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
391 memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
392
393 odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
394 odi->osdname = dt_dev->osdname;
395
396 /* FIXME support long names. Will need a _put function */
397 if (dt_dev->long_name_offset)
398 return -EINVAL;
399
400 /* Make sure osdname is printable!
401 * mkexofs should give us space for a null-terminator else the
402 * device-table is invalid.
403 */
404 if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
405 odi->osdname_len = sizeof(dt_dev->osdname) - 1;
406 dt_dev->osdname[odi->osdname_len] = 0;
407
408 /* If it's all zeros something is bad we read past end-of-obj */
409 return !(odi->systemid_len || odi->osdname_len);
410}
411
412static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
413 unsigned table_count)
414{
415 struct exofs_sb_info *sbi = *psbi;
416 struct osd_dev *fscb_od;
417 struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
418 .id = EXOFS_DEVTABLE_ID};
419 struct exofs_device_table *dt;
420 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
421 sizeof(*dt);
422 unsigned numdevs, i;
423 int ret;
424
425 dt = kmalloc(table_bytes, GFP_KERNEL);
426 if (unlikely(!dt)) {
427 EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
428 table_bytes);
429 return -ENOMEM;
430 }
431
432 fscb_od = sbi->layout.s_ods[0];
433 sbi->layout.s_ods[0] = NULL;
434 sbi->layout.s_numdevs = 0;
435 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
436 if (unlikely(ret)) {
437 EXOFS_ERR("ERROR: reading device table\n");
438 goto out;
439 }
440
441 numdevs = le64_to_cpu(dt->dt_num_devices);
442 if (unlikely(!numdevs)) {
443 ret = -EINVAL;
444 goto out;
445 }
446 WARN_ON(table_count != numdevs);
447
448 ret = _read_and_match_data_map(sbi, numdevs, dt);
449 if (unlikely(ret))
450 goto out;
451
452 if (likely(numdevs > 1)) {
453 unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
454
455 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
456 if (unlikely(!sbi)) {
457 ret = -ENOMEM;
458 goto out;
459 }
460 memset(&sbi->layout.s_ods[1], 0,
461 size - sizeof(sbi->layout.s_ods[0]));
462 *psbi = sbi;
463 }
464
465 for (i = 0; i < numdevs; i++) {
466 struct exofs_fscb fscb;
467 struct osd_dev_info odi;
468 struct osd_dev *od;
469
470 if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
471 EXOFS_ERR("ERROR: Read all-zeros device entry\n");
472 ret = -EINVAL;
473 goto out;
474 }
475
476 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
477 i, odi.osdname);
478
479 /* On all devices the device table is identical. The user can
480 * specify any one of the participating devices on the command
481 * line. We always keep them in device-table order.
482 */
483 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
484 sbi->layout.s_ods[i] = fscb_od;
485 ++sbi->layout.s_numdevs;
486 fscb_od = NULL;
487 continue;
488 }
489
490 od = osduld_info_lookup(&odi);
491 if (unlikely(IS_ERR(od))) {
492 ret = PTR_ERR(od);
493 EXOFS_ERR("ERROR: device requested is not found "
494 "osd_name-%s =>%d\n", odi.osdname, ret);
495 goto out;
496 }
497
498 sbi->layout.s_ods[i] = od;
499 ++sbi->layout.s_numdevs;
500
501 /* Read the fscb of the other devices to make sure the FS
502 * partition is there.
503 */
504 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
505 sizeof(fscb));
506 if (unlikely(ret)) {
507 EXOFS_ERR("ERROR: Malformed participating device "
508 "error reading fscb osd_name-%s\n",
509 odi.osdname);
510 goto out;
511 }
512
513 /* TODO: verify other information is correct and FS-uuid
514 * matches. Benny what did you say about device table
515 * generation and old devices?
516 */
517 }
518
519out:
520 kfree(dt);
521 if (unlikely(!ret && fscb_od)) {
522 EXOFS_ERR(
523 "ERROR: Bad device-table container device not present\n");
524 osduld_put_device(fscb_od);
525 ret = -EINVAL;
526 }
527
528 return ret;
529}
530
287/* 531/*
288 * Read the superblock from the OSD and fill in the fields 532 * Read the superblock from the OSD and fill in the fields
289 */ 533 */
@@ -292,25 +536,32 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
292 struct inode *root; 536 struct inode *root;
293 struct exofs_mountopt *opts = data; 537 struct exofs_mountopt *opts = data;
294 struct exofs_sb_info *sbi; /*extended info */ 538 struct exofs_sb_info *sbi; /*extended info */
539 struct osd_dev *od; /* Master device */
295 struct exofs_fscb fscb; /*on-disk superblock info */ 540 struct exofs_fscb fscb; /*on-disk superblock info */
296 struct osd_request *or = NULL;
297 struct osd_obj_id obj; 541 struct osd_obj_id obj;
542 unsigned table_count;
298 int ret; 543 int ret;
299 544
300 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 545 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
301 if (!sbi) 546 if (!sbi)
302 return -ENOMEM; 547 return -ENOMEM;
303 sb->s_fs_info = sbi;
304 548
305 /* use mount options to fill superblock */ 549 /* use mount options to fill superblock */
306 sbi->s_dev = osduld_path_lookup(opts->dev_name); 550 od = osduld_path_lookup(opts->dev_name);
307 if (IS_ERR(sbi->s_dev)) { 551 if (IS_ERR(od)) {
308 ret = PTR_ERR(sbi->s_dev); 552 ret = PTR_ERR(od);
309 sbi->s_dev = NULL;
310 goto free_sbi; 553 goto free_sbi;
311 } 554 }
312 555
313 sbi->s_pid = opts->pid; 556 /* Default layout in case we do not have a device-table */
557 sbi->layout.stripe_unit = PAGE_SIZE;
558 sbi->layout.mirrors_p1 = 1;
559 sbi->layout.group_width = 1;
560 sbi->layout.group_depth = -1;
561 sbi->layout.group_count = 1;
562 sbi->layout.s_ods[0] = od;
563 sbi->layout.s_numdevs = 1;
564 sbi->layout.s_pid = opts->pid;
314 sbi->s_timeout = opts->timeout; 565 sbi->s_timeout = opts->timeout;
315 566
316 /* fill in some other data by hand */ 567 /* fill in some other data by hand */
@@ -323,35 +574,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
323 sb->s_bdev = NULL; 574 sb->s_bdev = NULL;
324 sb->s_dev = 0; 575 sb->s_dev = 0;
325 576
326 /* read data from on-disk superblock object */ 577 obj.partition = sbi->layout.s_pid;
327 obj.partition = sbi->s_pid;
328 obj.id = EXOFS_SUPER_ID; 578 obj.id = EXOFS_SUPER_ID;
329 exofs_make_credential(sbi->s_cred, &obj); 579 exofs_make_credential(sbi->s_cred, &obj);
330 580
331 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 581 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
332 if (unlikely(!or)) { 582 if (unlikely(ret))
333 if (!silent)
334 EXOFS_ERR(
335 "exofs_fill_super: osd_start_request failed.\n");
336 ret = -ENOMEM;
337 goto free_sbi;
338 }
339 ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
340 if (unlikely(ret)) {
341 if (!silent)
342 EXOFS_ERR(
343 "exofs_fill_super: osd_req_read_kern failed.\n");
344 ret = -ENOMEM;
345 goto free_sbi;
346 }
347
348 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
349 if (unlikely(ret)) {
350 if (!silent)
351 EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
352 ret = -EIO;
353 goto free_sbi; 583 goto free_sbi;
354 }
355 584
356 sb->s_magic = le16_to_cpu(fscb.s_magic); 585 sb->s_magic = le16_to_cpu(fscb.s_magic);
357 sbi->s_nextid = le64_to_cpu(fscb.s_nextid); 586 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
@@ -364,12 +593,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
364 ret = -EINVAL; 593 ret = -EINVAL;
365 goto free_sbi; 594 goto free_sbi;
366 } 595 }
596 if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
597 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
598 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
599 ret = -EINVAL;
600 goto free_sbi;
601 }
367 602
368 /* start generation numbers from a random point */ 603 /* start generation numbers from a random point */
369 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 604 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
370 spin_lock_init(&sbi->s_next_gen_lock); 605 spin_lock_init(&sbi->s_next_gen_lock);
371 606
607 table_count = le64_to_cpu(fscb.s_dev_table_count);
608 if (table_count) {
609 ret = exofs_read_lookup_dev_table(&sbi, table_count);
610 if (unlikely(ret))
611 goto free_sbi;
612 }
613
372 /* set up operation vectors */ 614 /* set up operation vectors */
615 sb->s_fs_info = sbi;
373 sb->s_op = &exofs_sops; 616 sb->s_op = &exofs_sops;
374 sb->s_export_op = &exofs_export_ops; 617 sb->s_export_op = &exofs_export_ops;
375 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); 618 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
@@ -395,16 +638,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
395 goto free_sbi; 638 goto free_sbi;
396 } 639 }
397 640
398 ret = 0; 641 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
399out: 642 sbi->layout.s_pid);
400 if (or) 643 return 0;
401 osd_end_request(or);
402 return ret;
403 644
404free_sbi: 645free_sbi:
405 osduld_put_device(sbi->s_dev); /* NULL safe */ 646 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
406 kfree(sbi); 647 opts->dev_name, sbi->layout.s_pid, ret);
407 goto out; 648 exofs_free_sbi(sbi);
649 return ret;
408} 650}
409 651
410/* 652/*
@@ -433,7 +675,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
433{ 675{
434 struct super_block *sb = dentry->d_sb; 676 struct super_block *sb = dentry->d_sb;
435 struct exofs_sb_info *sbi = sb->s_fs_info; 677 struct exofs_sb_info *sbi = sb->s_fs_info;
436 struct osd_obj_id obj = {sbi->s_pid, 0}; 678 struct exofs_io_state *ios;
437 struct osd_attr attrs[] = { 679 struct osd_attr attrs[] = {
438 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, 680 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
439 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), 681 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -442,32 +684,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
442 }; 684 };
443 uint64_t capacity = ULLONG_MAX; 685 uint64_t capacity = ULLONG_MAX;
444 uint64_t used = ULLONG_MAX; 686 uint64_t used = ULLONG_MAX;
445 struct osd_request *or;
446 uint8_t cred_a[OSD_CAP_LEN]; 687 uint8_t cred_a[OSD_CAP_LEN];
447 int ret; 688 int ret;
448 689
449 /* get used/capacity attributes */ 690 ret = exofs_get_io_state(&sbi->layout, &ios);
450 exofs_make_credential(cred_a, &obj); 691 if (ret) {
451 692 EXOFS_DBGMSG("exofs_get_io_state failed.\n");
452 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 693 return ret;
453 if (unlikely(!or)) {
454 EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
455 return -ENOMEM;
456 } 694 }
457 695
458 osd_req_get_attributes(or, &obj); 696 exofs_make_credential(cred_a, &ios->obj);
459 osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs)); 697 ios->cred = sbi->s_cred;
460 ret = exofs_sync_op(or, sbi->s_timeout, cred_a); 698 ios->in_attr = attrs;
699 ios->in_attr_len = ARRAY_SIZE(attrs);
700
701 ret = exofs_sbi_read(ios);
461 if (unlikely(ret)) 702 if (unlikely(ret))
462 goto out; 703 goto out;
463 704
464 ret = extract_attr_from_req(or, &attrs[0]); 705 ret = extract_attr_from_ios(ios, &attrs[0]);
465 if (likely(!ret)) 706 if (likely(!ret)) {
466 capacity = get_unaligned_be64(attrs[0].val_ptr); 707 capacity = get_unaligned_be64(attrs[0].val_ptr);
467 else 708 if (unlikely(!capacity))
709 capacity = ULLONG_MAX;
710 } else
468 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); 711 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
469 712
470 ret = extract_attr_from_req(or, &attrs[1]); 713 ret = extract_attr_from_ios(ios, &attrs[1]);
471 if (likely(!ret)) 714 if (likely(!ret))
472 used = get_unaligned_be64(attrs[1].val_ptr); 715 used = get_unaligned_be64(attrs[1].val_ptr);
473 else 716 else
@@ -476,15 +719,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
476 /* fill in the stats buffer */ 719 /* fill in the stats buffer */
477 buf->f_type = EXOFS_SUPER_MAGIC; 720 buf->f_type = EXOFS_SUPER_MAGIC;
478 buf->f_bsize = EXOFS_BLKSIZE; 721 buf->f_bsize = EXOFS_BLKSIZE;
479 buf->f_blocks = (capacity >> EXOFS_BLKSHIFT); 722 buf->f_blocks = capacity >> 9;
480 buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT); 723 buf->f_bfree = (capacity - used) >> 9;
481 buf->f_bavail = buf->f_bfree; 724 buf->f_bavail = buf->f_bfree;
482 buf->f_files = sbi->s_numfiles; 725 buf->f_files = sbi->s_numfiles;
483 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; 726 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
484 buf->f_namelen = EXOFS_NAME_LEN; 727 buf->f_namelen = EXOFS_NAME_LEN;
485 728
486out: 729out:
487 osd_end_request(or); 730 exofs_put_io_state(ios);
488 return ret; 731 return ret;
489} 732}
490 733
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 197c7db583c7..e9e175949a63 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -6,7 +6,7 @@
6 * and for mapping back from file handles to dentries. 6 * and for mapping back from file handles to dentries.
7 * 7 *
8 * For details on why we do all the strange and hairy things in here 8 * For details on why we do all the strange and hairy things in here
9 * take a look at Documentation/filesystems/Exporting. 9 * take a look at Documentation/filesystems/nfs/Exporting.
10 */ 10 */
11#include <linux/exportfs.h> 11#include <linux/exportfs.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a63d44256a70..a99e54318c3d 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -339,12 +339,12 @@ ext2_acl_chmod(struct inode *inode)
339 * Extended attribut handlers 339 * Extended attribut handlers
340 */ 340 */
341static size_t 341static size_t
342ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, 342ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size,
343 const char *name, size_t name_len) 343 const char *name, size_t name_len, int type)
344{ 344{
345 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); 345 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
346 346
347 if (!test_opt(inode->i_sb, POSIX_ACL)) 347 if (!test_opt(dentry->d_sb, POSIX_ACL))
348 return 0; 348 return 0;
349 if (list && size <= list_size) 349 if (list && size <= list_size)
350 memcpy(list, POSIX_ACL_XATTR_ACCESS, size); 350 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -352,12 +352,12 @@ ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size,
352} 352}
353 353
354static size_t 354static size_t
355ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, 355ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
356 const char *name, size_t name_len) 356 const char *name, size_t name_len, int type)
357{ 357{
358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); 358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
359 359
360 if (!test_opt(inode->i_sb, POSIX_ACL)) 360 if (!test_opt(dentry->d_sb, POSIX_ACL))
361 return 0; 361 return 0;
362 if (list && size <= list_size) 362 if (list && size <= list_size)
363 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); 363 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -365,15 +365,18 @@ ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size,
365} 365}
366 366
367static int 367static int
368ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) 368ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
369 size_t size, int type)
369{ 370{
370 struct posix_acl *acl; 371 struct posix_acl *acl;
371 int error; 372 int error;
372 373
373 if (!test_opt(inode->i_sb, POSIX_ACL)) 374 if (strcmp(name, "") != 0)
375 return -EINVAL;
376 if (!test_opt(dentry->d_sb, POSIX_ACL))
374 return -EOPNOTSUPP; 377 return -EOPNOTSUPP;
375 378
376 acl = ext2_get_acl(inode, type); 379 acl = ext2_get_acl(dentry->d_inode, type);
377 if (IS_ERR(acl)) 380 if (IS_ERR(acl))
378 return PTR_ERR(acl); 381 return PTR_ERR(acl);
379 if (acl == NULL) 382 if (acl == NULL)
@@ -385,33 +388,17 @@ ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
385} 388}
386 389
387static int 390static int
388ext2_xattr_get_acl_access(struct inode *inode, const char *name, 391ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
389 void *buffer, size_t size) 392 size_t size, int flags, int type)
390{
391 if (strcmp(name, "") != 0)
392 return -EINVAL;
393 return ext2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
394}
395
396static int
397ext2_xattr_get_acl_default(struct inode *inode, const char *name,
398 void *buffer, size_t size)
399{
400 if (strcmp(name, "") != 0)
401 return -EINVAL;
402 return ext2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
403}
404
405static int
406ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
407 size_t size)
408{ 393{
409 struct posix_acl *acl; 394 struct posix_acl *acl;
410 int error; 395 int error;
411 396
412 if (!test_opt(inode->i_sb, POSIX_ACL)) 397 if (strcmp(name, "") != 0)
398 return -EINVAL;
399 if (!test_opt(dentry->d_sb, POSIX_ACL))
413 return -EOPNOTSUPP; 400 return -EOPNOTSUPP;
414 if (!is_owner_or_cap(inode)) 401 if (!is_owner_or_cap(dentry->d_inode))
415 return -EPERM; 402 return -EPERM;
416 403
417 if (value) { 404 if (value) {
@@ -426,41 +413,25 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
426 } else 413 } else
427 acl = NULL; 414 acl = NULL;
428 415
429 error = ext2_set_acl(inode, type, acl); 416 error = ext2_set_acl(dentry->d_inode, type, acl);
430 417
431release_and_out: 418release_and_out:
432 posix_acl_release(acl); 419 posix_acl_release(acl);
433 return error; 420 return error;
434} 421}
435 422
436static int
437ext2_xattr_set_acl_access(struct inode *inode, const char *name,
438 const void *value, size_t size, int flags)
439{
440 if (strcmp(name, "") != 0)
441 return -EINVAL;
442 return ext2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
443}
444
445static int
446ext2_xattr_set_acl_default(struct inode *inode, const char *name,
447 const void *value, size_t size, int flags)
448{
449 if (strcmp(name, "") != 0)
450 return -EINVAL;
451 return ext2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
452}
453
454struct xattr_handler ext2_xattr_acl_access_handler = { 423struct xattr_handler ext2_xattr_acl_access_handler = {
455 .prefix = POSIX_ACL_XATTR_ACCESS, 424 .prefix = POSIX_ACL_XATTR_ACCESS,
425 .flags = ACL_TYPE_ACCESS,
456 .list = ext2_xattr_list_acl_access, 426 .list = ext2_xattr_list_acl_access,
457 .get = ext2_xattr_get_acl_access, 427 .get = ext2_xattr_get_acl,
458 .set = ext2_xattr_set_acl_access, 428 .set = ext2_xattr_set_acl,
459}; 429};
460 430
461struct xattr_handler ext2_xattr_acl_default_handler = { 431struct xattr_handler ext2_xattr_acl_default_handler = {
462 .prefix = POSIX_ACL_XATTR_DEFAULT, 432 .prefix = POSIX_ACL_XATTR_DEFAULT,
433 .flags = ACL_TYPE_DEFAULT,
463 .list = ext2_xattr_list_acl_default, 434 .list = ext2_xattr_list_acl_default,
464 .get = ext2_xattr_get_acl_default, 435 .get = ext2_xattr_get_acl,
465 .set = ext2_xattr_set_acl_default, 436 .set = ext2_xattr_set_acl,
466}; 437};
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea6..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
13 13
14#include "ext2.h" 14#include "ext2.h"
15#include <linux/quotaops.h> 15#include <linux/quotaops.h>
16#include <linux/slab.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
@@ -570,7 +571,7 @@ do_more:
570error_return: 571error_return:
571 brelse(bitmap_bh); 572 brelse(bitmap_bh);
572 release_blocks(sb, freed); 573 release_blocks(sb, freed);
573 vfs_dq_free_block(inode, freed); 574 dquot_free_block(inode, freed);
574} 575}
575 576
576/** 577/**
@@ -1236,6 +1237,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
1236 unsigned short windowsz = 0; 1237 unsigned short windowsz = 0;
1237 unsigned long ngroups; 1238 unsigned long ngroups;
1238 unsigned long num = *count; 1239 unsigned long num = *count;
1240 int ret;
1239 1241
1240 *errp = -ENOSPC; 1242 *errp = -ENOSPC;
1241 sb = inode->i_sb; 1243 sb = inode->i_sb;
@@ -1247,8 +1249,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
1247 /* 1249 /*
1248 * Check quota for allocation of this block. 1250 * Check quota for allocation of this block.
1249 */ 1251 */
1250 if (vfs_dq_alloc_block(inode, num)) { 1252 ret = dquot_alloc_block(inode, num);
1251 *errp = -EDQUOT; 1253 if (ret) {
1254 *errp = ret;
1252 return 0; 1255 return 0;
1253 } 1256 }
1254 1257
@@ -1409,7 +1412,7 @@ allocated:
1409 1412
1410 *errp = 0; 1413 *errp = 0;
1411 brelse(bitmap_bh); 1414 brelse(bitmap_bh);
1412 vfs_dq_free_block(inode, *count-num); 1415 dquot_free_block(inode, *count-num);
1413 *count = num; 1416 *count = num;
1414 return ret_block; 1417 return ret_block;
1415 1418
@@ -1420,7 +1423,7 @@ out:
1420 * Undo the block allocation 1423 * Undo the block allocation
1421 */ 1424 */
1422 if (!performed_allocation) 1425 if (!performed_allocation)
1423 vfs_dq_free_block(inode, *count); 1426 dquot_free_block(inode, *count);
1424 brelse(bitmap_bh); 1427 brelse(bitmap_bh);
1425 return 0; 1428 return 0;
1426} 1429}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 6cde970b0a1a..7516957273ed 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -353,8 +353,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
353 * ext2_find_entry() 353 * ext2_find_entry()
354 * 354 *
355 * finds an entry in the specified directory with the wanted name. It 355 * finds an entry in the specified directory with the wanted name. It
356 * returns the page in which the entry was found, and the entry itself 356 * returns the page in which the entry was found (as a parameter - res_page),
357 * (as a parameter - res_dir). Page is returned mapped and unlocked. 357 * and the entry itself. Page is returned mapped and unlocked.
358 * Entry is guaranteed to be valid. 358 * Entry is guaranteed to be valid.
359 */ 359 */
360struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, 360struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
@@ -721,5 +721,5 @@ const struct file_operations ext2_dir_operations = {
721#ifdef CONFIG_COMPAT 721#ifdef CONFIG_COMPAT
722 .compat_ioctl = ext2_compat_ioctl, 722 .compat_ioctl = ext2_compat_ioctl,
723#endif 723#endif
724 .fsync = simple_fsync, 724 .fsync = ext2_fsync,
725}; 725};
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9a8a8e27a063..0b038e47ad2f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
118 118
119/* inode.c */ 119/* inode.c */
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, int); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -142,7 +142,7 @@ struct dentry *ext2_get_parent(struct dentry *child);
142/* super.c */ 142/* super.c */
143extern void ext2_error (struct super_block *, const char *, const char *, ...) 143extern void ext2_error (struct super_block *, const char *, const char *, ...)
144 __attribute__ ((format (printf, 3, 4))); 144 __attribute__ ((format (printf, 3, 4)));
145extern void ext2_warning (struct super_block *, const char *, const char *, ...) 145extern void ext2_msg(struct super_block *, const char *, const char *, ...)
146 __attribute__ ((format (printf, 3, 4))); 146 __attribute__ ((format (printf, 3, 4)));
147extern void ext2_update_dynamic_rev (struct super_block *sb); 147extern void ext2_update_dynamic_rev (struct super_block *sb);
148extern void ext2_write_super (struct super_block *); 148extern void ext2_write_super (struct super_block *);
@@ -155,6 +155,7 @@ extern void ext2_write_super (struct super_block *);
155extern const struct file_operations ext2_dir_operations; 155extern const struct file_operations ext2_dir_operations;
156 156
157/* file.c */ 157/* file.c */
158extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
158extern const struct inode_operations ext2_file_inode_operations; 159extern const struct inode_operations ext2_file_inode_operations;
159extern const struct file_operations ext2_file_operations; 160extern const struct file_operations ext2_file_operations;
160extern const struct file_operations ext2_xip_file_operations; 161extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a2f3afd1a1c1..5d198d0697fb 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -19,6 +19,8 @@
19 */ 19 */
20 20
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/pagemap.h>
23#include <linux/quotaops.h>
22#include "ext2.h" 24#include "ext2.h"
23#include "xattr.h" 25#include "xattr.h"
24#include "acl.h" 26#include "acl.h"
@@ -38,6 +40,22 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
38 return 0; 40 return 0;
39} 41}
40 42
43int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
44{
45 int ret;
46 struct super_block *sb = dentry->d_inode->i_sb;
47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
48
49 ret = simple_fsync(file, dentry, datasync);
50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
51 /* We don't really know where the IO error happened... */
52 ext2_error(sb, __func__,
53 "detected IO error when writing metadata buffers");
54 ret = -EIO;
55 }
56 return ret;
57}
58
41/* 59/*
42 * We have mostly NULL's here: the current defaults are ok for 60 * We have mostly NULL's here: the current defaults are ok for
43 * the ext2 filesystem. 61 * the ext2 filesystem.
@@ -53,9 +71,9 @@ const struct file_operations ext2_file_operations = {
53 .compat_ioctl = ext2_compat_ioctl, 71 .compat_ioctl = ext2_compat_ioctl,
54#endif 72#endif
55 .mmap = generic_file_mmap, 73 .mmap = generic_file_mmap,
56 .open = generic_file_open, 74 .open = dquot_file_open,
57 .release = ext2_release_file, 75 .release = ext2_release_file,
58 .fsync = simple_fsync, 76 .fsync = ext2_fsync,
59 .splice_read = generic_file_splice_read, 77 .splice_read = generic_file_splice_read,
60 .splice_write = generic_file_splice_write, 78 .splice_write = generic_file_splice_write,
61}; 79};
@@ -70,9 +88,9 @@ const struct file_operations ext2_xip_file_operations = {
70 .compat_ioctl = ext2_compat_ioctl, 88 .compat_ioctl = ext2_compat_ioctl,
71#endif 89#endif
72 .mmap = xip_file_mmap, 90 .mmap = xip_file_mmap,
73 .open = generic_file_open, 91 .open = dquot_file_open,
74 .release = ext2_release_file, 92 .release = ext2_release_file,
75 .fsync = simple_fsync, 93 .fsync = ext2_fsync,
76}; 94};
77#endif 95#endif
78 96
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d8..ad7d572ee8dc 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
121 if (!is_bad_inode(inode)) { 121 if (!is_bad_inode(inode)) {
122 /* Quota is already initialized in iput() */ 122 /* Quota is already initialized in iput() */
123 ext2_xattr_delete_inode(inode); 123 ext2_xattr_delete_inode(inode);
124 vfs_dq_free_inode(inode); 124 dquot_free_inode(inode);
125 vfs_dq_drop(inode); 125 dquot_drop(inode);
126 } 126 }
127 127
128 es = EXT2_SB(sb)->s_es; 128 es = EXT2_SB(sb)->s_es;
@@ -586,10 +586,10 @@ got:
586 goto fail_drop; 586 goto fail_drop;
587 } 587 }
588 588
589 if (vfs_dq_alloc_inode(inode)) { 589 dquot_initialize(inode);
590 err = -EDQUOT; 590 err = dquot_alloc_inode(inode);
591 if (err)
591 goto fail_drop; 592 goto fail_drop;
592 }
593 593
594 err = ext2_init_acl(inode, dir); 594 err = ext2_init_acl(inode, dir);
595 if (err) 595 if (err)
@@ -605,10 +605,10 @@ got:
605 return inode; 605 return inode;
606 606
607fail_free_drop: 607fail_free_drop:
608 vfs_dq_free_inode(inode); 608 dquot_free_inode(inode);
609 609
610fail_drop: 610fail_drop:
611 vfs_dq_drop(inode); 611 dquot_drop(inode);
612 inode->i_flags |= S_NOQUOTA; 612 inode->i_flags |= S_NOQUOTA;
613 inode->i_nlink = 0; 613 inode->i_nlink = 0;
614 unlock_new_inode(inode); 614 unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index ade634076d0a..fc13cc119aad 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
41MODULE_DESCRIPTION("Second Extended Filesystem"); 41MODULE_DESCRIPTION("Second Extended Filesystem");
42MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
43 43
44static int __ext2_write_inode(struct inode *inode, int do_sync);
45
44/* 46/*
45 * Test whether an inode is a fast symlink. 47 * Test whether an inode is a fast symlink.
46 */ 48 */
@@ -58,13 +60,15 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
58 */ 60 */
59void ext2_delete_inode (struct inode * inode) 61void ext2_delete_inode (struct inode * inode)
60{ 62{
63 if (!is_bad_inode(inode))
64 dquot_initialize(inode);
61 truncate_inode_pages(&inode->i_data, 0); 65 truncate_inode_pages(&inode->i_data, 0);
62 66
63 if (is_bad_inode(inode)) 67 if (is_bad_inode(inode))
64 goto no_delete; 68 goto no_delete;
65 EXT2_I(inode)->i_dtime = get_seconds(); 69 EXT2_I(inode)->i_dtime = get_seconds();
66 mark_inode_dirty(inode); 70 mark_inode_dirty(inode);
67 ext2_write_inode(inode, inode_needs_sync(inode)); 71 __ext2_write_inode(inode, inode_needs_sync(inode));
68 72
69 inode->i_size = 0; 73 inode->i_size = 0;
70 if (inode->i_blocks) 74 if (inode->i_blocks)
@@ -137,7 +141,8 @@ static int ext2_block_to_path(struct inode *inode,
137 int final = 0; 141 int final = 0;
138 142
139 if (i_block < 0) { 143 if (i_block < 0) {
140 ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0"); 144 ext2_msg(inode->i_sb, KERN_WARNING,
145 "warning: %s: block < 0", __func__);
141 } else if (i_block < direct_blocks) { 146 } else if (i_block < direct_blocks) {
142 offsets[n++] = i_block; 147 offsets[n++] = i_block;
143 final = direct_blocks; 148 final = direct_blocks;
@@ -157,7 +162,8 @@ static int ext2_block_to_path(struct inode *inode,
157 offsets[n++] = i_block & (ptrs - 1); 162 offsets[n++] = i_block & (ptrs - 1);
158 final = ptrs; 163 final = ptrs;
159 } else { 164 } else {
160 ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big"); 165 ext2_msg(inode->i_sb, KERN_WARNING,
166 "warning: %s: block is too big", __func__);
161 } 167 }
162 if (boundary) 168 if (boundary)
163 *boundary = final - 1 - (i_block & (ptrs - 1)); 169 *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -1333,7 +1339,7 @@ bad_inode:
1333 return ERR_PTR(ret); 1339 return ERR_PTR(ret);
1334} 1340}
1335 1341
1336int ext2_write_inode(struct inode *inode, int do_sync) 1342static int __ext2_write_inode(struct inode *inode, int do_sync)
1337{ 1343{
1338 struct ext2_inode_info *ei = EXT2_I(inode); 1344 struct ext2_inode_info *ei = EXT2_I(inode);
1339 struct super_block *sb = inode->i_sb; 1345 struct super_block *sb = inode->i_sb;
@@ -1438,6 +1444,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
1438 return err; 1444 return err;
1439} 1445}
1440 1446
1447int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1448{
1449 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1450}
1451
1441int ext2_sync_inode(struct inode *inode) 1452int ext2_sync_inode(struct inode *inode)
1442{ 1453{
1443 struct writeback_control wbc = { 1454 struct writeback_control wbc = {
@@ -1455,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1455 error = inode_change_ok(inode, iattr); 1466 error = inode_change_ok(inode, iattr);
1456 if (error) 1467 if (error)
1457 return error; 1468 return error;
1469
1470 if (iattr->ia_valid & ATTR_SIZE)
1471 dquot_initialize(inode);
1458 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1472 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1459 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1473 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
1460 error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; 1474 error = dquot_transfer(inode, iattr);
1461 if (error) 1475 if (error)
1462 return error; 1476 return error;
1463 } 1477 }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175ce5606..71efb0e9a3f2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/pagemap.h> 33#include <linux/pagemap.h>
34#include <linux/quotaops.h>
34#include "ext2.h" 35#include "ext2.h"
35#include "xattr.h" 36#include "xattr.h"
36#include "acl.h" 37#include "acl.h"
@@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child)
99 */ 100 */
100static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) 101static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
101{ 102{
102 struct inode * inode = ext2_new_inode (dir, mode); 103 struct inode *inode;
103 int err = PTR_ERR(inode); 104
104 if (!IS_ERR(inode)) { 105 dquot_initialize(dir);
105 inode->i_op = &ext2_file_inode_operations; 106
106 if (ext2_use_xip(inode->i_sb)) { 107 inode = ext2_new_inode(dir, mode);
107 inode->i_mapping->a_ops = &ext2_aops_xip; 108 if (IS_ERR(inode))
108 inode->i_fop = &ext2_xip_file_operations; 109 return PTR_ERR(inode);
109 } else if (test_opt(inode->i_sb, NOBH)) { 110
110 inode->i_mapping->a_ops = &ext2_nobh_aops; 111 inode->i_op = &ext2_file_inode_operations;
111 inode->i_fop = &ext2_file_operations; 112 if (ext2_use_xip(inode->i_sb)) {
112 } else { 113 inode->i_mapping->a_ops = &ext2_aops_xip;
113 inode->i_mapping->a_ops = &ext2_aops; 114 inode->i_fop = &ext2_xip_file_operations;
114 inode->i_fop = &ext2_file_operations; 115 } else if (test_opt(inode->i_sb, NOBH)) {
115 } 116 inode->i_mapping->a_ops = &ext2_nobh_aops;
116 mark_inode_dirty(inode); 117 inode->i_fop = &ext2_file_operations;
117 err = ext2_add_nondir(dentry, inode); 118 } else {
119 inode->i_mapping->a_ops = &ext2_aops;
120 inode->i_fop = &ext2_file_operations;
118 } 121 }
119 return err; 122 mark_inode_dirty(inode);
123 return ext2_add_nondir(dentry, inode);
120} 124}
121 125
122static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) 126static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
@@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
127 if (!new_valid_dev(rdev)) 131 if (!new_valid_dev(rdev))
128 return -EINVAL; 132 return -EINVAL;
129 133
134 dquot_initialize(dir);
135
130 inode = ext2_new_inode (dir, mode); 136 inode = ext2_new_inode (dir, mode);
131 err = PTR_ERR(inode); 137 err = PTR_ERR(inode);
132 if (!IS_ERR(inode)) { 138 if (!IS_ERR(inode)) {
@@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
151 if (l > sb->s_blocksize) 157 if (l > sb->s_blocksize)
152 goto out; 158 goto out;
153 159
160 dquot_initialize(dir);
161
154 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); 162 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
155 err = PTR_ERR(inode); 163 err = PTR_ERR(inode);
156 if (IS_ERR(inode)) 164 if (IS_ERR(inode))
@@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
194 if (inode->i_nlink >= EXT2_LINK_MAX) 202 if (inode->i_nlink >= EXT2_LINK_MAX)
195 return -EMLINK; 203 return -EMLINK;
196 204
205 dquot_initialize(dir);
206
197 inode->i_ctime = CURRENT_TIME_SEC; 207 inode->i_ctime = CURRENT_TIME_SEC;
198 inode_inc_link_count(inode); 208 inode_inc_link_count(inode);
199 atomic_inc(&inode->i_count); 209 atomic_inc(&inode->i_count);
@@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
216 if (dir->i_nlink >= EXT2_LINK_MAX) 226 if (dir->i_nlink >= EXT2_LINK_MAX)
217 goto out; 227 goto out;
218 228
229 dquot_initialize(dir);
230
219 inode_inc_link_count(dir); 231 inode_inc_link_count(dir);
220 232
221 inode = ext2_new_inode (dir, S_IFDIR | mode); 233 inode = ext2_new_inode (dir, S_IFDIR | mode);
@@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
262 struct page * page; 274 struct page * page;
263 int err = -ENOENT; 275 int err = -ENOENT;
264 276
277 dquot_initialize(dir);
278
265 de = ext2_find_entry (dir, &dentry->d_name, &page); 279 de = ext2_find_entry (dir, &dentry->d_name, &page);
266 if (!de) 280 if (!de)
267 goto out; 281 goto out;
@@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
304 struct ext2_dir_entry_2 * old_de; 318 struct ext2_dir_entry_2 * old_de;
305 int err = -ENOENT; 319 int err = -ENOENT;
306 320
321 dquot_initialize(old_dir);
322 dquot_initialize(new_dir);
323
307 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); 324 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
308 if (!old_de) 325 if (!old_de)
309 goto out; 326 goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1a9ffee47d56..42e4a303b675 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -58,27 +58,27 @@ void ext2_error (struct super_block * sb, const char * function,
58 } 58 }
59 59
60 va_start(args, fmt); 60 va_start(args, fmt);
61 printk(KERN_CRIT "EXT2-fs error (device %s): %s: ",sb->s_id, function); 61 printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
62 vprintk(fmt, args); 62 vprintk(fmt, args);
63 printk("\n"); 63 printk("\n");
64 va_end(args); 64 va_end(args);
65 65
66 if (test_opt(sb, ERRORS_PANIC)) 66 if (test_opt(sb, ERRORS_PANIC))
67 panic("EXT2-fs panic from previous error\n"); 67 panic("EXT2-fs: panic from previous error\n");
68 if (test_opt(sb, ERRORS_RO)) { 68 if (test_opt(sb, ERRORS_RO)) {
69 printk("Remounting filesystem read-only\n"); 69 ext2_msg(sb, KERN_CRIT,
70 "error: remounting filesystem read-only");
70 sb->s_flags |= MS_RDONLY; 71 sb->s_flags |= MS_RDONLY;
71 } 72 }
72} 73}
73 74
74void ext2_warning (struct super_block * sb, const char * function, 75void ext2_msg(struct super_block *sb, const char *prefix,
75 const char * fmt, ...) 76 const char *fmt, ...)
76{ 77{
77 va_list args; 78 va_list args;
78 79
79 va_start(args, fmt); 80 va_start(args, fmt);
80 printk(KERN_WARNING "EXT2-fs warning (device %s): %s: ", 81 printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
81 sb->s_id, function);
82 vprintk(fmt, args); 82 vprintk(fmt, args);
83 printk("\n"); 83 printk("\n");
84 va_end(args); 84 va_end(args);
@@ -91,9 +91,9 @@ void ext2_update_dynamic_rev(struct super_block *sb)
91 if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV) 91 if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV)
92 return; 92 return;
93 93
94 ext2_warning(sb, __func__, 94 ext2_msg(sb, KERN_WARNING,
95 "updating to rev %d because of new feature flag, " 95 "warning: updating to rev %d because of "
96 "running e2fsck is recommended", 96 "new feature flag, running e2fsck is recommended",
97 EXT2_DYNAMIC_REV); 97 EXT2_DYNAMIC_REV);
98 98
99 es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO); 99 es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO);
@@ -194,6 +194,8 @@ static void destroy_inodecache(void)
194static void ext2_clear_inode(struct inode *inode) 194static void ext2_clear_inode(struct inode *inode)
195{ 195{
196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; 196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
197
198 dquot_drop(inode);
197 ext2_discard_reservation(inode); 199 ext2_discard_reservation(inode);
198 EXT2_I(inode)->i_block_alloc_info = NULL; 200 EXT2_I(inode)->i_block_alloc_info = NULL;
199 if (unlikely(rsv)) 201 if (unlikely(rsv))
@@ -419,10 +421,10 @@ static const match_table_t tokens = {
419 {Opt_err, NULL} 421 {Opt_err, NULL}
420}; 422};
421 423
422static int parse_options (char * options, 424static int parse_options(char *options, struct super_block *sb)
423 struct ext2_sb_info *sbi)
424{ 425{
425 char * p; 426 char *p;
427 struct ext2_sb_info *sbi = EXT2_SB(sb);
426 substring_t args[MAX_OPT_ARGS]; 428 substring_t args[MAX_OPT_ARGS];
427 int option; 429 int option;
428 430
@@ -505,7 +507,8 @@ static int parse_options (char * options,
505#else 507#else
506 case Opt_user_xattr: 508 case Opt_user_xattr:
507 case Opt_nouser_xattr: 509 case Opt_nouser_xattr:
508 printk("EXT2 (no)user_xattr options not supported\n"); 510 ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
511 "not supported");
509 break; 512 break;
510#endif 513#endif
511#ifdef CONFIG_EXT2_FS_POSIX_ACL 514#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -518,14 +521,15 @@ static int parse_options (char * options,
518#else 521#else
519 case Opt_acl: 522 case Opt_acl:
520 case Opt_noacl: 523 case Opt_noacl:
521 printk("EXT2 (no)acl options not supported\n"); 524 ext2_msg(sb, KERN_INFO,
525 "(no)acl options not supported");
522 break; 526 break;
523#endif 527#endif
524 case Opt_xip: 528 case Opt_xip:
525#ifdef CONFIG_EXT2_FS_XIP 529#ifdef CONFIG_EXT2_FS_XIP
526 set_opt (sbi->s_mount_opt, XIP); 530 set_opt (sbi->s_mount_opt, XIP);
527#else 531#else
528 printk("EXT2 xip option not supported\n"); 532 ext2_msg(sb, KERN_INFO, "xip option not supported");
529#endif 533#endif
530 break; 534 break;
531 535
@@ -542,19 +546,18 @@ static int parse_options (char * options,
542 case Opt_quota: 546 case Opt_quota:
543 case Opt_usrquota: 547 case Opt_usrquota:
544 case Opt_grpquota: 548 case Opt_grpquota:
545 printk(KERN_ERR 549 ext2_msg(sb, KERN_INFO,
546 "EXT2-fs: quota operations not supported.\n"); 550 "quota operations not supported");
547
548 break; 551 break;
549#endif 552#endif
550 553
551 case Opt_reservation: 554 case Opt_reservation:
552 set_opt(sbi->s_mount_opt, RESERVATION); 555 set_opt(sbi->s_mount_opt, RESERVATION);
553 printk("reservations ON\n"); 556 ext2_msg(sb, KERN_INFO, "reservations ON");
554 break; 557 break;
555 case Opt_noreservation: 558 case Opt_noreservation:
556 clear_opt(sbi->s_mount_opt, RESERVATION); 559 clear_opt(sbi->s_mount_opt, RESERVATION);
557 printk("reservations OFF\n"); 560 ext2_msg(sb, KERN_INFO, "reservations OFF");
558 break; 561 break;
559 case Opt_ignore: 562 case Opt_ignore:
560 break; 563 break;
@@ -573,34 +576,40 @@ static int ext2_setup_super (struct super_block * sb,
573 struct ext2_sb_info *sbi = EXT2_SB(sb); 576 struct ext2_sb_info *sbi = EXT2_SB(sb);
574 577
575 if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) { 578 if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) {
576 printk ("EXT2-fs warning: revision level too high, " 579 ext2_msg(sb, KERN_ERR,
577 "forcing read-only mode\n"); 580 "error: revision level too high, "
581 "forcing read-only mode");
578 res = MS_RDONLY; 582 res = MS_RDONLY;
579 } 583 }
580 if (read_only) 584 if (read_only)
581 return res; 585 return res;
582 if (!(sbi->s_mount_state & EXT2_VALID_FS)) 586 if (!(sbi->s_mount_state & EXT2_VALID_FS))
583 printk ("EXT2-fs warning: mounting unchecked fs, " 587 ext2_msg(sb, KERN_WARNING,
584 "running e2fsck is recommended\n"); 588 "warning: mounting unchecked fs, "
589 "running e2fsck is recommended");
585 else if ((sbi->s_mount_state & EXT2_ERROR_FS)) 590 else if ((sbi->s_mount_state & EXT2_ERROR_FS))
586 printk ("EXT2-fs warning: mounting fs with errors, " 591 ext2_msg(sb, KERN_WARNING,
587 "running e2fsck is recommended\n"); 592 "warning: mounting fs with errors, "
593 "running e2fsck is recommended");
588 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 594 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
589 le16_to_cpu(es->s_mnt_count) >= 595 le16_to_cpu(es->s_mnt_count) >=
590 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 596 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
591 printk ("EXT2-fs warning: maximal mount count reached, " 597 ext2_msg(sb, KERN_WARNING,
592 "running e2fsck is recommended\n"); 598 "warning: maximal mount count reached, "
599 "running e2fsck is recommended");
593 else if (le32_to_cpu(es->s_checkinterval) && 600 else if (le32_to_cpu(es->s_checkinterval) &&
594 (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) 601 (le32_to_cpu(es->s_lastcheck) +
595 printk ("EXT2-fs warning: checktime reached, " 602 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
596 "running e2fsck is recommended\n"); 603 ext2_msg(sb, KERN_WARNING,
604 "warning: checktime reached, "
605 "running e2fsck is recommended");
597 if (!le16_to_cpu(es->s_max_mnt_count)) 606 if (!le16_to_cpu(es->s_max_mnt_count))
598 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); 607 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
599 le16_add_cpu(&es->s_mnt_count, 1); 608 le16_add_cpu(&es->s_mnt_count, 1);
600 ext2_write_super(sb); 609 ext2_write_super(sb);
601 if (test_opt (sb, DEBUG)) 610 if (test_opt (sb, DEBUG))
602 printk ("[EXT II FS %s, %s, bs=%lu, fs=%lu, gc=%lu, " 611 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
603 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 612 "bpg=%lu, ipg=%lu, mo=%04lx]",
604 EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize, 613 EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize,
605 sbi->s_frag_size, 614 sbi->s_frag_size,
606 sbi->s_groups_count, 615 sbi->s_groups_count,
@@ -767,7 +776,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
767 */ 776 */
768 blocksize = sb_min_blocksize(sb, BLOCK_SIZE); 777 blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
769 if (!blocksize) { 778 if (!blocksize) {
770 printk ("EXT2-fs: unable to set blocksize\n"); 779 ext2_msg(sb, KERN_ERR, "error: unable to set blocksize");
771 goto failed_sbi; 780 goto failed_sbi;
772 } 781 }
773 782
@@ -783,7 +792,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
783 } 792 }
784 793
785 if (!(bh = sb_bread(sb, logic_sb_block))) { 794 if (!(bh = sb_bread(sb, logic_sb_block))) {
786 printk ("EXT2-fs: unable to read superblock\n"); 795 ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
787 goto failed_sbi; 796 goto failed_sbi;
788 } 797 }
789 /* 798 /*
@@ -826,7 +835,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
826 835
827 set_opt(sbi->s_mount_opt, RESERVATION); 836 set_opt(sbi->s_mount_opt, RESERVATION);
828 837
829 if (!parse_options ((char *) data, sbi)) 838 if (!parse_options((char *) data, sb))
830 goto failed_mount; 839 goto failed_mount;
831 840
832 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 841 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -840,8 +849,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
840 (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || 849 (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
841 EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || 850 EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
842 EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U))) 851 EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
843 printk("EXT2-fs warning: feature flags set on rev 0 fs, " 852 ext2_msg(sb, KERN_WARNING,
844 "running e2fsck is recommended\n"); 853 "warning: feature flags set on rev 0 fs, "
854 "running e2fsck is recommended");
845 /* 855 /*
846 * Check feature flags regardless of the revision level, since we 856 * Check feature flags regardless of the revision level, since we
847 * previously didn't change the revision level when setting the flags, 857 * previously didn't change the revision level when setting the flags,
@@ -849,16 +859,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
849 */ 859 */
850 features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP); 860 features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
851 if (features) { 861 if (features) {
852 printk("EXT2-fs: %s: couldn't mount because of " 862 ext2_msg(sb, KERN_ERR, "error: couldn't mount because of "
853 "unsupported optional features (%x).\n", 863 "unsupported optional features (%x)",
854 sb->s_id, le32_to_cpu(features)); 864 le32_to_cpu(features));
855 goto failed_mount; 865 goto failed_mount;
856 } 866 }
857 if (!(sb->s_flags & MS_RDONLY) && 867 if (!(sb->s_flags & MS_RDONLY) &&
858 (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ 868 (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
859 printk("EXT2-fs: %s: couldn't mount RDWR because of " 869 ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of "
860 "unsupported optional features (%x).\n", 870 "unsupported optional features (%x)",
861 sb->s_id, le32_to_cpu(features)); 871 le32_to_cpu(features));
862 goto failed_mount; 872 goto failed_mount;
863 } 873 }
864 874
@@ -866,7 +876,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
866 876
867 if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { 877 if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
868 if (!silent) 878 if (!silent)
869 printk("XIP: Unsupported blocksize\n"); 879 ext2_msg(sb, KERN_ERR,
880 "error: unsupported blocksize for xip");
870 goto failed_mount; 881 goto failed_mount;
871 } 882 }
872 883
@@ -875,7 +886,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
875 brelse(bh); 886 brelse(bh);
876 887
877 if (!sb_set_blocksize(sb, blocksize)) { 888 if (!sb_set_blocksize(sb, blocksize)) {
878 printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n"); 889 ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
879 goto failed_sbi; 890 goto failed_sbi;
880 } 891 }
881 892
@@ -883,14 +894,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
883 offset = (sb_block*BLOCK_SIZE) % blocksize; 894 offset = (sb_block*BLOCK_SIZE) % blocksize;
884 bh = sb_bread(sb, logic_sb_block); 895 bh = sb_bread(sb, logic_sb_block);
885 if(!bh) { 896 if(!bh) {
886 printk("EXT2-fs: Couldn't read superblock on " 897 ext2_msg(sb, KERN_ERR, "error: couldn't read"
887 "2nd try.\n"); 898 "superblock on 2nd try");
888 goto failed_sbi; 899 goto failed_sbi;
889 } 900 }
890 es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); 901 es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
891 sbi->s_es = es; 902 sbi->s_es = es;
892 if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) { 903 if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
893 printk ("EXT2-fs: Magic mismatch, very weird !\n"); 904 ext2_msg(sb, KERN_ERR, "error: magic mismatch");
894 goto failed_mount; 905 goto failed_mount;
895 } 906 }
896 } 907 }
@@ -906,7 +917,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
906 if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) || 917 if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) ||
907 !is_power_of_2(sbi->s_inode_size) || 918 !is_power_of_2(sbi->s_inode_size) ||
908 (sbi->s_inode_size > blocksize)) { 919 (sbi->s_inode_size > blocksize)) {
909 printk ("EXT2-fs: unsupported inode size: %d\n", 920 ext2_msg(sb, KERN_ERR,
921 "error: unsupported inode size: %d",
910 sbi->s_inode_size); 922 sbi->s_inode_size);
911 goto failed_mount; 923 goto failed_mount;
912 } 924 }
@@ -943,29 +955,33 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
943 955
944 if (sb->s_blocksize != bh->b_size) { 956 if (sb->s_blocksize != bh->b_size) {
945 if (!silent) 957 if (!silent)
946 printk ("VFS: Unsupported blocksize on dev " 958 ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
947 "%s.\n", sb->s_id);
948 goto failed_mount; 959 goto failed_mount;
949 } 960 }
950 961
951 if (sb->s_blocksize != sbi->s_frag_size) { 962 if (sb->s_blocksize != sbi->s_frag_size) {
952 printk ("EXT2-fs: fragsize %lu != blocksize %lu (not supported yet)\n", 963 ext2_msg(sb, KERN_ERR,
964 "error: fragsize %lu != blocksize %lu"
965 "(not supported yet)",
953 sbi->s_frag_size, sb->s_blocksize); 966 sbi->s_frag_size, sb->s_blocksize);
954 goto failed_mount; 967 goto failed_mount;
955 } 968 }
956 969
957 if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { 970 if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
958 printk ("EXT2-fs: #blocks per group too big: %lu\n", 971 ext2_msg(sb, KERN_ERR,
972 "error: #blocks per group too big: %lu",
959 sbi->s_blocks_per_group); 973 sbi->s_blocks_per_group);
960 goto failed_mount; 974 goto failed_mount;
961 } 975 }
962 if (sbi->s_frags_per_group > sb->s_blocksize * 8) { 976 if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
963 printk ("EXT2-fs: #fragments per group too big: %lu\n", 977 ext2_msg(sb, KERN_ERR,
978 "error: #fragments per group too big: %lu",
964 sbi->s_frags_per_group); 979 sbi->s_frags_per_group);
965 goto failed_mount; 980 goto failed_mount;
966 } 981 }
967 if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { 982 if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
968 printk ("EXT2-fs: #inodes per group too big: %lu\n", 983 ext2_msg(sb, KERN_ERR,
984 "error: #inodes per group too big: %lu",
969 sbi->s_inodes_per_group); 985 sbi->s_inodes_per_group);
970 goto failed_mount; 986 goto failed_mount;
971 } 987 }
@@ -979,13 +995,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
979 EXT2_DESC_PER_BLOCK(sb); 995 EXT2_DESC_PER_BLOCK(sb);
980 sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); 996 sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
981 if (sbi->s_group_desc == NULL) { 997 if (sbi->s_group_desc == NULL) {
982 printk ("EXT2-fs: not enough memory\n"); 998 ext2_msg(sb, KERN_ERR, "error: not enough memory");
983 goto failed_mount; 999 goto failed_mount;
984 } 1000 }
985 bgl_lock_init(sbi->s_blockgroup_lock); 1001 bgl_lock_init(sbi->s_blockgroup_lock);
986 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); 1002 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
987 if (!sbi->s_debts) { 1003 if (!sbi->s_debts) {
988 printk ("EXT2-fs: not enough memory\n"); 1004 ext2_msg(sb, KERN_ERR, "error: not enough memory");
989 goto failed_mount_group_desc; 1005 goto failed_mount_group_desc;
990 } 1006 }
991 for (i = 0; i < db_count; i++) { 1007 for (i = 0; i < db_count; i++) {
@@ -994,12 +1010,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
994 if (!sbi->s_group_desc[i]) { 1010 if (!sbi->s_group_desc[i]) {
995 for (j = 0; j < i; j++) 1011 for (j = 0; j < i; j++)
996 brelse (sbi->s_group_desc[j]); 1012 brelse (sbi->s_group_desc[j]);
997 printk ("EXT2-fs: unable to read group descriptors\n"); 1013 ext2_msg(sb, KERN_ERR,
1014 "error: unable to read group descriptors");
998 goto failed_mount_group_desc; 1015 goto failed_mount_group_desc;
999 } 1016 }
1000 } 1017 }
1001 if (!ext2_check_descriptors (sb)) { 1018 if (!ext2_check_descriptors (sb)) {
1002 printk ("EXT2-fs: group descriptors corrupted!\n"); 1019 ext2_msg(sb, KERN_ERR, "group descriptors corrupted");
1003 goto failed_mount2; 1020 goto failed_mount2;
1004 } 1021 }
1005 sbi->s_gdb_count = db_count; 1022 sbi->s_gdb_count = db_count;
@@ -1032,7 +1049,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1032 ext2_count_dirs(sb)); 1049 ext2_count_dirs(sb));
1033 } 1050 }
1034 if (err) { 1051 if (err) {
1035 printk(KERN_ERR "EXT2-fs: insufficient memory\n"); 1052 ext2_msg(sb, KERN_ERR, "error: insufficient memory");
1036 goto failed_mount3; 1053 goto failed_mount3;
1037 } 1054 }
1038 /* 1055 /*
@@ -1048,27 +1065,28 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1048 } 1065 }
1049 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 1066 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1050 iput(root); 1067 iput(root);
1051 printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n"); 1068 ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
1052 goto failed_mount3; 1069 goto failed_mount3;
1053 } 1070 }
1054 1071
1055 sb->s_root = d_alloc_root(root); 1072 sb->s_root = d_alloc_root(root);
1056 if (!sb->s_root) { 1073 if (!sb->s_root) {
1057 iput(root); 1074 iput(root);
1058 printk(KERN_ERR "EXT2-fs: get root inode failed\n"); 1075 ext2_msg(sb, KERN_ERR, "error: get root inode failed");
1059 ret = -ENOMEM; 1076 ret = -ENOMEM;
1060 goto failed_mount3; 1077 goto failed_mount3;
1061 } 1078 }
1062 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) 1079 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
1063 ext2_warning(sb, __func__, 1080 ext2_msg(sb, KERN_WARNING,
1064 "mounting ext3 filesystem as ext2"); 1081 "warning: mounting ext3 filesystem as ext2");
1065 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); 1082 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1066 return 0; 1083 return 0;
1067 1084
1068cantfind_ext2: 1085cantfind_ext2:
1069 if (!silent) 1086 if (!silent)
1070 printk("VFS: Can't find an ext2 filesystem on dev %s.\n", 1087 ext2_msg(sb, KERN_ERR,
1071 sb->s_id); 1088 "error: can't find an ext2 filesystem on dev %s.",
1089 sb->s_id);
1072 goto failed_mount; 1090 goto failed_mount;
1073failed_mount3: 1091failed_mount3:
1074 percpu_counter_destroy(&sbi->s_freeblocks_counter); 1092 percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -1089,9 +1107,30 @@ failed_sbi:
1089 return ret; 1107 return ret;
1090} 1108}
1091 1109
1110static void ext2_clear_super_error(struct super_block *sb)
1111{
1112 struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
1113
1114 if (buffer_write_io_error(sbh)) {
1115 /*
1116 * Oh, dear. A previous attempt to write the
1117 * superblock failed. This could happen because the
1118 * USB device was yanked out. Or it could happen to
1119 * be a transient write error and maybe the block will
1120 * be remapped. Nothing we can do but to retry the
1121 * write and hope for the best.
1122 */
1123 printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
1124 "superblock detected", sb->s_id);
1125 clear_buffer_write_io_error(sbh);
1126 set_buffer_uptodate(sbh);
1127 }
1128}
1129
1092static void ext2_commit_super (struct super_block * sb, 1130static void ext2_commit_super (struct super_block * sb,
1093 struct ext2_super_block * es) 1131 struct ext2_super_block * es)
1094{ 1132{
1133 ext2_clear_super_error(sb);
1095 es->s_wtime = cpu_to_le32(get_seconds()); 1134 es->s_wtime = cpu_to_le32(get_seconds());
1096 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 1135 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1097 sb->s_dirt = 0; 1136 sb->s_dirt = 0;
@@ -1099,6 +1138,7 @@ static void ext2_commit_super (struct super_block * sb,
1099 1138
1100static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) 1139static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1101{ 1140{
1141 ext2_clear_super_error(sb);
1102 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); 1142 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
1103 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); 1143 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
1104 es->s_wtime = cpu_to_le32(get_seconds()); 1144 es->s_wtime = cpu_to_le32(get_seconds());
@@ -1121,8 +1161,24 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1121static int ext2_sync_fs(struct super_block *sb, int wait) 1161static int ext2_sync_fs(struct super_block *sb, int wait)
1122{ 1162{
1123 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 1163 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
1164 struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
1124 1165
1125 lock_kernel(); 1166 lock_kernel();
1167 if (buffer_write_io_error(sbh)) {
1168 /*
1169 * Oh, dear. A previous attempt to write the
1170 * superblock failed. This could happen because the
1171 * USB device was yanked out. Or it could happen to
1172 * be a transient write error and maybe the block will
1173 * be remapped. Nothing we can do but to retry the
1174 * write and hope for the best.
1175 */
1176 ext2_msg(sb, KERN_ERR,
1177 "previous I/O error to superblock detected\n");
1178 clear_buffer_write_io_error(sbh);
1179 set_buffer_uptodate(sbh);
1180 }
1181
1126 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { 1182 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
1127 ext2_debug("setting valid to 0\n"); 1183 ext2_debug("setting valid to 0\n");
1128 es->s_state &= cpu_to_le16(~EXT2_VALID_FS); 1184 es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
@@ -1170,7 +1226,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1170 /* 1226 /*
1171 * Allow the "check" option to be passed as a remount option. 1227 * Allow the "check" option to be passed as a remount option.
1172 */ 1228 */
1173 if (!parse_options (data, sbi)) { 1229 if (!parse_options(data, sb)) {
1174 err = -EINVAL; 1230 err = -EINVAL;
1175 goto restore_opts; 1231 goto restore_opts;
1176 } 1232 }
@@ -1182,7 +1238,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1182 EXT2_MOUNT_XIP if not */ 1238 EXT2_MOUNT_XIP if not */
1183 1239
1184 if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { 1240 if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
1185 printk("XIP: Unsupported blocksize\n"); 1241 ext2_msg(sb, KERN_WARNING,
1242 "warning: unsupported blocksize for xip");
1186 err = -EINVAL; 1243 err = -EINVAL;
1187 goto restore_opts; 1244 goto restore_opts;
1188 } 1245 }
@@ -1191,8 +1248,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1191 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1248 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
1192 (old_mount_opt & EXT2_MOUNT_XIP)) && 1249 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1193 invalidate_inodes(sb)) { 1250 invalidate_inodes(sb)) {
1194 ext2_warning(sb, __func__, "refusing change of xip flag " 1251 ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
1195 "with busy inodes while remounting"); 1252 "xip flag with busy inodes while remounting");
1196 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1253 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
1197 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; 1254 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1198 } 1255 }
@@ -1216,9 +1273,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1216 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1273 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
1217 ~EXT2_FEATURE_RO_COMPAT_SUPP); 1274 ~EXT2_FEATURE_RO_COMPAT_SUPP);
1218 if (ret) { 1275 if (ret) {
1219 printk("EXT2-fs: %s: couldn't remount RDWR because of " 1276 ext2_msg(sb, KERN_WARNING,
1220 "unsupported optional features (%x).\n", 1277 "warning: couldn't remount RDWR because of "
1221 sb->s_id, le32_to_cpu(ret)); 1278 "unsupported optional features (%x).",
1279 le32_to_cpu(ret));
1222 err = -EROFS; 1280 err = -EROFS;
1223 goto restore_opts; 1281 goto restore_opts;
1224 } 1282 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 7913531ec6d5..e44dc92609be 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -60,6 +60,7 @@
60#include <linux/mbcache.h> 60#include <linux/mbcache.h>
61#include <linux/quotaops.h> 61#include <linux/quotaops.h>
62#include <linux/rwsem.h> 62#include <linux/rwsem.h>
63#include <linux/security.h>
63#include "ext2.h" 64#include "ext2.h"
64#include "xattr.h" 65#include "xattr.h"
65#include "acl.h" 66#include "acl.h"
@@ -249,8 +250,9 @@ cleanup:
249 * used / required on success. 250 * used / required on success.
250 */ 251 */
251static int 252static int
252ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) 253ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
253{ 254{
255 struct inode *inode = dentry->d_inode;
254 struct buffer_head *bh = NULL; 256 struct buffer_head *bh = NULL;
255 struct ext2_xattr_entry *entry; 257 struct ext2_xattr_entry *entry;
256 char *end; 258 char *end;
@@ -300,9 +302,10 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
300 ext2_xattr_handler(entry->e_name_index); 302 ext2_xattr_handler(entry->e_name_index);
301 303
302 if (handler) { 304 if (handler) {
303 size_t size = handler->list(inode, buffer, rest, 305 size_t size = handler->list(dentry, buffer, rest,
304 entry->e_name, 306 entry->e_name,
305 entry->e_name_len); 307 entry->e_name_len,
308 handler->flags);
306 if (buffer) { 309 if (buffer) {
307 if (size > rest) { 310 if (size > rest) {
308 error = -ERANGE; 311 error = -ERANGE;
@@ -330,7 +333,7 @@ cleanup:
330ssize_t 333ssize_t
331ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) 334ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
332{ 335{
333 return ext2_xattr_list(dentry->d_inode, buffer, size); 336 return ext2_xattr_list(dentry, buffer, size);
334} 337}
335 338
336/* 339/*
@@ -641,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
641 the inode. */ 644 the inode. */
642 ea_bdebug(new_bh, "reusing block"); 645 ea_bdebug(new_bh, "reusing block");
643 646
644 error = -EDQUOT; 647 error = dquot_alloc_block(inode, 1);
645 if (vfs_dq_alloc_block(inode, 1)) { 648 if (error) {
646 unlock_buffer(new_bh); 649 unlock_buffer(new_bh);
647 goto cleanup; 650 goto cleanup;
648 } 651 }
@@ -699,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
699 * as if nothing happened and cleanup the unused block */ 702 * as if nothing happened and cleanup the unused block */
700 if (error && error != -ENOSPC) { 703 if (error && error != -ENOSPC) {
701 if (new_bh && new_bh != old_bh) 704 if (new_bh && new_bh != old_bh)
702 vfs_dq_free_block(inode, 1); 705 dquot_free_block(inode, 1);
703 goto cleanup; 706 goto cleanup;
704 } 707 }
705 } else 708 } else
@@ -731,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
731 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 734 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
732 if (ce) 735 if (ce)
733 mb_cache_entry_release(ce); 736 mb_cache_entry_release(ce);
734 vfs_dq_free_block(inode, 1); 737 dquot_free_block(inode, 1);
735 mark_buffer_dirty(old_bh); 738 mark_buffer_dirty(old_bh);
736 ea_bdebug(old_bh, "refcount now=%d", 739 ea_bdebug(old_bh, "refcount now=%d",
737 le32_to_cpu(HDR(old_bh)->h_refcount)); 740 le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -794,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode)
794 mark_buffer_dirty(bh); 797 mark_buffer_dirty(bh);
795 if (IS_SYNC(inode)) 798 if (IS_SYNC(inode))
796 sync_dirty_buffer(bh); 799 sync_dirty_buffer(bh);
797 vfs_dq_free_block(inode, 1); 800 dquot_free_block(inode, 1);
798 } 801 }
799 EXT2_I(inode)->i_file_acl = 0; 802 EXT2_I(inode)->i_file_acl = 0;
800 803
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 70c0dbdcdcb7..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext2_fs.h> 10#include <linux/ext2_fs.h>
@@ -11,8 +12,8 @@
11#include "xattr.h" 12#include "xattr.h"
12 13
13static size_t 14static size_t
14ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, 15ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
15 const char *name, size_t name_len) 16 const char *name, size_t name_len, int type)
16{ 17{
17 const int prefix_len = XATTR_SECURITY_PREFIX_LEN; 18 const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
18 const size_t total_len = prefix_len + name_len + 1; 19 const size_t total_len = prefix_len + name_len + 1;
@@ -26,22 +27,22 @@ ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
26} 27}
27 28
28static int 29static int
29ext2_xattr_security_get(struct inode *inode, const char *name, 30ext2_xattr_security_get(struct dentry *dentry, const char *name,
30 void *buffer, size_t size) 31 void *buffer, size_t size, int type)
31{ 32{
32 if (strcmp(name, "") == 0) 33 if (strcmp(name, "") == 0)
33 return -EINVAL; 34 return -EINVAL;
34 return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name, 35 return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
35 buffer, size); 36 buffer, size);
36} 37}
37 38
38static int 39static int
39ext2_xattr_security_set(struct inode *inode, const char *name, 40ext2_xattr_security_set(struct dentry *dentry, const char *name,
40 const void *value, size_t size, int flags) 41 const void *value, size_t size, int flags, int type)
41{ 42{
42 if (strcmp(name, "") == 0) 43 if (strcmp(name, "") == 0)
43 return -EINVAL; 44 return -EINVAL;
44 return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, 45 return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
45 value, size, flags); 46 value, size, flags);
46} 47}
47 48
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index e8219f8eae9f..2a26d71f4771 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -13,8 +13,8 @@
13#include "xattr.h" 13#include "xattr.h"
14 14
15static size_t 15static size_t
16ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, 16ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
17 const char *name, size_t name_len) 17 const char *name, size_t name_len, int type)
18{ 18{
19 const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; 19 const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
20 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
@@ -31,22 +31,22 @@ ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
31} 31}
32 32
33static int 33static int
34ext2_xattr_trusted_get(struct inode *inode, const char *name, 34ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
35 void *buffer, size_t size) 35 void *buffer, size_t size, int type)
36{ 36{
37 if (strcmp(name, "") == 0) 37 if (strcmp(name, "") == 0)
38 return -EINVAL; 38 return -EINVAL;
39 return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name, 39 return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
40 buffer, size); 40 buffer, size);
41} 41}
42 42
43static int 43static int
44ext2_xattr_trusted_set(struct inode *inode, const char *name, 44ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
45 const void *value, size_t size, int flags) 45 const void *value, size_t size, int flags, int type)
46{ 46{
47 if (strcmp(name, "") == 0) 47 if (strcmp(name, "") == 0)
48 return -EINVAL; 48 return -EINVAL;
49 return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, 49 return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
50 value, size, flags); 50 value, size, flags);
51} 51}
52 52
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 92495d28c62f..3f6caf3684b4 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -12,13 +12,13 @@
12#include "xattr.h" 12#include "xattr.h"
13 13
14static size_t 14static size_t
15ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, 15ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
16 const char *name, size_t name_len) 16 const char *name, size_t name_len, int type)
17{ 17{
18 const size_t prefix_len = XATTR_USER_PREFIX_LEN; 18 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
19 const size_t total_len = prefix_len + name_len + 1; 19 const size_t total_len = prefix_len + name_len + 1;
20 20
21 if (!test_opt(inode->i_sb, XATTR_USER)) 21 if (!test_opt(dentry->d_sb, XATTR_USER))
22 return 0; 22 return 0;
23 23
24 if (list && total_len <= list_size) { 24 if (list && total_len <= list_size) {
@@ -30,27 +30,28 @@ ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
30} 30}
31 31
32static int 32static int
33ext2_xattr_user_get(struct inode *inode, const char *name, 33ext2_xattr_user_get(struct dentry *dentry, const char *name,
34 void *buffer, size_t size) 34 void *buffer, size_t size, int type)
35{ 35{
36 if (strcmp(name, "") == 0) 36 if (strcmp(name, "") == 0)
37 return -EINVAL; 37 return -EINVAL;
38 if (!test_opt(inode->i_sb, XATTR_USER)) 38 if (!test_opt(dentry->d_sb, XATTR_USER))
39 return -EOPNOTSUPP; 39 return -EOPNOTSUPP;
40 return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size); 40 return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER,
41 name, buffer, size);
41} 42}
42 43
43static int 44static int
44ext2_xattr_user_set(struct inode *inode, const char *name, 45ext2_xattr_user_set(struct dentry *dentry, const char *name,
45 const void *value, size_t size, int flags) 46 const void *value, size_t size, int flags, int type)
46{ 47{
47 if (strcmp(name, "") == 0) 48 if (strcmp(name, "") == 0)
48 return -EINVAL; 49 return -EINVAL;
49 if (!test_opt(inode->i_sb, XATTR_USER)) 50 if (!test_opt(dentry->d_sb, XATTR_USER))
50 return -EOPNOTSUPP; 51 return -EOPNOTSUPP;
51 52
52 return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, 53 return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER,
53 value, size, flags); 54 name, value, size, flags);
54} 55}
55 56
56struct xattr_handler ext2_xattr_user_handler = { 57struct xattr_handler ext2_xattr_user_handler = {
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index c18fbf3e4068..322a56b2dfb1 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -69,8 +69,9 @@ void ext2_xip_verify_sb(struct super_block *sb)
69 if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && 69 if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
70 !sb->s_bdev->bd_disk->fops->direct_access) { 70 !sb->s_bdev->bd_disk->fops->direct_access) {
71 sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); 71 sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
72 ext2_warning(sb, __func__, 72 ext2_msg(sb, KERN_WARNING,
73 "ignoring xip option - not supported by bdev"); 73 "warning: ignoring xip option - "
74 "not supported by bdev");
74 } 75 }
75} 76}
76 77
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index c9b0df376b5f..82ba34158661 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -366,12 +366,12 @@ out:
366 * Extended attribute handlers 366 * Extended attribute handlers
367 */ 367 */
368static size_t 368static size_t
369ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, 369ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
370 const char *name, size_t name_len) 370 const char *name, size_t name_len, int type)
371{ 371{
372 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); 372 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
373 373
374 if (!test_opt(inode->i_sb, POSIX_ACL)) 374 if (!test_opt(dentry->d_sb, POSIX_ACL))
375 return 0; 375 return 0;
376 if (list && size <= list_len) 376 if (list && size <= list_len)
377 memcpy(list, POSIX_ACL_XATTR_ACCESS, size); 377 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -379,12 +379,12 @@ ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
379} 379}
380 380
381static size_t 381static size_t
382ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, 382ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
383 const char *name, size_t name_len) 383 const char *name, size_t name_len, int type)
384{ 384{
385 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); 385 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
386 386
387 if (!test_opt(inode->i_sb, POSIX_ACL)) 387 if (!test_opt(dentry->d_sb, POSIX_ACL))
388 return 0; 388 return 0;
389 if (list && size <= list_len) 389 if (list && size <= list_len)
390 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); 390 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -392,15 +392,18 @@ ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
392} 392}
393 393
394static int 394static int
395ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) 395ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
396 size_t size, int type)
396{ 397{
397 struct posix_acl *acl; 398 struct posix_acl *acl;
398 int error; 399 int error;
399 400
400 if (!test_opt(inode->i_sb, POSIX_ACL)) 401 if (strcmp(name, "") != 0)
402 return -EINVAL;
403 if (!test_opt(dentry->d_sb, POSIX_ACL))
401 return -EOPNOTSUPP; 404 return -EOPNOTSUPP;
402 405
403 acl = ext3_get_acl(inode, type); 406 acl = ext3_get_acl(dentry->d_inode, type);
404 if (IS_ERR(acl)) 407 if (IS_ERR(acl))
405 return PTR_ERR(acl); 408 return PTR_ERR(acl);
406 if (acl == NULL) 409 if (acl == NULL)
@@ -412,31 +415,16 @@ ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
412} 415}
413 416
414static int 417static int
415ext3_xattr_get_acl_access(struct inode *inode, const char *name, 418ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
416 void *buffer, size_t size) 419 size_t size, int flags, int type)
417{
418 if (strcmp(name, "") != 0)
419 return -EINVAL;
420 return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
421}
422
423static int
424ext3_xattr_get_acl_default(struct inode *inode, const char *name,
425 void *buffer, size_t size)
426{
427 if (strcmp(name, "") != 0)
428 return -EINVAL;
429 return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
430}
431
432static int
433ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
434 size_t size)
435{ 420{
421 struct inode *inode = dentry->d_inode;
436 handle_t *handle; 422 handle_t *handle;
437 struct posix_acl *acl; 423 struct posix_acl *acl;
438 int error, retries = 0; 424 int error, retries = 0;
439 425
426 if (strcmp(name, "") != 0)
427 return -EINVAL;
440 if (!test_opt(inode->i_sb, POSIX_ACL)) 428 if (!test_opt(inode->i_sb, POSIX_ACL))
441 return -EOPNOTSUPP; 429 return -EOPNOTSUPP;
442 if (!is_owner_or_cap(inode)) 430 if (!is_owner_or_cap(inode))
@@ -468,34 +456,18 @@ release_and_out:
468 return error; 456 return error;
469} 457}
470 458
471static int
472ext3_xattr_set_acl_access(struct inode *inode, const char *name,
473 const void *value, size_t size, int flags)
474{
475 if (strcmp(name, "") != 0)
476 return -EINVAL;
477 return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
478}
479
480static int
481ext3_xattr_set_acl_default(struct inode *inode, const char *name,
482 const void *value, size_t size, int flags)
483{
484 if (strcmp(name, "") != 0)
485 return -EINVAL;
486 return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
487}
488
489struct xattr_handler ext3_xattr_acl_access_handler = { 459struct xattr_handler ext3_xattr_acl_access_handler = {
490 .prefix = POSIX_ACL_XATTR_ACCESS, 460 .prefix = POSIX_ACL_XATTR_ACCESS,
461 .flags = ACL_TYPE_ACCESS,
491 .list = ext3_xattr_list_acl_access, 462 .list = ext3_xattr_list_acl_access,
492 .get = ext3_xattr_get_acl_access, 463 .get = ext3_xattr_get_acl,
493 .set = ext3_xattr_set_acl_access, 464 .set = ext3_xattr_set_acl,
494}; 465};
495 466
496struct xattr_handler ext3_xattr_acl_default_handler = { 467struct xattr_handler ext3_xattr_acl_default_handler = {
497 .prefix = POSIX_ACL_XATTR_DEFAULT, 468 .prefix = POSIX_ACL_XATTR_DEFAULT,
469 .flags = ACL_TYPE_DEFAULT,
498 .list = ext3_xattr_list_acl_default, 470 .list = ext3_xattr_list_acl_default,
499 .get = ext3_xattr_get_acl_default, 471 .get = ext3_xattr_get_acl,
500 .set = ext3_xattr_set_acl_default, 472 .set = ext3_xattr_set_acl,
501}; 473};
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e820..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/slab.h>
17#include <linux/jbd.h> 18#include <linux/jbd.h>
18#include <linux/ext3_fs.h> 19#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
@@ -676,7 +677,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
676 } 677 }
677 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 678 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
678 if (dquot_freed_blocks) 679 if (dquot_freed_blocks)
679 vfs_dq_free_block(inode, dquot_freed_blocks); 680 dquot_free_block(inode, dquot_freed_blocks);
680 return; 681 return;
681} 682}
682 683
@@ -1502,8 +1503,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1502 /* 1503 /*
1503 * Check quota for allocation of this block. 1504 * Check quota for allocation of this block.
1504 */ 1505 */
1505 if (vfs_dq_alloc_block(inode, num)) { 1506 err = dquot_alloc_block(inode, num);
1506 *errp = -EDQUOT; 1507 if (err) {
1508 *errp = err;
1507 return 0; 1509 return 0;
1508 } 1510 }
1509 1511
@@ -1713,7 +1715,7 @@ allocated:
1713 1715
1714 *errp = 0; 1716 *errp = 0;
1715 brelse(bitmap_bh); 1717 brelse(bitmap_bh);
1716 vfs_dq_free_block(inode, *count-num); 1718 dquot_free_block(inode, *count-num);
1717 *count = num; 1719 *count = num;
1718 return ret_block; 1720 return ret_block;
1719 1721
@@ -1728,7 +1730,7 @@ out:
1728 * Undo the block allocation 1730 * Undo the block allocation
1729 */ 1731 */
1730 if (!performed_allocation) 1732 if (!performed_allocation)
1731 vfs_dq_free_block(inode, *count); 1733 dquot_free_block(inode, *count);
1732 brelse(bitmap_bh); 1734 brelse(bitmap_bh);
1733 return 0; 1735 return 0;
1734} 1736}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 388bbdfa0b4e..f55df0e61cbd 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/jbd.h> 23#include <linux/jbd.h>
24#include <linux/quotaops.h>
24#include <linux/ext3_fs.h> 25#include <linux/ext3_fs.h>
25#include <linux/ext3_jbd.h> 26#include <linux/ext3_jbd.h>
26#include "xattr.h" 27#include "xattr.h"
@@ -33,9 +34,9 @@
33 */ 34 */
34static int ext3_release_file (struct inode * inode, struct file * filp) 35static int ext3_release_file (struct inode * inode, struct file * filp)
35{ 36{
36 if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { 37 if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
37 filemap_flush(inode->i_mapping); 38 filemap_flush(inode->i_mapping);
38 EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; 39 ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
39 } 40 }
40 /* if we are the last writer on the inode, drop the block reservation */ 41 /* if we are the last writer on the inode, drop the block reservation */
41 if ((filp->f_mode & FMODE_WRITE) && 42 if ((filp->f_mode & FMODE_WRITE) &&
@@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = {
62 .compat_ioctl = ext3_compat_ioctl, 63 .compat_ioctl = ext3_compat_ioctl,
63#endif 64#endif
64 .mmap = generic_file_mmap, 65 .mmap = generic_file_mmap,
65 .open = generic_file_open, 66 .open = dquot_file_open,
66 .release = ext3_release_file, 67 .release = ext3_release_file,
67 .fsync = ext3_sync_file, 68 .fsync = ext3_sync_file,
68 .splice_read = generic_file_splice_read, 69 .splice_read = generic_file_splice_read,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b39991285136..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
123 * Note: we must free any quota before locking the superblock, 123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well. 124 * as writing the quota to disk may need the lock as well.
125 */ 125 */
126 vfs_dq_init(inode); 126 dquot_initialize(inode);
127 ext3_xattr_delete_inode(handle, inode); 127 ext3_xattr_delete_inode(handle, inode);
128 vfs_dq_free_inode(inode); 128 dquot_free_inode(inode);
129 vfs_dq_drop(inode); 129 dquot_drop(inode);
130 130
131 is_directory = S_ISDIR(inode->i_mode); 131 is_directory = S_ISDIR(inode->i_mode);
132 132
@@ -582,16 +582,18 @@ got:
582 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
583 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
584 584
585 ei->i_state = EXT3_STATE_NEW; 585 ei->i_state_flags = 0;
586 ext3_set_inode_state(inode, EXT3_STATE_NEW);
587
586 ei->i_extra_isize = 588 ei->i_extra_isize =
587 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 589 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
588 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 590 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
589 591
590 ret = inode; 592 ret = inode;
591 if (vfs_dq_alloc_inode(inode)) { 593 dquot_initialize(inode);
592 err = -EDQUOT; 594 err = dquot_alloc_inode(inode);
595 if (err)
593 goto fail_drop; 596 goto fail_drop;
594 }
595 597
596 err = ext3_init_acl(handle, inode, dir); 598 err = ext3_init_acl(handle, inode, dir);
597 if (err) 599 if (err)
@@ -619,10 +621,10 @@ really_out:
619 return ret; 621 return ret;
620 622
621fail_free_drop: 623fail_free_drop:
622 vfs_dq_free_inode(inode); 624 dquot_free_inode(inode);
623 625
624fail_drop: 626fail_drop:
625 vfs_dq_drop(inode); 627 dquot_drop(inode);
626 inode->i_flags |= S_NOQUOTA; 628 inode->i_flags |= S_NOQUOTA;
627 inode->i_nlink = 0; 629 inode->i_nlink = 0;
628 unlock_new_inode(inode); 630 unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b47b30..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode)
196{ 196{
197 handle_t *handle; 197 handle_t *handle;
198 198
199 if (!is_bad_inode(inode))
200 dquot_initialize(inode);
201
199 truncate_inode_pages(&inode->i_data, 0); 202 truncate_inode_pages(&inode->i_data, 0);
200 203
201 if (is_bad_inode(inode)) 204 if (is_bad_inode(inode))
@@ -970,7 +973,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
970 if (max_blocks > DIO_MAX_BLOCKS) 973 if (max_blocks > DIO_MAX_BLOCKS)
971 max_blocks = DIO_MAX_BLOCKS; 974 max_blocks = DIO_MAX_BLOCKS;
972 handle = ext3_journal_start(inode, DIO_CREDITS + 975 handle = ext3_journal_start(inode, DIO_CREDITS +
973 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb)); 976 EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
974 if (IS_ERR(handle)) { 977 if (IS_ERR(handle)) {
975 ret = PTR_ERR(handle); 978 ret = PTR_ERR(handle);
976 goto out; 979 goto out;
@@ -1151,6 +1154,16 @@ static int do_journal_get_write_access(handle_t *handle,
1151 return ext3_journal_get_write_access(handle, bh); 1154 return ext3_journal_get_write_access(handle, bh);
1152} 1155}
1153 1156
1157/*
1158 * Truncate blocks that were not used by write. We have to truncate the
1159 * pagecache as well so that corresponding buffers get properly unmapped.
1160 */
1161static void ext3_truncate_failed_write(struct inode *inode)
1162{
1163 truncate_inode_pages(inode->i_mapping, inode->i_size);
1164 ext3_truncate(inode);
1165}
1166
1154static int ext3_write_begin(struct file *file, struct address_space *mapping, 1167static int ext3_write_begin(struct file *file, struct address_space *mapping,
1155 loff_t pos, unsigned len, unsigned flags, 1168 loff_t pos, unsigned len, unsigned flags,
1156 struct page **pagep, void **fsdata) 1169 struct page **pagep, void **fsdata)
@@ -1209,7 +1222,7 @@ write_begin_failed:
1209 unlock_page(page); 1222 unlock_page(page);
1210 page_cache_release(page); 1223 page_cache_release(page);
1211 if (pos + len > inode->i_size) 1224 if (pos + len > inode->i_size)
1212 ext3_truncate(inode); 1225 ext3_truncate_failed_write(inode);
1213 } 1226 }
1214 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1227 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1215 goto retry; 1228 goto retry;
@@ -1304,7 +1317,7 @@ static int ext3_ordered_write_end(struct file *file,
1304 page_cache_release(page); 1317 page_cache_release(page);
1305 1318
1306 if (pos + len > inode->i_size) 1319 if (pos + len > inode->i_size)
1307 ext3_truncate(inode); 1320 ext3_truncate_failed_write(inode);
1308 return ret ? ret : copied; 1321 return ret ? ret : copied;
1309} 1322}
1310 1323
@@ -1330,7 +1343,7 @@ static int ext3_writeback_write_end(struct file *file,
1330 page_cache_release(page); 1343 page_cache_release(page);
1331 1344
1332 if (pos + len > inode->i_size) 1345 if (pos + len > inode->i_size)
1333 ext3_truncate(inode); 1346 ext3_truncate_failed_write(inode);
1334 return ret ? ret : copied; 1347 return ret ? ret : copied;
1335} 1348}
1336 1349
@@ -1368,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file,
1368 */ 1381 */
1369 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1382 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1370 ext3_orphan_add(handle, inode); 1383 ext3_orphan_add(handle, inode);
1371 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1384 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1372 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1385 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1373 EXT3_I(inode)->i_disksize = inode->i_size; 1386 EXT3_I(inode)->i_disksize = inode->i_size;
1374 ret2 = ext3_mark_inode_dirty(handle, inode); 1387 ret2 = ext3_mark_inode_dirty(handle, inode);
@@ -1383,7 +1396,7 @@ static int ext3_journalled_write_end(struct file *file,
1383 page_cache_release(page); 1396 page_cache_release(page);
1384 1397
1385 if (pos + len > inode->i_size) 1398 if (pos + len > inode->i_size)
1386 ext3_truncate(inode); 1399 ext3_truncate_failed_write(inode);
1387 return ret ? ret : copied; 1400 return ret ? ret : copied;
1388} 1401}
1389 1402
@@ -1407,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1407 journal_t *journal; 1420 journal_t *journal;
1408 int err; 1421 int err;
1409 1422
1410 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { 1423 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
1411 /* 1424 /*
1412 * This is a REALLY heavyweight approach, but the use of 1425 * This is a REALLY heavyweight approach, but the use of
1413 * bmap on dirty files is expected to be extremely rare: 1426 * bmap on dirty files is expected to be extremely rare:
@@ -1426,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1426 * everything they get. 1439 * everything they get.
1427 */ 1440 */
1428 1441
1429 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; 1442 ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
1430 journal = EXT3_JOURNAL(inode); 1443 journal = EXT3_JOURNAL(inode);
1431 journal_lock_updates(journal); 1444 journal_lock_updates(journal);
1432 err = journal_flush(journal); 1445 err = journal_flush(journal);
@@ -1518,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page,
1518 int err; 1531 int err;
1519 1532
1520 J_ASSERT(PageLocked(page)); 1533 J_ASSERT(PageLocked(page));
1534 WARN_ON_ONCE(IS_RDONLY(inode));
1521 1535
1522 /* 1536 /*
1523 * We give up here if we're reentered, because it might be for a 1537 * We give up here if we're reentered, because it might be for a
@@ -1590,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page,
1590 int ret = 0; 1604 int ret = 0;
1591 int err; 1605 int err;
1592 1606
1607 J_ASSERT(PageLocked(page));
1608 WARN_ON_ONCE(IS_RDONLY(inode));
1609
1593 if (ext3_journal_current_handle()) 1610 if (ext3_journal_current_handle())
1594 goto out_fail; 1611 goto out_fail;
1595 1612
@@ -1632,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page,
1632 int ret = 0; 1649 int ret = 0;
1633 int err; 1650 int err;
1634 1651
1652 J_ASSERT(PageLocked(page));
1653 WARN_ON_ONCE(IS_RDONLY(inode));
1654
1635 if (ext3_journal_current_handle()) 1655 if (ext3_journal_current_handle())
1636 goto no_write; 1656 goto no_write;
1637 1657
@@ -1660,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page,
1660 PAGE_CACHE_SIZE, NULL, write_end_fn); 1680 PAGE_CACHE_SIZE, NULL, write_end_fn);
1661 if (ret == 0) 1681 if (ret == 0)
1662 ret = err; 1682 ret = err;
1663 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1683 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1664 unlock_page(page); 1684 unlock_page(page);
1665 } else { 1685 } else {
1666 /* 1686 /*
@@ -1775,8 +1795,9 @@ retry:
1775 handle = ext3_journal_start(inode, 2); 1795 handle = ext3_journal_start(inode, 2);
1776 if (IS_ERR(handle)) { 1796 if (IS_ERR(handle)) {
1777 /* This is really bad luck. We've written the data 1797 /* This is really bad luck. We've written the data
1778 * but cannot extend i_size. Bail out and pretend 1798 * but cannot extend i_size. Truncate allocated blocks
1779 * the write failed... */ 1799 * and pretend the write failed... */
1800 ext3_truncate(inode);
1780 ret = PTR_ERR(handle); 1801 ret = PTR_ERR(handle);
1781 goto out; 1802 goto out;
1782 } 1803 }
@@ -2033,7 +2054,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth,
2033 int k, err; 2054 int k, err;
2034 2055
2035 *top = 0; 2056 *top = 0;
2036 /* Make k index the deepest non-null offest + 1 */ 2057 /* Make k index the deepest non-null offset + 1 */
2037 for (k = depth; k > 1 && !offsets[k-1]; k--) 2058 for (k = depth; k > 1 && !offsets[k-1]; k--)
2038 ; 2059 ;
2039 partial = ext3_get_branch(inode, k, offsets, chain, &err); 2060 partial = ext3_get_branch(inode, k, offsets, chain, &err);
@@ -2392,7 +2413,7 @@ void ext3_truncate(struct inode *inode)
2392 goto out_notrans; 2413 goto out_notrans;
2393 2414
2394 if (inode->i_size == 0 && ext3_should_writeback_data(inode)) 2415 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2395 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; 2416 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2396 2417
2397 /* 2418 /*
2398 * We have to lock the EOF page here, because lock_page() nests 2419 * We have to lock the EOF page here, because lock_page() nests
@@ -2711,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2711{ 2732{
2712 /* We have all inode data except xattrs in memory here. */ 2733 /* We have all inode data except xattrs in memory here. */
2713 return __ext3_get_inode_loc(inode, iloc, 2734 return __ext3_get_inode_loc(inode, iloc,
2714 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); 2735 !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
2715} 2736}
2716 2737
2717void ext3_set_inode_flags(struct inode *inode) 2738void ext3_set_inode_flags(struct inode *inode)
@@ -2790,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2790 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2791 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2792 2813
2793 ei->i_state = 0; 2814 ei->i_state_flags = 0;
2794 ei->i_dir_start_lookup = 0; 2815 ei->i_dir_start_lookup = 0;
2795 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2796 /* We now have enough fields to check if the inode was active or not. 2817 /* We now have enough fields to check if the inode was active or not.
@@ -2883,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2883 EXT3_GOOD_OLD_INODE_SIZE + 2904 EXT3_GOOD_OLD_INODE_SIZE +
2884 ei->i_extra_isize; 2905 ei->i_extra_isize;
2885 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) 2906 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2886 ei->i_state |= EXT3_STATE_XATTR; 2907 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
2887 } 2908 }
2888 } else 2909 } else
2889 ei->i_extra_isize = 0; 2910 ei->i_extra_isize = 0;
@@ -2945,7 +2966,7 @@ again:
2945 2966
2946 /* For fields not not tracking in the in-memory inode, 2967 /* For fields not not tracking in the in-memory inode,
2947 * initialise them to zero for new inodes. */ 2968 * initialise them to zero for new inodes. */
2948 if (ei->i_state & EXT3_STATE_NEW) 2969 if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
2949 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 2970 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2950 2971
2951 ext3_get_inode_flags(ei); 2972 ext3_get_inode_flags(ei);
@@ -3042,7 +3063,7 @@ again:
3042 rc = ext3_journal_dirty_metadata(handle, bh); 3063 rc = ext3_journal_dirty_metadata(handle, bh);
3043 if (!err) 3064 if (!err)
3044 err = rc; 3065 err = rc;
3045 ei->i_state &= ~EXT3_STATE_NEW; 3066 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3046 3067
3047 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3068 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3048out_brelse: 3069out_brelse:
@@ -3086,7 +3107,7 @@ out_brelse:
3086 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3107 * `stuff()' is running, and the new i_size will be lost. Plus the inode
3087 * will no longer be on the superblock's dirty inode list. 3108 * will no longer be on the superblock's dirty inode list.
3088 */ 3109 */
3089int ext3_write_inode(struct inode *inode, int wait) 3110int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3090{ 3111{
3091 if (current->flags & PF_MEMALLOC) 3112 if (current->flags & PF_MEMALLOC)
3092 return 0; 3113 return 0;
@@ -3097,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait)
3097 return -EIO; 3118 return -EIO;
3098 } 3119 }
3099 3120
3100 if (!wait) 3121 if (wbc->sync_mode != WB_SYNC_ALL)
3101 return 0; 3122 return 0;
3102 3123
3103 return ext3_force_commit(inode->i_sb); 3124 return ext3_force_commit(inode->i_sb);
@@ -3130,19 +3151,21 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3130 if (error) 3151 if (error)
3131 return error; 3152 return error;
3132 3153
3154 if (ia_valid & ATTR_SIZE)
3155 dquot_initialize(inode);
3133 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3134 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3135 handle_t *handle; 3158 handle_t *handle;
3136 3159
3137 /* (user+group)*(old+new) structure, inode write (sb, 3160 /* (user+group)*(old+new) structure, inode write (sb,
3138 * inode block, ? - but truncate inode update has it) */ 3161 * inode block, ? - but truncate inode update has it) */
3139 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ 3162 handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
3140 EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 3163 EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
3141 if (IS_ERR(handle)) { 3164 if (IS_ERR(handle)) {
3142 error = PTR_ERR(handle); 3165 error = PTR_ERR(handle);
3143 goto err_out; 3166 goto err_out;
3144 } 3167 }
3145 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 3168 error = dquot_transfer(inode, attr);
3146 if (error) { 3169 if (error) {
3147 ext3_journal_stop(handle); 3170 ext3_journal_stop(handle);
3148 return error; 3171 return error;
@@ -3227,9 +3250,9 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
3227 ret = 2 * (bpp + indirects) + 2; 3250 ret = 2 * (bpp + indirects) + 2;
3228 3251
3229#ifdef CONFIG_QUOTA 3252#ifdef CONFIG_QUOTA
3230 /* We know that structure was already allocated during vfs_dq_init so 3253 /* We know that structure was already allocated during dquot_initialize so
3231 * we will be updating only the data blocks + inodes */ 3254 * we will be updating only the data blocks + inodes */
3232 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); 3255 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
3233#endif 3256#endif
3234 3257
3235 return ret; 3258 return ret;
@@ -3318,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3318 * i_size has been changed by generic_commit_write() and we thus need 3341 * i_size has been changed by generic_commit_write() and we thus need
3319 * to include the updated inode in the current transaction. 3342 * to include the updated inode in the current transaction.
3320 * 3343 *
3321 * Also, vfs_dq_alloc_space() will always dirty the inode when blocks 3344 * Also, dquot_alloc_space() will always dirty the inode when blocks
3322 * are allocated to the file. 3345 * are allocated to the file.
3323 * 3346 *
3324 * If the inode is marked synchronous, we don't honour that here - doing 3347 * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index aad6400c9b77..ee184084ca42 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,10 +1696,12 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1696 struct inode * inode; 1696 struct inode * inode;
1697 int err, retries = 0; 1697 int err, retries = 0;
1698 1698
1699 dquot_initialize(dir);
1700
1699retry: 1701retry:
1700 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1702 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1701 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1703 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1702 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); 1704 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1703 if (IS_ERR(handle)) 1705 if (IS_ERR(handle))
1704 return PTR_ERR(handle); 1706 return PTR_ERR(handle);
1705 1707
@@ -1730,10 +1732,12 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1730 if (!new_valid_dev(rdev)) 1732 if (!new_valid_dev(rdev))
1731 return -EINVAL; 1733 return -EINVAL;
1732 1734
1735 dquot_initialize(dir);
1736
1733retry: 1737retry:
1734 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1738 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1735 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1739 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1736 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); 1740 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1737 if (IS_ERR(handle)) 1741 if (IS_ERR(handle))
1738 return PTR_ERR(handle); 1742 return PTR_ERR(handle);
1739 1743
@@ -1766,10 +1770,12 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1766 if (dir->i_nlink >= EXT3_LINK_MAX) 1770 if (dir->i_nlink >= EXT3_LINK_MAX)
1767 return -EMLINK; 1771 return -EMLINK;
1768 1772
1773 dquot_initialize(dir);
1774
1769retry: 1775retry:
1770 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1776 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1771 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1777 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1772 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); 1778 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1773 if (IS_ERR(handle)) 1779 if (IS_ERR(handle))
1774 return PTR_ERR(handle); 1780 return PTR_ERR(handle);
1775 1781
@@ -1920,7 +1926,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
1920 struct ext3_iloc iloc; 1926 struct ext3_iloc iloc;
1921 int err = 0, rc; 1927 int err = 0, rc;
1922 1928
1923 lock_super(sb); 1929 mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
1924 if (!list_empty(&EXT3_I(inode)->i_orphan)) 1930 if (!list_empty(&EXT3_I(inode)->i_orphan))
1925 goto out_unlock; 1931 goto out_unlock;
1926 1932
@@ -1929,9 +1935,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
1929 1935
1930 /* @@@ FIXME: Observation from aviro: 1936 /* @@@ FIXME: Observation from aviro:
1931 * I think I can trigger J_ASSERT in ext3_orphan_add(). We block 1937 * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
1932 * here (on lock_super()), so race with ext3_link() which might bump 1938 * here (on s_orphan_lock), so race with ext3_link() which might bump
1933 * ->i_nlink. For, say it, character device. Not a regular file, 1939 * ->i_nlink. For, say it, character device. Not a regular file,
1934 * not a directory, not a symlink and ->i_nlink > 0. 1940 * not a directory, not a symlink and ->i_nlink > 0.
1941 *
1942 * tytso, 4/25/2009: I'm not sure how that could happen;
1943 * shouldn't the fs core protect us from these sort of
1944 * unlink()/link() races?
1935 */ 1945 */
1936 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1946 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1937 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 1947 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -1968,7 +1978,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
1968 jbd_debug(4, "orphan inode %lu will point to %d\n", 1978 jbd_debug(4, "orphan inode %lu will point to %d\n",
1969 inode->i_ino, NEXT_ORPHAN(inode)); 1979 inode->i_ino, NEXT_ORPHAN(inode));
1970out_unlock: 1980out_unlock:
1971 unlock_super(sb); 1981 mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
1972 ext3_std_error(inode->i_sb, err); 1982 ext3_std_error(inode->i_sb, err);
1973 return err; 1983 return err;
1974} 1984}
@@ -1986,11 +1996,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
1986 struct ext3_iloc iloc; 1996 struct ext3_iloc iloc;
1987 int err = 0; 1997 int err = 0;
1988 1998
1989 lock_super(inode->i_sb); 1999 mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
1990 if (list_empty(&ei->i_orphan)) { 2000 if (list_empty(&ei->i_orphan))
1991 unlock_super(inode->i_sb); 2001 goto out;
1992 return 0;
1993 }
1994 2002
1995 ino_next = NEXT_ORPHAN(inode); 2003 ino_next = NEXT_ORPHAN(inode);
1996 prev = ei->i_orphan.prev; 2004 prev = ei->i_orphan.prev;
@@ -2040,7 +2048,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
2040out_err: 2048out_err:
2041 ext3_std_error(inode->i_sb, err); 2049 ext3_std_error(inode->i_sb, err);
2042out: 2050out:
2043 unlock_super(inode->i_sb); 2051 mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
2044 return err; 2052 return err;
2045 2053
2046out_brelse: 2054out_brelse:
@@ -2058,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2058 2066
2059 /* Initialize quotas before so that eventual writes go in 2067 /* Initialize quotas before so that eventual writes go in
2060 * separate transaction */ 2068 * separate transaction */
2061 vfs_dq_init(dentry->d_inode); 2069 dquot_initialize(dir);
2070 dquot_initialize(dentry->d_inode);
2071
2062 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2072 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2063 if (IS_ERR(handle)) 2073 if (IS_ERR(handle))
2064 return PTR_ERR(handle); 2074 return PTR_ERR(handle);
@@ -2117,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2117 2127
2118 /* Initialize quotas before so that eventual writes go 2128 /* Initialize quotas before so that eventual writes go
2119 * in separate transaction */ 2129 * in separate transaction */
2120 vfs_dq_init(dentry->d_inode); 2130 dquot_initialize(dir);
2131 dquot_initialize(dentry->d_inode);
2132
2121 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2133 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2122 if (IS_ERR(handle)) 2134 if (IS_ERR(handle))
2123 return PTR_ERR(handle); 2135 return PTR_ERR(handle);
@@ -2172,10 +2184,12 @@ static int ext3_symlink (struct inode * dir,
2172 if (l > dir->i_sb->s_blocksize) 2184 if (l > dir->i_sb->s_blocksize)
2173 return -ENAMETOOLONG; 2185 return -ENAMETOOLONG;
2174 2186
2187 dquot_initialize(dir);
2188
2175retry: 2189retry:
2176 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2190 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2177 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2191 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2178 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); 2192 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2179 if (IS_ERR(handle)) 2193 if (IS_ERR(handle))
2180 return PTR_ERR(handle); 2194 return PTR_ERR(handle);
2181 2195
@@ -2226,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry,
2226 2240
2227 if (inode->i_nlink >= EXT3_LINK_MAX) 2241 if (inode->i_nlink >= EXT3_LINK_MAX)
2228 return -EMLINK; 2242 return -EMLINK;
2243
2244 dquot_initialize(dir);
2245
2229 /* 2246 /*
2230 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing 2247 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2231 * otherwise has the potential to corrupt the orphan inode list. 2248 * otherwise has the potential to corrupt the orphan inode list.
@@ -2276,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2276 struct ext3_dir_entry_2 * old_de, * new_de; 2293 struct ext3_dir_entry_2 * old_de, * new_de;
2277 int retval, flush_file = 0; 2294 int retval, flush_file = 0;
2278 2295
2296 dquot_initialize(old_dir);
2297 dquot_initialize(new_dir);
2298
2279 old_bh = new_bh = dir_bh = NULL; 2299 old_bh = new_bh = dir_bh = NULL;
2280 2300
2281 /* Initialize quotas before so that eventual writes go 2301 /* Initialize quotas before so that eventual writes go
2282 * in separate transaction */ 2302 * in separate transaction */
2283 if (new_dentry->d_inode) 2303 if (new_dentry->d_inode)
2284 vfs_dq_init(new_dentry->d_inode); 2304 dquot_initialize(new_dentry->d_inode);
2285 handle = ext3_journal_start(old_dir, 2 * 2305 handle = ext3_journal_start(old_dir, 2 *
2286 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2306 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2287 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); 2307 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8359e7b3dc89..54351ac7cef9 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb,
209 if (IS_ERR(handle)) 209 if (IS_ERR(handle))
210 return PTR_ERR(handle); 210 return PTR_ERR(handle);
211 211
212 lock_super(sb); 212 mutex_lock(&sbi->s_resize_lock);
213 if (input->group != sbi->s_groups_count) { 213 if (input->group != sbi->s_groups_count) {
214 err = -EBUSY; 214 err = -EBUSY;
215 goto exit_journal; 215 goto exit_journal;
@@ -266,7 +266,7 @@ static int setup_new_group_blocks(struct super_block *sb,
266 goto exit_bh; 266 goto exit_bh;
267 267
268 if (IS_ERR(gdb = bclean(handle, sb, block))) { 268 if (IS_ERR(gdb = bclean(handle, sb, block))) {
269 err = PTR_ERR(bh); 269 err = PTR_ERR(gdb);
270 goto exit_bh; 270 goto exit_bh;
271 } 271 }
272 ext3_journal_dirty_metadata(handle, gdb); 272 ext3_journal_dirty_metadata(handle, gdb);
@@ -324,7 +324,7 @@ exit_bh:
324 brelse(bh); 324 brelse(bh);
325 325
326exit_journal: 326exit_journal:
327 unlock_super(sb); 327 mutex_unlock(&sbi->s_resize_lock);
328 if ((err2 = ext3_journal_stop(handle)) && !err) 328 if ((err2 = ext3_journal_stop(handle)) && !err)
329 err = err2; 329 err = err2;
330 330
@@ -662,11 +662,12 @@ exit_free:
662 * important part is that the new block and inode counts are in the backup 662 * important part is that the new block and inode counts are in the backup
663 * superblocks, and the location of the new group metadata in the GDT backups. 663 * superblocks, and the location of the new group metadata in the GDT backups.
664 * 664 *
665 * We do not need lock_super() for this, because these blocks are not 665 * We do not need take the s_resize_lock for this, because these
666 * otherwise touched by the filesystem code when it is mounted. We don't 666 * blocks are not otherwise touched by the filesystem code when it is
667 * need to worry about last changing from sbi->s_groups_count, because the 667 * mounted. We don't need to worry about last changing from
668 * worst that can happen is that we do not copy the full number of backups 668 * sbi->s_groups_count, because the worst that can happen is that we
669 * at this time. The resize which changed s_groups_count will backup again. 669 * do not copy the full number of backups at this time. The resize
670 * which changed s_groups_count will backup again.
670 */ 671 */
671static void update_backups(struct super_block *sb, 672static void update_backups(struct super_block *sb,
672 int blk_off, char *data, int size) 673 int blk_off, char *data, int size)
@@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
825 goto exit_put; 826 goto exit_put;
826 } 827 }
827 828
828 lock_super(sb); 829 mutex_lock(&sbi->s_resize_lock);
829 if (input->group != sbi->s_groups_count) { 830 if (input->group != sbi->s_groups_count) {
830 ext3_warning(sb, __func__, 831 ext3_warning(sb, __func__,
831 "multiple resizers run on filesystem!"); 832 "multiple resizers run on filesystem!");
@@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
856 /* 857 /*
857 * OK, now we've set up the new group. Time to make it active. 858 * OK, now we've set up the new group. Time to make it active.
858 * 859 *
859 * Current kernels don't lock all allocations via lock_super(), 860 * We do not lock all allocations via s_resize_lock
860 * so we have to be safe wrt. concurrent accesses the group 861 * so we have to be safe wrt. concurrent accesses the group
861 * data. So we need to be careful to set all of the relevant 862 * data. So we need to be careful to set all of the relevant
862 * group descriptor data etc. *before* we enable the group. 863 * group descriptor data etc. *before* we enable the group.
@@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
900 * 901 *
901 * The precise rules we use are: 902 * The precise rules we use are:
902 * 903 *
903 * * Writers of s_groups_count *must* hold lock_super 904 * * Writers of s_groups_count *must* hold s_resize_lock
904 * AND 905 * AND
905 * * Writers must perform a smp_wmb() after updating all dependent 906 * * Writers must perform a smp_wmb() after updating all dependent
906 * data and before modifying the groups count 907 * data and before modifying the groups count
907 * 908 *
908 * * Readers must hold lock_super() over the access 909 * * Readers must hold s_resize_lock over the access
909 * OR 910 * OR
910 * * Readers must perform an smp_rmb() after reading the groups count 911 * * Readers must perform an smp_rmb() after reading the groups count
911 * and before reading any dependent data. 912 * and before reading any dependent data.
@@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
936 ext3_journal_dirty_metadata(handle, sbi->s_sbh); 937 ext3_journal_dirty_metadata(handle, sbi->s_sbh);
937 938
938exit_journal: 939exit_journal:
939 unlock_super(sb); 940 mutex_unlock(&sbi->s_resize_lock);
940 if ((err2 = ext3_journal_stop(handle)) && !err) 941 if ((err2 = ext3_journal_stop(handle)) && !err)
941 err = err2; 942 err = err2;
942 if (!err) { 943 if (!err) {
@@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
973 974
974 /* We don't need to worry about locking wrt other resizers just 975 /* We don't need to worry about locking wrt other resizers just
975 * yet: we're going to revalidate es->s_blocks_count after 976 * yet: we're going to revalidate es->s_blocks_count after
976 * taking lock_super() below. */ 977 * taking the s_resize_lock below. */
977 o_blocks_count = le32_to_cpu(es->s_blocks_count); 978 o_blocks_count = le32_to_cpu(es->s_blocks_count);
978 o_groups_count = EXT3_SB(sb)->s_groups_count; 979 o_groups_count = EXT3_SB(sb)->s_groups_count;
979 980
@@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1045 goto exit_put; 1046 goto exit_put;
1046 } 1047 }
1047 1048
1048 lock_super(sb); 1049 mutex_lock(&EXT3_SB(sb)->s_resize_lock);
1049 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { 1050 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
1050 ext3_warning(sb, __func__, 1051 ext3_warning(sb, __func__,
1051 "multiple resizers run on filesystem!"); 1052 "multiple resizers run on filesystem!");
1052 unlock_super(sb); 1053 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1053 ext3_journal_stop(handle); 1054 ext3_journal_stop(handle);
1054 err = -EBUSY; 1055 err = -EBUSY;
1055 goto exit_put; 1056 goto exit_put;
@@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1059 EXT3_SB(sb)->s_sbh))) { 1060 EXT3_SB(sb)->s_sbh))) {
1060 ext3_warning(sb, __func__, 1061 ext3_warning(sb, __func__,
1061 "error %d on journal write access", err); 1062 "error %d on journal write access", err);
1062 unlock_super(sb); 1063 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1063 ext3_journal_stop(handle); 1064 ext3_journal_stop(handle);
1064 goto exit_put; 1065 goto exit_put;
1065 } 1066 }
1066 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1067 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1067 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1068 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1068 unlock_super(sb); 1069 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1069 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, 1070 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
1070 o_blocks_count + add); 1071 o_blocks_count + add);
1071 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1072 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 427496c4767c..1bee604cc6cd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -135,12 +135,24 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
135 if (is_handle_aborted(handle)) 135 if (is_handle_aborted(handle))
136 return; 136 return;
137 137
138 printk(KERN_ERR "%s: aborting transaction: %s in %s\n", 138 printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
139 caller, errstr, err_fn); 139 caller, errstr, err_fn);
140 140
141 journal_abort_handle(handle); 141 journal_abort_handle(handle);
142} 142}
143 143
144void ext3_msg(struct super_block *sb, const char *prefix,
145 const char *fmt, ...)
146{
147 va_list args;
148
149 va_start(args, fmt);
150 printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
151 vprintk(fmt, args);
152 printk("\n");
153 va_end(args);
154}
155
144/* Deal with the reporting of failure conditions on a filesystem such as 156/* Deal with the reporting of failure conditions on a filesystem such as
145 * inconsistencies detected or read IO failures. 157 * inconsistencies detected or read IO failures.
146 * 158 *
@@ -152,7 +164,7 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
152 * write out the superblock safely. 164 * write out the superblock safely.
153 * 165 *
154 * We'll just use the journal_abort() error code to record an error in 166 * We'll just use the journal_abort() error code to record an error in
155 * the journal instead. On recovery, the journal will compain about 167 * the journal instead. On recovery, the journal will complain about
156 * that error until we've noted it down and cleared it. 168 * that error until we've noted it down and cleared it.
157 */ 169 */
158 170
@@ -169,17 +181,18 @@ static void ext3_handle_error(struct super_block *sb)
169 if (!test_opt (sb, ERRORS_CONT)) { 181 if (!test_opt (sb, ERRORS_CONT)) {
170 journal_t *journal = EXT3_SB(sb)->s_journal; 182 journal_t *journal = EXT3_SB(sb)->s_journal;
171 183
172 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; 184 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
173 if (journal) 185 if (journal)
174 journal_abort(journal, -EIO); 186 journal_abort(journal, -EIO);
175 } 187 }
176 if (test_opt (sb, ERRORS_RO)) { 188 if (test_opt (sb, ERRORS_RO)) {
177 printk (KERN_CRIT "Remounting filesystem read-only\n"); 189 ext3_msg(sb, KERN_CRIT,
190 "error: remounting filesystem read-only");
178 sb->s_flags |= MS_RDONLY; 191 sb->s_flags |= MS_RDONLY;
179 } 192 }
180 ext3_commit_super(sb, es, 1); 193 ext3_commit_super(sb, es, 1);
181 if (test_opt(sb, ERRORS_PANIC)) 194 if (test_opt(sb, ERRORS_PANIC))
182 panic("EXT3-fs (device %s): panic forced after error\n", 195 panic("EXT3-fs (%s): panic forced after error\n",
183 sb->s_id); 196 sb->s_id);
184} 197}
185 198
@@ -247,8 +260,7 @@ void __ext3_std_error (struct super_block * sb, const char * function,
247 return; 260 return;
248 261
249 errstr = ext3_decode_error(sb, errno, nbuf); 262 errstr = ext3_decode_error(sb, errno, nbuf);
250 printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", 263 ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
251 sb->s_id, function, errstr);
252 264
253 ext3_handle_error(sb); 265 ext3_handle_error(sb);
254} 266}
@@ -268,24 +280,23 @@ void ext3_abort (struct super_block * sb, const char * function,
268{ 280{
269 va_list args; 281 va_list args;
270 282
271 printk (KERN_CRIT "ext3_abort called.\n");
272
273 va_start(args, fmt); 283 va_start(args, fmt);
274 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); 284 printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
275 vprintk(fmt, args); 285 vprintk(fmt, args);
276 printk("\n"); 286 printk("\n");
277 va_end(args); 287 va_end(args);
278 288
279 if (test_opt(sb, ERRORS_PANIC)) 289 if (test_opt(sb, ERRORS_PANIC))
280 panic("EXT3-fs panic from previous error\n"); 290 panic("EXT3-fs: panic from previous error\n");
281 291
282 if (sb->s_flags & MS_RDONLY) 292 if (sb->s_flags & MS_RDONLY)
283 return; 293 return;
284 294
285 printk(KERN_CRIT "Remounting filesystem read-only\n"); 295 ext3_msg(sb, KERN_CRIT,
296 "error: remounting filesystem read-only");
286 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; 297 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
287 sb->s_flags |= MS_RDONLY; 298 sb->s_flags |= MS_RDONLY;
288 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; 299 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
289 if (EXT3_SB(sb)->s_journal) 300 if (EXT3_SB(sb)->s_journal)
290 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 301 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
291} 302}
@@ -296,7 +307,7 @@ void ext3_warning (struct super_block * sb, const char * function,
296 va_list args; 307 va_list args;
297 308
298 va_start(args, fmt); 309 va_start(args, fmt);
299 printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ", 310 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
300 sb->s_id, function); 311 sb->s_id, function);
301 vprintk(fmt, args); 312 vprintk(fmt, args);
302 printk("\n"); 313 printk("\n");
@@ -310,10 +321,10 @@ void ext3_update_dynamic_rev(struct super_block *sb)
310 if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) 321 if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
311 return; 322 return;
312 323
313 ext3_warning(sb, __func__, 324 ext3_msg(sb, KERN_WARNING,
314 "updating to rev %d because of new feature flag, " 325 "warning: updating to rev %d because of "
315 "running e2fsck is recommended", 326 "new feature flag, running e2fsck is recommended",
316 EXT3_DYNAMIC_REV); 327 EXT3_DYNAMIC_REV);
317 328
318 es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); 329 es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
319 es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); 330 es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
@@ -331,7 +342,7 @@ void ext3_update_dynamic_rev(struct super_block *sb)
331/* 342/*
332 * Open the external journal device 343 * Open the external journal device
333 */ 344 */
334static struct block_device *ext3_blkdev_get(dev_t dev) 345static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
335{ 346{
336 struct block_device *bdev; 347 struct block_device *bdev;
337 char b[BDEVNAME_SIZE]; 348 char b[BDEVNAME_SIZE];
@@ -342,8 +353,9 @@ static struct block_device *ext3_blkdev_get(dev_t dev)
342 return bdev; 353 return bdev;
343 354
344fail: 355fail:
345 printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n", 356 ext3_msg(sb, "error: failed to open journal device %s: %ld",
346 __bdevname(dev, b), PTR_ERR(bdev)); 357 __bdevname(dev, b), PTR_ERR(bdev));
358
347 return NULL; 359 return NULL;
348} 360}
349 361
@@ -378,13 +390,13 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
378{ 390{
379 struct list_head *l; 391 struct list_head *l;
380 392
381 printk(KERN_ERR "sb orphan head is %d\n", 393 ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
382 le32_to_cpu(sbi->s_es->s_last_orphan)); 394 le32_to_cpu(sbi->s_es->s_last_orphan));
383 395
384 printk(KERN_ERR "sb_info orphan list:\n"); 396 ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
385 list_for_each(l, &sbi->s_orphan) { 397 list_for_each(l, &sbi->s_orphan) {
386 struct inode *inode = orphan_list_entry(l); 398 struct inode *inode = orphan_list_entry(l);
387 printk(KERN_ERR " " 399 ext3_msg(sb, KERN_ERR, " "
388 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", 400 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
389 inode->i_sb->s_id, inode->i_ino, inode, 401 inode->i_sb->s_id, inode->i_ino, inode,
390 inode->i_mode, inode->i_nlink, 402 inode->i_mode, inode->i_nlink,
@@ -516,6 +528,8 @@ static void destroy_inodecache(void)
516static void ext3_clear_inode(struct inode *inode) 528static void ext3_clear_inode(struct inode *inode)
517{ 529{
518 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; 530 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
531
532 dquot_drop(inode);
519 ext3_discard_reservation(inode); 533 ext3_discard_reservation(inode);
520 EXT3_I(inode)->i_block_alloc_info = NULL; 534 EXT3_I(inode)->i_block_alloc_info = NULL;
521 if (unlikely(rsv)) 535 if (unlikely(rsv))
@@ -527,9 +541,22 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
527#if defined(CONFIG_QUOTA) 541#if defined(CONFIG_QUOTA)
528 struct ext3_sb_info *sbi = EXT3_SB(sb); 542 struct ext3_sb_info *sbi = EXT3_SB(sb);
529 543
530 if (sbi->s_jquota_fmt) 544 if (sbi->s_jquota_fmt) {
531 seq_printf(seq, ",jqfmt=%s", 545 char *fmtname = "";
532 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); 546
547 switch (sbi->s_jquota_fmt) {
548 case QFMT_VFS_OLD:
549 fmtname = "vfsold";
550 break;
551 case QFMT_VFS_V0:
552 fmtname = "vfsv0";
553 break;
554 case QFMT_VFS_V1:
555 fmtname = "vfsv1";
556 break;
557 }
558 seq_printf(seq, ",jqfmt=%s", fmtname);
559 }
533 560
534 if (sbi->s_qf_names[USRQUOTA]) 561 if (sbi->s_qf_names[USRQUOTA])
535 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); 562 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -537,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
537 if (sbi->s_qf_names[GRPQUOTA]) 564 if (sbi->s_qf_names[GRPQUOTA])
538 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 565 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
539 566
540 if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) 567 if (test_opt(sb, USRQUOTA))
541 seq_puts(seq, ",usrquota"); 568 seq_puts(seq, ",usrquota");
542 569
543 if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) 570 if (test_opt(sb, GRPQUOTA))
544 seq_puts(seq, ",grpquota"); 571 seq_puts(seq, ",grpquota");
545#endif 572#endif
546} 573}
@@ -631,11 +658,13 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
631 if (test_opt(sb, NOBH)) 658 if (test_opt(sb, NOBH))
632 seq_puts(seq, ",nobh"); 659 seq_puts(seq, ",nobh");
633 660
634 seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & 661 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
635 EXT3_MOUNT_DATA_FLAGS));
636 if (test_opt(sb, DATA_ERR_ABORT)) 662 if (test_opt(sb, DATA_ERR_ABORT))
637 seq_puts(seq, ",data_err=abort"); 663 seq_puts(seq, ",data_err=abort");
638 664
665 if (test_opt(sb, NOLOAD))
666 seq_puts(seq, ",norecovery");
667
639 ext3_show_quota_options(seq, sb); 668 ext3_show_quota_options(seq, sb);
640 669
641 return 0; 670 return 0;
@@ -723,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
723 const char *data, size_t len, loff_t off); 752 const char *data, size_t len, loff_t off);
724 753
725static const struct dquot_operations ext3_quota_operations = { 754static const struct dquot_operations ext3_quota_operations = {
726 .initialize = dquot_initialize,
727 .drop = dquot_drop,
728 .alloc_space = dquot_alloc_space,
729 .alloc_inode = dquot_alloc_inode,
730 .free_space = dquot_free_space,
731 .free_inode = dquot_free_inode,
732 .transfer = dquot_transfer,
733 .write_dquot = ext3_write_dquot, 755 .write_dquot = ext3_write_dquot,
734 .acquire_dquot = ext3_acquire_dquot, 756 .acquire_dquot = ext3_acquire_dquot,
735 .release_dquot = ext3_release_dquot, 757 .release_dquot = ext3_release_dquot,
@@ -787,9 +809,9 @@ enum {
787 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 809 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
788 Opt_data_err_abort, Opt_data_err_ignore, 810 Opt_data_err_abort, Opt_data_err_ignore,
789 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 811 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
790 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 812 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
791 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 813 Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
792 Opt_grpquota 814 Opt_usrquota, Opt_grpquota
793}; 815};
794 816
795static const match_table_t tokens = { 817static const match_table_t tokens = {
@@ -818,6 +840,7 @@ static const match_table_t tokens = {
818 {Opt_reservation, "reservation"}, 840 {Opt_reservation, "reservation"},
819 {Opt_noreservation, "noreservation"}, 841 {Opt_noreservation, "noreservation"},
820 {Opt_noload, "noload"}, 842 {Opt_noload, "noload"},
843 {Opt_noload, "norecovery"},
821 {Opt_nobh, "nobh"}, 844 {Opt_nobh, "nobh"},
822 {Opt_bh, "bh"}, 845 {Opt_bh, "bh"},
823 {Opt_commit, "commit=%u"}, 846 {Opt_commit, "commit=%u"},
@@ -836,6 +859,7 @@ static const match_table_t tokens = {
836 {Opt_grpjquota, "grpjquota=%s"}, 859 {Opt_grpjquota, "grpjquota=%s"},
837 {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, 860 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
838 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, 861 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
862 {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
839 {Opt_grpquota, "grpquota"}, 863 {Opt_grpquota, "grpquota"},
840 {Opt_noquota, "noquota"}, 864 {Opt_noquota, "noquota"},
841 {Opt_quota, "quota"}, 865 {Opt_quota, "quota"},
@@ -845,7 +869,7 @@ static const match_table_t tokens = {
845 {Opt_err, NULL}, 869 {Opt_err, NULL},
846}; 870};
847 871
848static ext3_fsblk_t get_sb_block(void **data) 872static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
849{ 873{
850 ext3_fsblk_t sb_block; 874 ext3_fsblk_t sb_block;
851 char *options = (char *) *data; 875 char *options = (char *) *data;
@@ -856,7 +880,7 @@ static ext3_fsblk_t get_sb_block(void **data)
856 /*todo: use simple_strtoll with >32bit ext3 */ 880 /*todo: use simple_strtoll with >32bit ext3 */
857 sb_block = simple_strtoul(options, &options, 0); 881 sb_block = simple_strtoul(options, &options, 0);
858 if (*options && *options != ',') { 882 if (*options && *options != ',') {
859 printk("EXT3-fs: Invalid sb specification: %s\n", 883 ext3_msg(sb, "error: invalid sb specification: %s",
860 (char *) *data); 884 (char *) *data);
861 return 1; 885 return 1;
862 } 886 }
@@ -866,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data)
866 return sb_block; 890 return sb_block;
867} 891}
868 892
893#ifdef CONFIG_QUOTA
894static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
895{
896 struct ext3_sb_info *sbi = EXT3_SB(sb);
897 char *qname;
898
899 if (sb_any_quota_loaded(sb) &&
900 !sbi->s_qf_names[qtype]) {
901 ext3_msg(sb, KERN_ERR,
902 "Cannot change journaled "
903 "quota options when quota turned on");
904 return 0;
905 }
906 qname = match_strdup(args);
907 if (!qname) {
908 ext3_msg(sb, KERN_ERR,
909 "Not enough memory for storing quotafile name");
910 return 0;
911 }
912 if (sbi->s_qf_names[qtype] &&
913 strcmp(sbi->s_qf_names[qtype], qname)) {
914 ext3_msg(sb, KERN_ERR,
915 "%s quota file already specified", QTYPE2NAME(qtype));
916 kfree(qname);
917 return 0;
918 }
919 sbi->s_qf_names[qtype] = qname;
920 if (strchr(sbi->s_qf_names[qtype], '/')) {
921 ext3_msg(sb, KERN_ERR,
922 "quotafile must be on filesystem root");
923 kfree(sbi->s_qf_names[qtype]);
924 sbi->s_qf_names[qtype] = NULL;
925 return 0;
926 }
927 set_opt(sbi->s_mount_opt, QUOTA);
928 return 1;
929}
930
931static int clear_qf_name(struct super_block *sb, int qtype) {
932
933 struct ext3_sb_info *sbi = EXT3_SB(sb);
934
935 if (sb_any_quota_loaded(sb) &&
936 sbi->s_qf_names[qtype]) {
937 ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
938 " when quota turned on");
939 return 0;
940 }
941 /*
942 * The space will be released later when all options are confirmed
943 * to be correct
944 */
945 sbi->s_qf_names[qtype] = NULL;
946 return 1;
947}
948#endif
949
869static int parse_options (char *options, struct super_block *sb, 950static int parse_options (char *options, struct super_block *sb,
870 unsigned int *inum, unsigned long *journal_devnum, 951 unsigned int *inum, unsigned long *journal_devnum,
871 ext3_fsblk_t *n_blocks_count, int is_remount) 952 ext3_fsblk_t *n_blocks_count, int is_remount)
@@ -876,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb,
876 int data_opt = 0; 957 int data_opt = 0;
877 int option; 958 int option;
878#ifdef CONFIG_QUOTA 959#ifdef CONFIG_QUOTA
879 int qtype, qfmt; 960 int qfmt;
880 char *qname;
881#endif 961#endif
882 962
883 if (!options) 963 if (!options)
@@ -956,7 +1036,8 @@ static int parse_options (char *options, struct super_block *sb,
956#else 1036#else
957 case Opt_user_xattr: 1037 case Opt_user_xattr:
958 case Opt_nouser_xattr: 1038 case Opt_nouser_xattr:
959 printk("EXT3 (no)user_xattr options not supported\n"); 1039 ext3_msg(sb, KERN_INFO,
1040 "(no)user_xattr options not supported");
960 break; 1041 break;
961#endif 1042#endif
962#ifdef CONFIG_EXT3_FS_POSIX_ACL 1043#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -969,7 +1050,8 @@ static int parse_options (char *options, struct super_block *sb,
969#else 1050#else
970 case Opt_acl: 1051 case Opt_acl:
971 case Opt_noacl: 1052 case Opt_noacl:
972 printk("EXT3 (no)acl options not supported\n"); 1053 ext3_msg(sb, KERN_INFO,
1054 "(no)acl options not supported");
973 break; 1055 break;
974#endif 1056#endif
975 case Opt_reservation: 1057 case Opt_reservation:
@@ -985,16 +1067,16 @@ static int parse_options (char *options, struct super_block *sb,
985 user to specify an existing inode to be the 1067 user to specify an existing inode to be the
986 journal file. */ 1068 journal file. */
987 if (is_remount) { 1069 if (is_remount) {
988 printk(KERN_ERR "EXT3-fs: cannot specify " 1070 ext3_msg(sb, KERN_ERR, "error: cannot specify "
989 "journal on remount\n"); 1071 "journal on remount");
990 return 0; 1072 return 0;
991 } 1073 }
992 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); 1074 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
993 break; 1075 break;
994 case Opt_journal_inum: 1076 case Opt_journal_inum:
995 if (is_remount) { 1077 if (is_remount) {
996 printk(KERN_ERR "EXT3-fs: cannot specify " 1078 ext3_msg(sb, KERN_ERR, "error: cannot specify "
997 "journal on remount\n"); 1079 "journal on remount");
998 return 0; 1080 return 0;
999 } 1081 }
1000 if (match_int(&args[0], &option)) 1082 if (match_int(&args[0], &option))
@@ -1003,8 +1085,8 @@ static int parse_options (char *options, struct super_block *sb,
1003 break; 1085 break;
1004 case Opt_journal_dev: 1086 case Opt_journal_dev:
1005 if (is_remount) { 1087 if (is_remount) {
1006 printk(KERN_ERR "EXT3-fs: cannot specify " 1088 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1007 "journal on remount\n"); 1089 "journal on remount");
1008 return 0; 1090 return 0;
1009 } 1091 }
1010 if (match_int(&args[0], &option)) 1092 if (match_int(&args[0], &option))
@@ -1033,21 +1115,19 @@ static int parse_options (char *options, struct super_block *sb,
1033 data_opt = EXT3_MOUNT_WRITEBACK_DATA; 1115 data_opt = EXT3_MOUNT_WRITEBACK_DATA;
1034 datacheck: 1116 datacheck:
1035 if (is_remount) { 1117 if (is_remount) {
1036 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) 1118 if (test_opt(sb, DATA_FLAGS) == data_opt)
1037 == data_opt)
1038 break; 1119 break;
1039 printk(KERN_ERR 1120 ext3_msg(sb, KERN_ERR,
1040 "EXT3-fs (device %s): Cannot change " 1121 "error: cannot change "
1041 "data mode on remount. The filesystem " 1122 "data mode on remount. The filesystem "
1042 "is mounted in data=%s mode and you " 1123 "is mounted in data=%s mode and you "
1043 "try to remount it in data=%s mode.\n", 1124 "try to remount it in data=%s mode.",
1044 sb->s_id, 1125 data_mode_string(test_opt(sb,
1045 data_mode_string(sbi->s_mount_opt & 1126 DATA_FLAGS)),
1046 EXT3_MOUNT_DATA_FLAGS),
1047 data_mode_string(data_opt)); 1127 data_mode_string(data_opt));
1048 return 0; 1128 return 0;
1049 } else { 1129 } else {
1050 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; 1130 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1051 sbi->s_mount_opt |= data_opt; 1131 sbi->s_mount_opt |= data_opt;
1052 } 1132 }
1053 break; 1133 break;
@@ -1059,74 +1139,35 @@ static int parse_options (char *options, struct super_block *sb,
1059 break; 1139 break;
1060#ifdef CONFIG_QUOTA 1140#ifdef CONFIG_QUOTA
1061 case Opt_usrjquota: 1141 case Opt_usrjquota:
1062 qtype = USRQUOTA; 1142 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1063 goto set_qf_name;
1064 case Opt_grpjquota:
1065 qtype = GRPQUOTA;
1066set_qf_name:
1067 if (sb_any_quota_loaded(sb) &&
1068 !sbi->s_qf_names[qtype]) {
1069 printk(KERN_ERR
1070 "EXT3-fs: Cannot change journaled "
1071 "quota options when quota turned on.\n");
1072 return 0; 1143 return 0;
1073 } 1144 break;
1074 qname = match_strdup(&args[0]); 1145 case Opt_grpjquota:
1075 if (!qname) { 1146 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1076 printk(KERN_ERR
1077 "EXT3-fs: not enough memory for "
1078 "storing quotafile name.\n");
1079 return 0;
1080 }
1081 if (sbi->s_qf_names[qtype] &&
1082 strcmp(sbi->s_qf_names[qtype], qname)) {
1083 printk(KERN_ERR
1084 "EXT3-fs: %s quota file already "
1085 "specified.\n", QTYPE2NAME(qtype));
1086 kfree(qname);
1087 return 0;
1088 }
1089 sbi->s_qf_names[qtype] = qname;
1090 if (strchr(sbi->s_qf_names[qtype], '/')) {
1091 printk(KERN_ERR
1092 "EXT3-fs: quotafile must be on "
1093 "filesystem root.\n");
1094 kfree(sbi->s_qf_names[qtype]);
1095 sbi->s_qf_names[qtype] = NULL;
1096 return 0; 1147 return 0;
1097 }
1098 set_opt(sbi->s_mount_opt, QUOTA);
1099 break; 1148 break;
1100 case Opt_offusrjquota: 1149 case Opt_offusrjquota:
1101 qtype = USRQUOTA; 1150 if (!clear_qf_name(sb, USRQUOTA))
1102 goto clear_qf_name; 1151 return 0;
1152 break;
1103 case Opt_offgrpjquota: 1153 case Opt_offgrpjquota:
1104 qtype = GRPQUOTA; 1154 if (!clear_qf_name(sb, GRPQUOTA))
1105clear_qf_name:
1106 if (sb_any_quota_loaded(sb) &&
1107 sbi->s_qf_names[qtype]) {
1108 printk(KERN_ERR "EXT3-fs: Cannot change "
1109 "journaled quota options when "
1110 "quota turned on.\n");
1111 return 0; 1155 return 0;
1112 }
1113 /*
1114 * The space will be released later when all options
1115 * are confirmed to be correct
1116 */
1117 sbi->s_qf_names[qtype] = NULL;
1118 break; 1156 break;
1119 case Opt_jqfmt_vfsold: 1157 case Opt_jqfmt_vfsold:
1120 qfmt = QFMT_VFS_OLD; 1158 qfmt = QFMT_VFS_OLD;
1121 goto set_qf_format; 1159 goto set_qf_format;
1122 case Opt_jqfmt_vfsv0: 1160 case Opt_jqfmt_vfsv0:
1123 qfmt = QFMT_VFS_V0; 1161 qfmt = QFMT_VFS_V0;
1162 goto set_qf_format;
1163 case Opt_jqfmt_vfsv1:
1164 qfmt = QFMT_VFS_V1;
1124set_qf_format: 1165set_qf_format:
1125 if (sb_any_quota_loaded(sb) && 1166 if (sb_any_quota_loaded(sb) &&
1126 sbi->s_jquota_fmt != qfmt) { 1167 sbi->s_jquota_fmt != qfmt) {
1127 printk(KERN_ERR "EXT3-fs: Cannot change " 1168 ext3_msg(sb, KERN_ERR, "error: cannot change "
1128 "journaled quota options when " 1169 "journaled quota options when "
1129 "quota turned on.\n"); 1170 "quota turned on.");
1130 return 0; 1171 return 0;
1131 } 1172 }
1132 sbi->s_jquota_fmt = qfmt; 1173 sbi->s_jquota_fmt = qfmt;
@@ -1142,8 +1183,8 @@ set_qf_format:
1142 break; 1183 break;
1143 case Opt_noquota: 1184 case Opt_noquota:
1144 if (sb_any_quota_loaded(sb)) { 1185 if (sb_any_quota_loaded(sb)) {
1145 printk(KERN_ERR "EXT3-fs: Cannot change quota " 1186 ext3_msg(sb, KERN_ERR, "error: cannot change "
1146 "options when quota turned on.\n"); 1187 "quota options when quota turned on.");
1147 return 0; 1188 return 0;
1148 } 1189 }
1149 clear_opt(sbi->s_mount_opt, QUOTA); 1190 clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1154,8 +1195,8 @@ set_qf_format:
1154 case Opt_quota: 1195 case Opt_quota:
1155 case Opt_usrquota: 1196 case Opt_usrquota:
1156 case Opt_grpquota: 1197 case Opt_grpquota:
1157 printk(KERN_ERR 1198 ext3_msg(sb, KERN_ERR,
1158 "EXT3-fs: quota options not supported.\n"); 1199 "error: quota options not supported.");
1159 break; 1200 break;
1160 case Opt_usrjquota: 1201 case Opt_usrjquota:
1161 case Opt_grpjquota: 1202 case Opt_grpjquota:
@@ -1163,9 +1204,10 @@ set_qf_format:
1163 case Opt_offgrpjquota: 1204 case Opt_offgrpjquota:
1164 case Opt_jqfmt_vfsold: 1205 case Opt_jqfmt_vfsold:
1165 case Opt_jqfmt_vfsv0: 1206 case Opt_jqfmt_vfsv0:
1166 printk(KERN_ERR 1207 case Opt_jqfmt_vfsv1:
1167 "EXT3-fs: journaled quota options not " 1208 ext3_msg(sb, KERN_ERR,
1168 "supported.\n"); 1209 "error: journaled quota options not "
1210 "supported.");
1169 break; 1211 break;
1170 case Opt_noquota: 1212 case Opt_noquota:
1171 break; 1213 break;
@@ -1185,8 +1227,9 @@ set_qf_format:
1185 break; 1227 break;
1186 case Opt_resize: 1228 case Opt_resize:
1187 if (!is_remount) { 1229 if (!is_remount) {
1188 printk("EXT3-fs: resize option only available " 1230 ext3_msg(sb, KERN_ERR,
1189 "for remount\n"); 1231 "error: resize option only available "
1232 "for remount");
1190 return 0; 1233 return 0;
1191 } 1234 }
1192 if (match_int(&args[0], &option) != 0) 1235 if (match_int(&args[0], &option) != 0)
@@ -1200,41 +1243,35 @@ set_qf_format:
1200 clear_opt(sbi->s_mount_opt, NOBH); 1243 clear_opt(sbi->s_mount_opt, NOBH);
1201 break; 1244 break;
1202 default: 1245 default:
1203 printk (KERN_ERR 1246 ext3_msg(sb, KERN_ERR,
1204 "EXT3-fs: Unrecognized mount option \"%s\" " 1247 "error: unrecognized mount option \"%s\" "
1205 "or missing value\n", p); 1248 "or missing value", p);
1206 return 0; 1249 return 0;
1207 } 1250 }
1208 } 1251 }
1209#ifdef CONFIG_QUOTA 1252#ifdef CONFIG_QUOTA
1210 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1253 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1211 if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && 1254 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1212 sbi->s_qf_names[USRQUOTA])
1213 clear_opt(sbi->s_mount_opt, USRQUOTA); 1255 clear_opt(sbi->s_mount_opt, USRQUOTA);
1214 1256 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1215 if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
1216 sbi->s_qf_names[GRPQUOTA])
1217 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1257 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1218 1258
1219 if ((sbi->s_qf_names[USRQUOTA] && 1259 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1220 (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || 1260 ext3_msg(sb, KERN_ERR, "error: old and new quota "
1221 (sbi->s_qf_names[GRPQUOTA] && 1261 "format mixing.");
1222 (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
1223 printk(KERN_ERR "EXT3-fs: old and new quota "
1224 "format mixing.\n");
1225 return 0; 1262 return 0;
1226 } 1263 }
1227 1264
1228 if (!sbi->s_jquota_fmt) { 1265 if (!sbi->s_jquota_fmt) {
1229 printk(KERN_ERR "EXT3-fs: journaled quota format " 1266 ext3_msg(sb, KERN_ERR, "error: journaled quota format "
1230 "not specified.\n"); 1267 "not specified.");
1231 return 0; 1268 return 0;
1232 } 1269 }
1233 } else { 1270 } else {
1234 if (sbi->s_jquota_fmt) { 1271 if (sbi->s_jquota_fmt) {
1235 printk(KERN_ERR "EXT3-fs: journaled quota format " 1272 ext3_msg(sb, KERN_ERR, "error: journaled quota format "
1236 "specified with no journaling " 1273 "specified with no journaling "
1237 "enabled.\n"); 1274 "enabled.");
1238 return 0; 1275 return 0;
1239 } 1276 }
1240 } 1277 }
@@ -1249,31 +1286,33 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1249 int res = 0; 1286 int res = 0;
1250 1287
1251 if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { 1288 if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
1252 printk (KERN_ERR "EXT3-fs warning: revision level too high, " 1289 ext3_msg(sb, KERN_ERR,
1253 "forcing read-only mode\n"); 1290 "error: revision level too high, "
1291 "forcing read-only mode");
1254 res = MS_RDONLY; 1292 res = MS_RDONLY;
1255 } 1293 }
1256 if (read_only) 1294 if (read_only)
1257 return res; 1295 return res;
1258 if (!(sbi->s_mount_state & EXT3_VALID_FS)) 1296 if (!(sbi->s_mount_state & EXT3_VALID_FS))
1259 printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " 1297 ext3_msg(sb, KERN_WARNING,
1260 "running e2fsck is recommended\n"); 1298 "warning: mounting unchecked fs, "
1299 "running e2fsck is recommended");
1261 else if ((sbi->s_mount_state & EXT3_ERROR_FS)) 1300 else if ((sbi->s_mount_state & EXT3_ERROR_FS))
1262 printk (KERN_WARNING 1301 ext3_msg(sb, KERN_WARNING,
1263 "EXT3-fs warning: mounting fs with errors, " 1302 "warning: mounting fs with errors, "
1264 "running e2fsck is recommended\n"); 1303 "running e2fsck is recommended");
1265 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1304 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1266 le16_to_cpu(es->s_mnt_count) >= 1305 le16_to_cpu(es->s_mnt_count) >=
1267 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1306 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1268 printk (KERN_WARNING 1307 ext3_msg(sb, KERN_WARNING,
1269 "EXT3-fs warning: maximal mount count reached, " 1308 "warning: maximal mount count reached, "
1270 "running e2fsck is recommended\n"); 1309 "running e2fsck is recommended");
1271 else if (le32_to_cpu(es->s_checkinterval) && 1310 else if (le32_to_cpu(es->s_checkinterval) &&
1272 (le32_to_cpu(es->s_lastcheck) + 1311 (le32_to_cpu(es->s_lastcheck) +
1273 le32_to_cpu(es->s_checkinterval) <= get_seconds())) 1312 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1274 printk (KERN_WARNING 1313 ext3_msg(sb, KERN_WARNING,
1275 "EXT3-fs warning: checktime reached, " 1314 "warning: checktime reached, "
1276 "running e2fsck is recommended\n"); 1315 "running e2fsck is recommended");
1277#if 0 1316#if 0
1278 /* @@@ We _will_ want to clear the valid bit if we find 1317 /* @@@ We _will_ want to clear the valid bit if we find
1279 inconsistencies, to force a fsck at reboot. But for 1318 inconsistencies, to force a fsck at reboot. But for
@@ -1290,22 +1329,20 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1290 1329
1291 ext3_commit_super(sb, es, 1); 1330 ext3_commit_super(sb, es, 1);
1292 if (test_opt(sb, DEBUG)) 1331 if (test_opt(sb, DEBUG))
1293 printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, " 1332 ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
1294 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 1333 "bpg=%lu, ipg=%lu, mo=%04lx]",
1295 sb->s_blocksize, 1334 sb->s_blocksize,
1296 sbi->s_groups_count, 1335 sbi->s_groups_count,
1297 EXT3_BLOCKS_PER_GROUP(sb), 1336 EXT3_BLOCKS_PER_GROUP(sb),
1298 EXT3_INODES_PER_GROUP(sb), 1337 EXT3_INODES_PER_GROUP(sb),
1299 sbi->s_mount_opt); 1338 sbi->s_mount_opt);
1300 1339
1301 printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id);
1302 if (EXT3_SB(sb)->s_journal->j_inode == NULL) { 1340 if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
1303 char b[BDEVNAME_SIZE]; 1341 char b[BDEVNAME_SIZE];
1304 1342 ext3_msg(sb, KERN_INFO, "using external journal on %s",
1305 printk("external journal on %s\n",
1306 bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); 1343 bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
1307 } else { 1344 } else {
1308 printk("internal journal\n"); 1345 ext3_msg(sb, KERN_INFO, "using internal journal");
1309 } 1346 }
1310 return res; 1347 return res;
1311} 1348}
@@ -1399,8 +1436,8 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1399 } 1436 }
1400 1437
1401 if (bdev_read_only(sb->s_bdev)) { 1438 if (bdev_read_only(sb->s_bdev)) {
1402 printk(KERN_ERR "EXT3-fs: write access " 1439 ext3_msg(sb, KERN_ERR, "error: write access "
1403 "unavailable, skipping orphan cleanup.\n"); 1440 "unavailable, skipping orphan cleanup.");
1404 return; 1441 return;
1405 } 1442 }
1406 1443
@@ -1414,8 +1451,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1414 } 1451 }
1415 1452
1416 if (s_flags & MS_RDONLY) { 1453 if (s_flags & MS_RDONLY) {
1417 printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", 1454 ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
1418 sb->s_id);
1419 sb->s_flags &= ~MS_RDONLY; 1455 sb->s_flags &= ~MS_RDONLY;
1420 } 1456 }
1421#ifdef CONFIG_QUOTA 1457#ifdef CONFIG_QUOTA
@@ -1426,9 +1462,9 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1426 if (EXT3_SB(sb)->s_qf_names[i]) { 1462 if (EXT3_SB(sb)->s_qf_names[i]) {
1427 int ret = ext3_quota_on_mount(sb, i); 1463 int ret = ext3_quota_on_mount(sb, i);
1428 if (ret < 0) 1464 if (ret < 0)
1429 printk(KERN_ERR 1465 ext3_msg(sb, KERN_ERR,
1430 "EXT3-fs: Cannot turn on journaled " 1466 "error: cannot turn on journaled "
1431 "quota: error %d\n", ret); 1467 "quota: %d", ret);
1432 } 1468 }
1433 } 1469 }
1434#endif 1470#endif
@@ -1443,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1443 } 1479 }
1444 1480
1445 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); 1481 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1446 vfs_dq_init(inode); 1482 dquot_initialize(inode);
1447 if (inode->i_nlink) { 1483 if (inode->i_nlink) {
1448 printk(KERN_DEBUG 1484 printk(KERN_DEBUG
1449 "%s: truncating inode %lu to %Ld bytes\n", 1485 "%s: truncating inode %lu to %Ld bytes\n",
@@ -1466,11 +1502,11 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1466#define PLURAL(x) (x), ((x)==1) ? "" : "s" 1502#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1467 1503
1468 if (nr_orphans) 1504 if (nr_orphans)
1469 printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", 1505 ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
1470 sb->s_id, PLURAL(nr_orphans)); 1506 PLURAL(nr_orphans));
1471 if (nr_truncates) 1507 if (nr_truncates)
1472 printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", 1508 ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
1473 sb->s_id, PLURAL(nr_truncates)); 1509 PLURAL(nr_truncates));
1474#ifdef CONFIG_QUOTA 1510#ifdef CONFIG_QUOTA
1475 /* Turn quotas off */ 1511 /* Turn quotas off */
1476 for (i = 0; i < MAXQUOTAS; i++) { 1512 for (i = 0; i < MAXQUOTAS; i++) {
@@ -1554,7 +1590,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1554 struct ext3_super_block *es = NULL; 1590 struct ext3_super_block *es = NULL;
1555 struct ext3_sb_info *sbi; 1591 struct ext3_sb_info *sbi;
1556 ext3_fsblk_t block; 1592 ext3_fsblk_t block;
1557 ext3_fsblk_t sb_block = get_sb_block(&data); 1593 ext3_fsblk_t sb_block = get_sb_block(&data, sb);
1558 ext3_fsblk_t logic_sb_block; 1594 ext3_fsblk_t logic_sb_block;
1559 unsigned long offset = 0; 1595 unsigned long offset = 0;
1560 unsigned int journal_inum = 0; 1596 unsigned int journal_inum = 0;
@@ -1590,7 +1626,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1590 1626
1591 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); 1627 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1592 if (!blocksize) { 1628 if (!blocksize) {
1593 printk(KERN_ERR "EXT3-fs: unable to set blocksize\n"); 1629 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
1594 goto out_fail; 1630 goto out_fail;
1595 } 1631 }
1596 1632
@@ -1606,7 +1642,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1606 } 1642 }
1607 1643
1608 if (!(bh = sb_bread(sb, logic_sb_block))) { 1644 if (!(bh = sb_bread(sb, logic_sb_block))) {
1609 printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); 1645 ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
1610 goto out_fail; 1646 goto out_fail;
1611 } 1647 }
1612 /* 1648 /*
@@ -1636,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1636 set_opt(sbi->s_mount_opt, POSIX_ACL); 1672 set_opt(sbi->s_mount_opt, POSIX_ACL);
1637#endif 1673#endif
1638 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) 1674 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
1639 sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; 1675 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1640 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) 1676 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
1641 sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; 1677 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1642 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) 1678 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
1643 sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; 1679 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
1644 1680
1645 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) 1681 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
1646 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1682 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1659,15 +1695,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1659 goto failed_mount; 1695 goto failed_mount;
1660 1696
1661 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1697 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1662 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1698 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
1663 1699
1664 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && 1700 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
1665 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || 1701 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
1666 EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || 1702 EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1667 EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) 1703 EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1668 printk(KERN_WARNING 1704 ext3_msg(sb, KERN_WARNING,
1669 "EXT3-fs warning: feature flags set on rev 0 fs, " 1705 "warning: feature flags set on rev 0 fs, "
1670 "running e2fsck is recommended\n"); 1706 "running e2fsck is recommended");
1671 /* 1707 /*
1672 * Check feature flags regardless of the revision level, since we 1708 * Check feature flags regardless of the revision level, since we
1673 * previously didn't change the revision level when setting the flags, 1709 * previously didn't change the revision level when setting the flags,
@@ -1675,25 +1711,25 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1675 */ 1711 */
1676 features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); 1712 features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
1677 if (features) { 1713 if (features) {
1678 printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " 1714 ext3_msg(sb, KERN_ERR,
1679 "unsupported optional features (%x).\n", 1715 "error: couldn't mount because of unsupported "
1680 sb->s_id, le32_to_cpu(features)); 1716 "optional features (%x)", le32_to_cpu(features));
1681 goto failed_mount; 1717 goto failed_mount;
1682 } 1718 }
1683 features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); 1719 features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
1684 if (!(sb->s_flags & MS_RDONLY) && features) { 1720 if (!(sb->s_flags & MS_RDONLY) && features) {
1685 printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " 1721 ext3_msg(sb, KERN_ERR,
1686 "unsupported optional features (%x).\n", 1722 "error: couldn't mount RDWR because of unsupported "
1687 sb->s_id, le32_to_cpu(features)); 1723 "optional features (%x)", le32_to_cpu(features));
1688 goto failed_mount; 1724 goto failed_mount;
1689 } 1725 }
1690 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 1726 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1691 1727
1692 if (blocksize < EXT3_MIN_BLOCK_SIZE || 1728 if (blocksize < EXT3_MIN_BLOCK_SIZE ||
1693 blocksize > EXT3_MAX_BLOCK_SIZE) { 1729 blocksize > EXT3_MAX_BLOCK_SIZE) {
1694 printk(KERN_ERR 1730 ext3_msg(sb, KERN_ERR,
1695 "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", 1731 "error: couldn't mount because of unsupported "
1696 blocksize, sb->s_id); 1732 "filesystem blocksize %d", blocksize);
1697 goto failed_mount; 1733 goto failed_mount;
1698 } 1734 }
1699 1735
@@ -1704,30 +1740,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1704 * than the hardware sectorsize for the machine. 1740 * than the hardware sectorsize for the machine.
1705 */ 1741 */
1706 if (blocksize < hblock) { 1742 if (blocksize < hblock) {
1707 printk(KERN_ERR "EXT3-fs: blocksize %d too small for " 1743 ext3_msg(sb, KERN_ERR,
1708 "device blocksize %d.\n", blocksize, hblock); 1744 "error: fsblocksize %d too small for "
1745 "hardware sectorsize %d", blocksize, hblock);
1709 goto failed_mount; 1746 goto failed_mount;
1710 } 1747 }
1711 1748
1712 brelse (bh); 1749 brelse (bh);
1713 if (!sb_set_blocksize(sb, blocksize)) { 1750 if (!sb_set_blocksize(sb, blocksize)) {
1714 printk(KERN_ERR "EXT3-fs: bad blocksize %d.\n", 1751 ext3_msg(sb, KERN_ERR,
1715 blocksize); 1752 "error: bad blocksize %d", blocksize);
1716 goto out_fail; 1753 goto out_fail;
1717 } 1754 }
1718 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; 1755 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1719 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; 1756 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1720 bh = sb_bread(sb, logic_sb_block); 1757 bh = sb_bread(sb, logic_sb_block);
1721 if (!bh) { 1758 if (!bh) {
1722 printk(KERN_ERR 1759 ext3_msg(sb, KERN_ERR,
1723 "EXT3-fs: Can't read superblock on 2nd try.\n"); 1760 "error: can't read superblock on 2nd try");
1724 goto failed_mount; 1761 goto failed_mount;
1725 } 1762 }
1726 es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); 1763 es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
1727 sbi->s_es = es; 1764 sbi->s_es = es;
1728 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { 1765 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1729 printk (KERN_ERR 1766 ext3_msg(sb, KERN_ERR,
1730 "EXT3-fs: Magic mismatch, very weird !\n"); 1767 "error: magic mismatch");
1731 goto failed_mount; 1768 goto failed_mount;
1732 } 1769 }
1733 } 1770 }
@@ -1743,8 +1780,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1743 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || 1780 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
1744 (!is_power_of_2(sbi->s_inode_size)) || 1781 (!is_power_of_2(sbi->s_inode_size)) ||
1745 (sbi->s_inode_size > blocksize)) { 1782 (sbi->s_inode_size > blocksize)) {
1746 printk (KERN_ERR 1783 ext3_msg(sb, KERN_ERR,
1747 "EXT3-fs: unsupported inode size: %d\n", 1784 "error: unsupported inode size: %d",
1748 sbi->s_inode_size); 1785 sbi->s_inode_size);
1749 goto failed_mount; 1786 goto failed_mount;
1750 } 1787 }
@@ -1752,8 +1789,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1752 sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << 1789 sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
1753 le32_to_cpu(es->s_log_frag_size); 1790 le32_to_cpu(es->s_log_frag_size);
1754 if (blocksize != sbi->s_frag_size) { 1791 if (blocksize != sbi->s_frag_size) {
1755 printk(KERN_ERR 1792 ext3_msg(sb, KERN_ERR,
1756 "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", 1793 "error: fragsize %lu != blocksize %u (unsupported)",
1757 sbi->s_frag_size, blocksize); 1794 sbi->s_frag_size, blocksize);
1758 goto failed_mount; 1795 goto failed_mount;
1759 } 1796 }
@@ -1789,31 +1826,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1789 } 1826 }
1790 1827
1791 if (sbi->s_blocks_per_group > blocksize * 8) { 1828 if (sbi->s_blocks_per_group > blocksize * 8) {
1792 printk (KERN_ERR 1829 ext3_msg(sb, KERN_ERR,
1793 "EXT3-fs: #blocks per group too big: %lu\n", 1830 "#blocks per group too big: %lu",
1794 sbi->s_blocks_per_group); 1831 sbi->s_blocks_per_group);
1795 goto failed_mount; 1832 goto failed_mount;
1796 } 1833 }
1797 if (sbi->s_frags_per_group > blocksize * 8) { 1834 if (sbi->s_frags_per_group > blocksize * 8) {
1798 printk (KERN_ERR 1835 ext3_msg(sb, KERN_ERR,
1799 "EXT3-fs: #fragments per group too big: %lu\n", 1836 "error: #fragments per group too big: %lu",
1800 sbi->s_frags_per_group); 1837 sbi->s_frags_per_group);
1801 goto failed_mount; 1838 goto failed_mount;
1802 } 1839 }
1803 if (sbi->s_inodes_per_group > blocksize * 8) { 1840 if (sbi->s_inodes_per_group > blocksize * 8) {
1804 printk (KERN_ERR 1841 ext3_msg(sb, KERN_ERR,
1805 "EXT3-fs: #inodes per group too big: %lu\n", 1842 "error: #inodes per group too big: %lu",
1806 sbi->s_inodes_per_group); 1843 sbi->s_inodes_per_group);
1807 goto failed_mount; 1844 goto failed_mount;
1808 } 1845 }
1809 1846
1810 if (le32_to_cpu(es->s_blocks_count) > 1847 if (le32_to_cpu(es->s_blocks_count) >
1811 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1848 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1812 printk(KERN_ERR "EXT3-fs: filesystem on %s:" 1849 ext3_msg(sb, KERN_ERR,
1813 " too large to mount safely\n", sb->s_id); 1850 "error: filesystem is too large to mount safely");
1814 if (sizeof(sector_t) < 8) 1851 if (sizeof(sector_t) < 8)
1815 printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not " 1852 ext3_msg(sb, KERN_ERR,
1816 "enabled\n"); 1853 "error: CONFIG_LBDAF not enabled");
1817 goto failed_mount; 1854 goto failed_mount;
1818 } 1855 }
1819 1856
@@ -1827,7 +1864,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1827 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), 1864 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1828 GFP_KERNEL); 1865 GFP_KERNEL);
1829 if (sbi->s_group_desc == NULL) { 1866 if (sbi->s_group_desc == NULL) {
1830 printk (KERN_ERR "EXT3-fs: not enough memory\n"); 1867 ext3_msg(sb, KERN_ERR,
1868 "error: not enough memory");
1831 goto failed_mount; 1869 goto failed_mount;
1832 } 1870 }
1833 1871
@@ -1837,14 +1875,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1837 block = descriptor_loc(sb, logic_sb_block, i); 1875 block = descriptor_loc(sb, logic_sb_block, i);
1838 sbi->s_group_desc[i] = sb_bread(sb, block); 1876 sbi->s_group_desc[i] = sb_bread(sb, block);
1839 if (!sbi->s_group_desc[i]) { 1877 if (!sbi->s_group_desc[i]) {
1840 printk (KERN_ERR "EXT3-fs: " 1878 ext3_msg(sb, KERN_ERR,
1841 "can't read group descriptor %d\n", i); 1879 "error: can't read group descriptor %d", i);
1842 db_count = i; 1880 db_count = i;
1843 goto failed_mount2; 1881 goto failed_mount2;
1844 } 1882 }
1845 } 1883 }
1846 if (!ext3_check_descriptors (sb)) { 1884 if (!ext3_check_descriptors (sb)) {
1847 printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n"); 1885 ext3_msg(sb, KERN_ERR,
1886 "error: group descriptors corrupted");
1848 goto failed_mount2; 1887 goto failed_mount2;
1849 } 1888 }
1850 sbi->s_gdb_count = db_count; 1889 sbi->s_gdb_count = db_count;
@@ -1862,7 +1901,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1862 ext3_count_dirs(sb)); 1901 ext3_count_dirs(sb));
1863 } 1902 }
1864 if (err) { 1903 if (err) {
1865 printk(KERN_ERR "EXT3-fs: insufficient memory\n"); 1904 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1866 goto failed_mount3; 1905 goto failed_mount3;
1867 } 1906 }
1868 1907
@@ -1890,6 +1929,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1890 sb->dq_op = &ext3_quota_operations; 1929 sb->dq_op = &ext3_quota_operations;
1891#endif 1930#endif
1892 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 1931 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1932 mutex_init(&sbi->s_orphan_lock);
1933 mutex_init(&sbi->s_resize_lock);
1893 1934
1894 sb->s_root = NULL; 1935 sb->s_root = NULL;
1895 1936
@@ -1910,9 +1951,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1910 goto failed_mount3; 1951 goto failed_mount3;
1911 } else { 1952 } else {
1912 if (!silent) 1953 if (!silent)
1913 printk (KERN_ERR 1954 ext3_msg(sb, KERN_ERR,
1914 "ext3: No journal on filesystem on %s\n", 1955 "error: no journal found. "
1915 sb->s_id); 1956 "mounting ext3 over ext2?");
1916 goto failed_mount3; 1957 goto failed_mount3;
1917 } 1958 }
1918 1959
@@ -1934,8 +1975,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1934 case EXT3_MOUNT_WRITEBACK_DATA: 1975 case EXT3_MOUNT_WRITEBACK_DATA:
1935 if (!journal_check_available_features 1976 if (!journal_check_available_features
1936 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { 1977 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
1937 printk(KERN_ERR "EXT3-fs: Journal does not support " 1978 ext3_msg(sb, KERN_ERR,
1938 "requested data journaling mode\n"); 1979 "error: journal does not support "
1980 "requested data journaling mode");
1939 goto failed_mount4; 1981 goto failed_mount4;
1940 } 1982 }
1941 default: 1983 default:
@@ -1944,8 +1986,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1944 1986
1945 if (test_opt(sb, NOBH)) { 1987 if (test_opt(sb, NOBH)) {
1946 if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { 1988 if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
1947 printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - " 1989 ext3_msg(sb, KERN_WARNING,
1948 "its supported only with writeback mode\n"); 1990 "warning: ignoring nobh option - "
1991 "it is supported only with writeback mode");
1949 clear_opt(sbi->s_mount_opt, NOBH); 1992 clear_opt(sbi->s_mount_opt, NOBH);
1950 } 1993 }
1951 } 1994 }
@@ -1956,39 +1999,32 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1956 1999
1957 root = ext3_iget(sb, EXT3_ROOT_INO); 2000 root = ext3_iget(sb, EXT3_ROOT_INO);
1958 if (IS_ERR(root)) { 2001 if (IS_ERR(root)) {
1959 printk(KERN_ERR "EXT3-fs: get root inode failed\n"); 2002 ext3_msg(sb, KERN_ERR, "error: get root inode failed");
1960 ret = PTR_ERR(root); 2003 ret = PTR_ERR(root);
1961 goto failed_mount4; 2004 goto failed_mount4;
1962 } 2005 }
1963 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 2006 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1964 iput(root); 2007 iput(root);
1965 printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n"); 2008 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
1966 goto failed_mount4; 2009 goto failed_mount4;
1967 } 2010 }
1968 sb->s_root = d_alloc_root(root); 2011 sb->s_root = d_alloc_root(root);
1969 if (!sb->s_root) { 2012 if (!sb->s_root) {
1970 printk(KERN_ERR "EXT3-fs: get root dentry failed\n"); 2013 ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
1971 iput(root); 2014 iput(root);
1972 ret = -ENOMEM; 2015 ret = -ENOMEM;
1973 goto failed_mount4; 2016 goto failed_mount4;
1974 } 2017 }
1975 2018
1976 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); 2019 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1977 /* 2020
1978 * akpm: core read_super() calls in here with the superblock locked.
1979 * That deadlocks, because orphan cleanup needs to lock the superblock
1980 * in numerous places. Here we just pop the lock - it's relatively
1981 * harmless, because we are now ready to accept write_super() requests,
1982 * and aviro says that's the only reason for hanging onto the
1983 * superblock lock.
1984 */
1985 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; 2021 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
1986 ext3_orphan_cleanup(sb, es); 2022 ext3_orphan_cleanup(sb, es);
1987 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; 2023 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
1988 if (needs_recovery) 2024 if (needs_recovery)
1989 printk (KERN_INFO "EXT3-fs: recovery complete.\n"); 2025 ext3_msg(sb, KERN_INFO, "recovery complete");
1990 ext3_mark_recovery_complete(sb, es); 2026 ext3_mark_recovery_complete(sb, es);
1991 printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", 2027 ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
1992 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": 2028 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
1993 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": 2029 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
1994 "writeback"); 2030 "writeback");
@@ -1998,7 +2034,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1998 2034
1999cantfind_ext3: 2035cantfind_ext3:
2000 if (!silent) 2036 if (!silent)
2001 printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n", 2037 ext3_msg(sb, KERN_INFO,
2038 "error: can't find ext3 filesystem on dev %s.",
2002 sb->s_id); 2039 sb->s_id);
2003 goto failed_mount; 2040 goto failed_mount;
2004 2041
@@ -2066,27 +2103,27 @@ static journal_t *ext3_get_journal(struct super_block *sb,
2066 2103
2067 journal_inode = ext3_iget(sb, journal_inum); 2104 journal_inode = ext3_iget(sb, journal_inum);
2068 if (IS_ERR(journal_inode)) { 2105 if (IS_ERR(journal_inode)) {
2069 printk(KERN_ERR "EXT3-fs: no journal found.\n"); 2106 ext3_msg(sb, KERN_ERR, "error: no journal found");
2070 return NULL; 2107 return NULL;
2071 } 2108 }
2072 if (!journal_inode->i_nlink) { 2109 if (!journal_inode->i_nlink) {
2073 make_bad_inode(journal_inode); 2110 make_bad_inode(journal_inode);
2074 iput(journal_inode); 2111 iput(journal_inode);
2075 printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); 2112 ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
2076 return NULL; 2113 return NULL;
2077 } 2114 }
2078 2115
2079 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", 2116 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
2080 journal_inode, journal_inode->i_size); 2117 journal_inode, journal_inode->i_size);
2081 if (!S_ISREG(journal_inode->i_mode)) { 2118 if (!S_ISREG(journal_inode->i_mode)) {
2082 printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); 2119 ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
2083 iput(journal_inode); 2120 iput(journal_inode);
2084 return NULL; 2121 return NULL;
2085 } 2122 }
2086 2123
2087 journal = journal_init_inode(journal_inode); 2124 journal = journal_init_inode(journal_inode);
2088 if (!journal) { 2125 if (!journal) {
2089 printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); 2126 ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
2090 iput(journal_inode); 2127 iput(journal_inode);
2091 return NULL; 2128 return NULL;
2092 } 2129 }
@@ -2108,13 +2145,13 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2108 struct ext3_super_block * es; 2145 struct ext3_super_block * es;
2109 struct block_device *bdev; 2146 struct block_device *bdev;
2110 2147
2111 bdev = ext3_blkdev_get(j_dev); 2148 bdev = ext3_blkdev_get(j_dev, sb);
2112 if (bdev == NULL) 2149 if (bdev == NULL)
2113 return NULL; 2150 return NULL;
2114 2151
2115 if (bd_claim(bdev, sb)) { 2152 if (bd_claim(bdev, sb)) {
2116 printk(KERN_ERR 2153 ext3_msg(sb, KERN_ERR,
2117 "EXT3: failed to claim external journal device.\n"); 2154 "error: failed to claim external journal device");
2118 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 2155 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2119 return NULL; 2156 return NULL;
2120 } 2157 }
@@ -2122,8 +2159,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2122 blocksize = sb->s_blocksize; 2159 blocksize = sb->s_blocksize;
2123 hblock = bdev_logical_block_size(bdev); 2160 hblock = bdev_logical_block_size(bdev);
2124 if (blocksize < hblock) { 2161 if (blocksize < hblock) {
2125 printk(KERN_ERR 2162 ext3_msg(sb, KERN_ERR,
2126 "EXT3-fs: blocksize too small for journal device.\n"); 2163 "error: blocksize too small for journal device");
2127 goto out_bdev; 2164 goto out_bdev;
2128 } 2165 }
2129 2166
@@ -2131,8 +2168,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2131 offset = EXT3_MIN_BLOCK_SIZE % blocksize; 2168 offset = EXT3_MIN_BLOCK_SIZE % blocksize;
2132 set_blocksize(bdev, blocksize); 2169 set_blocksize(bdev, blocksize);
2133 if (!(bh = __bread(bdev, sb_block, blocksize))) { 2170 if (!(bh = __bread(bdev, sb_block, blocksize))) {
2134 printk(KERN_ERR "EXT3-fs: couldn't read superblock of " 2171 ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
2135 "external journal\n"); 2172 "external journal");
2136 goto out_bdev; 2173 goto out_bdev;
2137 } 2174 }
2138 2175
@@ -2140,14 +2177,14 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2140 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || 2177 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
2141 !(le32_to_cpu(es->s_feature_incompat) & 2178 !(le32_to_cpu(es->s_feature_incompat) &
2142 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { 2179 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2143 printk(KERN_ERR "EXT3-fs: external journal has " 2180 ext3_msg(sb, KERN_ERR, "error: external journal has "
2144 "bad superblock\n"); 2181 "bad superblock");
2145 brelse(bh); 2182 brelse(bh);
2146 goto out_bdev; 2183 goto out_bdev;
2147 } 2184 }
2148 2185
2149 if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { 2186 if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2150 printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); 2187 ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
2151 brelse(bh); 2188 brelse(bh);
2152 goto out_bdev; 2189 goto out_bdev;
2153 } 2190 }
@@ -2159,19 +2196,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2159 journal = journal_init_dev(bdev, sb->s_bdev, 2196 journal = journal_init_dev(bdev, sb->s_bdev,
2160 start, len, blocksize); 2197 start, len, blocksize);
2161 if (!journal) { 2198 if (!journal) {
2162 printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); 2199 ext3_msg(sb, KERN_ERR,
2200 "error: failed to create device journal");
2163 goto out_bdev; 2201 goto out_bdev;
2164 } 2202 }
2165 journal->j_private = sb; 2203 journal->j_private = sb;
2166 ll_rw_block(READ, 1, &journal->j_sb_buffer); 2204 ll_rw_block(READ, 1, &journal->j_sb_buffer);
2167 wait_on_buffer(journal->j_sb_buffer); 2205 wait_on_buffer(journal->j_sb_buffer);
2168 if (!buffer_uptodate(journal->j_sb_buffer)) { 2206 if (!buffer_uptodate(journal->j_sb_buffer)) {
2169 printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); 2207 ext3_msg(sb, KERN_ERR, "I/O error on journal device");
2170 goto out_journal; 2208 goto out_journal;
2171 } 2209 }
2172 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { 2210 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2173 printk(KERN_ERR "EXT3-fs: External journal has more than one " 2211 ext3_msg(sb, KERN_ERR,
2174 "user (unsupported) - %d\n", 2212 "error: external journal has more than one "
2213 "user (unsupported) - %d",
2175 be32_to_cpu(journal->j_superblock->s_nr_users)); 2214 be32_to_cpu(journal->j_superblock->s_nr_users));
2176 goto out_journal; 2215 goto out_journal;
2177 } 2216 }
@@ -2197,8 +2236,8 @@ static int ext3_load_journal(struct super_block *sb,
2197 2236
2198 if (journal_devnum && 2237 if (journal_devnum &&
2199 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2238 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2200 printk(KERN_INFO "EXT3-fs: external journal device major/minor " 2239 ext3_msg(sb, KERN_INFO, "external journal device major/minor "
2201 "numbers have changed\n"); 2240 "numbers have changed");
2202 journal_dev = new_decode_dev(journal_devnum); 2241 journal_dev = new_decode_dev(journal_devnum);
2203 } else 2242 } else
2204 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); 2243 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -2213,21 +2252,21 @@ static int ext3_load_journal(struct super_block *sb,
2213 2252
2214 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { 2253 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
2215 if (sb->s_flags & MS_RDONLY) { 2254 if (sb->s_flags & MS_RDONLY) {
2216 printk(KERN_INFO "EXT3-fs: INFO: recovery " 2255 ext3_msg(sb, KERN_INFO,
2217 "required on readonly filesystem.\n"); 2256 "recovery required on readonly filesystem");
2218 if (really_read_only) { 2257 if (really_read_only) {
2219 printk(KERN_ERR "EXT3-fs: write access " 2258 ext3_msg(sb, KERN_ERR, "error: write access "
2220 "unavailable, cannot proceed.\n"); 2259 "unavailable, cannot proceed");
2221 return -EROFS; 2260 return -EROFS;
2222 } 2261 }
2223 printk (KERN_INFO "EXT3-fs: write access will " 2262 ext3_msg(sb, KERN_INFO,
2224 "be enabled during recovery.\n"); 2263 "write access will be enabled during recovery");
2225 } 2264 }
2226 } 2265 }
2227 2266
2228 if (journal_inum && journal_dev) { 2267 if (journal_inum && journal_dev) {
2229 printk(KERN_ERR "EXT3-fs: filesystem has both journal " 2268 ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
2230 "and inode journals!\n"); 2269 "and inode journals");
2231 return -EINVAL; 2270 return -EINVAL;
2232 } 2271 }
2233 2272
@@ -2242,7 +2281,7 @@ static int ext3_load_journal(struct super_block *sb,
2242 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2281 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2243 err = journal_update_format(journal); 2282 err = journal_update_format(journal);
2244 if (err) { 2283 if (err) {
2245 printk(KERN_ERR "EXT3-fs: error updating journal.\n"); 2284 ext3_msg(sb, KERN_ERR, "error updating journal");
2246 journal_destroy(journal); 2285 journal_destroy(journal);
2247 return err; 2286 return err;
2248 } 2287 }
@@ -2254,7 +2293,7 @@ static int ext3_load_journal(struct super_block *sb,
2254 err = journal_load(journal); 2293 err = journal_load(journal);
2255 2294
2256 if (err) { 2295 if (err) {
2257 printk(KERN_ERR "EXT3-fs: error loading journal.\n"); 2296 ext3_msg(sb, KERN_ERR, "error loading journal");
2258 journal_destroy(journal); 2297 journal_destroy(journal);
2259 return err; 2298 return err;
2260 } 2299 }
@@ -2273,16 +2312,17 @@ static int ext3_load_journal(struct super_block *sb,
2273 return 0; 2312 return 0;
2274} 2313}
2275 2314
2276static int ext3_create_journal(struct super_block * sb, 2315static int ext3_create_journal(struct super_block *sb,
2277 struct ext3_super_block * es, 2316 struct ext3_super_block *es,
2278 unsigned int journal_inum) 2317 unsigned int journal_inum)
2279{ 2318{
2280 journal_t *journal; 2319 journal_t *journal;
2281 int err; 2320 int err;
2282 2321
2283 if (sb->s_flags & MS_RDONLY) { 2322 if (sb->s_flags & MS_RDONLY) {
2284 printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " 2323 ext3_msg(sb, KERN_ERR,
2285 "create journal.\n"); 2324 "error: readonly filesystem when trying to "
2325 "create journal");
2286 return -EROFS; 2326 return -EROFS;
2287 } 2327 }
2288 2328
@@ -2290,12 +2330,12 @@ static int ext3_create_journal(struct super_block * sb,
2290 if (!journal) 2330 if (!journal)
2291 return -EINVAL; 2331 return -EINVAL;
2292 2332
2293 printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", 2333 ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
2294 journal_inum); 2334 journal_inum);
2295 2335
2296 err = journal_create(journal); 2336 err = journal_create(journal);
2297 if (err) { 2337 if (err) {
2298 printk(KERN_ERR "EXT3-fs: error creating journal.\n"); 2338 ext3_msg(sb, KERN_ERR, "error creating journal");
2299 journal_destroy(journal); 2339 journal_destroy(journal);
2300 return -EIO; 2340 return -EIO;
2301 } 2341 }
@@ -2359,13 +2399,11 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
2359 if (journal_flush(journal) < 0) 2399 if (journal_flush(journal) < 0)
2360 goto out; 2400 goto out;
2361 2401
2362 lock_super(sb);
2363 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && 2402 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2364 sb->s_flags & MS_RDONLY) { 2403 sb->s_flags & MS_RDONLY) {
2365 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2404 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2366 ext3_commit_super(sb, es, 1); 2405 ext3_commit_super(sb, es, 1);
2367 } 2406 }
2368 unlock_super(sb);
2369 2407
2370out: 2408out:
2371 journal_unlock_updates(journal); 2409 journal_unlock_updates(journal);
@@ -2376,8 +2414,8 @@ out:
2376 * has recorded an error from a previous lifetime, move that error to the 2414 * has recorded an error from a previous lifetime, move that error to the
2377 * main filesystem now. 2415 * main filesystem now.
2378 */ 2416 */
2379static void ext3_clear_journal_err(struct super_block * sb, 2417static void ext3_clear_journal_err(struct super_block *sb,
2380 struct ext3_super_block * es) 2418 struct ext3_super_block *es)
2381{ 2419{
2382 journal_t *journal; 2420 journal_t *journal;
2383 int j_errno; 2421 int j_errno;
@@ -2524,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2524 goto restore_opts; 2562 goto restore_opts;
2525 } 2563 }
2526 2564
2527 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) 2565 if (test_opt(sb, ABORT))
2528 ext3_abort(sb, __func__, "Abort forced by user"); 2566 ext3_abort(sb, __func__, "Abort forced by user");
2529 2567
2530 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2568 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2531 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2569 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2532 2570
2533 es = sbi->s_es; 2571 es = sbi->s_es;
2534 2572
@@ -2536,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2536 2574
2537 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 2575 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2538 n_blocks_count > le32_to_cpu(es->s_blocks_count)) { 2576 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
2539 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { 2577 if (test_opt(sb, ABORT)) {
2540 err = -EROFS; 2578 err = -EROFS;
2541 goto restore_opts; 2579 goto restore_opts;
2542 } 2580 }
@@ -2557,21 +2595,15 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2557 (sbi->s_mount_state & EXT3_VALID_FS)) 2595 (sbi->s_mount_state & EXT3_VALID_FS))
2558 es->s_state = cpu_to_le16(sbi->s_mount_state); 2596 es->s_state = cpu_to_le16(sbi->s_mount_state);
2559 2597
2560 /*
2561 * We have to unlock super so that we can wait for
2562 * transactions.
2563 */
2564 unlock_super(sb);
2565 ext3_mark_recovery_complete(sb, es); 2598 ext3_mark_recovery_complete(sb, es);
2566 lock_super(sb);
2567 } else { 2599 } else {
2568 __le32 ret; 2600 __le32 ret;
2569 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, 2601 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
2570 ~EXT3_FEATURE_RO_COMPAT_SUPP))) { 2602 ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
2571 printk(KERN_WARNING "EXT3-fs: %s: couldn't " 2603 ext3_msg(sb, KERN_WARNING,
2572 "remount RDWR because of unsupported " 2604 "warning: couldn't remount RDWR "
2573 "optional features (%x).\n", 2605 "because of unsupported optional "
2574 sb->s_id, le32_to_cpu(ret)); 2606 "features (%x)", le32_to_cpu(ret));
2575 err = -EROFS; 2607 err = -EROFS;
2576 goto restore_opts; 2608 goto restore_opts;
2577 } 2609 }
@@ -2582,11 +2614,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2582 * require a full umount/remount for now. 2614 * require a full umount/remount for now.
2583 */ 2615 */
2584 if (es->s_last_orphan) { 2616 if (es->s_last_orphan) {
2585 printk(KERN_WARNING "EXT3-fs: %s: couldn't " 2617 ext3_msg(sb, KERN_WARNING, "warning: couldn't "
2586 "remount RDWR because of unprocessed " 2618 "remount RDWR because of unprocessed "
2587 "orphan inode list. Please " 2619 "orphan inode list. Please "
2588 "umount/remount instead.\n", 2620 "umount/remount instead.");
2589 sb->s_id);
2590 err = -EINVAL; 2621 err = -EINVAL;
2591 goto restore_opts; 2622 goto restore_opts;
2592 } 2623 }
@@ -2686,13 +2717,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2686 buf->f_bsize = sb->s_blocksize; 2717 buf->f_bsize = sb->s_blocksize;
2687 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; 2718 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
2688 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); 2719 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
2689 es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
2690 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); 2720 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
2691 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) 2721 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
2692 buf->f_bavail = 0; 2722 buf->f_bavail = 0;
2693 buf->f_files = le32_to_cpu(es->s_inodes_count); 2723 buf->f_files = le32_to_cpu(es->s_inodes_count);
2694 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 2724 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
2695 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
2696 buf->f_namelen = EXT3_NAME_LEN; 2725 buf->f_namelen = EXT3_NAME_LEN;
2697 fsid = le64_to_cpup((void *)es->s_uuid) ^ 2726 fsid = le64_to_cpup((void *)es->s_uuid) ^
2698 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 2727 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -2706,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2706 * Process 1 Process 2 2735 * Process 1 Process 2
2707 * ext3_create() quota_sync() 2736 * ext3_create() quota_sync()
2708 * journal_start() write_dquot() 2737 * journal_start() write_dquot()
2709 * vfs_dq_init() down(dqio_mutex) 2738 * dquot_initialize() down(dqio_mutex)
2710 * down(dqio_mutex) journal_start() 2739 * down(dqio_mutex) journal_start()
2711 * 2740 *
2712 */ 2741 */
@@ -2837,9 +2866,9 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2837 if (EXT3_SB(sb)->s_qf_names[type]) { 2866 if (EXT3_SB(sb)->s_qf_names[type]) {
2838 /* Quotafile not of fs root? */ 2867 /* Quotafile not of fs root? */
2839 if (path.dentry->d_parent != sb->s_root) 2868 if (path.dentry->d_parent != sb->s_root)
2840 printk(KERN_WARNING 2869 ext3_msg(sb, KERN_WARNING,
2841 "EXT3-fs: Quota file not on filesystem root. " 2870 "warning: Quota file not on filesystem root. "
2842 "Journaled quota will not work.\n"); 2871 "Journaled quota will not work.");
2843 } 2872 }
2844 2873
2845 /* 2874 /*
@@ -2914,65 +2943,65 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2914 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); 2943 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
2915 int err = 0; 2944 int err = 0;
2916 int offset = off & (sb->s_blocksize - 1); 2945 int offset = off & (sb->s_blocksize - 1);
2917 int tocopy;
2918 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; 2946 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
2919 size_t towrite = len;
2920 struct buffer_head *bh; 2947 struct buffer_head *bh;
2921 handle_t *handle = journal_current_handle(); 2948 handle_t *handle = journal_current_handle();
2922 2949
2923 if (!handle) { 2950 if (!handle) {
2924 printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)" 2951 ext3_msg(sb, KERN_WARNING,
2925 " cancelled because transaction is not started.\n", 2952 "warning: quota write (off=%llu, len=%llu)"
2953 " cancelled because transaction is not started.",
2954 (unsigned long long)off, (unsigned long long)len);
2955 return -EIO;
2956 }
2957
2958 /*
2959 * Since we account only one data block in transaction credits,
2960 * then it is impossible to cross a block boundary.
2961 */
2962 if (sb->s_blocksize - offset < len) {
2963 ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
2964 " cancelled because not block aligned",
2926 (unsigned long long)off, (unsigned long long)len); 2965 (unsigned long long)off, (unsigned long long)len);
2927 return -EIO; 2966 return -EIO;
2928 } 2967 }
2929 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2968 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2930 while (towrite > 0) { 2969 bh = ext3_bread(handle, inode, blk, 1, &err);
2931 tocopy = sb->s_blocksize - offset < towrite ? 2970 if (!bh)
2932 sb->s_blocksize - offset : towrite; 2971 goto out;
2933 bh = ext3_bread(handle, inode, blk, 1, &err); 2972 if (journal_quota) {
2934 if (!bh) 2973 err = ext3_journal_get_write_access(handle, bh);
2974 if (err) {
2975 brelse(bh);
2935 goto out; 2976 goto out;
2936 if (journal_quota) {
2937 err = ext3_journal_get_write_access(handle, bh);
2938 if (err) {
2939 brelse(bh);
2940 goto out;
2941 }
2942 }
2943 lock_buffer(bh);
2944 memcpy(bh->b_data+offset, data, tocopy);
2945 flush_dcache_page(bh->b_page);
2946 unlock_buffer(bh);
2947 if (journal_quota)
2948 err = ext3_journal_dirty_metadata(handle, bh);
2949 else {
2950 /* Always do at least ordered writes for quotas */
2951 err = ext3_journal_dirty_data(handle, bh);
2952 mark_buffer_dirty(bh);
2953 } 2977 }
2954 brelse(bh);
2955 if (err)
2956 goto out;
2957 offset = 0;
2958 towrite -= tocopy;
2959 data += tocopy;
2960 blk++;
2961 } 2978 }
2979 lock_buffer(bh);
2980 memcpy(bh->b_data+offset, data, len);
2981 flush_dcache_page(bh->b_page);
2982 unlock_buffer(bh);
2983 if (journal_quota)
2984 err = ext3_journal_dirty_metadata(handle, bh);
2985 else {
2986 /* Always do at least ordered writes for quotas */
2987 err = ext3_journal_dirty_data(handle, bh);
2988 mark_buffer_dirty(bh);
2989 }
2990 brelse(bh);
2962out: 2991out:
2963 if (len == towrite) { 2992 if (err) {
2964 mutex_unlock(&inode->i_mutex); 2993 mutex_unlock(&inode->i_mutex);
2965 return err; 2994 return err;
2966 } 2995 }
2967 if (inode->i_size < off+len-towrite) { 2996 if (inode->i_size < off + len) {
2968 i_size_write(inode, off+len-towrite); 2997 i_size_write(inode, off + len);
2969 EXT3_I(inode)->i_disksize = inode->i_size; 2998 EXT3_I(inode)->i_disksize = inode->i_size;
2970 } 2999 }
2971 inode->i_version++; 3000 inode->i_version++;
2972 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3001 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2973 ext3_mark_inode_dirty(handle, inode); 3002 ext3_mark_inode_dirty(handle, inode);
2974 mutex_unlock(&inode->i_mutex); 3003 mutex_unlock(&inode->i_mutex);
2975 return len - towrite; 3004 return len;
2976} 3005}
2977 3006
2978#endif 3007#endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 545e37c4b91e..534a94c3a933 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -99,7 +99,7 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *,
99 struct mb_cache_entry **); 99 struct mb_cache_entry **);
100static void ext3_xattr_rehash(struct ext3_xattr_header *, 100static void ext3_xattr_rehash(struct ext3_xattr_header *,
101 struct ext3_xattr_entry *); 101 struct ext3_xattr_entry *);
102static int ext3_xattr_list(struct inode *inode, char *buffer, 102static int ext3_xattr_list(struct dentry *dentry, char *buffer,
103 size_t buffer_size); 103 size_t buffer_size);
104 104
105static struct mb_cache *ext3_xattr_cache; 105static struct mb_cache *ext3_xattr_cache;
@@ -147,7 +147,7 @@ ext3_xattr_handler(int name_index)
147ssize_t 147ssize_t
148ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) 148ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
149{ 149{
150 return ext3_xattr_list(dentry->d_inode, buffer, size); 150 return ext3_xattr_list(dentry, buffer, size);
151} 151}
152 152
153static int 153static int
@@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
274 void *end; 274 void *end;
275 int error; 275 int error;
276 276
277 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) 277 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
278 return -ENODATA; 278 return -ENODATA;
279 error = ext3_get_inode_loc(inode, &iloc); 279 error = ext3_get_inode_loc(inode, &iloc);
280 if (error) 280 if (error)
@@ -332,7 +332,7 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name,
332} 332}
333 333
334static int 334static int
335ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, 335ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
336 char *buffer, size_t buffer_size) 336 char *buffer, size_t buffer_size)
337{ 337{
338 size_t rest = buffer_size; 338 size_t rest = buffer_size;
@@ -342,9 +342,10 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
342 ext3_xattr_handler(entry->e_name_index); 342 ext3_xattr_handler(entry->e_name_index);
343 343
344 if (handler) { 344 if (handler) {
345 size_t size = handler->list(inode, buffer, rest, 345 size_t size = handler->list(dentry, buffer, rest,
346 entry->e_name, 346 entry->e_name,
347 entry->e_name_len); 347 entry->e_name_len,
348 handler->flags);
348 if (buffer) { 349 if (buffer) {
349 if (size > rest) 350 if (size > rest)
350 return -ERANGE; 351 return -ERANGE;
@@ -357,8 +358,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
357} 358}
358 359
359static int 360static int
360ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) 361ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
361{ 362{
363 struct inode *inode = dentry->d_inode;
362 struct buffer_head *bh = NULL; 364 struct buffer_head *bh = NULL;
363 int error; 365 int error;
364 366
@@ -383,7 +385,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
383 goto cleanup; 385 goto cleanup;
384 } 386 }
385 ext3_xattr_cache_insert(bh); 387 ext3_xattr_cache_insert(bh);
386 error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); 388 error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
387 389
388cleanup: 390cleanup:
389 brelse(bh); 391 brelse(bh);
@@ -392,15 +394,16 @@ cleanup:
392} 394}
393 395
394static int 396static int
395ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) 397ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
396{ 398{
399 struct inode *inode = dentry->d_inode;
397 struct ext3_xattr_ibody_header *header; 400 struct ext3_xattr_ibody_header *header;
398 struct ext3_inode *raw_inode; 401 struct ext3_inode *raw_inode;
399 struct ext3_iloc iloc; 402 struct ext3_iloc iloc;
400 void *end; 403 void *end;
401 int error; 404 int error;
402 405
403 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) 406 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
404 return 0; 407 return 0;
405 error = ext3_get_inode_loc(inode, &iloc); 408 error = ext3_get_inode_loc(inode, &iloc);
406 if (error) 409 if (error)
@@ -411,7 +414,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
411 error = ext3_xattr_check_names(IFIRST(header), end); 414 error = ext3_xattr_check_names(IFIRST(header), end);
412 if (error) 415 if (error)
413 goto cleanup; 416 goto cleanup;
414 error = ext3_xattr_list_entries(inode, IFIRST(header), 417 error = ext3_xattr_list_entries(dentry, IFIRST(header),
415 buffer, buffer_size); 418 buffer, buffer_size);
416 419
417cleanup: 420cleanup:
@@ -430,12 +433,12 @@ cleanup:
430 * used / required on success. 433 * used / required on success.
431 */ 434 */
432static int 435static int
433ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) 436ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
434{ 437{
435 int i_error, b_error; 438 int i_error, b_error;
436 439
437 down_read(&EXT3_I(inode)->xattr_sem); 440 down_read(&EXT3_I(dentry->d_inode)->xattr_sem);
438 i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size); 441 i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
439 if (i_error < 0) { 442 if (i_error < 0) {
440 b_error = 0; 443 b_error = 0;
441 } else { 444 } else {
@@ -443,11 +446,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
443 buffer += i_error; 446 buffer += i_error;
444 buffer_size -= i_error; 447 buffer_size -= i_error;
445 } 448 }
446 b_error = ext3_xattr_block_list(inode, buffer, buffer_size); 449 b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
447 if (b_error < 0) 450 if (b_error < 0)
448 i_error = 0; 451 i_error = 0;
449 } 452 }
450 up_read(&EXT3_I(inode)->xattr_sem); 453 up_read(&EXT3_I(dentry->d_inode)->xattr_sem);
451 return i_error + b_error; 454 return i_error + b_error;
452} 455}
453 456
@@ -497,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
497 error = ext3_journal_dirty_metadata(handle, bh); 500 error = ext3_journal_dirty_metadata(handle, bh);
498 if (IS_SYNC(inode)) 501 if (IS_SYNC(inode))
499 handle->h_sync = 1; 502 handle->h_sync = 1;
500 vfs_dq_free_block(inode, 1); 503 dquot_free_block(inode, 1);
501 ea_bdebug(bh, "refcount now=%d; releasing", 504 ea_bdebug(bh, "refcount now=%d; releasing",
502 le32_to_cpu(BHDR(bh)->h_refcount)); 505 le32_to_cpu(BHDR(bh)->h_refcount));
503 if (ce) 506 if (ce)
@@ -772,8 +775,8 @@ inserted:
772 else { 775 else {
773 /* The old block is released after updating 776 /* The old block is released after updating
774 the inode. */ 777 the inode. */
775 error = -EDQUOT; 778 error = dquot_alloc_block(inode, 1);
776 if (vfs_dq_alloc_block(inode, 1)) 779 if (error)
777 goto cleanup; 780 goto cleanup;
778 error = ext3_journal_get_write_access(handle, 781 error = ext3_journal_get_write_access(handle,
779 new_bh); 782 new_bh);
@@ -847,7 +850,7 @@ cleanup:
847 return error; 850 return error;
848 851
849cleanup_dquot: 852cleanup_dquot:
850 vfs_dq_free_block(inode, 1); 853 dquot_free_block(inode, 1);
851 goto cleanup; 854 goto cleanup;
852 855
853bad_block: 856bad_block:
@@ -879,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
879 is->s.base = is->s.first = IFIRST(header); 882 is->s.base = is->s.first = IFIRST(header);
880 is->s.here = is->s.first; 883 is->s.here = is->s.first;
881 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; 884 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
882 if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { 885 if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
883 error = ext3_xattr_check_names(IFIRST(header), is->s.end); 886 error = ext3_xattr_check_names(IFIRST(header), is->s.end);
884 if (error) 887 if (error)
885 return error; 888 return error;
@@ -911,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
911 header = IHDR(inode, ext3_raw_inode(&is->iloc)); 914 header = IHDR(inode, ext3_raw_inode(&is->iloc));
912 if (!IS_LAST_ENTRY(s->first)) { 915 if (!IS_LAST_ENTRY(s->first)) {
913 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); 916 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
914 EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; 917 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
915 } else { 918 } else {
916 header->h_magic = cpu_to_le32(0); 919 header->h_magic = cpu_to_le32(0);
917 EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; 920 ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
918 } 921 }
919 return 0; 922 return 0;
920} 923}
@@ -960,10 +963,14 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
960 if (error) 963 if (error)
961 goto cleanup; 964 goto cleanup;
962 965
963 if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { 966 error = ext3_journal_get_write_access(handle, is.iloc.bh);
967 if (error)
968 goto cleanup;
969
970 if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
964 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); 971 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
965 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 972 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
966 EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; 973 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
967 } 974 }
968 975
969 error = ext3_xattr_ibody_find(inode, &i, &is); 976 error = ext3_xattr_ibody_find(inode, &i, &is);
@@ -985,9 +992,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
985 if (flags & XATTR_CREATE) 992 if (flags & XATTR_CREATE)
986 goto cleanup; 993 goto cleanup;
987 } 994 }
988 error = ext3_journal_get_write_access(handle, is.iloc.bh);
989 if (error)
990 goto cleanup;
991 if (!value) { 995 if (!value) {
992 if (!is.s.not_found) 996 if (!is.s.not_found)
993 error = ext3_xattr_ibody_set(handle, inode, &i, &is); 997 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 37b81097bdf2..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext3_jbd.h> 10#include <linux/ext3_jbd.h>
@@ -12,8 +13,8 @@
12#include "xattr.h" 13#include "xattr.h"
13 14
14static size_t 15static size_t
15ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, 16ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
16 const char *name, size_t name_len) 17 const char *name, size_t name_len, int type)
17{ 18{
18 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; 19 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
19 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
@@ -28,23 +29,23 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
28} 29}
29 30
30static int 31static int
31ext3_xattr_security_get(struct inode *inode, const char *name, 32ext3_xattr_security_get(struct dentry *dentry, const char *name,
32 void *buffer, size_t size) 33 void *buffer, size_t size, int type)
33{ 34{
34 if (strcmp(name, "") == 0) 35 if (strcmp(name, "") == 0)
35 return -EINVAL; 36 return -EINVAL;
36 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name, 37 return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
37 buffer, size); 38 name, buffer, size);
38} 39}
39 40
40static int 41static int
41ext3_xattr_security_set(struct inode *inode, const char *name, 42ext3_xattr_security_set(struct dentry *dentry, const char *name,
42 const void *value, size_t size, int flags) 43 const void *value, size_t size, int flags, int type)
43{ 44{
44 if (strcmp(name, "") == 0) 45 if (strcmp(name, "") == 0)
45 return -EINVAL; 46 return -EINVAL;
46 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name, 47 return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
47 value, size, flags); 48 name, value, size, flags);
48} 49}
49 50
50int 51int
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index c7c41a410c4b..e5562845ed96 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -14,8 +14,8 @@
14#include "xattr.h" 14#include "xattr.h"
15 15
16static size_t 16static size_t
17ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, 17ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
18 const char *name, size_t name_len) 18 const char *name, size_t name_len, int type)
19{ 19{
20 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; 20 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
21 const size_t total_len = prefix_len + name_len + 1; 21 const size_t total_len = prefix_len + name_len + 1;
@@ -32,22 +32,22 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
32} 32}
33 33
34static int 34static int
35ext3_xattr_trusted_get(struct inode *inode, const char *name, 35ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
36 void *buffer, size_t size) 36 void *buffer, size_t size, int type)
37{ 37{
38 if (strcmp(name, "") == 0) 38 if (strcmp(name, "") == 0)
39 return -EINVAL; 39 return -EINVAL;
40 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, 40 return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED,
41 buffer, size); 41 name, buffer, size);
42} 42}
43 43
44static int 44static int
45ext3_xattr_trusted_set(struct inode *inode, const char *name, 45ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
46 const void *value, size_t size, int flags) 46 const void *value, size_t size, int flags, int type)
47{ 47{
48 if (strcmp(name, "") == 0) 48 if (strcmp(name, "") == 0)
49 return -EINVAL; 49 return -EINVAL;
50 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, 50 return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name,
51 value, size, flags); 51 value, size, flags);
52} 52}
53 53
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 430fe63b31b3..3bcfe9ee0a68 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -13,13 +13,13 @@
13#include "xattr.h" 13#include "xattr.h"
14 14
15static size_t 15static size_t
16ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, 16ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
17 const char *name, size_t name_len) 17 const char *name, size_t name_len, int type)
18{ 18{
19 const size_t prefix_len = XATTR_USER_PREFIX_LEN; 19 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
20 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
21 21
22 if (!test_opt(inode->i_sb, XATTR_USER)) 22 if (!test_opt(dentry->d_sb, XATTR_USER))
23 return 0; 23 return 0;
24 24
25 if (list && total_len <= list_size) { 25 if (list && total_len <= list_size) {
@@ -31,26 +31,27 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
31} 31}
32 32
33static int 33static int
34ext3_xattr_user_get(struct inode *inode, const char *name, 34ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
35 void *buffer, size_t size) 35 size_t size, int type)
36{ 36{
37 if (strcmp(name, "") == 0) 37 if (strcmp(name, "") == 0)
38 return -EINVAL; 38 return -EINVAL;
39 if (!test_opt(inode->i_sb, XATTR_USER)) 39 if (!test_opt(dentry->d_sb, XATTR_USER))
40 return -EOPNOTSUPP; 40 return -EOPNOTSUPP;
41 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); 41 return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER,
42 name, buffer, size);
42} 43}
43 44
44static int 45static int
45ext3_xattr_user_set(struct inode *inode, const char *name, 46ext3_xattr_user_set(struct dentry *dentry, const char *name,
46 const void *value, size_t size, int flags) 47 const void *value, size_t size, int flags, int type)
47{ 48{
48 if (strcmp(name, "") == 0) 49 if (strcmp(name, "") == 0)
49 return -EINVAL; 50 return -EINVAL;
50 if (!test_opt(inode->i_sb, XATTR_USER)) 51 if (!test_opt(dentry->d_sb, XATTR_USER))
51 return -EOPNOTSUPP; 52 return -EOPNOTSUPP;
52 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, 53 return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER,
53 value, size, flags); 54 name, value, size, flags);
54} 55}
55 56
56struct xattr_handler ext3_xattr_user_handler = { 57struct xattr_handler ext3_xattr_user_handler = {
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9f2d45d75b1a..9ed1bb1f319f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,6 +26,17 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4_USE_FOR_EXT23
30 bool "Use ext4 for ext2/ext3 file systems"
31 depends on EXT4_FS
32 depends on EXT3_FS=n || EXT2_FS=n
33 default y
34 help
35 Allow the ext4 file system driver code to be used for ext2 or
36 ext3 file system mounts. This allows users to reduce their
37 compiled kernel size by using one file system driver for
38 ext2, ext3, and ext4 file systems.
39
29config EXT4_FS_XATTR 40config EXT4_FS_XATTR
30 bool "Ext4 extended attributes" 41 bool "Ext4 extended attributes"
31 depends on EXT4_FS 42 depends on EXT4_FS
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 0df88b2a69b0..8a2a29d35a6f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -364,12 +364,12 @@ out:
364 * Extended attribute handlers 364 * Extended attribute handlers
365 */ 365 */
366static size_t 366static size_t
367ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, 367ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
368 const char *name, size_t name_len) 368 const char *name, size_t name_len, int type)
369{ 369{
370 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); 370 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
371 371
372 if (!test_opt(inode->i_sb, POSIX_ACL)) 372 if (!test_opt(dentry->d_sb, POSIX_ACL))
373 return 0; 373 return 0;
374 if (list && size <= list_len) 374 if (list && size <= list_len)
375 memcpy(list, POSIX_ACL_XATTR_ACCESS, size); 375 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -377,12 +377,12 @@ ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
377} 377}
378 378
379static size_t 379static size_t
380ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, 380ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
381 const char *name, size_t name_len) 381 const char *name, size_t name_len, int type)
382{ 382{
383 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); 383 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
384 384
385 if (!test_opt(inode->i_sb, POSIX_ACL)) 385 if (!test_opt(dentry->d_sb, POSIX_ACL))
386 return 0; 386 return 0;
387 if (list && size <= list_len) 387 if (list && size <= list_len)
388 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); 388 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -390,15 +390,18 @@ ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
390} 390}
391 391
392static int 392static int
393ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) 393ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
394 size_t size, int type)
394{ 395{
395 struct posix_acl *acl; 396 struct posix_acl *acl;
396 int error; 397 int error;
397 398
398 if (!test_opt(inode->i_sb, POSIX_ACL)) 399 if (strcmp(name, "") != 0)
400 return -EINVAL;
401 if (!test_opt(dentry->d_sb, POSIX_ACL))
399 return -EOPNOTSUPP; 402 return -EOPNOTSUPP;
400 403
401 acl = ext4_get_acl(inode, type); 404 acl = ext4_get_acl(dentry->d_inode, type);
402 if (IS_ERR(acl)) 405 if (IS_ERR(acl))
403 return PTR_ERR(acl); 406 return PTR_ERR(acl);
404 if (acl == NULL) 407 if (acl == NULL)
@@ -410,31 +413,16 @@ ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
410} 413}
411 414
412static int 415static int
413ext4_xattr_get_acl_access(struct inode *inode, const char *name, 416ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
414 void *buffer, size_t size) 417 size_t size, int flags, int type)
415{
416 if (strcmp(name, "") != 0)
417 return -EINVAL;
418 return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
419}
420
421static int
422ext4_xattr_get_acl_default(struct inode *inode, const char *name,
423 void *buffer, size_t size)
424{
425 if (strcmp(name, "") != 0)
426 return -EINVAL;
427 return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
428}
429
430static int
431ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
432 size_t size)
433{ 418{
419 struct inode *inode = dentry->d_inode;
434 handle_t *handle; 420 handle_t *handle;
435 struct posix_acl *acl; 421 struct posix_acl *acl;
436 int error, retries = 0; 422 int error, retries = 0;
437 423
424 if (strcmp(name, "") != 0)
425 return -EINVAL;
438 if (!test_opt(inode->i_sb, POSIX_ACL)) 426 if (!test_opt(inode->i_sb, POSIX_ACL))
439 return -EOPNOTSUPP; 427 return -EOPNOTSUPP;
440 if (!is_owner_or_cap(inode)) 428 if (!is_owner_or_cap(inode))
@@ -466,34 +454,18 @@ release_and_out:
466 return error; 454 return error;
467} 455}
468 456
469static int
470ext4_xattr_set_acl_access(struct inode *inode, const char *name,
471 const void *value, size_t size, int flags)
472{
473 if (strcmp(name, "") != 0)
474 return -EINVAL;
475 return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
476}
477
478static int
479ext4_xattr_set_acl_default(struct inode *inode, const char *name,
480 const void *value, size_t size, int flags)
481{
482 if (strcmp(name, "") != 0)
483 return -EINVAL;
484 return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
485}
486
487struct xattr_handler ext4_xattr_acl_access_handler = { 457struct xattr_handler ext4_xattr_acl_access_handler = {
488 .prefix = POSIX_ACL_XATTR_ACCESS, 458 .prefix = POSIX_ACL_XATTR_ACCESS,
459 .flags = ACL_TYPE_ACCESS,
489 .list = ext4_xattr_list_acl_access, 460 .list = ext4_xattr_list_acl_access,
490 .get = ext4_xattr_get_acl_access, 461 .get = ext4_xattr_get_acl,
491 .set = ext4_xattr_set_acl_access, 462 .set = ext4_xattr_set_acl,
492}; 463};
493 464
494struct xattr_handler ext4_xattr_acl_default_handler = { 465struct xattr_handler ext4_xattr_acl_default_handler = {
495 .prefix = POSIX_ACL_XATTR_DEFAULT, 466 .prefix = POSIX_ACL_XATTR_DEFAULT,
467 .flags = ACL_TYPE_DEFAULT,
496 .list = ext4_xattr_list_acl_default, 468 .list = ext4_xattr_list_acl_default,
497 .get = ext4_xattr_get_acl_default, 469 .get = ext4_xattr_get_acl,
498 .set = ext4_xattr_set_acl_default, 470 .set = ext4_xattr_set_acl,
499}; 471};
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8d..d2f37a5516c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
97 /* If checksum is bad mark all blocks used to prevent allocation 97 /* If checksum is bad mark all blocks used to prevent allocation
98 * essentially implementing a per-group read-only flag. */ 98 * essentially implementing a per-group read-only flag. */
99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
100 ext4_error(sb, __func__, 100 ext4_error(sb, "Checksum bad for group %u",
101 "Checksum bad for group %u", block_group); 101 block_group);
102 ext4_free_blks_set(sb, gdp, 0); 102 ext4_free_blks_set(sb, gdp, 0);
103 ext4_free_inodes_set(sb, gdp, 0); 103 ext4_free_inodes_set(sb, gdp, 0);
104 ext4_itable_unused_set(sb, gdp, 0); 104 ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
130 * to make sure we calculate the right free blocks 130 * to make sure we calculate the right free blocks
131 */ 131 */
132 group_blocks = ext4_blocks_count(sbi->s_es) - 132 group_blocks = ext4_blocks_count(sbi->s_es) -
133 le32_to_cpu(sbi->s_es->s_first_data_block) - 133 ext4_group_first_block_no(sb, ngroups - 1);
134 (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
135 } else { 134 } else {
136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 135 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
137 } 136 }
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
189 * when a file system is mounted (see ext4_fill_super). 188 * when a file system is mounted (see ext4_fill_super).
190 */ 189 */
191 190
192
193#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
194
195/** 191/**
196 * ext4_get_group_desc() -- load group descriptor from disk 192 * ext4_get_group_desc() -- load group descriptor from disk
197 * @sb: super block 193 * @sb: super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 206 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 207
212 if (block_group >= ngroups) { 208 if (block_group >= ngroups) {
213 ext4_error(sb, "ext4_get_group_desc", 209 ext4_error(sb, "block_group >= groups_count - block_group = %u,"
214 "block_group >= groups_count - " 210 " groups_count = %u", block_group, ngroups);
215 "block_group = %u, groups_count = %u",
216 block_group, ngroups);
217 211
218 return NULL; 212 return NULL;
219 } 213 }
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 215 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 216 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
223 if (!sbi->s_group_desc[group_desc]) { 217 if (!sbi->s_group_desc[group_desc]) {
224 ext4_error(sb, "ext4_get_group_desc", 218 ext4_error(sb, "Group descriptor not loaded - "
225 "Group descriptor not loaded - "
226 "block_group = %u, group_desc = %u, desc = %u", 219 "block_group = %u, group_desc = %u, desc = %u",
227 block_group, group_desc, offset); 220 block_group, group_desc, offset);
228 return NULL; 221 return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
282 return 1; 275 return 1;
283 276
284err_out: 277err_out:
285 ext4_error(sb, __func__, 278 ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
286 "Invalid block bitmap - "
287 "block_group = %d, block = %llu",
288 block_group, bitmap_blk); 279 block_group, bitmap_blk);
289 return 0; 280 return 0;
290} 281}
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
311 bitmap_blk = ext4_block_bitmap(sb, desc); 302 bitmap_blk = ext4_block_bitmap(sb, desc);
312 bh = sb_getblk(sb, bitmap_blk); 303 bh = sb_getblk(sb, bitmap_blk);
313 if (unlikely(!bh)) { 304 if (unlikely(!bh)) {
314 ext4_error(sb, __func__, 305 ext4_error(sb, "Cannot read block bitmap - "
315 "Cannot read block bitmap - "
316 "block_group = %u, block_bitmap = %llu", 306 "block_group = %u, block_bitmap = %llu",
317 block_group, bitmap_blk); 307 block_group, bitmap_blk);
318 return NULL; 308 return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
354 set_bitmap_uptodate(bh); 344 set_bitmap_uptodate(bh);
355 if (bh_submit_read(bh) < 0) { 345 if (bh_submit_read(bh) < 0) {
356 put_bh(bh); 346 put_bh(bh);
357 ext4_error(sb, __func__, 347 ext4_error(sb, "Cannot read block bitmap - "
358 "Cannot read block bitmap - "
359 "block_group = %u, block_bitmap = %llu", 348 "block_group = %u, block_bitmap = %llu",
360 block_group, bitmap_blk); 349 block_group, bitmap_blk);
361 return NULL; 350 return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
419 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 408 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
420 in_range(block + count - 1, ext4_inode_table(sb, desc), 409 in_range(block + count - 1, ext4_inode_table(sb, desc),
421 sbi->s_itb_per_group)) { 410 sbi->s_itb_per_group)) {
422 ext4_error(sb, __func__, 411 ext4_error(sb, "Adding blocks in system zones - "
423 "Adding blocks in system zones - "
424 "Block = %llu, count = %lu", 412 "Block = %llu, count = %lu",
425 block, count); 413 block, count);
426 goto error_return; 414 goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
453 BUFFER_TRACE(bitmap_bh, "clear bit"); 441 BUFFER_TRACE(bitmap_bh, "clear bit");
454 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 442 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
455 bit + i, bitmap_bh->b_data)) { 443 bit + i, bitmap_bh->b_data)) {
456 ext4_error(sb, __func__, 444 ext4_error(sb, "bit already cleared for block %llu",
457 "bit already cleared for block %llu",
458 (ext4_fsblk_t)(block + i)); 445 (ext4_fsblk_t)(block + i));
459 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 446 BUFFER_TRACE(bitmap_bh, "bit already cleared");
460 } else { 447 } else {
@@ -499,44 +486,6 @@ error_return:
499} 486}
500 487
501/** 488/**
502 * ext4_free_blocks() -- Free given blocks and update quota
503 * @handle: handle for this transaction
504 * @inode: inode
505 * @block: start physical block to free
506 * @count: number of blocks to count
507 * @metadata: Are these metadata blocks
508 */
509void ext4_free_blocks(handle_t *handle, struct inode *inode,
510 ext4_fsblk_t block, unsigned long count,
511 int metadata)
512{
513 struct super_block *sb;
514 unsigned long dquot_freed_blocks;
515
516 /* this isn't the right place to decide whether block is metadata
517 * inode.c/extents.c knows better, but for safety ... */
518 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
519 metadata = 1;
520
521 /* We need to make sure we don't reuse
522 * block released untill the transaction commit.
523 * writeback mode have weak data consistency so
524 * don't force data as metadata when freeing block
525 * for writeback mode.
526 */
527 if (metadata == 0 && !ext4_should_writeback_data(inode))
528 metadata = 1;
529
530 sb = inode->i_sb;
531
532 ext4_mb_free_blocks(handle, inode, block, count,
533 metadata, &dquot_freed_blocks);
534 if (dquot_freed_blocks)
535 vfs_dq_free_block(inode, dquot_freed_blocks);
536 return;
537}
538
539/**
540 * ext4_has_free_blocks() 489 * ext4_has_free_blocks()
541 * @sbi: in-core super block structure. 490 * @sbi: in-core super block structure.
542 * @nblocks: number of needed blocks 491 * @nblocks: number of needed blocks
@@ -761,7 +710,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
761static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, 710static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
762 ext4_group_t group) 711 ext4_group_t group)
763{ 712{
764 return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; 713 if (!ext4_bg_has_super(sb, group))
714 return 0;
715
716 if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
717 return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
718 else
719 return EXT4_SB(sb)->s_gdb_count;
765} 720}
766 721
767/** 722/**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,9 +16,9 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/version.h>
20#include <linux/blkdev.h> 19#include <linux/blkdev.h>
21#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/slab.h>
22#include "ext4.h" 22#include "ext4.h"
23 23
24struct ext4_system_zone { 24struct ext4_system_zone {
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
160 if (ext4_bg_has_super(sb, i) && 160 if (ext4_bg_has_super(sb, i) &&
161 ((i < 5) || ((i % flex_size) == 0))) 161 ((i < 5) || ((i % flex_size) == 0)))
162 add_system_zone(sbi, ext4_group_first_block_no(sb, i), 162 add_system_zone(sbi, ext4_group_first_block_no(sb, i),
163 sbi->s_gdb_count + 1); 163 ext4_bg_num_gdb(sb, i) + 1);
164 gdp = ext4_get_group_desc(sb, i, NULL); 164 gdp = ext4_get_group_desc(sb, i, NULL);
165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); 165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
166 if (ret) 166 if (ret)
@@ -206,14 +206,14 @@ void ext4_release_system_zone(struct super_block *sb)
206 entry = rb_entry(n, struct ext4_system_zone, node); 206 entry = rb_entry(n, struct ext4_system_zone, node);
207 kmem_cache_free(ext4_system_zone_cachep, entry); 207 kmem_cache_free(ext4_system_zone_cachep, entry);
208 if (!parent) 208 if (!parent)
209 EXT4_SB(sb)->system_blks.rb_node = NULL; 209 EXT4_SB(sb)->system_blks = RB_ROOT;
210 else if (parent->rb_left == n) 210 else if (parent->rb_left == n)
211 parent->rb_left = NULL; 211 parent->rb_left = NULL;
212 else if (parent->rb_right == n) 212 else if (parent->rb_right == n)
213 parent->rb_right = NULL; 213 parent->rb_right = NULL;
214 n = parent; 214 n = parent;
215 } 215 }
216 EXT4_SB(sb)->system_blks.rb_node = NULL; 216 EXT4_SB(sb)->system_blks = RB_ROOT;
217} 217}
218 218
219/* 219/*
@@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
228 struct rb_node *n = sbi->system_blks.rb_node; 228 struct rb_node *n = sbi->system_blks.rb_node;
229 229
230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
231 (start_blk + count < start_blk) ||
231 (start_blk + count > ext4_blocks_count(sbi->s_es))) 232 (start_blk + count > ext4_blocks_count(sbi->s_es)))
232 return 0; 233 return 0;
233 while (n) { 234 while (n) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..86cb6d86a048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 ext4_error(dir->i_sb, function, 86 __ext4_error(dir->i_sb, function,
87 "bad entry in directory #%lu: %s - " 87 "bad entry in directory #%lu: %s - block=%llu"
88 "offset=%u, inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, offset, 89 dir->i_ino, error_msg,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset,
90 le32_to_cpu(de->inode), 92 le32_to_cpu(de->inode),
91 rlen, de->name_len); 93 rlen, de->name_len);
92 return error_msg == NULL ? 1 : 0; 94 return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
150 */ 152 */
151 if (!bh) { 153 if (!bh) {
152 if (!dir_has_error) { 154 if (!dir_has_error) {
153 ext4_error(sb, __func__, "directory #%lu " 155 ext4_error(sb, "directory #%lu "
154 "contains a hole at offset %Lu", 156 "contains a hole at offset %Lu",
155 inode->i_ino, 157 inode->i_ino,
156 (unsigned long long) filp->f_pos); 158 (unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
303 kfree(old); 305 kfree(old);
304 } 306 }
305 if (!parent) 307 if (!parent)
306 root->rb_node = NULL; 308 *root = RB_ROOT;
307 else if (parent->rb_left == n) 309 else if (parent->rb_left == n)
308 parent->rb_left = NULL; 310 parent->rb_left = NULL;
309 else if (parent->rb_right == n) 311 else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8825515eeddd..bf938cf7c5f0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
53#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
54#endif 54#endif
55 55
56#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a);
58
59#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a);
61
56/* data type for block offset of block group */ 62/* data type for block offset of block group */
57typedef int ext4_grpblk_t; 63typedef int ext4_grpblk_t;
58 64
@@ -133,14 +139,14 @@ struct mpage_da_data {
133 int pages_written; 139 int pages_written;
134 int retval; 140 int retval;
135}; 141};
136#define DIO_AIO_UNWRITTEN 0x1 142#define EXT4_IO_UNWRITTEN 0x1
137typedef struct ext4_io_end { 143typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */ 144 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */ 145 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */ 146 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */ 147 struct page *page; /* page struct for buffer write */
142 ext4_lblk_t offset; /* offset in the file */ 148 loff_t offset; /* offset in the file */
143 size_t size; /* size of the extent */ 149 ssize_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */ 150 struct work_struct work; /* data work queue */
145} ext4_io_end_t; 151} ext4_io_end_t;
146 152
@@ -284,10 +290,12 @@ struct flex_groups {
284#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 290#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
285#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 291#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
286#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 292#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
293#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
294#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
287#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 295#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
288 296
289#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 297#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
290#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 298#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
291 299
292/* Flags that should be inherited by new inodes from their parent. */ 300/* Flags that should be inherited by new inodes from their parent. */
293#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 301#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
313 return flags & EXT4_OTHER_FLMASK; 321 return flags & EXT4_OTHER_FLMASK;
314} 322}
315 323
316/*
317 * Inode dynamic state flags
318 */
319#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
320#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
321#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
326
327/* Used to pass group descriptor data when online resize is done */ 324/* Used to pass group descriptor data when online resize is done */
328struct ext4_new_group_input { 325struct ext4_new_group_input {
329 __u32 group; /* Group number for this data */ 326 __u32 group; /* Group number for this data */
@@ -361,19 +358,23 @@ struct ext4_new_group_data {
361 so set the magic i_delalloc_reserve_flag after taking the 358 so set the magic i_delalloc_reserve_flag after taking the
362 inode allocation semaphore for */ 359 inode allocation semaphore for */
363#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
364 /* Call ext4_da_update_reserve_space() after successfully
365 allocating the blocks */
366#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
367 /* caller is from the direct IO path, request to creation of an 361 /* caller is from the direct IO path, request to creation of an
368 unitialized extents if not allocated, split the uninitialized 362 unitialized extents if not allocated, split the uninitialized
369 extent if blocks has been preallocated already*/ 363 extent if blocks has been preallocated already*/
370#define EXT4_GET_BLOCKS_DIO 0x0010 364#define EXT4_GET_BLOCKS_PRE_IO 0x0008
371#define EXT4_GET_BLOCKS_CONVERT 0x0020 365#define EXT4_GET_BLOCKS_CONVERT 0x0010
372#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ 366#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
367 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
368 /* Convert extent to initialized after IO complete */
369#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
373 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
374 /* Convert extent to initialized after direct IO complete */ 371
375#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 372/*
376 EXT4_GET_BLOCKS_DIO_CREATE_EXT) 373 * Flags used by ext4_free_blocks
374 */
375#define EXT4_FREE_BLOCKS_METADATA 0x0001
376#define EXT4_FREE_BLOCKS_FORGET 0x0002
377#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
377 378
378/* 379/*
379 * ioctl commands 380 * ioctl commands
@@ -627,7 +628,7 @@ struct ext4_inode_info {
627 * near to their parent directory's inode. 628 * near to their parent directory's inode.
628 */ 629 */
629 ext4_group_t i_block_group; 630 ext4_group_t i_block_group;
630 __u32 i_state; /* Dynamic state flags for ext4 */ 631 unsigned long i_state_flags; /* Dynamic state flags */
631 632
632 ext4_lblk_t i_dir_start_lookup; 633 ext4_lblk_t i_dir_start_lookup;
633#ifdef CONFIG_EXT4_FS_XATTR 634#ifdef CONFIG_EXT4_FS_XATTR
@@ -693,16 +694,30 @@ struct ext4_inode_info {
693 unsigned int i_reserved_meta_blocks; 694 unsigned int i_reserved_meta_blocks;
694 unsigned int i_allocated_meta_blocks; 695 unsigned int i_allocated_meta_blocks;
695 unsigned short i_delalloc_reserved_flag; 696 unsigned short i_delalloc_reserved_flag;
697 sector_t i_da_metadata_calc_last_lblock;
698 int i_da_metadata_calc_len;
696 699
697 /* on-disk additional length */ 700 /* on-disk additional length */
698 __u16 i_extra_isize; 701 __u16 i_extra_isize;
699 702
700 spinlock_t i_block_reservation_lock; 703 spinlock_t i_block_reservation_lock;
704#ifdef CONFIG_QUOTA
705 /* quota space reservation, managed internally by quota code */
706 qsize_t i_reserved_quota;
707#endif
701 708
702 /* completed async DIOs that might need unwritten extents handling */ 709 /* completed IOs that might need unwritten extents handling */
703 struct list_head i_aio_dio_complete_list; 710 struct list_head i_completed_io_list;
711 spinlock_t i_completed_io_lock;
704 /* current io_end structure for async DIO write*/ 712 /* current io_end structure for async DIO write*/
705 ext4_io_end_t *cur_aio_dio; 713 ext4_io_end_t *cur_aio_dio;
714
715 /*
716 * Transactions that contain inode's metadata needed to complete
717 * fsync and fdatasync, respectively.
718 */
719 tid_t i_sync_tid;
720 tid_t i_datasync_tid;
706}; 721};
707 722
708/* 723/*
@@ -744,12 +759,14 @@ struct ext4_inode_info {
744#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 759#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
745#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 760#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
746#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 761#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
762#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
747#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
748#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
749#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
750#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 766#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
751#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 767#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
752#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 768#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
769#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
753 770
754#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 771#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
755#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 772#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -997,7 +1014,7 @@ struct ext4_sb_info {
997 atomic_t s_lock_busy; 1014 atomic_t s_lock_busy;
998 1015
999 /* locality groups */ 1016 /* locality groups */
1000 struct ext4_locality_group *s_locality_groups; 1017 struct ext4_locality_group __percpu *s_locality_groups;
1001 1018
1002 /* for write statistics */ 1019 /* for write statistics */
1003 unsigned long s_sectors_written_start; 1020 unsigned long s_sectors_written_start;
@@ -1033,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1033 (ino >= EXT4_FIRST_INO(sb) && 1050 (ino >= EXT4_FIRST_INO(sb) &&
1034 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1035} 1052}
1053
1054/*
1055 * Inode dynamic state flags
1056 */
1057enum {
1058 EXT4_STATE_JDATA, /* journaled data exists */
1059 EXT4_STATE_NEW, /* inode is newly created */
1060 EXT4_STATE_XATTR, /* has in-inode xattrs */
1061 EXT4_STATE_NO_EXPAND, /* No space for expansion */
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1065};
1066
1067static inline int ext4_test_inode_state(struct inode *inode, int bit)
1068{
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags);
1070}
1071
1072static inline void ext4_set_inode_state(struct inode *inode, int bit)
1073{
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags);
1075}
1076
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit)
1078{
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1036#else 1081#else
1037/* Assume that user mode programs are passing in an ext4fs superblock, not 1082/* Assume that user mode programs are passing in an ext4fs superblock, not
1038 * a kernel struct super_block. This will allow us to call the feature-test 1083 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1109,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1109#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 1154#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
1110#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 1155#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
1111#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1156#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1157#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1158#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1112 1159
1113#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1160#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1114#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1161#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1324,8 +1371,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1324 ext4_fsblk_t goal, unsigned long *count, int *errp); 1371 ext4_fsblk_t goal, unsigned long *count, int *errp);
1325extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1372extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1326extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1373extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1327extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1328 ext4_fsblk_t block, unsigned long count, int metadata);
1329extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1374extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1330 ext4_fsblk_t block, unsigned long count); 1375 ext4_fsblk_t block, unsigned long count);
1331extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1376extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1384,16 +1429,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
1384extern void ext4_discard_preallocations(struct inode *); 1429extern void ext4_discard_preallocations(struct inode *);
1385extern int __init init_ext4_mballoc(void); 1430extern int __init init_ext4_mballoc(void);
1386extern void exit_ext4_mballoc(void); 1431extern void exit_ext4_mballoc(void);
1387extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1432extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1388 ext4_fsblk_t, unsigned long, int, unsigned long *); 1433 struct buffer_head *bh, ext4_fsblk_t block,
1434 unsigned long count, int flags);
1389extern int ext4_mb_add_groupinfo(struct super_block *sb, 1435extern int ext4_mb_add_groupinfo(struct super_block *sb,
1390 ext4_group_t i, struct ext4_group_desc *desc); 1436 ext4_group_t i, struct ext4_group_desc *desc);
1391extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1437extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1392extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1438extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1393 ext4_group_t, int); 1439 ext4_group_t, int);
1394/* inode.c */ 1440/* inode.c */
1395int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1396 struct buffer_head *bh, ext4_fsblk_t blocknr);
1397struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1441struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1398 ext4_lblk_t, int, int *); 1442 ext4_lblk_t, int, int *);
1399struct buffer_head *ext4_bread(handle_t *, struct inode *, 1443struct buffer_head *ext4_bread(handle_t *, struct inode *,
@@ -1402,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
1402 struct buffer_head *bh_result, int create); 1446 struct buffer_head *bh_result, int create);
1403 1447
1404extern struct inode *ext4_iget(struct super_block *, unsigned long); 1448extern struct inode *ext4_iget(struct super_block *, unsigned long);
1405extern int ext4_write_inode(struct inode *, int); 1449extern int ext4_write_inode(struct inode *, struct writeback_control *);
1406extern int ext4_setattr(struct dentry *, struct iattr *); 1450extern int ext4_setattr(struct dentry *, struct iattr *);
1407extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1451extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1408 struct kstat *stat); 1452 struct kstat *stat);
@@ -1424,8 +1468,10 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1424extern int ext4_block_truncate_page(handle_t *handle, 1468extern int ext4_block_truncate_page(handle_t *handle,
1425 struct address_space *mapping, loff_t from); 1469 struct address_space *mapping, loff_t from);
1426extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1470extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1427extern qsize_t ext4_get_reserved_space(struct inode *inode); 1471extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1428extern int flush_aio_dio_completed_IO(struct inode *inode); 1472extern int flush_completed_IO(struct inode *inode);
1473extern void ext4_da_update_reserve_space(struct inode *inode,
1474 int used, int quota_claim);
1429/* ioctl.c */ 1475/* ioctl.c */
1430extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1476extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1431extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1477extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1449,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
1449 ext4_fsblk_t n_blocks_count); 1495 ext4_fsblk_t n_blocks_count);
1450 1496
1451/* super.c */ 1497/* super.c */
1452extern void ext4_error(struct super_block *, const char *, const char *, ...) 1498extern void __ext4_error(struct super_block *, const char *, const char *, ...)
1499 __attribute__ ((format (printf, 3, 4)));
1500#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message)
1501extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
1502 __attribute__ ((format (printf, 3, 4)));
1503extern void ext4_error_file(const char *, struct file *, const char *, ...)
1453 __attribute__ ((format (printf, 3, 4))); 1504 __attribute__ ((format (printf, 3, 4)));
1454extern void __ext4_std_error(struct super_block *, const char *, int); 1505extern void __ext4_std_error(struct super_block *, const char *, int);
1455extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1506extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1456 __attribute__ ((format (printf, 3, 4))); 1507 __attribute__ ((format (printf, 3, 4)));
1457extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1508extern void __ext4_warning(struct super_block *, const char *,
1509 const char *, ...)
1458 __attribute__ ((format (printf, 3, 4))); 1510 __attribute__ ((format (printf, 3, 4)));
1511#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message)
1459extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1512extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1460 __attribute__ ((format (printf, 3, 4))); 1513 __attribute__ ((format (printf, 3, 4)));
1461extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1514extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1728,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
1728extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1781extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1729 loff_t len); 1782 loff_t len);
1730extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1731 loff_t len); 1784 ssize_t len);
1732extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1733 sector_t block, unsigned int max_blocks, 1786 sector_t block, unsigned int max_blocks,
1734 struct buffer_head *bh, int flags); 1787 struct buffer_head *bh, int flags);
@@ -1740,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1740 __u64 len, __u64 *moved_len); 1793 __u64 len, __u64 *moved_len);
1741 1794
1742 1795
1796/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1797enum ext4_state_bits {
1798 BH_Uninit /* blocks are allocated but uninitialized on disk */
1799 = BH_JBDPrivateStart,
1800};
1801
1802BUFFER_FNS(Uninit, uninit)
1803TAS_BUFFER_FNS(Uninit, uninit)
1804
1743/* 1805/*
1744 * Add new method to test wether block and inode bitmaps are properly 1806 * Add new method to test wether block and inode bitmaps are properly
1745 * initialized. With uninit_bg reading the block from disk is not enough 1807 * initialized. With uninit_bg reading the block from disk is not enough
@@ -1757,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
1757 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 1819 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1758} 1820}
1759 1821
1822#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
1823
1760#endif /* __KERNEL__ */ 1824#endif /* __KERNEL__ */
1761 1825
1762#endif /* _EXT4_H */ 1826#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 2ca686454e87..bdb6ce7e2eb4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); 225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226} 226}
227 227
228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode,
229 sector_t lblocks);
229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 230extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 231extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
231extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 232extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920dee..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
4 4
5#include "ext4_jbd2.h" 5#include "ext4_jbd2.h"
6 6
7#include <trace/events/ext4.h>
8
7int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 9int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh) 10 struct buffer_head *bh)
9{ 11{
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
32 return err; 34 return err;
33} 35}
34 36
35int __ext4_journal_forget(const char *where, handle_t *handle, 37/*
36 struct buffer_head *bh) 38 * The ext4 forget function must perform a revoke if we are freeing data
39 * which has been journaled. Metadata (eg. indirect blocks) must be
40 * revoked in all cases.
41 *
42 * "bh" may be NULL: a metadata block may have been freed from memory
43 * but there may still be a record of it in the journal, and that record
44 * still needs to be revoked.
45 *
46 * If the handle isn't valid we're not journaling, but we still need to
47 * call into ext4_journal_revoke() to put the buffer head.
48 */
49int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
50 struct inode *inode, struct buffer_head *bh,
51 ext4_fsblk_t blocknr)
37{ 52{
38 int err = 0; 53 int err;
39 54
40 if (ext4_handle_valid(handle)) { 55 might_sleep();
41 err = jbd2_journal_forget(handle, bh); 56
42 if (err) 57 trace_ext4_forget(inode, is_metadata, blocknr);
43 ext4_journal_abort_handle(where, __func__, bh, 58 BUFFER_TRACE(bh, "enter");
44 handle, err); 59
45 } 60 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
46 else 61 "data mode %x\n",
62 bh, is_metadata, inode->i_mode,
63 test_opt(inode->i_sb, DATA_FLAGS));
64
65 /* In the no journal case, we can just do a bforget and return */
66 if (!ext4_handle_valid(handle)) {
47 bforget(bh); 67 bforget(bh);
48 return err; 68 return 0;
49} 69 }
50 70
51int __ext4_journal_revoke(const char *where, handle_t *handle, 71 /* Never use the revoke function if we are doing full data
52 ext4_fsblk_t blocknr, struct buffer_head *bh) 72 * journaling: there is no need to, and a V1 superblock won't
53{ 73 * support it. Otherwise, only skip the revoke on un-journaled
54 int err = 0; 74 * data blocks. */
55 75
56 if (ext4_handle_valid(handle)) { 76 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
57 err = jbd2_journal_revoke(handle, blocknr, bh); 77 (!is_metadata && !ext4_should_journal_data(inode))) {
58 if (err) 78 if (bh) {
59 ext4_journal_abort_handle(where, __func__, bh, 79 BUFFER_TRACE(bh, "call jbd2_journal_forget");
60 handle, err); 80 err = jbd2_journal_forget(handle, bh);
81 if (err)
82 ext4_journal_abort_handle(where, __func__, bh,
83 handle, err);
84 return err;
85 }
86 return 0;
61 } 87 }
62 else 88
63 bforget(bh); 89 /*
90 * data!=journal && (is_metadata || should_journal_data(inode))
91 */
92 BUFFER_TRACE(bh, "call jbd2_journal_revoke");
93 err = jbd2_journal_revoke(handle, blocknr, bh);
94 if (err) {
95 ext4_journal_abort_handle(where, __func__, bh, handle, err);
96 ext4_abort(inode->i_sb, __func__,
97 "error %d when attempting revoke", err);
98 }
99 BUFFER_TRACE(bh, "exit");
64 return err; 100 return err;
65} 101}
66 102
@@ -89,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
89 ext4_journal_abort_handle(where, __func__, bh, 125 ext4_journal_abort_handle(where, __func__, bh,
90 handle, err); 126 handle, err);
91 } else { 127 } else {
92 if (inode && bh) 128 if (inode)
93 mark_buffer_dirty_inode(bh, inode); 129 mark_buffer_dirty_inode(bh, inode);
94 else 130 else
95 mark_buffer_dirty(bh); 131 mark_buffer_dirty(bh);
96 if (inode && inode_needs_sync(inode)) { 132 if (inode && inode_needs_sync(inode)) {
97 sync_dirty_buffer(bh); 133 sync_dirty_buffer(bh);
98 if (buffer_req(bh) && !buffer_uptodate(bh)) { 134 if (buffer_req(bh) && !buffer_uptodate(bh)) {
99 ext4_error(inode->i_sb, __func__, 135 ext4_error(inode->i_sb,
100 "IO error syncing inode, " 136 "IO error syncing inode, "
101 "inode=%lu, block=%llu", 137 "inode=%lu, block=%llu",
102 inode->i_ino, 138 inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a2865980342f..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
49 49
50#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ 50#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \
52 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 52 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
53 53
54/* 54/*
55 * Define the number of metadata blocks we need to account to modify data. 55 * Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
57 * This include super block, inode block, quota blocks and xattr blocks 57 * This include super block, inode block, quota blocks and xattr blocks
58 */ 58 */
59#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ 59#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
60 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 60 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
61 61
62/* Delete operations potentially hit one directory's namespace plus an 62/* Delete operations potentially hit one directory's namespace plus an
63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
@@ -92,6 +92,7 @@
92 * but inode, sb and group updates are done only once */ 92 * but inode, sb and group updates are done only once */
93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) 94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
95
95#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ 96#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
96 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) 97 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
97#else 98#else
@@ -99,6 +100,9 @@
99#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 100#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
100#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 101#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
101#endif 102#endif
103#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
102 106
103int 107int
104ext4_mark_iloc_dirty(handle_t *handle, 108ext4_mark_iloc_dirty(handle_t *handle,
@@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
116int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); 120int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
117 121
118/* 122/*
119 * Wrapper functions with which ext4 calls into JBD. The intent here is 123 * Wrapper functions with which ext4 calls into JBD.
120 * to allow these to be turned into appropriate stubs so ext4 can control
121 * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't
122 * been done yet.
123 */ 124 */
124
125void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, const char *err_fn,
126 struct buffer_head *bh, handle_t *handle, int err); 126 struct buffer_head *bh, handle_t *handle, int err);
127 127
@@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
131int __ext4_journal_get_write_access(const char *where, handle_t *handle, 131int __ext4_journal_get_write_access(const char *where, handle_t *handle,
132 struct buffer_head *bh); 132 struct buffer_head *bh);
133 133
134/* When called with an invalid handle, this will still do a put on the BH */ 134int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
135int __ext4_journal_forget(const char *where, handle_t *handle, 135 struct inode *inode, struct buffer_head *bh,
136 struct buffer_head *bh); 136 ext4_fsblk_t blocknr);
137
138/* When called with an invalid handle, this will still do a put on the BH */
139int __ext4_journal_revoke(const char *where, handle_t *handle,
140 ext4_fsblk_t blocknr, struct buffer_head *bh);
141 137
142int __ext4_journal_get_create_access(const char *where, 138int __ext4_journal_get_create_access(const char *where,
143 handle_t *handle, struct buffer_head *bh); 139 handle_t *handle, struct buffer_head *bh);
@@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
149 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 145 __ext4_journal_get_undo_access(__func__, (handle), (bh))
150#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
151 __ext4_journal_get_write_access(__func__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, (handle), (bh))
152#define ext4_journal_revoke(handle, blocknr, bh) \ 148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
153 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 149 __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
150 (block_nr))
154#define ext4_journal_get_create_access(handle, bh) \ 151#define ext4_journal_get_create_access(handle, bh) \
155 __ext4_journal_get_create_access(__func__, (handle), (bh)) 152 __ext4_journal_get_create_access(__func__, (handle), (bh))
156#define ext4_journal_forget(handle, bh) \
157 __ext4_journal_forget(__func__, (handle), (bh))
158#define ext4_handle_dirty_metadata(handle, inode, bh) \ 153#define ext4_handle_dirty_metadata(handle, inode, bh) \
159 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 154 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
160 155
@@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
254 return 0; 249 return 0;
255} 250}
256 251
252static inline void ext4_update_inode_fsync_trans(handle_t *handle,
253 struct inode *inode,
254 int datasync)
255{
256 struct ext4_inode_info *ei = EXT4_I(inode);
257
258 if (ext4_handle_valid(handle)) {
259 ei->i_sync_tid = handle->h_transaction->t_tid;
260 if (datasync)
261 ei->i_datasync_tid = handle->h_transaction->t_tid;
262 }
263}
264
257/* super.c */ 265/* super.c */
258int ext4_force_commit(struct super_block *sb); 266int ext4_force_commit(struct super_block *sb);
259 267
@@ -296,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
296 return 0; 304 return 0;
297} 305}
298 306
307/*
308 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is
312 * enabled, since the dioread_nolock code uses b_private to pass
313 * information back to the I/O completion handler, and this conflicts
314 * with the jbd's use of b_private.
315 */
316static inline int ext4_should_dioread_nolock(struct inode *inode)
317{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode))
323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
325 return 0;
326 if (ext4_should_journal_data(inode))
327 return 0;
328 return 1;
329}
330
299#endif /* _EXT4_JBD2_H */ 331#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b4bae4..94c8ee81f5e1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
195 if (S_ISREG(inode->i_mode)) 195 if (S_ISREG(inode->i_mode))
196 block_group++; 196 block_group++;
197 } 197 }
198 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 198 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
199 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
200 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 199 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
201 200
202 /* 201 /*
@@ -296,29 +295,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
296 * to allocate @blocks 295 * to allocate @blocks
297 * Worse case is one block per extent 296 * Worse case is one block per extent
298 */ 297 */
299int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) 298int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
300{ 299{
301 int lcap, icap, rcap, leafs, idxs, num; 300 struct ext4_inode_info *ei = EXT4_I(inode);
302 int newextents = blocks; 301 int idxs, num = 0;
303
304 rcap = ext4_ext_space_root_idx(inode, 0);
305 lcap = ext4_ext_space_block(inode, 0);
306 icap = ext4_ext_space_block_idx(inode, 0);
307 302
308 /* number of new leaf blocks needed */ 303 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
309 num = leafs = (newextents + lcap - 1) / lcap; 304 / sizeof(struct ext4_extent_idx));
310 305
311 /* 306 /*
312 * Worse case, we need separate index block(s) 307 * If the new delayed allocation block is contiguous with the
313 * to link all new leaf blocks 308 * previous da block, it can share index blocks with the
309 * previous block, so we only need to allocate a new index
310 * block every idxs leaf blocks. At ldxs**2 blocks, we need
311 * an additional index block, and at ldxs**3 blocks, yet
312 * another index blocks.
314 */ 313 */
315 idxs = (leafs + icap - 1) / icap; 314 if (ei->i_da_metadata_calc_len &&
316 do { 315 ei->i_da_metadata_calc_last_lblock+1 == lblock) {
317 num += idxs; 316 if ((ei->i_da_metadata_calc_len % idxs) == 0)
318 idxs = (idxs + icap - 1) / icap; 317 num++;
319 } while (idxs > rcap); 318 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
319 num++;
320 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
321 num++;
322 ei->i_da_metadata_calc_len = 0;
323 } else
324 ei->i_da_metadata_calc_len++;
325 ei->i_da_metadata_calc_last_lblock++;
326 return num;
327 }
320 328
321 return num; 329 /*
330 * In the worst case we need a new set of index blocks at
331 * every level of the inode's extent tree.
332 */
333 ei->i_da_metadata_calc_len = 1;
334 ei->i_da_metadata_calc_last_lblock = lblock;
335 return ext_depth(inode) + 1;
322} 336}
323 337
324static int 338static int
@@ -425,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
425 return 0; 439 return 0;
426 440
427corrupted: 441corrupted:
428 ext4_error(inode->i_sb, function, 442 __ext4_error(inode->i_sb, function,
429 "bad header/extent in inode #%lu: %s - magic %x, " 443 "bad header/extent in inode #%lu: %s - magic %x, "
430 "entries %u, max %u(%u), depth %u(%u)", 444 "entries %u, max %u(%u), depth %u(%u)",
431 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -688,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
688 } 702 }
689 eh = ext_block_hdr(bh); 703 eh = ext_block_hdr(bh);
690 ppos++; 704 ppos++;
691 BUG_ON(ppos > depth); 705 if (unlikely(ppos > depth)) {
706 put_bh(bh);
707 EXT4_ERROR_INODE(inode,
708 "ppos %d > depth %d", ppos, depth);
709 goto err;
710 }
692 path[ppos].p_bh = bh; 711 path[ppos].p_bh = bh;
693 path[ppos].p_hdr = eh; 712 path[ppos].p_hdr = eh;
694 i--; 713 i--;
@@ -734,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
734 if (err) 753 if (err)
735 return err; 754 return err;
736 755
737 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); 756 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
757 EXT4_ERROR_INODE(inode,
758 "logical %d == ei_block %d!",
759 logical, le32_to_cpu(curp->p_idx->ei_block));
760 return -EIO;
761 }
738 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 762 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
739 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 763 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
740 /* insert after */ 764 /* insert after */
@@ -764,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
764 ext4_idx_store_pblock(ix, ptr); 788 ext4_idx_store_pblock(ix, ptr);
765 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 789 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
766 790
767 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) 791 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
768 > le16_to_cpu(curp->p_hdr->eh_max)); 792 > le16_to_cpu(curp->p_hdr->eh_max))) {
769 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); 793 EXT4_ERROR_INODE(inode,
794 "logical %d == ei_block %d!",
795 logical, le32_to_cpu(curp->p_idx->ei_block));
796 return -EIO;
797 }
798 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
799 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
800 return -EIO;
801 }
770 802
771 err = ext4_ext_dirty(handle, inode, curp); 803 err = ext4_ext_dirty(handle, inode, curp);
772 ext4_std_error(inode->i_sb, err); 804 ext4_std_error(inode->i_sb, err);
@@ -804,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
804 836
805 /* if current leaf will be split, then we should use 837 /* if current leaf will be split, then we should use
806 * border from split point */ 838 * border from split point */
807 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); 839 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
840 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
841 return -EIO;
842 }
808 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 843 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
809 border = path[depth].p_ext[1].ee_block; 844 border = path[depth].p_ext[1].ee_block;
810 ext_debug("leaf will be split." 845 ext_debug("leaf will be split."
@@ -845,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
845 880
846 /* initialize new leaf */ 881 /* initialize new leaf */
847 newblock = ablocks[--a]; 882 newblock = ablocks[--a];
848 BUG_ON(newblock == 0); 883 if (unlikely(newblock == 0)) {
884 EXT4_ERROR_INODE(inode, "newblock == 0!");
885 err = -EIO;
886 goto cleanup;
887 }
849 bh = sb_getblk(inode->i_sb, newblock); 888 bh = sb_getblk(inode->i_sb, newblock);
850 if (!bh) { 889 if (!bh) {
851 err = -EIO; 890 err = -EIO;
@@ -865,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
865 ex = EXT_FIRST_EXTENT(neh); 904 ex = EXT_FIRST_EXTENT(neh);
866 905
867 /* move remainder of path[depth] to the new leaf */ 906 /* move remainder of path[depth] to the new leaf */
868 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); 907 if (unlikely(path[depth].p_hdr->eh_entries !=
908 path[depth].p_hdr->eh_max)) {
909 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
910 path[depth].p_hdr->eh_entries,
911 path[depth].p_hdr->eh_max);
912 err = -EIO;
913 goto cleanup;
914 }
869 /* start copy from next extent */ 915 /* start copy from next extent */
870 /* TODO: we could do it by single memmove */ 916 /* TODO: we could do it by single memmove */
871 m = 0; 917 m = 0;
@@ -912,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
912 958
913 /* create intermediate indexes */ 959 /* create intermediate indexes */
914 k = depth - at - 1; 960 k = depth - at - 1;
915 BUG_ON(k < 0); 961 if (unlikely(k < 0)) {
962 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
963 err = -EIO;
964 goto cleanup;
965 }
916 if (k) 966 if (k)
917 ext_debug("create %d intermediate indices\n", k); 967 ext_debug("create %d intermediate indices\n", k);
918 /* insert new index into current index block */ 968 /* insert new index into current index block */
@@ -949,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
949 999
950 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
951 EXT_MAX_INDEX(path[i].p_hdr)); 1001 EXT_MAX_INDEX(path[i].p_hdr));
952 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != 1002 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
953 EXT_LAST_INDEX(path[i].p_hdr)); 1003 EXT_LAST_INDEX(path[i].p_hdr))) {
1004 EXT4_ERROR_INODE(inode,
1005 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1006 le32_to_cpu(path[i].p_ext->ee_block));
1007 err = -EIO;
1008 goto cleanup;
1009 }
954 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1010 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
955 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1011 ext_debug("%d: move %d:%llu in new index %llu\n", i,
956 le32_to_cpu(path[i].p_idx->ei_block), 1012 le32_to_cpu(path[i].p_idx->ei_block),
@@ -1007,7 +1063,8 @@ cleanup:
1007 for (i = 0; i < depth; i++) { 1063 for (i = 0; i < depth; i++) {
1008 if (!ablocks[i]) 1064 if (!ablocks[i])
1009 continue; 1065 continue;
1010 ext4_free_blocks(handle, inode, ablocks[i], 1, 1); 1066 ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
1067 EXT4_FREE_BLOCKS_METADATA);
1011 } 1068 }
1012 } 1069 }
1013 kfree(ablocks); 1070 kfree(ablocks);
@@ -1187,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1187 struct ext4_extent *ex; 1244 struct ext4_extent *ex;
1188 int depth, ee_len; 1245 int depth, ee_len;
1189 1246
1190 BUG_ON(path == NULL); 1247 if (unlikely(path == NULL)) {
1248 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1249 return -EIO;
1250 }
1191 depth = path->p_depth; 1251 depth = path->p_depth;
1192 *phys = 0; 1252 *phys = 0;
1193 1253
@@ -1201,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1201 ex = path[depth].p_ext; 1261 ex = path[depth].p_ext;
1202 ee_len = ext4_ext_get_actual_len(ex); 1262 ee_len = ext4_ext_get_actual_len(ex);
1203 if (*logical < le32_to_cpu(ex->ee_block)) { 1263 if (*logical < le32_to_cpu(ex->ee_block)) {
1204 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1264 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1265 EXT4_ERROR_INODE(inode,
1266 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1267 *logical, le32_to_cpu(ex->ee_block));
1268 return -EIO;
1269 }
1205 while (--depth >= 0) { 1270 while (--depth >= 0) {
1206 ix = path[depth].p_idx; 1271 ix = path[depth].p_idx;
1207 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1272 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1273 EXT4_ERROR_INODE(inode,
1274 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1275 ix != NULL ? ix->ei_block : 0,
1276 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1277 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1278 depth);
1279 return -EIO;
1280 }
1208 } 1281 }
1209 return 0; 1282 return 0;
1210 } 1283 }
1211 1284
1212 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1285 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1286 EXT4_ERROR_INODE(inode,
1287 "logical %d < ee_block %d + ee_len %d!",
1288 *logical, le32_to_cpu(ex->ee_block), ee_len);
1289 return -EIO;
1290 }
1213 1291
1214 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1292 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1215 *phys = ext_pblock(ex) + ee_len - 1; 1293 *phys = ext_pblock(ex) + ee_len - 1;
@@ -1235,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1235 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1313 int depth; /* Note, NOT eh_depth; depth from top of tree */
1236 int ee_len; 1314 int ee_len;
1237 1315
1238 BUG_ON(path == NULL); 1316 if (unlikely(path == NULL)) {
1317 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1318 return -EIO;
1319 }
1239 depth = path->p_depth; 1320 depth = path->p_depth;
1240 *phys = 0; 1321 *phys = 0;
1241 1322
@@ -1249,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1249 ex = path[depth].p_ext; 1330 ex = path[depth].p_ext;
1250 ee_len = ext4_ext_get_actual_len(ex); 1331 ee_len = ext4_ext_get_actual_len(ex);
1251 if (*logical < le32_to_cpu(ex->ee_block)) { 1332 if (*logical < le32_to_cpu(ex->ee_block)) {
1252 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1333 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1334 EXT4_ERROR_INODE(inode,
1335 "first_extent(path[%d].p_hdr) != ex",
1336 depth);
1337 return -EIO;
1338 }
1253 while (--depth >= 0) { 1339 while (--depth >= 0) {
1254 ix = path[depth].p_idx; 1340 ix = path[depth].p_idx;
1255 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1341 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1342 EXT4_ERROR_INODE(inode,
1343 "ix != EXT_FIRST_INDEX *logical %d!",
1344 *logical);
1345 return -EIO;
1346 }
1256 } 1347 }
1257 *logical = le32_to_cpu(ex->ee_block); 1348 *logical = le32_to_cpu(ex->ee_block);
1258 *phys = ext_pblock(ex); 1349 *phys = ext_pblock(ex);
1259 return 0; 1350 return 0;
1260 } 1351 }
1261 1352
1262 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1353 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1354 EXT4_ERROR_INODE(inode,
1355 "logical %d < ee_block %d + ee_len %d!",
1356 *logical, le32_to_cpu(ex->ee_block), ee_len);
1357 return -EIO;
1358 }
1263 1359
1264 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1360 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1265 /* next allocated block in this leaf */ 1361 /* next allocated block in this leaf */
@@ -1398,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1398 1494
1399 eh = path[depth].p_hdr; 1495 eh = path[depth].p_hdr;
1400 ex = path[depth].p_ext; 1496 ex = path[depth].p_ext;
1401 BUG_ON(ex == NULL); 1497
1402 BUG_ON(eh == NULL); 1498 if (unlikely(ex == NULL || eh == NULL)) {
1499 EXT4_ERROR_INODE(inode,
1500 "ex %p == NULL or eh %p == NULL", ex, eh);
1501 return -EIO;
1502 }
1403 1503
1404 if (depth == 0) { 1504 if (depth == 0) {
1405 /* there is no tree at all */ 1505 /* there is no tree at all */
@@ -1522,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1522 merge_done = 1; 1622 merge_done = 1;
1523 WARN_ON(eh->eh_entries == 0); 1623 WARN_ON(eh->eh_entries == 0);
1524 if (!eh->eh_entries) 1624 if (!eh->eh_entries)
1525 ext4_error(inode->i_sb, "ext4_ext_try_to_merge", 1625 ext4_error(inode->i_sb,
1526 "inode#%lu, eh->eh_entries = 0!", inode->i_ino); 1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1527 } 1628 }
1528 1629
1529 return merge_done; 1630 return merge_done;
@@ -1596,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1596 ext4_lblk_t next; 1697 ext4_lblk_t next;
1597 unsigned uninitialized = 0; 1698 unsigned uninitialized = 0;
1598 1699
1599 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1700 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1701 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1702 return -EIO;
1703 }
1600 depth = ext_depth(inode); 1704 depth = ext_depth(inode);
1601 ex = path[depth].p_ext; 1705 ex = path[depth].p_ext;
1602 BUG_ON(path[depth].p_hdr == NULL); 1706 if (unlikely(path[depth].p_hdr == NULL)) {
1707 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1708 return -EIO;
1709 }
1603 1710
1604 /* try to insert block into found extent and return */ 1711 /* try to insert block into found extent and return */
1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1712 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1606 && ext4_can_extents_be_merged(inode, ex, newext)) { 1713 && ext4_can_extents_be_merged(inode, ex, newext)) {
1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1714 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1608 ext4_ext_is_uninitialized(newext), 1715 ext4_ext_is_uninitialized(newext),
@@ -1723,7 +1830,7 @@ has_space:
1723 1830
1724merge: 1831merge:
1725 /* try to merge extents to the right */ 1832 /* try to merge extents to the right */
1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1833 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1727 ext4_ext_try_to_merge(inode, path, nearex); 1834 ext4_ext_try_to_merge(inode, path, nearex);
1728 1835
1729 /* try to merge extents to the left */ 1836 /* try to merge extents to the left */
@@ -1761,7 +1868,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1761 while (block < last && block != EXT_MAX_BLOCK) { 1868 while (block < last && block != EXT_MAX_BLOCK) {
1762 num = last - block; 1869 num = last - block;
1763 /* find extent for this block */ 1870 /* find extent for this block */
1871 down_read(&EXT4_I(inode)->i_data_sem);
1764 path = ext4_ext_find_extent(inode, block, path); 1872 path = ext4_ext_find_extent(inode, block, path);
1873 up_read(&EXT4_I(inode)->i_data_sem);
1765 if (IS_ERR(path)) { 1874 if (IS_ERR(path)) {
1766 err = PTR_ERR(path); 1875 err = PTR_ERR(path);
1767 path = NULL; 1876 path = NULL;
@@ -1769,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1769 } 1878 }
1770 1879
1771 depth = ext_depth(inode); 1880 depth = ext_depth(inode);
1772 BUG_ON(path[depth].p_hdr == NULL); 1881 if (unlikely(path[depth].p_hdr == NULL)) {
1882 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1883 err = -EIO;
1884 break;
1885 }
1773 ex = path[depth].p_ext; 1886 ex = path[depth].p_ext;
1774 next = ext4_ext_next_allocated_block(path); 1887 next = ext4_ext_next_allocated_block(path);
1775 1888
@@ -1820,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1820 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1933 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1821 } 1934 }
1822 1935
1823 BUG_ON(cbex.ec_len == 0); 1936 if (unlikely(cbex.ec_len == 0)) {
1937 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1938 err = -EIO;
1939 break;
1940 }
1824 err = func(inode, path, &cbex, ex, cbdata); 1941 err = func(inode, path, &cbex, ex, cbdata);
1825 ext4_ext_drop_refs(path); 1942 ext4_ext_drop_refs(path);
1826 1943
@@ -1934,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1934 2051
1935 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 2052 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1936 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 2053 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1937 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { 2054 if (in_range(block, cex->ec_block, cex->ec_len)) {
1938 ex->ee_block = cpu_to_le32(cex->ec_block); 2055 ex->ee_block = cpu_to_le32(cex->ec_block);
1939 ext4_ext_store_pblock(ex, cex->ec_start); 2056 ext4_ext_store_pblock(ex, cex->ec_start);
1940 ex->ee_len = cpu_to_le16(cex->ec_len); 2057 ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1957,14 +2074,16 @@ errout:
1957static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2074static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1958 struct ext4_ext_path *path) 2075 struct ext4_ext_path *path)
1959{ 2076{
1960 struct buffer_head *bh;
1961 int err; 2077 int err;
1962 ext4_fsblk_t leaf; 2078 ext4_fsblk_t leaf;
1963 2079
1964 /* free index block */ 2080 /* free index block */
1965 path--; 2081 path--;
1966 leaf = idx_pblock(path->p_idx); 2082 leaf = idx_pblock(path->p_idx);
1967 BUG_ON(path->p_hdr->eh_entries == 0); 2083 if (unlikely(path->p_hdr->eh_entries == 0)) {
2084 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2085 return -EIO;
2086 }
1968 err = ext4_ext_get_access(handle, inode, path); 2087 err = ext4_ext_get_access(handle, inode, path);
1969 if (err) 2088 if (err)
1970 return err; 2089 return err;
@@ -1973,9 +2092,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1973 if (err) 2092 if (err)
1974 return err; 2093 return err;
1975 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2094 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1976 bh = sb_find_get_block(inode->i_sb, leaf); 2095 ext4_free_blocks(handle, inode, 0, leaf, 1,
1977 ext4_forget(handle, 1, inode, bh, leaf); 2096 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1978 ext4_free_blocks(handle, inode, leaf, 1, 1);
1979 return err; 2097 return err;
1980} 2098}
1981 2099
@@ -2042,12 +2160,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2042 struct ext4_extent *ex, 2160 struct ext4_extent *ex,
2043 ext4_lblk_t from, ext4_lblk_t to) 2161 ext4_lblk_t from, ext4_lblk_t to)
2044{ 2162{
2045 struct buffer_head *bh;
2046 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2163 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2047 int i, metadata = 0; 2164 int flags = EXT4_FREE_BLOCKS_FORGET;
2048 2165
2049 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2166 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2050 metadata = 1; 2167 flags |= EXT4_FREE_BLOCKS_METADATA;
2051#ifdef EXTENTS_STATS 2168#ifdef EXTENTS_STATS
2052 { 2169 {
2053 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2170 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2072,11 +2189,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2072 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2189 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2073 start = ext_pblock(ex) + ee_len - num; 2190 start = ext_pblock(ex) + ee_len - num;
2074 ext_debug("free last %u blocks starting %llu\n", num, start); 2191 ext_debug("free last %u blocks starting %llu\n", num, start);
2075 for (i = 0; i < num; i++) { 2192 ext4_free_blocks(handle, inode, 0, start, num, flags);
2076 bh = sb_find_get_block(inode->i_sb, start + i);
2077 ext4_forget(handle, 0, inode, bh, start + i);
2078 }
2079 ext4_free_blocks(handle, inode, start, num, metadata);
2080 } else if (from == le32_to_cpu(ex->ee_block) 2193 } else if (from == le32_to_cpu(ex->ee_block)
2081 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2194 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2082 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2195 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2108,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2108 if (!path[depth].p_hdr) 2221 if (!path[depth].p_hdr)
2109 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2222 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2110 eh = path[depth].p_hdr; 2223 eh = path[depth].p_hdr;
2111 BUG_ON(eh == NULL); 2224 if (unlikely(path[depth].p_hdr == NULL)) {
2112 2225 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2226 return -EIO;
2227 }
2113 /* find where to start removing */ 2228 /* find where to start removing */
2114 ex = EXT_LAST_EXTENT(eh); 2229 ex = EXT_LAST_EXTENT(eh);
2115 2230
@@ -2167,7 +2282,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2167 correct_index = 1; 2282 correct_index = 1;
2168 credits += (ext_depth(inode)) + 1; 2283 credits += (ext_depth(inode)) + 1;
2169 } 2284 }
2170 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2285 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2171 2286
2172 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2287 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2173 if (err) 2288 if (err)
@@ -2972,7 +3087,7 @@ fix_extent_len:
2972 ext4_ext_dirty(handle, inode, path + depth); 3087 ext4_ext_dirty(handle, inode, path + depth);
2973 return err; 3088 return err;
2974} 3089}
2975static int ext4_convert_unwritten_extents_dio(handle_t *handle, 3090static int ext4_convert_unwritten_extents_endio(handle_t *handle,
2976 struct inode *inode, 3091 struct inode *inode,
2977 struct ext4_ext_path *path) 3092 struct ext4_ext_path *path)
2978{ 3093{
@@ -3027,6 +3142,14 @@ out:
3027 return err; 3142 return err;
3028} 3143}
3029 3144
3145static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3146 sector_t block, int count)
3147{
3148 int i;
3149 for (i = 0; i < count; i++)
3150 unmap_underlying_metadata(bdev, block + i);
3151}
3152
3030static int 3153static int
3031ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3032 ext4_lblk_t iblock, unsigned int max_blocks, 3155 ext4_lblk_t iblock, unsigned int max_blocks,
@@ -3044,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3044 flags, allocated); 3167 flags, allocated);
3045 ext4_ext_show_leaf(inode, path); 3168 ext4_ext_show_leaf(inode, path);
3046 3169
3047 /* DIO get_block() before submit the IO, split the extent */ 3170 /* get_block() before submit the IO, split the extent */
3048 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3049 ret = ext4_split_unwritten_extents(handle, 3172 ret = ext4_split_unwritten_extents(handle,
3050 inode, path, iblock, 3173 inode, path, iblock,
3051 max_blocks, flags); 3174 max_blocks, flags);
@@ -3055,15 +3178,19 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3055 * completed 3178 * completed
3056 */ 3179 */
3057 if (io) 3180 if (io)
3058 io->flag = DIO_AIO_UNWRITTEN; 3181 io->flag = EXT4_IO_UNWRITTEN;
3059 else 3182 else
3060 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; 3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result);
3061 goto out; 3186 goto out;
3062 } 3187 }
3063 /* async DIO end_io complete, convert the filled extent to written */ 3188 /* IO end_io complete, convert the filled extent to written */
3064 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3189 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3065 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3190 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3066 path); 3191 path);
3192 if (ret >= 0)
3193 ext4_update_inode_fsync_trans(handle, inode, 1);
3067 goto out2; 3194 goto out2;
3068 } 3195 }
3069 /* buffered IO case */ 3196 /* buffered IO case */
@@ -3091,6 +3218,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3091 ret = ext4_ext_convert_to_initialized(handle, inode, 3218 ret = ext4_ext_convert_to_initialized(handle, inode,
3092 path, iblock, 3219 path, iblock,
3093 max_blocks); 3220 max_blocks);
3221 if (ret >= 0)
3222 ext4_update_inode_fsync_trans(handle, inode, 1);
3094out: 3223out:
3095 if (ret <= 0) { 3224 if (ret <= 0) {
3096 err = ret; 3225 err = ret;
@@ -3098,6 +3227,30 @@ out:
3098 } else 3227 } else
3099 allocated = ret; 3228 allocated = ret;
3100 set_buffer_new(bh_result); 3229 set_buffer_new(bh_result);
3230 /*
3231 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block
3233 * allocated. The actual needed block will get
3234 * unmapped later when we find the buffer_head marked
3235 * new.
3236 */
3237 if (allocated > max_blocks) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks,
3240 allocated - max_blocks);
3241 allocated = max_blocks;
3242 }
3243
3244 /*
3245 * If we have done fallocate with the offset that is already
3246 * delayed allocated, we would have block reservation
3247 * and quota reservation done in the delayed write path.
3248 * But fallocate would have already updated quota and block
3249 * count for this offset. So cancel these reservation
3250 */
3251 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3252 ext4_da_update_reserve_space(inode, allocated, 0);
3253
3101map_out: 3254map_out:
3102 set_buffer_mapped(bh_result); 3255 set_buffer_mapped(bh_result);
3103out1: 3256out1:
@@ -3138,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3138{ 3291{
3139 struct ext4_ext_path *path = NULL; 3292 struct ext4_ext_path *path = NULL;
3140 struct ext4_extent_header *eh; 3293 struct ext4_extent_header *eh;
3141 struct ext4_extent newex, *ex; 3294 struct ext4_extent newex, *ex, *last_ex;
3142 ext4_fsblk_t newblock; 3295 ext4_fsblk_t newblock;
3143 int err = 0, depth, ret, cache_type; 3296 int err = 0, depth, ret, cache_type;
3144 unsigned int allocated = 0; 3297 unsigned int allocated = 0;
@@ -3190,7 +3343,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3190 * this situation is possible, though, _during_ tree modification; 3343 * this situation is possible, though, _during_ tree modification;
3191 * this is why assert can't be put in ext4_ext_find_extent() 3344 * this is why assert can't be put in ext4_ext_find_extent()
3192 */ 3345 */
3193 BUG_ON(path[depth].p_ext == NULL && depth != 0); 3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block);
3350 err = -EIO;
3351 goto out2;
3352 }
3194 eh = path[depth].p_hdr; 3353 eh = path[depth].p_hdr;
3195 3354
3196 ex = path[depth].p_ext; 3355 ex = path[depth].p_ext;
@@ -3205,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3205 */ 3364 */
3206 ee_len = ext4_ext_get_actual_len(ex); 3365 ee_len = ext4_ext_get_actual_len(ex);
3207 /* if found extent covers block, simply return it */ 3366 /* if found extent covers block, simply return it */
3208 if (iblock >= ee_block && iblock < ee_block + ee_len) { 3367 if (in_range(iblock, ee_block, ee_len)) {
3209 newblock = iblock - ee_block + ee_start; 3368 newblock = iblock - ee_block + ee_start;
3210 /* number of remaining blocks in the extent */ 3369 /* number of remaining blocks in the extent */
3211 allocated = ee_len - (iblock - ee_block); 3370 allocated = ee_len - (iblock - ee_block);
@@ -3297,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3297 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3456 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3298 ext4_ext_mark_uninitialized(&newex); 3457 ext4_ext_mark_uninitialized(&newex);
3299 /* 3458 /*
3300 * io_end structure was created for every async 3459 * io_end structure was created for every IO write to an
3301 * direct IO write to the middle of the file. 3460 * uninitialized extent. To avoid unecessary conversion,
3302 * To avoid unecessary convertion for every aio dio rewrite 3461 * here we flag the IO that really needs the conversion.
3303 * to the mid of file, here we flag the IO that is really
3304 * need the convertion.
3305 * For non asycn direct IO case, flag the inode state 3462 * For non asycn direct IO case, flag the inode state
3306 * that we need to perform convertion when IO is done. 3463 * that we need to perform convertion when IO is done.
3307 */ 3464 */
3308 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3309 if (io) 3466 if (io)
3310 io->flag = DIO_AIO_UNWRITTEN; 3467 io->flag = EXT4_IO_UNWRITTEN;
3311 else 3468 else
3312 EXT4_I(inode)->i_state |= 3469 ext4_set_inode_state(inode,
3313 EXT4_STATE_DIO_UNWRITTEN;; 3470 EXT4_STATE_DIO_UNWRITTEN);
3314 } 3471 }
3472 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result);
3474 }
3475
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
3477 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d",
3480 ex->ee_block);
3481 err = -EIO;
3482 goto out2;
3483 }
3484 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
3486 + ext4_ext_get_actual_len(last_ex))
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
3315 } 3488 }
3316 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3317 if (err) { 3490 if (err) {
@@ -3319,20 +3492,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3319 /* not a good idea to call discard here directly, 3492 /* not a good idea to call discard here directly,
3320 * but otherwise we'd need to call it every free() */ 3493 * but otherwise we'd need to call it every free() */
3321 ext4_discard_preallocations(inode); 3494 ext4_discard_preallocations(inode);
3322 ext4_free_blocks(handle, inode, ext_pblock(&newex), 3495 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
3323 ext4_ext_get_actual_len(&newex), 0); 3496 ext4_ext_get_actual_len(&newex), 0);
3324 goto out2; 3497 goto out2;
3325 } 3498 }
3326 3499
3327 /* previous routine could use block we allocated */ 3500 /* previous routine could use block we allocated */
3328 newblock = ext_pblock(&newex); 3501 newblock = ext_pblock(&newex);
3329 allocated = ext4_ext_get_actual_len(&newex); 3502 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks)
3504 allocated = max_blocks;
3330 set_buffer_new(bh_result); 3505 set_buffer_new(bh_result);
3331 3506
3332 /* Cache only when it is _not_ an uninitialized extent */ 3507 /*
3333 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) 3508 * Update reserved blocks/metadata blocks after successful
3509 * block allocation which had been deferred till now.
3510 */
3511 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3512 ext4_da_update_reserve_space(inode, allocated, 1);
3513
3514 /*
3515 * Cache the extent and update transaction to commit on fdatasync only
3516 * when it is _not_ an uninitialized extent.
3517 */
3518 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3334 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3519 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
3335 EXT4_EXT_CACHE_EXTENT); 3520 EXT4_EXT_CACHE_EXTENT);
3521 ext4_update_inode_fsync_trans(handle, inode, 1);
3522 } else
3523 ext4_update_inode_fsync_trans(handle, inode, 0);
3336out: 3524out:
3337 if (allocated > max_blocks) 3525 if (allocated > max_blocks)
3338 allocated = max_blocks; 3526 allocated = max_blocks;
@@ -3431,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
3431 i_size_write(inode, new_size); 3619 i_size_write(inode, new_size);
3432 if (new_size > EXT4_I(inode)->i_disksize) 3620 if (new_size > EXT4_I(inode)->i_disksize)
3433 ext4_update_i_disksize(inode, new_size); 3621 ext4_update_i_disksize(inode, new_size);
3622 } else {
3623 /*
3624 * Mark that we allocate beyond EOF so the subsequent truncate
3625 * can proceed even if the new size is the same as i_size.
3626 */
3627 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
3434 } 3629 }
3435 3630
3436} 3631}
@@ -3535,7 +3730,7 @@ retry:
3535 * Returns 0 on success. 3730 * Returns 0 on success.
3536 */ 3731 */
3537int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3732int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3538 loff_t len) 3733 ssize_t len)
3539{ 3734{
3540 handle_t *handle; 3735 handle_t *handle;
3541 ext4_lblk_t block; 3736 ext4_lblk_t block;
@@ -3567,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3567 map_bh.b_state = 0; 3762 map_bh.b_state = 0;
3568 ret = ext4_get_blocks(handle, inode, block, 3763 ret = ext4_get_blocks(handle, inode, block,
3569 max_blocks, &map_bh, 3764 max_blocks, &map_bh,
3570 EXT4_GET_BLOCKS_DIO_CONVERT_EXT); 3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3571 if (ret <= 0) { 3766 if (ret <= 0) {
3572 WARN_ON(ret <= 0); 3767 WARN_ON(ret <= 0);
3573 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3768 printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3671,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3671 int error = 0; 3866 int error = 0;
3672 3867
3673 /* in-inode? */ 3868 /* in-inode? */
3674 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 3869 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3675 struct ext4_iloc iloc; 3870 struct ext4_iloc iloc;
3676 int offset; /* offset of xattr in inode */ 3871 int offset; /* offset of xattr in inode */
3677 3872
@@ -3699,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3699 __u64 start, __u64 len) 3894 __u64 start, __u64 len)
3700{ 3895{
3701 ext4_lblk_t start_blk; 3896 ext4_lblk_t start_blk;
3702 ext4_lblk_t len_blks;
3703 int error = 0; 3897 int error = 0;
3704 3898
3705 /* fallback to generic here if not in extents fmt */ 3899 /* fallback to generic here if not in extents fmt */
@@ -3713,17 +3907,21 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3713 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 3907 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3714 error = ext4_xattr_fiemap(inode, fieinfo); 3908 error = ext4_xattr_fiemap(inode, fieinfo);
3715 } else { 3909 } else {
3910 ext4_lblk_t len_blks;
3911 __u64 last_blk;
3912
3716 start_blk = start >> inode->i_sb->s_blocksize_bits; 3913 start_blk = start >> inode->i_sb->s_blocksize_bits;
3717 len_blks = len >> inode->i_sb->s_blocksize_bits; 3914 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3915 if (last_blk >= EXT_MAX_BLOCK)
3916 last_blk = EXT_MAX_BLOCK-1;
3917 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3718 3918
3719 /* 3919 /*
3720 * Walk the extent tree gathering extent information. 3920 * Walk the extent tree gathering extent information.
3721 * ext4_ext_fiemap_cb will push extents back to user. 3921 * ext4_ext_fiemap_cb will push extents back to user.
3722 */ 3922 */
3723 down_read(&EXT4_I(inode)->i_data_sem);
3724 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3923 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3725 ext4_ext_fiemap_cb, fieinfo); 3924 ext4_ext_fiemap_cb, fieinfo);
3726 up_read(&EXT4_I(inode)->i_data_sem);
3727 } 3925 }
3728 3926
3729 return error; 3927 return error;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..d0776e410f34 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
23#include <linux/jbd2.h> 23#include <linux/jbd2.h>
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/path.h> 25#include <linux/path.h>
26#include <linux/quotaops.h>
26#include "ext4.h" 27#include "ext4.h"
27#include "ext4_jbd2.h" 28#include "ext4_jbd2.h"
28#include "xattr.h" 29#include "xattr.h"
@@ -35,9 +36,9 @@
35 */ 36 */
36static int ext4_release_file(struct inode *inode, struct file *filp) 37static int ext4_release_file(struct inode *inode, struct file *filp)
37{ 38{
38 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { 39 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
39 ext4_alloc_da_blocks(inode); 40 ext4_alloc_da_blocks(inode);
40 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; 41 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
41 } 42 }
42 /* if we are the last writer on the inode, drop the block reservation */ 43 /* if we are the last writer on the inode, drop the block reservation */
43 if ((filp->f_mode & FMODE_WRITE) && 44 if ((filp->f_mode & FMODE_WRITE) &&
@@ -116,18 +117,16 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
116 * devices or filesystem images. 117 * devices or filesystem images.
117 */ 118 */
118 memset(buf, 0, sizeof(buf)); 119 memset(buf, 0, sizeof(buf));
119 path.mnt = mnt->mnt_parent; 120 path.mnt = mnt;
120 path.dentry = mnt->mnt_mountpoint; 121 path.dentry = mnt->mnt_root;
121 path_get(&path);
122 cp = d_path(&path, buf, sizeof(buf)); 122 cp = d_path(&path, buf, sizeof(buf));
123 path_put(&path);
124 if (!IS_ERR(cp)) { 123 if (!IS_ERR(cp)) {
125 memcpy(sbi->s_es->s_last_mounted, cp, 124 memcpy(sbi->s_es->s_last_mounted, cp,
126 sizeof(sbi->s_es->s_last_mounted)); 125 sizeof(sbi->s_es->s_last_mounted));
127 sb->s_dirt = 1; 126 sb->s_dirt = 1;
128 } 127 }
129 } 128 }
130 return generic_file_open(inode, filp); 129 return dquot_file_open(inode, filp);
131} 130}
132 131
133const struct file_operations ext4_file_operations = { 132const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b1531266ee2..0d0c3239c1cd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
52{ 52{
53 struct inode *inode = dentry->d_inode; 53 struct inode *inode = dentry->d_inode;
54 struct ext4_inode_info *ei = EXT4_I(inode);
54 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
55 int err, ret = 0; 56 int ret;
57 tid_t commit_tid;
56 58
57 J_ASSERT(ext4_journal_current_handle() == NULL); 59 J_ASSERT(ext4_journal_current_handle() == NULL);
58 60
59 trace_ext4_sync_file(file, dentry, datasync); 61 trace_ext4_sync_file(file, dentry, datasync);
60 62
61 ret = flush_aio_dio_completed_IO(inode); 63 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0;
65
66 ret = flush_completed_IO(inode);
62 if (ret < 0) 67 if (ret < 0)
63 goto out; 68 return ret;
69
70 if (!journal)
71 return simple_fsync(file, dentry, datasync);
72
64 /* 73 /*
65 * data=writeback: 74 * data=writeback,ordered:
66 * The caller's filemap_fdatawrite()/wait will sync the data. 75 * The caller's filemap_fdatawrite()/wait will sync the data.
67 * sync_inode() will sync the metadata 76 * Metadata is in the journal, we wait for proper transaction to
68 * 77 * commit here.
69 * data=ordered:
70 * The caller's filemap_fdatawrite() will write the data and
71 * sync_inode() will write the inode if it is dirty. Then the caller's
72 * filemap_fdatawait() will wait on the pages.
73 * 78 *
74 * data=journal: 79 * data=journal:
75 * filemap_fdatawrite won't do anything (the buffers are clean). 80 * filemap_fdatawrite won't do anything (the buffers are clean).
@@ -79,32 +84,25 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
79 * (they were dirtied by commit). But that's OK - the blocks are 84 * (they were dirtied by commit). But that's OK - the blocks are
80 * safe in-journal, which is all fsync() needs to ensure. 85 * safe in-journal, which is all fsync() needs to ensure.
81 */ 86 */
82 if (ext4_should_journal_data(inode)) { 87 if (ext4_should_journal_data(inode))
83 ret = ext4_force_commit(inode->i_sb); 88 return ext4_force_commit(inode->i_sb);
84 goto out;
85 }
86 89
87 if (!journal) 90 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
88 ret = sync_mapping_buffers(inode->i_mapping); 91 if (jbd2_log_start_commit(journal, commit_tid)) {
89 92 /*
90 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 93 * When the journal is on a different device than the
91 goto out; 94 * fs data disk, we need to issue the barrier in
92 95 * writeback mode. (In ordered mode, the jbd2 layer
93 /* 96 * will take care of issuing the barrier. In
94 * The VFS has written the file data. If the inode is unaltered 97 * data=journal, all of the data blocks are written to
95 * then we need not start a commit. 98 * the journal device.)
96 */ 99 */
97 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { 100 if (ext4_should_writeback_data(inode) &&
98 struct writeback_control wbc = { 101 (journal->j_fs_dev != journal->j_dev) &&
99 .sync_mode = WB_SYNC_ALL, 102 (journal->j_flags & JBD2_BARRIER))
100 .nr_to_write = 0, /* sys_fsync did this */ 103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
101 }; 104 jbd2_log_wait_commit(journal, commit_tid);
102 err = sync_inode(inode, &wbc); 105 } else if (journal->j_flags & JBD2_BARRIER)
103 if (ret == 0)
104 ret = err;
105 }
106out:
107 if (journal && (journal->j_flags & JBD2_BARRIER))
108 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
109 return ret; 107 return ret;
110} 108}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
76 /* If checksum is bad mark all blocks and inodes use to prevent 76 /* If checksum is bad mark all blocks and inodes use to prevent
77 * allocation, essentially implementing a per-group read-only flag. */ 77 * allocation, essentially implementing a per-group read-only flag. */
78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
79 ext4_error(sb, __func__, "Checksum bad for group %u", 79 ext4_error(sb, "Checksum bad for group %u", block_group);
80 block_group);
81 ext4_free_blks_set(sb, gdp, 0); 80 ext4_free_blks_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 81 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 82 ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 bitmap_blk = ext4_inode_bitmap(sb, desc); 110 bitmap_blk = ext4_inode_bitmap(sb, desc);
112 bh = sb_getblk(sb, bitmap_blk); 111 bh = sb_getblk(sb, bitmap_blk);
113 if (unlikely(!bh)) { 112 if (unlikely(!bh)) {
114 ext4_error(sb, __func__, 113 ext4_error(sb, "Cannot read inode bitmap - "
115 "Cannot read inode bitmap - "
116 "block_group = %u, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
117 block_group, bitmap_blk); 115 block_group, bitmap_blk);
118 return NULL; 116 return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
153 set_bitmap_uptodate(bh); 151 set_bitmap_uptodate(bh);
154 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
155 put_bh(bh); 153 put_bh(bh);
156 ext4_error(sb, __func__, 154 ext4_error(sb, "Cannot read inode bitmap - "
157 "Cannot read inode bitmap - "
158 "block_group = %u, inode_bitmap = %llu", 155 "block_group = %u, inode_bitmap = %llu",
159 block_group, bitmap_blk); 156 block_group, bitmap_blk);
160 return NULL; 157 return NULL;
@@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
217 * Note: we must free any quota before locking the superblock, 214 * Note: we must free any quota before locking the superblock,
218 * as writing the quota to disk may need the lock as well. 215 * as writing the quota to disk may need the lock as well.
219 */ 216 */
220 vfs_dq_init(inode); 217 dquot_initialize(inode);
221 ext4_xattr_delete_inode(handle, inode); 218 ext4_xattr_delete_inode(handle, inode);
222 vfs_dq_free_inode(inode); 219 dquot_free_inode(inode);
223 vfs_dq_drop(inode); 220 dquot_drop(inode);
224 221
225 is_directory = S_ISDIR(inode->i_mode); 222 is_directory = S_ISDIR(inode->i_mode);
226 223
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
229 226
230 es = EXT4_SB(sb)->s_es; 227 es = EXT4_SB(sb)->s_es;
231 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
232 ext4_error(sb, "ext4_free_inode", 229 ext4_error(sb, "reserved or nonexistent inode %lu", ino);
233 "reserved or nonexistent inode %lu", ino);
234 goto error_return; 230 goto error_return;
235 } 231 }
236 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 232 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
249 bit, bitmap_bh->b_data); 245 bit, bitmap_bh->b_data);
250 if (!cleared) 246 if (!cleared)
251 ext4_error(sb, "ext4_free_inode", 247 ext4_error(sb, "bit already cleared for inode %lu", ino);
252 "bit already cleared for inode %lu", ino);
253 else { 248 else {
254 gdp = ext4_get_group_desc(sb, block_group, &bh2); 249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
255 250
@@ -268,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
268 ext4_group_t f; 263 ext4_group_t f;
269 264
270 f = ext4_flex_group(sbi, block_group); 265 f = ext4_flex_group(sbi, block_group);
271 atomic_dec(&sbi->s_flex_groups[f].free_inodes); 266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
272 } 267 }
273 268
274 } 269 }
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 731 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
737 ino > EXT4_INODES_PER_GROUP(sb)) { 732 ino > EXT4_INODES_PER_GROUP(sb)) {
738 ext4_unlock_group(sb, group); 733 ext4_unlock_group(sb, group);
739 ext4_error(sb, __func__, 734 ext4_error(sb, "reserved inode or inode > inodes count - "
740 "reserved inode or inode > inodes count - "
741 "block_group = %u, inode=%lu", group, 735 "block_group = %u, inode=%lu", group,
742 ino + group * EXT4_INODES_PER_GROUP(sb)); 736 ino + group * EXT4_INODES_PER_GROUP(sb));
743 return 1; 737 return 1;
@@ -779,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
779 if (sbi->s_log_groups_per_flex) { 773 if (sbi->s_log_groups_per_flex) {
780 ext4_group_t f = ext4_flex_group(sbi, group); 774 ext4_group_t f = ext4_flex_group(sbi, group);
781 775
782 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 776 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
783 } 777 }
784 } 778 }
785 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -904,7 +898,7 @@ repeat_in_this_group:
904 BUFFER_TRACE(inode_bitmap_bh, 898 BUFFER_TRACE(inode_bitmap_bh,
905 "call ext4_handle_dirty_metadata"); 899 "call ext4_handle_dirty_metadata");
906 err = ext4_handle_dirty_metadata(handle, 900 err = ext4_handle_dirty_metadata(handle,
907 inode, 901 NULL,
908 inode_bitmap_bh); 902 inode_bitmap_bh);
909 if (err) 903 if (err)
910 goto fail; 904 goto fail;
@@ -1029,15 +1023,16 @@ got:
1029 inode->i_generation = sbi->s_next_generation++; 1023 inode->i_generation = sbi->s_next_generation++;
1030 spin_unlock(&sbi->s_next_gen_lock); 1024 spin_unlock(&sbi->s_next_gen_lock);
1031 1025
1032 ei->i_state = EXT4_STATE_NEW; 1026 ei->i_state_flags = 0;
1027 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1033 1028
1034 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1029 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1035 1030
1036 ret = inode; 1031 ret = inode;
1037 if (vfs_dq_alloc_inode(inode)) { 1032 dquot_initialize(inode);
1038 err = -EDQUOT; 1033 err = dquot_alloc_inode(inode);
1034 if (err)
1039 goto fail_drop; 1035 goto fail_drop;
1040 }
1041 1036
1042 err = ext4_init_acl(handle, inode, dir); 1037 err = ext4_init_acl(handle, inode, dir);
1043 if (err) 1038 if (err)
@@ -1074,10 +1069,10 @@ really_out:
1074 return ret; 1069 return ret;
1075 1070
1076fail_free_drop: 1071fail_free_drop:
1077 vfs_dq_free_inode(inode); 1072 dquot_free_inode(inode);
1078 1073
1079fail_drop: 1074fail_drop:
1080 vfs_dq_drop(inode); 1075 dquot_drop(inode);
1081 inode->i_flags |= S_NOQUOTA; 1076 inode->i_flags |= S_NOQUOTA;
1082 inode->i_nlink = 0; 1077 inode->i_nlink = 0;
1083 unlock_new_inode(inode); 1078 unlock_new_inode(inode);
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1098 1093
1099 /* Error cases - e2fsck has already cleaned up for us */ 1094 /* Error cases - e2fsck has already cleaned up for us */
1100 if (ino > max_ino) { 1095 if (ino > max_ino) {
1101 ext4_warning(sb, __func__, 1096 ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
1102 "bad orphan ino %lu! e2fsck was run?", ino);
1103 goto error; 1097 goto error;
1104 } 1098 }
1105 1099
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1107 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); 1101 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1108 bitmap_bh = ext4_read_inode_bitmap(sb, block_group); 1102 bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1109 if (!bitmap_bh) { 1103 if (!bitmap_bh) {
1110 ext4_warning(sb, __func__, 1104 ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
1111 "inode bitmap error for orphan %lu", ino);
1112 goto error; 1105 goto error;
1113 } 1106 }
1114 1107
@@ -1140,8 +1133,7 @@ iget_failed:
1140 err = PTR_ERR(inode); 1133 err = PTR_ERR(inode);
1141 inode = NULL; 1134 inode = NULL;
1142bad_orphan: 1135bad_orphan:
1143 ext4_warning(sb, __func__, 1136 ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
1144 "bad orphan inode %lu! e2fsck was run?", ino);
1145 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", 1137 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1146 bit, (unsigned long long)bitmap_bh->b_blocknr, 1138 bit, (unsigned long long)bitmap_bh->b_blocknr,
1147 ext4_test_bit(bit, bitmap_bh->b_data)); 1139 ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51addb..5381802d6052 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,8 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
42#include <linux/slab.h>
41 43
42#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
43#include "xattr.h" 45#include "xattr.h"
@@ -71,58 +73,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
71} 73}
72 74
73/* 75/*
74 * The ext4 forget function must perform a revoke if we are freeing data
75 * which has been journaled. Metadata (eg. indirect blocks) must be
76 * revoked in all cases.
77 *
78 * "bh" may be NULL: a metadata block may have been freed from memory
79 * but there may still be a record of it in the journal, and that record
80 * still needs to be revoked.
81 *
82 * If the handle isn't valid we're not journaling, but we still need to
83 * call into ext4_journal_revoke() to put the buffer head.
84 */
85int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
86 struct buffer_head *bh, ext4_fsblk_t blocknr)
87{
88 int err;
89
90 might_sleep();
91
92 BUFFER_TRACE(bh, "enter");
93
94 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
95 "data mode %x\n",
96 bh, is_metadata, inode->i_mode,
97 test_opt(inode->i_sb, DATA_FLAGS));
98
99 /* Never use the revoke function if we are doing full data
100 * journaling: there is no need to, and a V1 superblock won't
101 * support it. Otherwise, only skip the revoke on un-journaled
102 * data blocks. */
103
104 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
105 (!is_metadata && !ext4_should_journal_data(inode))) {
106 if (bh) {
107 BUFFER_TRACE(bh, "call jbd2_journal_forget");
108 return ext4_journal_forget(handle, bh);
109 }
110 return 0;
111 }
112
113 /*
114 * data!=journal && (is_metadata || should_journal_data(inode))
115 */
116 BUFFER_TRACE(bh, "call ext4_journal_revoke");
117 err = ext4_journal_revoke(handle, blocknr, bh);
118 if (err)
119 ext4_abort(inode->i_sb, __func__,
120 "error %d when attempting revoke", err);
121 BUFFER_TRACE(bh, "exit");
122 return err;
123}
124
125/*
126 * Work out how many blocks we need to proceed with the next chunk of a 76 * Work out how many blocks we need to proceed with the next chunk of a
127 * truncate transaction. 77 * truncate transaction.
128 */ 78 */
@@ -222,6 +172,9 @@ void ext4_delete_inode(struct inode *inode)
222 handle_t *handle; 172 handle_t *handle;
223 int err; 173 int err;
224 174
175 if (!is_bad_inode(inode))
176 dquot_initialize(inode);
177
225 if (ext4_should_order_data(inode)) 178 if (ext4_should_order_data(inode))
226 ext4_begin_ordered_truncate(inode, 0); 179 ext4_begin_ordered_truncate(inode, 0);
227 truncate_inode_pages(&inode->i_data, 0); 180 truncate_inode_pages(&inode->i_data, 0);
@@ -246,7 +199,7 @@ void ext4_delete_inode(struct inode *inode)
246 inode->i_size = 0; 199 inode->i_size = 0;
247 err = ext4_mark_inode_dirty(handle, inode); 200 err = ext4_mark_inode_dirty(handle, inode);
248 if (err) { 201 if (err) {
249 ext4_warning(inode->i_sb, __func__, 202 ext4_warning(inode->i_sb,
250 "couldn't mark inode dirty (err %d)", err); 203 "couldn't mark inode dirty (err %d)", err);
251 goto stop_handle; 204 goto stop_handle;
252 } 205 }
@@ -264,7 +217,7 @@ void ext4_delete_inode(struct inode *inode)
264 if (err > 0) 217 if (err > 0)
265 err = ext4_journal_restart(handle, 3); 218 err = ext4_journal_restart(handle, 3);
266 if (err != 0) { 219 if (err != 0) {
267 ext4_warning(inode->i_sb, __func__, 220 ext4_warning(inode->i_sb,
268 "couldn't extend journal (err %d)", err); 221 "couldn't extend journal (err %d)", err);
269 stop_handle: 222 stop_handle:
270 ext4_journal_stop(handle); 223 ext4_journal_stop(handle);
@@ -375,8 +328,7 @@ static int ext4_block_to_path(struct inode *inode,
375 offsets[n++] = i_block & (ptrs - 1); 328 offsets[n++] = i_block & (ptrs - 1);
376 final = ptrs; 329 final = ptrs;
377 } else { 330 } else {
378 ext4_warning(inode->i_sb, "ext4_block_to_path", 331 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
379 "block %lu > max in inode %lu",
380 i_block + direct_blocks + 332 i_block + direct_blocks +
381 indirect_blocks + double_blocks, inode->i_ino); 333 indirect_blocks + double_blocks, inode->i_ino);
382 } 334 }
@@ -396,7 +348,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
396 if (blk && 348 if (blk &&
397 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
398 blk, 1))) { 350 blk, 1))) {
399 ext4_error(inode->i_sb, function, 351 __ext4_error(inode->i_sb, function,
400 "invalid block reference %u " 352 "invalid block reference %u "
401 "in inode #%lu", blk, inode->i_ino); 353 "in inode #%lu", blk, inode->i_ino);
402 return -EIO; 354 return -EIO;
@@ -659,7 +611,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
659 if (*err) 611 if (*err)
660 goto failed_out; 612 goto failed_out;
661 613
662 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); 614 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
615 EXT4_ERROR_INODE(inode,
616 "current_block %llu + count %lu > %d!",
617 current_block, count,
618 EXT4_MAX_BLOCK_FILE_PHYS);
619 *err = -EIO;
620 goto failed_out;
621 }
663 622
664 target -= count; 623 target -= count;
665 /* allocate blocks for indirect blocks */ 624 /* allocate blocks for indirect blocks */
@@ -695,7 +654,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
695 ar.flags = EXT4_MB_HINT_DATA; 654 ar.flags = EXT4_MB_HINT_DATA;
696 655
697 current_block = ext4_mb_new_blocks(handle, &ar, err); 656 current_block = ext4_mb_new_blocks(handle, &ar, err);
698 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); 657 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
658 EXT4_ERROR_INODE(inode,
659 "current_block %llu + ar.len %d > %d!",
660 current_block, ar.len,
661 EXT4_MAX_BLOCK_FILE_PHYS);
662 *err = -EIO;
663 goto failed_out;
664 }
699 665
700 if (*err && (target == blks)) { 666 if (*err && (target == blks)) {
701 /* 667 /*
@@ -721,7 +687,7 @@ allocated:
721 return ret; 687 return ret;
722failed_out: 688failed_out:
723 for (i = 0; i < index; i++) 689 for (i = 0; i < index; i++)
724 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 690 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
725 return ret; 691 return ret;
726} 692}
727 693
@@ -817,14 +783,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
817 return err; 783 return err;
818failed: 784failed:
819 /* Allocation failed, free what we already allocated */ 785 /* Allocation failed, free what we already allocated */
786 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
820 for (i = 1; i <= n ; i++) { 787 for (i = 1; i <= n ; i++) {
821 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 788 /*
822 ext4_journal_forget(handle, branch[i].bh); 789 * branch[i].bh is newly allocated, so there is no
790 * need to revoke the block, which is why we don't
791 * need to set EXT4_FREE_BLOCKS_METADATA.
792 */
793 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
794 EXT4_FREE_BLOCKS_FORGET);
823 } 795 }
824 for (i = 0; i < indirect_blks; i++) 796 for (i = n+1; i < indirect_blks; i++)
825 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 797 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
826 798
827 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 799 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
828 800
829 return err; 801 return err;
830} 802}
@@ -903,12 +875,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
903 875
904err_out: 876err_out:
905 for (i = 1; i <= num; i++) { 877 for (i = 1; i <= num; i++) {
906 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 878 /*
907 ext4_journal_forget(handle, where[i].bh); 879 * branch[i].bh is newly allocated, so there is no
908 ext4_free_blocks(handle, inode, 880 * need to revoke the block, which is why we don't
909 le32_to_cpu(where[i-1].key), 1, 0); 881 * need to set EXT4_FREE_BLOCKS_METADATA.
882 */
883 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
884 EXT4_FREE_BLOCKS_FORGET);
910 } 885 }
911 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 886 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
887 blks, 0);
912 888
913 return err; 889 return err;
914} 890}
@@ -1021,10 +997,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
1021 if (!err) 997 if (!err)
1022 err = ext4_splice_branch(handle, inode, iblock, 998 err = ext4_splice_branch(handle, inode, iblock,
1023 partial, indirect_blks, count); 999 partial, indirect_blks, count);
1024 else 1000 if (err)
1025 goto cleanup; 1001 goto cleanup;
1026 1002
1027 set_buffer_new(bh_result); 1003 set_buffer_new(bh_result);
1004
1005 ext4_update_inode_fsync_trans(handle, inode, 1);
1028got_it: 1006got_it:
1029 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1007 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
1030 if (count > blocks_to_boundary) 1008 if (count > blocks_to_boundary)
@@ -1043,92 +1021,121 @@ out:
1043 return err; 1021 return err;
1044} 1022}
1045 1023
1046qsize_t ext4_get_reserved_space(struct inode *inode) 1024#ifdef CONFIG_QUOTA
1025qsize_t *ext4_get_reserved_space(struct inode *inode)
1047{ 1026{
1048 unsigned long long total; 1027 return &EXT4_I(inode)->i_reserved_quota;
1049
1050 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1051 total = EXT4_I(inode)->i_reserved_data_blocks +
1052 EXT4_I(inode)->i_reserved_meta_blocks;
1053 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1054
1055 return total;
1056} 1028}
1029#endif
1030
1057/* 1031/*
1058 * Calculate the number of metadata blocks need to reserve 1032 * Calculate the number of metadata blocks need to reserve
1059 * to allocate @blocks for non extent file based file 1033 * to allocate a new block at @lblocks for non extent file based file
1060 */ 1034 */
1061static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 1035static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1036 sector_t lblock)
1062{ 1037{
1063 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1038 struct ext4_inode_info *ei = EXT4_I(inode);
1064 int ind_blks, dind_blks, tind_blks; 1039 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1065 1040 int blk_bits;
1066 /* number of new indirect blocks needed */
1067 ind_blks = (blocks + icap - 1) / icap;
1068 1041
1069 dind_blks = (ind_blks + icap - 1) / icap; 1042 if (lblock < EXT4_NDIR_BLOCKS)
1043 return 0;
1070 1044
1071 tind_blks = 1; 1045 lblock -= EXT4_NDIR_BLOCKS;
1072 1046
1073 return ind_blks + dind_blks + tind_blks; 1047 if (ei->i_da_metadata_calc_len &&
1048 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1049 ei->i_da_metadata_calc_len++;
1050 return 0;
1051 }
1052 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1053 ei->i_da_metadata_calc_len = 1;
1054 blk_bits = order_base_2(lblock);
1055 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1074} 1056}
1075 1057
1076/* 1058/*
1077 * Calculate the number of metadata blocks need to reserve 1059 * Calculate the number of metadata blocks need to reserve
1078 * to allocate given number of blocks 1060 * to allocate a block located at @lblock
1079 */ 1061 */
1080static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1081{ 1063{
1082 if (!blocks)
1083 return 0;
1084
1085 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1086 return ext4_ext_calc_metadata_amount(inode, blocks); 1065 return ext4_ext_calc_metadata_amount(inode, lblock);
1087 1066
1088 return ext4_indirect_calc_metadata_amount(inode, blocks); 1067 return ext4_indirect_calc_metadata_amount(inode, lblock);
1089} 1068}
1090 1069
1091static void ext4_da_update_reserve_space(struct inode *inode, int used) 1070/*
1071 * Called with i_data_sem down, which is important since we can call
1072 * ext4_discard_preallocations() from here.
1073 */
1074void ext4_da_update_reserve_space(struct inode *inode,
1075 int used, int quota_claim)
1092{ 1076{
1093 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1094 int total, mdb, mdb_free; 1078 struct ext4_inode_info *ei = EXT4_I(inode);
1095 1079 int mdb_free = 0, allocated_meta_blocks = 0;
1096 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1080
1097 /* recalculate the number of metablocks still need to be reserved */ 1081 spin_lock(&ei->i_block_reservation_lock);
1098 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1082 trace_ext4_da_update_reserve_space(inode, used);
1099 mdb = ext4_calc_metadata_amount(inode, total); 1083 if (unlikely(used > ei->i_reserved_data_blocks)) {
1100 1084 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1101 /* figure out how many metablocks to release */ 1085 "with only %d reserved data blocks\n",
1102 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1086 __func__, inode->i_ino, used,
1103 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1087 ei->i_reserved_data_blocks);
1104 1088 WARN_ON(1);
1105 if (mdb_free) { 1089 used = ei->i_reserved_data_blocks;
1106 /* Account for allocated meta_blocks */ 1090 }
1107 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1091
1108 1092 /* Update per-inode reservations */
1109 /* update fs dirty blocks counter */ 1093 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks;
1097 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099
1100 if (ei->i_reserved_data_blocks == 0) {
1101 /*
1102 * We can release all of the reserved metadata blocks
1103 * only when we have written all of the delayed
1104 * allocation blocks.
1105 */
1106 mdb_free = ei->i_reserved_meta_blocks;
1107 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0;
1110 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1111 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1112 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1113 } 1110 }
1114
1115 /* update per-inode reservations */
1116 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1117 EXT4_I(inode)->i_reserved_data_blocks -= used;
1118 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1119 1112
1120 /* 1113 /* Update quota subsystem */
1121 * free those over-booking quota for metadata blocks 1114 if (quota_claim) {
1122 */ 1115 dquot_claim_block(inode, used);
1123 if (mdb_free) 1116 if (mdb_free)
1124 vfs_dq_release_reservation_block(inode, mdb_free); 1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /*
1120 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */
1127 if (allocated_meta_blocks)
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 }
1125 1131
1126 /* 1132 /*
1127 * If we have done all the pending block allocations and if 1133 * If we have done all the pending block allocations and if
1128 * there aren't any writers on the inode, we can discard the 1134 * there aren't any writers on the inode, we can discard the
1129 * inode's preallocations. 1135 * inode's preallocations.
1130 */ 1136 */
1131 if (!total && (atomic_read(&inode->i_writecount) == 0)) 1137 if ((ei->i_reserved_data_blocks == 0) &&
1138 (atomic_read(&inode->i_writecount) == 0))
1132 ext4_discard_preallocations(inode); 1139 ext4_discard_preallocations(inode);
1133} 1140}
1134 1141
@@ -1136,7 +1143,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
1136 sector_t logical, sector_t phys, int len) 1143 sector_t logical, sector_t phys, int len)
1137{ 1144{
1138 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1139 ext4_error(inode->i_sb, msg, 1146 __ext4_error(inode->i_sb, msg,
1140 "inode #%lu logical block %llu mapped to %llu " 1147 "inode #%lu logical block %llu mapped to %llu "
1141 "(size %d)", inode->i_ino, 1148 "(size %d)", inode->i_ino,
1142 (unsigned long long) logical, 1149 (unsigned long long) logical,
@@ -1318,20 +1325,22 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1318 * i_data's format changing. Force the migrate 1325 * i_data's format changing. Force the migrate
1319 * to fail by clearing migrate flags 1326 * to fail by clearing migrate flags
1320 */ 1327 */
1321 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 1328 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1322 } 1329 }
1323 }
1324 1330
1331 /*
1332 * Update reserved blocks/metadata blocks after successful
1333 * block allocation which had been deferred till now. We don't
1334 * support fallocate for non extent files. So we can update
1335 * reserve space here.
1336 */
1337 if ((retval > 0) &&
1338 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1339 ext4_da_update_reserve_space(inode, retval, 1);
1340 }
1325 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1341 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1326 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1327 1343
1328 /*
1329 * Update reserved blocks/metadata blocks after successful
1330 * block allocation which had been deferred till now.
1331 */
1332 if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
1333 ext4_da_update_reserve_space(inode, retval);
1334
1335 up_write((&EXT4_I(inode)->i_data_sem)); 1344 up_write((&EXT4_I(inode)->i_data_sem));
1336 if (retval > 0 && buffer_mapped(bh)) { 1345 if (retval > 0 && buffer_mapped(bh)) {
1337 int ret = check_block_validity(inode, "file system " 1346 int ret = check_block_validity(inode, "file system "
@@ -1534,6 +1543,18 @@ static int do_journal_get_write_access(handle_t *handle,
1534 return ext4_journal_get_write_access(handle, bh); 1543 return ext4_journal_get_write_access(handle, bh);
1535} 1544}
1536 1545
1546/*
1547 * Truncate blocks that were not used by write. We have to truncate the
1548 * pagecache as well so that corresponding buffers get properly unmapped.
1549 */
1550static void ext4_truncate_failed_write(struct inode *inode)
1551{
1552 truncate_inode_pages(inode->i_mapping, inode->i_size);
1553 ext4_truncate(inode);
1554}
1555
1556static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1557 struct buffer_head *bh_result, int create);
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1558static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1559 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1560 struct page **pagep, void **fsdata)
@@ -1575,8 +1596,12 @@ retry:
1575 } 1596 }
1576 *pagep = page; 1597 *pagep = page;
1577 1598
1578 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1599 if (ext4_should_dioread_nolock(inode))
1579 ext4_get_block); 1600 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1601 fsdata, ext4_get_block_write);
1602 else
1603 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1604 fsdata, ext4_get_block);
1580 1605
1581 if (!ret && ext4_should_journal_data(inode)) { 1606 if (!ret && ext4_should_journal_data(inode)) {
1582 ret = walk_page_buffers(handle, page_buffers(page), 1607 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1599,7 +1624,7 @@ retry:
1599 1624
1600 ext4_journal_stop(handle); 1625 ext4_journal_stop(handle);
1601 if (pos + len > inode->i_size) { 1626 if (pos + len > inode->i_size) {
1602 ext4_truncate(inode); 1627 ext4_truncate_failed_write(inode);
1603 /* 1628 /*
1604 * If truncate failed early the inode might 1629 * If truncate failed early the inode might
1605 * still be on the orphan list; we need to 1630 * still be on the orphan list; we need to
@@ -1709,7 +1734,7 @@ static int ext4_ordered_write_end(struct file *file,
1709 ret = ret2; 1734 ret = ret2;
1710 1735
1711 if (pos + len > inode->i_size) { 1736 if (pos + len > inode->i_size) {
1712 ext4_truncate(inode); 1737 ext4_truncate_failed_write(inode);
1713 /* 1738 /*
1714 * If truncate failed early the inode might still be 1739 * If truncate failed early the inode might still be
1715 * on the orphan list; we need to make sure the inode 1740 * on the orphan list; we need to make sure the inode
@@ -1751,7 +1776,7 @@ static int ext4_writeback_write_end(struct file *file,
1751 ret = ret2; 1776 ret = ret2;
1752 1777
1753 if (pos + len > inode->i_size) { 1778 if (pos + len > inode->i_size) {
1754 ext4_truncate(inode); 1779 ext4_truncate_failed_write(inode);
1755 /* 1780 /*
1756 * If truncate failed early the inode might still be 1781 * If truncate failed early the inode might still be
1757 * on the orphan list; we need to make sure the inode 1782 * on the orphan list; we need to make sure the inode
@@ -1793,7 +1818,7 @@ static int ext4_journalled_write_end(struct file *file,
1793 new_i_size = pos + copied; 1818 new_i_size = pos + copied;
1794 if (new_i_size > inode->i_size) 1819 if (new_i_size > inode->i_size)
1795 i_size_write(inode, pos+copied); 1820 i_size_write(inode, pos+copied);
1796 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1821 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1797 if (new_i_size > EXT4_I(inode)->i_disksize) { 1822 if (new_i_size > EXT4_I(inode)->i_disksize) {
1798 ext4_update_i_disksize(inode, new_i_size); 1823 ext4_update_i_disksize(inode, new_i_size);
1799 ret2 = ext4_mark_inode_dirty(handle, inode); 1824 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1814,7 +1839,7 @@ static int ext4_journalled_write_end(struct file *file,
1814 if (!ret) 1839 if (!ret)
1815 ret = ret2; 1840 ret = ret2;
1816 if (pos + len > inode->i_size) { 1841 if (pos + len > inode->i_size) {
1817 ext4_truncate(inode); 1842 ext4_truncate_failed_write(inode);
1818 /* 1843 /*
1819 * If truncate failed early the inode might still be 1844 * If truncate failed early the inode might still be
1820 * on the orphan list; we need to make sure the inode 1845 * on the orphan list; we need to make sure the inode
@@ -1827,11 +1852,16 @@ static int ext4_journalled_write_end(struct file *file,
1827 return ret ? ret : copied; 1852 return ret ? ret : copied;
1828} 1853}
1829 1854
1830static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1855/*
1856 * Reserve a single block located at lblock
1857 */
1858static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1831{ 1859{
1832 int retries = 0; 1860 int retries = 0;
1833 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1834 unsigned long md_needed, mdblocks, total = 0; 1862 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved;
1864 int ret;
1835 1865
1836 /* 1866 /*
1837 * recalculate the amount of metadata blocks to reserve 1867 * recalculate the amount of metadata blocks to reserve
@@ -1839,86 +1869,80 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1839 * worse case is one extent per block 1869 * worse case is one extent per block
1840 */ 1870 */
1841repeat: 1871repeat:
1842 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1872 spin_lock(&ei->i_block_reservation_lock);
1843 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1873 md_reserved = ei->i_reserved_meta_blocks;
1844 mdblocks = ext4_calc_metadata_amount(inode, total); 1874 md_needed = ext4_calc_metadata_amount(inode, lblock);
1845 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1875 trace_ext4_da_reserve_space(inode, md_needed);
1846 1876 spin_unlock(&ei->i_block_reservation_lock);
1847 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1848 total = md_needed + nrblocks;
1849 1877
1850 /* 1878 /*
1851 * Make quota reservation here to prevent quota overflow 1879 * Make quota reservation here to prevent quota overflow
1852 * later. Real quota accounting is done at pages writeout 1880 * later. Real quota accounting is done at pages writeout
1853 * time. 1881 * time.
1854 */ 1882 */
1855 if (vfs_dq_reserve_block(inode, total)) { 1883 ret = dquot_reserve_block(inode, md_needed + 1);
1856 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1884 if (ret)
1857 return -EDQUOT; 1885 return ret;
1858 }
1859 1886
1860 if (ext4_claim_free_blocks(sbi, total)) { 1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1861 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1888 dquot_release_reservation_block(inode, md_needed + 1);
1862 vfs_dq_release_reservation_block(inode, total);
1863 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1864 yield(); 1890 yield();
1865 goto repeat; 1891 goto repeat;
1866 } 1892 }
1867 return -ENOSPC; 1893 return -ENOSPC;
1868 } 1894 }
1869 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1895 spin_lock(&ei->i_block_reservation_lock);
1870 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1896 ei->i_reserved_data_blocks++;
1897 ei->i_reserved_meta_blocks += md_needed;
1898 spin_unlock(&ei->i_block_reservation_lock);
1871 1899
1872 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1873 return 0; /* success */ 1900 return 0; /* success */
1874} 1901}
1875 1902
1876static void ext4_da_release_space(struct inode *inode, int to_free) 1903static void ext4_da_release_space(struct inode *inode, int to_free)
1877{ 1904{
1878 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1879 int total, mdb, mdb_free, release; 1906 struct ext4_inode_info *ei = EXT4_I(inode);
1880 1907
1881 if (!to_free) 1908 if (!to_free)
1882 return; /* Nothing to release, exit */ 1909 return; /* Nothing to release, exit */
1883 1910
1884 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1885 1912
1886 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1887 /* 1914 /*
1888 * if there is no reserved blocks, but we try to free some 1915 * if there aren't enough reserved blocks, then the
1889 * then the counter is messed up somewhere. 1916 * counter is messed up somewhere. Since this
1890 * but since this function is called from invalidate 1917 * function is called from invalidate page, it's
1891 * page, it's harmless to return without any action 1918 * harmless to return without any action.
1892 */ 1919 */
1893 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1920 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1894 "blocks for inode %lu, but there is no reserved " 1921 "ino %lu, to_free %d with only %d reserved "
1895 "data blocks\n", to_free, inode->i_ino); 1922 "data blocks\n", inode->i_ino, to_free,
1896 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1923 ei->i_reserved_data_blocks);
1897 return; 1924 WARN_ON(1);
1925 to_free = ei->i_reserved_data_blocks;
1898 } 1926 }
1927 ei->i_reserved_data_blocks -= to_free;
1899 1928
1900 /* recalculate the number of metablocks still need to be reserved */ 1929 if (ei->i_reserved_data_blocks == 0) {
1901 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1930 /*
1902 mdb = ext4_calc_metadata_amount(inode, total); 1931 * We can release all of the reserved metadata blocks
1903 1932 * only when we have written all of the delayed
1904 /* figure out how many metablocks to release */ 1933 * allocation blocks.
1905 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1934 */
1906 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1935 to_free += ei->i_reserved_meta_blocks;
1907 1936 ei->i_reserved_meta_blocks = 0;
1908 release = to_free + mdb_free; 1937 ei->i_da_metadata_calc_len = 0;
1909 1938 }
1910 /* update fs dirty blocks counter for truncate case */
1911 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1912 1939
1913 /* update per-inode reservations */ 1940 /* update fs dirty blocks counter */
1914 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1915 EXT4_I(inode)->i_reserved_data_blocks -= to_free;
1916 1942
1917 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1918 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1919 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1920 1944
1921 vfs_dq_release_reservation_block(inode, release); 1945 dquot_release_reservation_block(inode, to_free);
1922} 1946}
1923 1947
1924static void ext4_da_page_release_reservation(struct page *page, 1948static void ext4_da_page_release_reservation(struct page *page,
@@ -2095,6 +2119,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2095 } else if (buffer_mapped(bh)) 2119 } else if (buffer_mapped(bh))
2096 BUG_ON(bh->b_blocknr != pblock); 2120 BUG_ON(bh->b_blocknr != pblock);
2097 2121
2122 if (buffer_uninit(exbh))
2123 set_buffer_uninit(bh);
2098 cur_logical++; 2124 cur_logical++;
2099 pblock++; 2125 pblock++;
2100 } while ((bh = bh->b_this_page) != head); 2126 } while ((bh = bh->b_this_page) != head);
@@ -2137,17 +2163,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2137 break; 2163 break;
2138 for (i = 0; i < nr_pages; i++) { 2164 for (i = 0; i < nr_pages; i++) {
2139 struct page *page = pvec.pages[i]; 2165 struct page *page = pvec.pages[i];
2140 index = page->index; 2166 if (page->index > end)
2141 if (index > end)
2142 break; 2167 break;
2143 index++;
2144
2145 BUG_ON(!PageLocked(page)); 2168 BUG_ON(!PageLocked(page));
2146 BUG_ON(PageWriteback(page)); 2169 BUG_ON(PageWriteback(page));
2147 block_invalidatepage(page, 0); 2170 block_invalidatepage(page, 0);
2148 ClearPageUptodate(page); 2171 ClearPageUptodate(page);
2149 unlock_page(page); 2172 unlock_page(page);
2150 } 2173 }
2174 index = pvec.pages[nr_pages - 1]->index + 1;
2175 pagevec_release(&pvec);
2151 } 2176 }
2152 return; 2177 return;
2153} 2178}
@@ -2223,10 +2248,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2223 * variables are updated after the blocks have been allocated. 2248 * variables are updated after the blocks have been allocated.
2224 */ 2249 */
2225 new.b_state = 0; 2250 new.b_state = 0;
2226 get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | 2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2227 EXT4_GET_BLOCKS_DELALLOC_RESERVE); 2252 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2228 if (mpd->b_state & (1 << BH_Delay)) 2254 if (mpd->b_state & (1 << BH_Delay))
2229 get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; 2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256
2230 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
2231 &new, get_blocks_flags); 2258 &new, get_blocks_flags);
2232 if (blks < 0) { 2259 if (blks < 0) {
@@ -2524,7 +2551,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2524 * XXX: __block_prepare_write() unmaps passed block, 2551 * XXX: __block_prepare_write() unmaps passed block,
2525 * is it OK? 2552 * is it OK?
2526 */ 2553 */
2527 ret = ext4_da_reserve_space(inode, 1); 2554 ret = ext4_da_reserve_space(inode, iblock);
2528 if (ret) 2555 if (ret)
2529 /* not enough space to reserve */ 2556 /* not enough space to reserve */
2530 return ret; 2557 return ret;
@@ -2600,7 +2627,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
2600} 2627}
2601 2628
2602static int __ext4_journalled_writepage(struct page *page, 2629static int __ext4_journalled_writepage(struct page *page,
2603 struct writeback_control *wbc,
2604 unsigned int len) 2630 unsigned int len)
2605{ 2631{
2606 struct address_space *mapping = page->mapping; 2632 struct address_space *mapping = page->mapping;
@@ -2635,11 +2661,14 @@ static int __ext4_journalled_writepage(struct page *page,
2635 ret = err; 2661 ret = err;
2636 2662
2637 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2663 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2638 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2664 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2639out: 2665out:
2640 return ret; 2666 return ret;
2641} 2667}
2642 2668
2669static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2670static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2671
2643/* 2672/*
2644 * Note that we don't need to start a transaction unless we're journaling data 2673 * Note that we don't need to start a transaction unless we're journaling data
2645 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2674 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2687,7 +2716,7 @@ static int ext4_writepage(struct page *page,
2687 int ret = 0; 2716 int ret = 0;
2688 loff_t size; 2717 loff_t size;
2689 unsigned int len; 2718 unsigned int len;
2690 struct buffer_head *page_bufs; 2719 struct buffer_head *page_bufs = NULL;
2691 struct inode *inode = page->mapping->host; 2720 struct inode *inode = page->mapping->host;
2692 2721
2693 trace_ext4_writepage(inode, page); 2722 trace_ext4_writepage(inode, page);
@@ -2758,12 +2787,16 @@ static int ext4_writepage(struct page *page,
2758 * doesn't seem much point in redirtying the page here. 2787 * doesn't seem much point in redirtying the page here.
2759 */ 2788 */
2760 ClearPageChecked(page); 2789 ClearPageChecked(page);
2761 return __ext4_journalled_writepage(page, wbc, len); 2790 return __ext4_journalled_writepage(page, len);
2762 } 2791 }
2763 2792
2764 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2793 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2765 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2794 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2766 else 2795 else if (page_bufs && buffer_uninit(page_bufs)) {
2796 ext4_set_bh_endio(page_bufs, inode);
2797 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2798 wbc, ext4_end_io_buffer_write);
2799 } else
2767 ret = block_write_full_page(page, noalloc_get_block_write, 2800 ret = block_write_full_page(page, noalloc_get_block_write,
2768 wbc); 2801 wbc);
2769 2802
@@ -2788,7 +2821,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2788 * number of contiguous block. So we will limit 2821 * number of contiguous block. So we will limit
2789 * number of contiguous block to a sane value 2822 * number of contiguous block to a sane value
2790 */ 2823 */
2791 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2824 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
2792 (max_blocks > EXT4_MAX_TRANS_DATA)) 2825 (max_blocks > EXT4_MAX_TRANS_DATA))
2793 max_blocks = EXT4_MAX_TRANS_DATA; 2826 max_blocks = EXT4_MAX_TRANS_DATA;
2794 2827
@@ -2933,7 +2966,7 @@ retry:
2933 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 2966 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2934 &mpd); 2967 &mpd);
2935 /* 2968 /*
2936 * If we have a contigous extent of pages and we 2969 * If we have a contiguous extent of pages and we
2937 * haven't done the I/O yet, map the blocks and submit 2970 * haven't done the I/O yet, map the blocks and submit
2938 * them for I/O. 2971 * them for I/O.
2939 */ 2972 */
@@ -2999,8 +3032,7 @@ retry:
2999out_writepages: 3032out_writepages:
3000 if (!no_nrwrite_index_update) 3033 if (!no_nrwrite_index_update)
3001 wbc->no_nrwrite_index_update = 0; 3034 wbc->no_nrwrite_index_update = 0;
3002 if (wbc->nr_to_write > nr_to_writebump) 3035 wbc->nr_to_write -= nr_to_writebump;
3003 wbc->nr_to_write -= nr_to_writebump;
3004 wbc->range_start = range_start; 3036 wbc->range_start = range_start;
3005 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
3006 return ret; 3038 return ret;
@@ -3025,11 +3057,18 @@ static int ext4_nonda_switch(struct super_block *sb)
3025 if (2 * free_blocks < 3 * dirty_blocks || 3057 if (2 * free_blocks < 3 * dirty_blocks ||
3026 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 3058 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
3027 /* 3059 /*
3028 * free block count is less that 150% of dirty blocks 3060 * free block count is less than 150% of dirty blocks
3029 * or free blocks is less that watermark 3061 * or free blocks is less than watermark
3030 */ 3062 */
3031 return 1; 3063 return 1;
3032 } 3064 }
3065 /*
3066 * Even if we don't switch but are nearing capacity,
3067 * start pushing delalloc when 1/2 of free blocks are dirty.
3068 */
3069 if (free_blocks < 2 * dirty_blocks)
3070 writeback_inodes_sb_if_idle(sb);
3071
3033 return 0; 3072 return 0;
3034} 3073}
3035 3074
@@ -3037,7 +3076,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3037 loff_t pos, unsigned len, unsigned flags, 3076 loff_t pos, unsigned len, unsigned flags,
3038 struct page **pagep, void **fsdata) 3077 struct page **pagep, void **fsdata)
3039{ 3078{
3040 int ret, retries = 0; 3079 int ret, retries = 0, quota_retries = 0;
3041 struct page *page; 3080 struct page *page;
3042 pgoff_t index; 3081 pgoff_t index;
3043 unsigned from, to; 3082 unsigned from, to;
@@ -3091,11 +3130,27 @@ retry:
3091 * i_size_read because we hold i_mutex. 3130 * i_size_read because we hold i_mutex.
3092 */ 3131 */
3093 if (pos + len > inode->i_size) 3132 if (pos + len > inode->i_size)
3094 ext4_truncate(inode); 3133 ext4_truncate_failed_write(inode);
3095 } 3134 }
3096 3135
3097 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3098 goto retry; 3137 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3099out: 3154out:
3100 return ret; 3155 return ret;
3101} 3156}
@@ -3284,7 +3339,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3284 filemap_write_and_wait(mapping); 3339 filemap_write_and_wait(mapping);
3285 } 3340 }
3286 3341
3287 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3342 if (EXT4_JOURNAL(inode) &&
3343 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3288 /* 3344 /*
3289 * This is a REALLY heavyweight approach, but the use of 3345 * This is a REALLY heavyweight approach, but the use of
3290 * bmap on dirty files is expected to be extremely rare: 3346 * bmap on dirty files is expected to be extremely rare:
@@ -3303,7 +3359,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3303 * everything they get. 3359 * everything they get.
3304 */ 3360 */
3305 3361
3306 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3362 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3307 journal = EXT4_JOURNAL(inode); 3363 journal = EXT4_JOURNAL(inode);
3308 jbd2_journal_lock_updates(journal); 3364 jbd2_journal_lock_updates(journal);
3309 err = jbd2_journal_flush(journal); 3365 err = jbd2_journal_flush(journal);
@@ -3328,11 +3384,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3328 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3384 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3329} 3385}
3330 3386
3387static void ext4_free_io_end(ext4_io_end_t *io)
3388{
3389 BUG_ON(!io);
3390 if (io->page)
3391 put_page(io->page);
3392 iput(io->inode);
3393 kfree(io);
3394}
3395
3396static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3397{
3398 struct buffer_head *head, *bh;
3399 unsigned int curr_off = 0;
3400
3401 if (!page_has_buffers(page))
3402 return;
3403 head = bh = page_buffers(page);
3404 do {
3405 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3406 && bh->b_private) {
3407 ext4_free_io_end(bh->b_private);
3408 bh->b_private = NULL;
3409 bh->b_end_io = NULL;
3410 }
3411 curr_off = curr_off + bh->b_size;
3412 bh = bh->b_this_page;
3413 } while (bh != head);
3414}
3415
3331static void ext4_invalidatepage(struct page *page, unsigned long offset) 3416static void ext4_invalidatepage(struct page *page, unsigned long offset)
3332{ 3417{
3333 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3418 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3334 3419
3335 /* 3420 /*
3421 * free any io_end structure allocated for buffers to be discarded
3422 */
3423 if (ext4_should_dioread_nolock(page->mapping->host))
3424 ext4_invalidatepage_free_endio(page, offset);
3425 /*
3336 * If it's a full truncate we just forget about the pending dirtying 3426 * If it's a full truncate we just forget about the pending dirtying
3337 */ 3427 */
3338 if (offset == 0) 3428 if (offset == 0)
@@ -3403,7 +3493,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3403 } 3493 }
3404 3494
3405retry: 3495retry:
3406 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3496 if (rw == READ && ext4_should_dioread_nolock(inode))
3497 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
3498 inode->i_sb->s_bdev, iov,
3499 offset, nr_segs,
3500 ext4_get_block, NULL);
3501 else
3502 ret = blockdev_direct_IO(rw, iocb, inode,
3503 inode->i_sb->s_bdev, iov,
3407 offset, nr_segs, 3504 offset, nr_segs,
3408 ext4_get_block, NULL); 3505 ext4_get_block, NULL);
3409 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3506 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3419,6 +3516,9 @@ retry:
3419 * but cannot extend i_size. Bail out and pretend 3516 * but cannot extend i_size. Bail out and pretend
3420 * the write failed... */ 3517 * the write failed... */
3421 ret = PTR_ERR(handle); 3518 ret = PTR_ERR(handle);
3519 if (inode->i_nlink)
3520 ext4_orphan_del(NULL, inode);
3521
3422 goto out; 3522 goto out;
3423 } 3523 }
3424 if (inode->i_nlink) 3524 if (inode->i_nlink)
@@ -3446,75 +3546,63 @@ out:
3446 return ret; 3546 return ret;
3447} 3547}
3448 3548
3449static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, 3549static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3450 struct buffer_head *bh_result, int create) 3550 struct buffer_head *bh_result, int create)
3451{ 3551{
3452 handle_t *handle = NULL; 3552 handle_t *handle = ext4_journal_current_handle();
3453 int ret = 0; 3553 int ret = 0;
3454 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3455 int dio_credits; 3555 int dio_credits;
3556 int started = 0;
3456 3557
3457 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", 3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3458 inode->i_ino, create); 3559 inode->i_ino, create);
3459 /* 3560 /*
3460 * DIO VFS code passes create = 0 flag for write to 3561 * ext4_get_block in prepare for a DIO write or buffer write.
3461 * the middle of file. It does this to avoid block 3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3462 * allocation for holes, to prevent expose stale data 3563 * The extent will be converted to initialized after IO complete.
3463 * out when there is parallel buffered read (which does
3464 * not hold the i_mutex lock) while direct IO write has
3465 * not completed. DIO request on holes finally falls back
3466 * to buffered IO for this reason.
3467 *
3468 * For ext4 extent based file, since we support fallocate,
3469 * new allocated extent as uninitialized, for holes, we
3470 * could fallocate blocks for holes, thus parallel
3471 * buffered IO read will zero out the page when read on
3472 * a hole while parallel DIO write to the hole has not completed.
3473 *
3474 * when we come here, we know it's a direct IO write to
3475 * to the middle of file (<i_size)
3476 * so it's safe to override the create flag from VFS.
3477 */ 3564 */
3478 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; 3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3479 3566
3480 if (max_blocks > DIO_MAX_BLOCKS) 3567 if (!handle) {
3481 max_blocks = DIO_MAX_BLOCKS; 3568 if (max_blocks > DIO_MAX_BLOCKS)
3482 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3569 max_blocks = DIO_MAX_BLOCKS;
3483 handle = ext4_journal_start(inode, dio_credits); 3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3484 if (IS_ERR(handle)) { 3571 handle = ext4_journal_start(inode, dio_credits);
3485 ret = PTR_ERR(handle); 3572 if (IS_ERR(handle)) {
3486 goto out; 3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3487 } 3577 }
3578
3488 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3489 create); 3580 create);
3490 if (ret > 0) { 3581 if (ret > 0) {
3491 bh_result->b_size = (ret << inode->i_blkbits); 3582 bh_result->b_size = (ret << inode->i_blkbits);
3492 ret = 0; 3583 ret = 0;
3493 } 3584 }
3494 ext4_journal_stop(handle); 3585 if (started)
3586 ext4_journal_stop(handle);
3495out: 3587out:
3496 return ret; 3588 return ret;
3497} 3589}
3498 3590
3499static void ext4_free_io_end(ext4_io_end_t *io) 3591static void dump_completed_IO(struct inode * inode)
3500{
3501 BUG_ON(!io);
3502 iput(io->inode);
3503 kfree(io);
3504}
3505static void dump_aio_dio_list(struct inode * inode)
3506{ 3592{
3507#ifdef EXT4_DEBUG 3593#ifdef EXT4_DEBUG
3508 struct list_head *cur, *before, *after; 3594 struct list_head *cur, *before, *after;
3509 ext4_io_end_t *io, *io0, *io1; 3595 ext4_io_end_t *io, *io0, *io1;
3596 unsigned long flags;
3510 3597
3511 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3598 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3512 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); 3599 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3513 return; 3600 return;
3514 } 3601 }
3515 3602
3516 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); 3603 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3517 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ 3604 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3605 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3518 cur = &io->list; 3606 cur = &io->list;
3519 before = cur->prev; 3607 before = cur->prev;
3520 io0 = container_of(before, ext4_io_end_t, list); 3608 io0 = container_of(before, ext4_io_end_t, list);
@@ -3524,32 +3612,31 @@ static void dump_aio_dio_list(struct inode * inode)
3524 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3612 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3525 io, inode->i_ino, io0, io1); 3613 io, inode->i_ino, io0, io1);
3526 } 3614 }
3615 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3527#endif 3616#endif
3528} 3617}
3529 3618
3530/* 3619/*
3531 * check a range of space and convert unwritten extents to written. 3620 * check a range of space and convert unwritten extents to written.
3532 */ 3621 */
3533static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) 3622static int ext4_end_io_nolock(ext4_io_end_t *io)
3534{ 3623{
3535 struct inode *inode = io->inode; 3624 struct inode *inode = io->inode;
3536 loff_t offset = io->offset; 3625 loff_t offset = io->offset;
3537 size_t size = io->size; 3626 ssize_t size = io->size;
3538 int ret = 0; 3627 int ret = 0;
3539 3628
3540 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," 3629 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3541 "list->prev 0x%p\n", 3630 "list->prev 0x%p\n",
3542 io, inode->i_ino, io->list.next, io->list.prev); 3631 io, inode->i_ino, io->list.next, io->list.prev);
3543 3632
3544 if (list_empty(&io->list)) 3633 if (list_empty(&io->list))
3545 return ret; 3634 return ret;
3546 3635
3547 if (io->flag != DIO_AIO_UNWRITTEN) 3636 if (io->flag != EXT4_IO_UNWRITTEN)
3548 return ret; 3637 return ret;
3549 3638
3550 if (offset + size <= i_size_read(inode)) 3639 ret = ext4_convert_unwritten_extents(inode, offset, size);
3551 ret = ext4_convert_unwritten_extents(inode, offset, size);
3552
3553 if (ret < 0) { 3640 if (ret < 0) {
3554 printk(KERN_EMERG "%s: failed to convert unwritten" 3641 printk(KERN_EMERG "%s: failed to convert unwritten"
3555 "extents to written extents, error is %d" 3642 "extents to written extents, error is %d"
@@ -3562,50 +3649,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3562 io->flag = 0; 3649 io->flag = 0;
3563 return ret; 3650 return ret;
3564} 3651}
3652
3565/* 3653/*
3566 * work on completed aio dio IO, to convert unwritten extents to extents 3654 * work on completed aio dio IO, to convert unwritten extents to extents
3567 */ 3655 */
3568static void ext4_end_aio_dio_work(struct work_struct *work) 3656static void ext4_end_io_work(struct work_struct *work)
3569{ 3657{
3570 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3658 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3571 struct inode *inode = io->inode; 3659 struct inode *inode = io->inode;
3572 int ret = 0; 3660 struct ext4_inode_info *ei = EXT4_I(inode);
3661 unsigned long flags;
3662 int ret;
3573 3663
3574 mutex_lock(&inode->i_mutex); 3664 mutex_lock(&inode->i_mutex);
3575 ret = ext4_end_aio_dio_nolock(io); 3665 ret = ext4_end_io_nolock(io);
3576 if (ret >= 0) { 3666 if (ret < 0) {
3577 if (!list_empty(&io->list)) 3667 mutex_unlock(&inode->i_mutex);
3578 list_del_init(&io->list); 3668 return;
3579 ext4_free_io_end(io);
3580 } 3669 }
3670
3671 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3672 if (!list_empty(&io->list))
3673 list_del_init(&io->list);
3674 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3581 mutex_unlock(&inode->i_mutex); 3675 mutex_unlock(&inode->i_mutex);
3676 ext4_free_io_end(io);
3582} 3677}
3678
3583/* 3679/*
3584 * This function is called from ext4_sync_file(). 3680 * This function is called from ext4_sync_file().
3585 * 3681 *
3586 * When AIO DIO IO is completed, the work to convert unwritten 3682 * When IO is completed, the work to convert unwritten extents to
3587 * extents to written is queued on workqueue but may not get immediately 3683 * written is queued on workqueue but may not get immediately
3588 * scheduled. When fsync is called, we need to ensure the 3684 * scheduled. When fsync is called, we need to ensure the
3589 * conversion is complete before fsync returns. 3685 * conversion is complete before fsync returns.
3590 * The inode keeps track of a list of completed AIO from DIO path 3686 * The inode keeps track of a list of pending/completed IO that
3591 * that might needs to do the conversion. This function walks through 3687 * might needs to do the conversion. This function walks through
3592 * the list and convert the related unwritten extents to written. 3688 * the list and convert the related unwritten extents for completed IO
3689 * to written.
3690 * The function return the number of pending IOs on success.
3593 */ 3691 */
3594int flush_aio_dio_completed_IO(struct inode *inode) 3692int flush_completed_IO(struct inode *inode)
3595{ 3693{
3596 ext4_io_end_t *io; 3694 ext4_io_end_t *io;
3695 struct ext4_inode_info *ei = EXT4_I(inode);
3696 unsigned long flags;
3597 int ret = 0; 3697 int ret = 0;
3598 int ret2 = 0; 3698 int ret2 = 0;
3599 3699
3600 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) 3700 if (list_empty(&ei->i_completed_io_list))
3601 return ret; 3701 return ret;
3602 3702
3603 dump_aio_dio_list(inode); 3703 dump_completed_IO(inode);
3604 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3704 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3605 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, 3705 while (!list_empty(&ei->i_completed_io_list)){
3706 io = list_entry(ei->i_completed_io_list.next,
3606 ext4_io_end_t, list); 3707 ext4_io_end_t, list);
3607 /* 3708 /*
3608 * Calling ext4_end_aio_dio_nolock() to convert completed 3709 * Calling ext4_end_io_nolock() to convert completed
3609 * IO to written. 3710 * IO to written.
3610 * 3711 *
3611 * When ext4_sync_file() is called, run_queue() may already 3712 * When ext4_sync_file() is called, run_queue() may already
@@ -3618,20 +3719,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
3618 * avoid double converting from both fsync and background work 3719 * avoid double converting from both fsync and background work
3619 * queue work. 3720 * queue work.
3620 */ 3721 */
3621 ret = ext4_end_aio_dio_nolock(io); 3722 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3723 ret = ext4_end_io_nolock(io);
3724 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3622 if (ret < 0) 3725 if (ret < 0)
3623 ret2 = ret; 3726 ret2 = ret;
3624 else 3727 else
3625 list_del_init(&io->list); 3728 list_del_init(&io->list);
3626 } 3729 }
3730 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3627 return (ret2 < 0) ? ret2 : 0; 3731 return (ret2 < 0) ? ret2 : 0;
3628} 3732}
3629 3733
3630static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3734static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3631{ 3735{
3632 ext4_io_end_t *io = NULL; 3736 ext4_io_end_t *io = NULL;
3633 3737
3634 io = kmalloc(sizeof(*io), GFP_NOFS); 3738 io = kmalloc(sizeof(*io), flags);
3635 3739
3636 if (io) { 3740 if (io) {
3637 igrab(inode); 3741 igrab(inode);
@@ -3639,8 +3743,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3639 io->flag = 0; 3743 io->flag = 0;
3640 io->offset = 0; 3744 io->offset = 0;
3641 io->size = 0; 3745 io->size = 0;
3642 io->error = 0; 3746 io->page = NULL;
3643 INIT_WORK(&io->work, ext4_end_aio_dio_work); 3747 INIT_WORK(&io->work, ext4_end_io_work);
3644 INIT_LIST_HEAD(&io->list); 3748 INIT_LIST_HEAD(&io->list);
3645 } 3749 }
3646 3750
@@ -3652,6 +3756,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3652{ 3756{
3653 ext4_io_end_t *io_end = iocb->private; 3757 ext4_io_end_t *io_end = iocb->private;
3654 struct workqueue_struct *wq; 3758 struct workqueue_struct *wq;
3759 unsigned long flags;
3760 struct ext4_inode_info *ei;
3655 3761
3656 /* if not async direct IO or dio with 0 bytes write, just return */ 3762 /* if not async direct IO or dio with 0 bytes write, just return */
3657 if (!io_end || !size) 3763 if (!io_end || !size)
@@ -3663,7 +3769,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3663 size); 3769 size);
3664 3770
3665 /* if not aio dio with unwritten extents, just free io and return */ 3771 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){ 3772 if (io_end->flag != EXT4_IO_UNWRITTEN){
3667 ext4_free_io_end(io_end); 3773 ext4_free_io_end(io_end);
3668 iocb->private = NULL; 3774 iocb->private = NULL;
3669 return; 3775 return;
@@ -3671,16 +3777,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3671 3777
3672 io_end->offset = offset; 3778 io_end->offset = offset;
3673 io_end->size = size; 3779 io_end->size = size;
3780 io_end->flag = EXT4_IO_UNWRITTEN;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3781 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675 3782
3676 /* queue the work to convert unwritten extents to written */ 3783 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work); 3784 queue_work(wq, &io_end->work);
3678 3785
3679 /* Add the io_end to per-inode completed aio dio list*/ 3786 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list, 3787 ei = EXT4_I(io_end->inode);
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list); 3788 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3789 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3790 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3682 iocb->private = NULL; 3791 iocb->private = NULL;
3683} 3792}
3793
3794static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3795{
3796 ext4_io_end_t *io_end = bh->b_private;
3797 struct workqueue_struct *wq;
3798 struct inode *inode;
3799 unsigned long flags;
3800
3801 if (!test_clear_buffer_uninit(bh) || !io_end)
3802 goto out;
3803
3804 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3805 printk("sb umounted, discard end_io request for inode %lu\n",
3806 io_end->inode->i_ino);
3807 ext4_free_io_end(io_end);
3808 goto out;
3809 }
3810
3811 io_end->flag = EXT4_IO_UNWRITTEN;
3812 inode = io_end->inode;
3813
3814 /* Add the io_end to per-inode completed io list*/
3815 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3816 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3817 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3818
3819 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3820 /* queue the work to convert unwritten extents to written */
3821 queue_work(wq, &io_end->work);
3822out:
3823 bh->b_private = NULL;
3824 bh->b_end_io = NULL;
3825 clear_buffer_uninit(bh);
3826 end_buffer_async_write(bh, uptodate);
3827}
3828
3829static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3830{
3831 ext4_io_end_t *io_end;
3832 struct page *page = bh->b_page;
3833 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3834 size_t size = bh->b_size;
3835
3836retry:
3837 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3838 if (!io_end) {
3839 if (printk_ratelimit())
3840 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3841 schedule();
3842 goto retry;
3843 }
3844 io_end->offset = offset;
3845 io_end->size = size;
3846 /*
3847 * We need to hold a reference to the page to make sure it
3848 * doesn't get evicted before ext4_end_io_work() has a chance
3849 * to convert the extent from written to unwritten.
3850 */
3851 io_end->page = page;
3852 get_page(io_end->page);
3853
3854 bh->b_private = io_end;
3855 bh->b_end_io = ext4_end_io_buffer_write;
3856 return 0;
3857}
3858
3684/* 3859/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes, 3860 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to 3861 * preallocated extents, and those write extend the file, no need to
@@ -3734,7 +3909,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3734 iocb->private = NULL; 3909 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL; 3910 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) { 3911 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode); 3912 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3738 if (!iocb->private) 3913 if (!iocb->private)
3739 return -ENOMEM; 3914 return -ENOMEM;
3740 /* 3915 /*
@@ -3750,7 +3925,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3750 ret = blockdev_direct_IO(rw, iocb, inode, 3925 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov, 3926 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs, 3927 offset, nr_segs,
3753 ext4_get_block_dio_write, 3928 ext4_get_block_write,
3754 ext4_end_io_dio); 3929 ext4_end_io_dio);
3755 if (iocb->private) 3930 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL; 3931 EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3771,8 +3946,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3946 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private); 3947 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL; 3948 iocb->private = NULL;
3774 } else if (ret > 0 && (EXT4_I(inode)->i_state & 3949 } else if (ret > 0 && ext4_test_inode_state(inode,
3775 EXT4_STATE_DIO_UNWRITTEN)) { 3950 EXT4_STATE_DIO_UNWRITTEN)) {
3776 int err; 3951 int err;
3777 /* 3952 /*
3778 * for non AIO case, since the IO is already 3953 * for non AIO case, since the IO is already
@@ -3782,7 +3957,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3782 offset, ret); 3957 offset, ret);
3783 if (err < 0) 3958 if (err < 0)
3784 ret = err; 3959 ret = err;
3785 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; 3960 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3786 } 3961 }
3787 return ret; 3962 return ret;
3788 } 3963 }
@@ -4064,7 +4239,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
4064 int k, err; 4239 int k, err;
4065 4240
4066 *top = 0; 4241 *top = 0;
4067 /* Make k index the deepest non-null offest + 1 */ 4242 /* Make k index the deepest non-null offset + 1 */
4068 for (k = depth; k > 1 && !offsets[k-1]; k--) 4243 for (k = depth; k > 1 && !offsets[k-1]; k--)
4069 ; 4244 ;
4070 partial = ext4_get_branch(inode, k, offsets, chain, &err); 4245 partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4113,13 +4288,27 @@ no_top:
4113 * We release `count' blocks on disk, but (last - first) may be greater 4288 * We release `count' blocks on disk, but (last - first) may be greater
4114 * than `count' because there can be holes in there. 4289 * than `count' because there can be holes in there.
4115 */ 4290 */
4116static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 4291static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4117 struct buffer_head *bh, 4292 struct buffer_head *bh,
4118 ext4_fsblk_t block_to_free, 4293 ext4_fsblk_t block_to_free,
4119 unsigned long count, __le32 *first, 4294 unsigned long count, __le32 *first,
4120 __le32 *last) 4295 __le32 *last)
4121{ 4296{
4122 __le32 *p; 4297 __le32 *p;
4298 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4299
4300 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4301 flags |= EXT4_FREE_BLOCKS_METADATA;
4302
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: "
4306 "attempt to clear blocks %llu len %lu, invalid",
4307 inode->i_ino, (unsigned long long) block_to_free,
4308 count);
4309 return 1;
4310 }
4311
4123 if (try_to_extend_transaction(handle, inode)) { 4312 if (try_to_extend_transaction(handle, inode)) {
4124 if (bh) { 4313 if (bh) {
4125 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4314 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4134,27 +4323,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4134 } 4323 }
4135 } 4324 }
4136 4325
4137 /* 4326 for (p = first; p < last; p++)
4138 * Any buffers which are on the journal will be in memory. We 4327 *p = 0;
4139 * find them on the hash table so jbd2_journal_revoke() will
4140 * run jbd2_journal_forget() on them. We've already detached
4141 * each block from the file, so bforget() in
4142 * jbd2_journal_forget() should be safe.
4143 *
4144 * AKPM: turn on bforget in jbd2_journal_forget()!!!
4145 */
4146 for (p = first; p < last; p++) {
4147 u32 nr = le32_to_cpu(*p);
4148 if (nr) {
4149 struct buffer_head *tbh;
4150
4151 *p = 0;
4152 tbh = sb_find_get_block(inode->i_sb, nr);
4153 ext4_forget(handle, 0, inode, tbh, nr);
4154 }
4155 }
4156 4328
4157 ext4_free_blocks(handle, inode, block_to_free, count, 0); 4329 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4330 return 0;
4158} 4331}
4159 4332
4160/** 4333/**
@@ -4210,9 +4383,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4210 } else if (nr == block_to_free + count) { 4383 } else if (nr == block_to_free + count) {
4211 count++; 4384 count++;
4212 } else { 4385 } else {
4213 ext4_clear_blocks(handle, inode, this_bh, 4386 if (ext4_clear_blocks(handle, inode, this_bh,
4214 block_to_free, 4387 block_to_free, count,
4215 count, block_to_free_p, p); 4388 block_to_free_p, p))
4389 break;
4216 block_to_free = nr; 4390 block_to_free = nr;
4217 block_to_free_p = p; 4391 block_to_free_p = p;
4218 count = 1; 4392 count = 1;
@@ -4236,7 +4410,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4236 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4237 ext4_handle_dirty_metadata(handle, inode, this_bh); 4411 ext4_handle_dirty_metadata(handle, inode, this_bh);
4238 else 4412 else
4239 ext4_error(inode->i_sb, __func__, 4413 ext4_error(inode->i_sb,
4240 "circular indirect block detected, " 4414 "circular indirect block detected, "
4241 "inode=%lu, block=%llu", 4415 "inode=%lu, block=%llu",
4242 inode->i_ino, 4416 inode->i_ino,
@@ -4276,6 +4450,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4276 if (!nr) 4450 if (!nr)
4277 continue; /* A hole */ 4451 continue; /* A hole */
4278 4452
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) {
4455 ext4_error(inode->i_sb,
4456 "indirect mapped block in inode "
4457 "#%lu invalid (level %d, blk #%lu)",
4458 inode->i_ino, depth,
4459 (unsigned long) nr);
4460 break;
4461 }
4462
4279 /* Go read the buffer for the next level down */ 4463 /* Go read the buffer for the next level down */
4280 bh = sb_bread(inode->i_sb, nr); 4464 bh = sb_bread(inode->i_sb, nr);
4281 4465
@@ -4284,7 +4468,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4284 * (should be rare). 4468 * (should be rare).
4285 */ 4469 */
4286 if (!bh) { 4470 if (!bh) {
4287 ext4_error(inode->i_sb, "ext4_free_branches", 4471 ext4_error(inode->i_sb,
4288 "Read failure, inode=%lu, block=%llu", 4472 "Read failure, inode=%lu, block=%llu",
4289 inode->i_ino, nr); 4473 inode->i_ino, nr);
4290 continue; 4474 continue;
@@ -4342,7 +4526,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4342 blocks_for_truncate(inode)); 4526 blocks_for_truncate(inode));
4343 } 4527 }
4344 4528
4345 ext4_free_blocks(handle, inode, nr, 1, 1); 4529 ext4_free_blocks(handle, inode, 0, nr, 1,
4530 EXT4_FREE_BLOCKS_METADATA);
4346 4531
4347 if (parent_bh) { 4532 if (parent_bh) {
4348 /* 4533 /*
@@ -4427,8 +4612,10 @@ void ext4_truncate(struct inode *inode)
4427 if (!ext4_can_truncate(inode)) 4612 if (!ext4_can_truncate(inode))
4428 return; 4613 return;
4429 4614
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
4616
4430 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4431 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4432 4619
4433 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4434 ext4_ext_truncate(inode); 4621 ext4_ext_truncate(inode);
@@ -4598,9 +4785,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4598 4785
4599 bh = sb_getblk(sb, block); 4786 bh = sb_getblk(sb, block);
4600 if (!bh) { 4787 if (!bh) {
4601 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4788 ext4_error(sb, "unable to read inode block - "
4602 "inode block - inode=%lu, block=%llu", 4789 "inode=%lu, block=%llu", inode->i_ino, block);
4603 inode->i_ino, block);
4604 return -EIO; 4790 return -EIO;
4605 } 4791 }
4606 if (!buffer_uptodate(bh)) { 4792 if (!buffer_uptodate(bh)) {
@@ -4698,9 +4884,8 @@ make_io:
4698 submit_bh(READ_META, bh); 4884 submit_bh(READ_META, bh);
4699 wait_on_buffer(bh); 4885 wait_on_buffer(bh);
4700 if (!buffer_uptodate(bh)) { 4886 if (!buffer_uptodate(bh)) {
4701 ext4_error(sb, __func__, 4887 ext4_error(sb, "unable to read inode block - inode=%lu,"
4702 "unable to read inode block - inode=%lu, " 4888 " block=%llu", inode->i_ino, block);
4703 "block=%llu", inode->i_ino, block);
4704 brelse(bh); 4889 brelse(bh);
4705 return -EIO; 4890 return -EIO;
4706 } 4891 }
@@ -4714,7 +4899,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4714{ 4899{
4715 /* We have all inode data except xattrs in memory here. */ 4900 /* We have all inode data except xattrs in memory here. */
4716 return __ext4_get_inode_loc(inode, iloc, 4901 return __ext4_get_inode_loc(inode, iloc,
4717 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4902 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4718} 4903}
4719 4904
4720void ext4_set_inode_flags(struct inode *inode) 4905void ext4_set_inode_flags(struct inode *inode)
@@ -4781,8 +4966,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4781 struct ext4_iloc iloc; 4966 struct ext4_iloc iloc;
4782 struct ext4_inode *raw_inode; 4967 struct ext4_inode *raw_inode;
4783 struct ext4_inode_info *ei; 4968 struct ext4_inode_info *ei;
4784 struct buffer_head *bh;
4785 struct inode *inode; 4969 struct inode *inode;
4970 journal_t *journal = EXT4_SB(sb)->s_journal;
4786 long ret; 4971 long ret;
4787 int block; 4972 int block;
4788 4973
@@ -4793,11 +4978,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4793 return inode; 4978 return inode;
4794 4979
4795 ei = EXT4_I(inode); 4980 ei = EXT4_I(inode);
4981 iloc.bh = 0;
4796 4982
4797 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4983 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4798 if (ret < 0) 4984 if (ret < 0)
4799 goto bad_inode; 4985 goto bad_inode;
4800 bh = iloc.bh;
4801 raw_inode = ext4_raw_inode(&iloc); 4986 raw_inode = ext4_raw_inode(&iloc);
4802 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4987 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4803 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4988 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4808,7 +4993,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4808 } 4993 }
4809 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4994 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4810 4995
4811 ei->i_state = 0; 4996 ei->i_state_flags = 0;
4812 ei->i_dir_start_lookup = 0; 4997 ei->i_dir_start_lookup = 0;
4813 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4998 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4814 /* We now have enough fields to check if the inode was active or not. 4999 /* We now have enough fields to check if the inode was active or not.
@@ -4820,7 +5005,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4820 if (inode->i_mode == 0 || 5005 if (inode->i_mode == 0 ||
4821 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 5006 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4822 /* this inode is deleted */ 5007 /* this inode is deleted */
4823 brelse(bh);
4824 ret = -ESTALE; 5008 ret = -ESTALE;
4825 goto bad_inode; 5009 goto bad_inode;
4826 } 5010 }
@@ -4837,6 +5021,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4837 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 5021 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4838 inode->i_size = ext4_isize(raw_inode); 5022 inode->i_size = ext4_isize(raw_inode);
4839 ei->i_disksize = inode->i_size; 5023 ei->i_disksize = inode->i_size;
5024#ifdef CONFIG_QUOTA
5025 ei->i_reserved_quota = 0;
5026#endif
4840 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 5027 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4841 ei->i_block_group = iloc.block_group; 5028 ei->i_block_group = iloc.block_group;
4842 ei->i_last_alloc_group = ~0; 5029 ei->i_last_alloc_group = ~0;
@@ -4848,11 +5035,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4848 ei->i_data[block] = raw_inode->i_block[block]; 5035 ei->i_data[block] = raw_inode->i_block[block];
4849 INIT_LIST_HEAD(&ei->i_orphan); 5036 INIT_LIST_HEAD(&ei->i_orphan);
4850 5037
5038 /*
5039 * Set transaction id's of transactions that have to be committed
5040 * to finish f[data]sync. We set them to currently running transaction
5041 * as we cannot be sure that the inode or some of its metadata isn't
5042 * part of the transaction - the inode could have been reclaimed and
5043 * now it is reread from disk.
5044 */
5045 if (journal) {
5046 transaction_t *transaction;
5047 tid_t tid;
5048
5049 spin_lock(&journal->j_state_lock);
5050 if (journal->j_running_transaction)
5051 transaction = journal->j_running_transaction;
5052 else
5053 transaction = journal->j_committing_transaction;
5054 if (transaction)
5055 tid = transaction->t_tid;
5056 else
5057 tid = journal->j_commit_sequence;
5058 spin_unlock(&journal->j_state_lock);
5059 ei->i_sync_tid = tid;
5060 ei->i_datasync_tid = tid;
5061 }
5062
4851 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 5063 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4852 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 5064 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4853 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 5065 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4854 EXT4_INODE_SIZE(inode->i_sb)) { 5066 EXT4_INODE_SIZE(inode->i_sb)) {
4855 brelse(bh);
4856 ret = -EIO; 5067 ret = -EIO;
4857 goto bad_inode; 5068 goto bad_inode;
4858 } 5069 }
@@ -4865,7 +5076,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4865 EXT4_GOOD_OLD_INODE_SIZE + 5076 EXT4_GOOD_OLD_INODE_SIZE +
4866 ei->i_extra_isize; 5077 ei->i_extra_isize;
4867 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 5078 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4868 ei->i_state |= EXT4_STATE_XATTR; 5079 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4869 } 5080 }
4870 } else 5081 } else
4871 ei->i_extra_isize = 0; 5082 ei->i_extra_isize = 0;
@@ -4884,12 +5095,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4884 5095
4885 ret = 0; 5096 ret = 0;
4886 if (ei->i_file_acl && 5097 if (ei->i_file_acl &&
4887 ((ei->i_file_acl < 5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4888 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu",
4889 EXT4_SB(sb)->s_gdb_count)) ||
4890 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
4891 ext4_error(sb, __func__,
4892 "bad extended attribute block %llu in inode #%lu",
4893 ei->i_file_acl, inode->i_ino); 5100 ei->i_file_acl, inode->i_ino);
4894 ret = -EIO; 5101 ret = -EIO;
4895 goto bad_inode; 5102 goto bad_inode;
@@ -4905,10 +5112,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4905 /* Validate block references which are part of inode */ 5112 /* Validate block references which are part of inode */
4906 ret = ext4_check_inode_blockref(inode); 5113 ret = ext4_check_inode_blockref(inode);
4907 } 5114 }
4908 if (ret) { 5115 if (ret)
4909 brelse(bh);
4910 goto bad_inode; 5116 goto bad_inode;
4911 }
4912 5117
4913 if (S_ISREG(inode->i_mode)) { 5118 if (S_ISREG(inode->i_mode)) {
4914 inode->i_op = &ext4_file_inode_operations; 5119 inode->i_op = &ext4_file_inode_operations;
@@ -4936,10 +5141,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4936 init_special_inode(inode, inode->i_mode, 5141 init_special_inode(inode, inode->i_mode,
4937 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4938 } else { 5143 } else {
4939 brelse(bh);
4940 ret = -EIO; 5144 ret = -EIO;
4941 ext4_error(inode->i_sb, __func__, 5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
4942 "bogus i_mode (%o) for inode=%lu",
4943 inode->i_mode, inode->i_ino); 5146 inode->i_mode, inode->i_ino);
4944 goto bad_inode; 5147 goto bad_inode;
4945 } 5148 }
@@ -4949,6 +5152,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4949 return inode; 5152 return inode;
4950 5153
4951bad_inode: 5154bad_inode:
5155 brelse(iloc.bh);
4952 iget_failed(inode); 5156 iget_failed(inode);
4953 return ERR_PTR(ret); 5157 return ERR_PTR(ret);
4954} 5158}
@@ -5010,7 +5214,7 @@ static int ext4_do_update_inode(handle_t *handle,
5010 5214
5011 /* For fields not not tracking in the in-memory inode, 5215 /* For fields not not tracking in the in-memory inode,
5012 * initialise them to zero for new inodes. */ 5216 * initialise them to zero for new inodes. */
5013 if (ei->i_state & EXT4_STATE_NEW) 5217 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5014 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5218 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5015 5219
5016 ext4_get_inode_flags(ei); 5220 ext4_get_inode_flags(ei);
@@ -5074,7 +5278,7 @@ static int ext4_do_update_inode(handle_t *handle,
5074 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5278 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5075 sb->s_dirt = 1; 5279 sb->s_dirt = 1;
5076 ext4_handle_sync(handle); 5280 ext4_handle_sync(handle);
5077 err = ext4_handle_dirty_metadata(handle, inode, 5281 err = ext4_handle_dirty_metadata(handle, NULL,
5078 EXT4_SB(sb)->s_sbh); 5282 EXT4_SB(sb)->s_sbh);
5079 } 5283 }
5080 } 5284 }
@@ -5103,11 +5307,12 @@ static int ext4_do_update_inode(handle_t *handle,
5103 } 5307 }
5104 5308
5105 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5309 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5106 rc = ext4_handle_dirty_metadata(handle, inode, bh); 5310 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5107 if (!err) 5311 if (!err)
5108 err = rc; 5312 err = rc;
5109 ei->i_state &= ~EXT4_STATE_NEW; 5313 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5110 5314
5315 ext4_update_inode_fsync_trans(handle, inode, 0);
5111out_brelse: 5316out_brelse:
5112 brelse(bh); 5317 brelse(bh);
5113 ext4_std_error(inode->i_sb, err); 5318 ext4_std_error(inode->i_sb, err);
@@ -5149,7 +5354,7 @@ out_brelse:
5149 * `stuff()' is running, and the new i_size will be lost. Plus the inode 5354 * `stuff()' is running, and the new i_size will be lost. Plus the inode
5150 * will no longer be on the superblock's dirty inode list. 5355 * will no longer be on the superblock's dirty inode list.
5151 */ 5356 */
5152int ext4_write_inode(struct inode *inode, int wait) 5357int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5153{ 5358{
5154 int err; 5359 int err;
5155 5360
@@ -5163,7 +5368,7 @@ int ext4_write_inode(struct inode *inode, int wait)
5163 return -EIO; 5368 return -EIO;
5164 } 5369 }
5165 5370
5166 if (!wait) 5371 if (wbc->sync_mode != WB_SYNC_ALL)
5167 return 0; 5372 return 0;
5168 5373
5169 err = ext4_force_commit(inode->i_sb); 5374 err = ext4_force_commit(inode->i_sb);
@@ -5173,13 +5378,11 @@ int ext4_write_inode(struct inode *inode, int wait)
5173 err = ext4_get_inode_loc(inode, &iloc); 5378 err = ext4_get_inode_loc(inode, &iloc);
5174 if (err) 5379 if (err)
5175 return err; 5380 return err;
5176 if (wait) 5381 if (wbc->sync_mode == WB_SYNC_ALL)
5177 sync_dirty_buffer(iloc.bh); 5382 sync_dirty_buffer(iloc.bh);
5178 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5179 ext4_error(inode->i_sb, __func__, 5384 ext4_error(inode->i_sb, "IO error syncing inode, "
5180 "IO error syncing inode, " 5385 "inode=%lu, block=%llu", inode->i_ino,
5181 "inode=%lu, block=%llu",
5182 inode->i_ino,
5183 (unsigned long long)iloc.bh->b_blocknr); 5386 (unsigned long long)iloc.bh->b_blocknr);
5184 err = -EIO; 5387 err = -EIO;
5185 } 5388 }
@@ -5221,19 +5424,21 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5221 if (error) 5424 if (error)
5222 return error; 5425 return error;
5223 5426
5427 if (ia_valid & ATTR_SIZE)
5428 dquot_initialize(inode);
5224 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5429 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5225 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5430 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5226 handle_t *handle; 5431 handle_t *handle;
5227 5432
5228 /* (user+group)*(old+new) structure, inode write (sb, 5433 /* (user+group)*(old+new) structure, inode write (sb,
5229 * inode block, ? - but truncate inode update has it) */ 5434 * inode block, ? - but truncate inode update has it) */
5230 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 5435 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5231 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 5436 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5232 if (IS_ERR(handle)) { 5437 if (IS_ERR(handle)) {
5233 error = PTR_ERR(handle); 5438 error = PTR_ERR(handle);
5234 goto err_out; 5439 goto err_out;
5235 } 5440 }
5236 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 5441 error = dquot_transfer(inode, attr);
5237 if (error) { 5442 if (error) {
5238 ext4_journal_stop(handle); 5443 ext4_journal_stop(handle);
5239 return error; 5444 return error;
@@ -5260,7 +5465,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5260 } 5465 }
5261 5466
5262 if (S_ISREG(inode->i_mode) && 5467 if (S_ISREG(inode->i_mode) &&
5263 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 5468 attr->ia_valid & ATTR_SIZE &&
5469 (attr->ia_size < inode->i_size ||
5470 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
5264 handle_t *handle; 5471 handle_t *handle;
5265 5472
5266 handle = ext4_journal_start(inode, 3); 5473 handle = ext4_journal_start(inode, 3);
@@ -5291,6 +5498,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5291 goto err_out; 5498 goto err_out;
5292 } 5499 }
5293 } 5500 }
5501 /* ext4_truncate will clear the flag */
5502 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
5503 ext4_truncate(inode);
5294 } 5504 }
5295 5505
5296 rc = inode_setattr(inode, attr); 5506 rc = inode_setattr(inode, attr);
@@ -5376,7 +5586,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5376 * worse case, the indexs blocks spread over different block groups 5586 * worse case, the indexs blocks spread over different block groups
5377 * 5587 *
5378 * If datablocks are discontiguous, they are possible to spread over 5588 * If datablocks are discontiguous, they are possible to spread over
5379 * different block groups too. If they are contiugous, with flexbg, 5589 * different block groups too. If they are contiuguous, with flexbg,
5380 * they could still across block group boundary. 5590 * they could still across block group boundary.
5381 * 5591 *
5382 * Also account for superblock, inode, quota and xattr blocks 5592 * Also account for superblock, inode, quota and xattr blocks
@@ -5452,7 +5662,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
5452 * Calculate the journal credits for a chunk of data modification. 5662 * Calculate the journal credits for a chunk of data modification.
5453 * 5663 *
5454 * This is called from DIO, fallocate or whoever calling 5664 * This is called from DIO, fallocate or whoever calling
5455 * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. 5665 * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
5456 * 5666 *
5457 * journal buffers for data blocks are not included here, as DIO 5667 * journal buffers for data blocks are not included here, as DIO
5458 * and fallocate do no need to journal data buffers. 5668 * and fallocate do no need to journal data buffers.
@@ -5529,8 +5739,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
5529 entry = IFIRST(header); 5739 entry = IFIRST(header);
5530 5740
5531 /* No extended attributes present */ 5741 /* No extended attributes present */
5532 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5742 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5533 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5743 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5534 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5744 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5535 new_extra_isize); 5745 new_extra_isize);
5536 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5746 EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5574,7 +5784,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5574 err = ext4_reserve_inode_write(handle, inode, &iloc); 5784 err = ext4_reserve_inode_write(handle, inode, &iloc);
5575 if (ext4_handle_valid(handle) && 5785 if (ext4_handle_valid(handle) &&
5576 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5786 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5577 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5787 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5578 /* 5788 /*
5579 * We need extra buffer credits since we may write into EA block 5789 * We need extra buffer credits since we may write into EA block
5580 * with this same handle. If journal_extend fails, then it will 5790 * with this same handle. If journal_extend fails, then it will
@@ -5588,10 +5798,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5588 sbi->s_want_extra_isize, 5798 sbi->s_want_extra_isize,
5589 iloc, handle); 5799 iloc, handle);
5590 if (ret) { 5800 if (ret) {
5591 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5801 ext4_set_inode_state(inode,
5802 EXT4_STATE_NO_EXPAND);
5592 if (mnt_count != 5803 if (mnt_count !=
5593 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5804 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5594 ext4_warning(inode->i_sb, __func__, 5805 ext4_warning(inode->i_sb,
5595 "Unable to expand inode %lu. Delete" 5806 "Unable to expand inode %lu. Delete"
5596 " some EAs or run e2fsck.", 5807 " some EAs or run e2fsck.",
5597 inode->i_ino); 5808 inode->i_ino);
@@ -5613,7 +5824,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5613 * i_size has been changed by generic_commit_write() and we thus need 5824 * i_size has been changed by generic_commit_write() and we thus need
5614 * to include the updated inode in the current transaction. 5825 * to include the updated inode in the current transaction.
5615 * 5826 *
5616 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks 5827 * Also, dquot_alloc_block() will always dirty the inode when blocks
5617 * are allocated to the file. 5828 * are allocated to the file.
5618 * 5829 *
5619 * If the inode is marked synchronous, we don't honour that here - doing 5830 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5655,7 +5866,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5655 err = jbd2_journal_get_write_access(handle, iloc.bh); 5866 err = jbd2_journal_get_write_access(handle, iloc.bh);
5656 if (!err) 5867 if (!err)
5657 err = ext4_handle_dirty_metadata(handle, 5868 err = ext4_handle_dirty_metadata(handle,
5658 inode, 5869 NULL,
5659 iloc.bh); 5870 iloc.bh);
5660 brelse(iloc.bh); 5871 brelse(iloc.bh);
5661 } 5872 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e725..016d0249294f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
92 flags &= ~EXT4_EXTENTS_FL; 92 flags &= ~EXT4_EXTENTS_FL;
93 } 93 }
94 94
95 if (flags & EXT4_EOFBLOCKS_FL) {
96 /* we don't support adding EOFBLOCKS flag */
97 if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
98 err = -EOPNOTSUPP;
99 goto flags_out;
100 }
101 } else if (oldflags & EXT4_EOFBLOCKS_FL)
102 ext4_truncate(inode);
103
95 handle = ext4_journal_start(inode, 1); 104 handle = ext4_journal_start(inode, 1);
96 if (IS_ERR(handle)) { 105 if (IS_ERR(handle)) {
97 err = PTR_ERR(handle); 106 err = PTR_ERR(handle);
@@ -221,31 +230,39 @@ setversion_out:
221 struct file *donor_filp; 230 struct file *donor_filp;
222 int err; 231 int err;
223 232
233 if (!(filp->f_mode & FMODE_READ) ||
234 !(filp->f_mode & FMODE_WRITE))
235 return -EBADF;
236
224 if (copy_from_user(&me, 237 if (copy_from_user(&me,
225 (struct move_extent __user *)arg, sizeof(me))) 238 (struct move_extent __user *)arg, sizeof(me)))
226 return -EFAULT; 239 return -EFAULT;
240 me.moved_len = 0;
227 241
228 donor_filp = fget(me.donor_fd); 242 donor_filp = fget(me.donor_fd);
229 if (!donor_filp) 243 if (!donor_filp)
230 return -EBADF; 244 return -EBADF;
231 245
232 if (!capable(CAP_DAC_OVERRIDE)) { 246 if (!(donor_filp->f_mode & FMODE_WRITE)) {
233 if ((current->real_cred->fsuid != inode->i_uid) || 247 err = -EBADF;
234 !(inode->i_mode & S_IRUSR) || 248 goto mext_out;
235 !(donor_filp->f_dentry->d_inode->i_mode &
236 S_IRUSR)) {
237 fput(donor_filp);
238 return -EACCES;
239 }
240 } 249 }
241 250
251 err = mnt_want_write(filp->f_path.mnt);
252 if (err)
253 goto mext_out;
254
242 err = ext4_move_extents(filp, donor_filp, me.orig_start, 255 err = ext4_move_extents(filp, donor_filp, me.orig_start,
243 me.donor_start, me.len, &me.moved_len); 256 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp); 257 mnt_drop_write(filp->f_path.mnt);
245 258 if (me.moved_len > 0)
246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 259 file_remove_suid(donor_filp);
247 return -EFAULT;
248 260
261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me)))
263 err = -EFAULT;
264mext_out:
265 fput(donor_filp);
249 return err; 266 return err;
250 } 267 }
251 268
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824defa..bde9d0b170c2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/slab.h>
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
28/* 29/*
@@ -69,7 +70,7 @@
69 * 70 *
70 * pa_lstart -> the logical start block for this prealloc space 71 * pa_lstart -> the logical start block for this prealloc space
71 * pa_pstart -> the physical start block for this prealloc space 72 * pa_pstart -> the physical start block for this prealloc space
72 * pa_len -> lenght for this prealloc space 73 * pa_len -> length for this prealloc space
73 * pa_free -> free space available in this prealloc space 74 * pa_free -> free space available in this prealloc space
74 * 75 *
75 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
@@ -142,7 +143,7 @@
142 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 143 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
143 * value of s_mb_order2_reqs can be tuned via 144 * value of s_mb_order2_reqs can be tuned via
144 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 145 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
145 * stripe size (sbi->s_stripe), we try to search for contigous block in 146 * stripe size (sbi->s_stripe), we try to search for contiguous block in
146 * stripe size. This should result in better allocation on RAID setups. If 147 * stripe size. This should result in better allocation on RAID setups. If
147 * not, we search in the specific group using bitmap for best extents. The 148 * not, we search in the specific group using bitmap for best extents. The
148 * tunable min_to_scan and max_to_scan control the behaviour here. 149 * tunable min_to_scan and max_to_scan control the behaviour here.
@@ -441,10 +442,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
441 for (i = 0; i < count; i++) { 442 for (i = 0; i < count; i++) {
442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 443 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
443 ext4_fsblk_t blocknr; 444 ext4_fsblk_t blocknr;
444 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 445
446 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
445 blocknr += first + i; 447 blocknr += first + i;
446 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 ext4_grp_locked_error(sb, e4b->bd_group, 448 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 449 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %u)", 450 " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1255,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1255 1255
1256 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1256 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1257 ext4_fsblk_t blocknr; 1257 ext4_fsblk_t blocknr;
1258 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1258
1259 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1259 blocknr += block; 1260 blocknr += block;
1260 blocknr +=
1261 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1262 ext4_grp_locked_error(sb, e4b->bd_group, 1261 ext4_grp_locked_error(sb, e4b->bd_group,
1263 __func__, "double-free of inode" 1262 __func__, "double-free of inode"
1264 " %lu's block %llu(bit %u in group %u)", 1263 " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1630,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1631 int max; 1630 int max;
1632 int err; 1631 int err;
1633 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1632 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1634 struct ext4_super_block *es = sbi->s_es;
1635 struct ext4_free_extent ex; 1633 struct ext4_free_extent ex;
1636 1634
1637 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1635 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1646,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1648 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1646 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1649 ext4_fsblk_t start; 1647 ext4_fsblk_t start;
1650 1648
1651 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1649 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1652 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1650 ex.fe_start;
1653 /* use do_div to get remainder (would be 64-bit modulo) */ 1651 /* use do_div to get remainder (would be 64-bit modulo) */
1654 if (do_div(start, sbi->s_stripe) == 0) { 1652 if (do_div(start, sbi->s_stripe) == 0) {
1655 ac->ac_found++; 1653 ac->ac_found++;
@@ -1803,8 +1801,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1803 BUG_ON(sbi->s_stripe == 0); 1801 BUG_ON(sbi->s_stripe == 0);
1804 1802
1805 /* find first stripe-aligned block in group */ 1803 /* find first stripe-aligned block in group */
1806 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1804 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1807 + le32_to_cpu(sbi->s_es->s_first_data_block); 1805
1808 a = first_group_block + sbi->s_stripe - 1; 1806 a = first_group_block + sbi->s_stripe - 1;
1809 do_div(a, sbi->s_stripe); 1807 do_div(a, sbi->s_stripe);
1810 i = (a * sbi->s_stripe) - first_group_block; 1808 i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2254,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2256 2254
2257 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2255 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2258 init_rwsem(&meta_group_info[i]->alloc_sem); 2256 init_rwsem(&meta_group_info[i]->alloc_sem);
2259 meta_group_info[i]->bb_free_root.rb_node = NULL; 2257 meta_group_info[i]->bb_free_root = RB_ROOT;
2260 2258
2261#ifdef DOUBLE_CHECK 2259#ifdef DOUBLE_CHECK
2262 { 2260 {
@@ -2529,7 +2527,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2529 struct ext4_group_info *db; 2527 struct ext4_group_info *db;
2530 int err, count = 0, count2 = 0; 2528 int err, count = 0, count2 = 0;
2531 struct ext4_free_data *entry; 2529 struct ext4_free_data *entry;
2532 ext4_fsblk_t discard_block;
2533 struct list_head *l, *ltmp; 2530 struct list_head *l, *ltmp;
2534 2531
2535 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2532 list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2559,13 +2556,16 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2559 page_cache_release(e4b.bd_bitmap_page); 2556 page_cache_release(e4b.bd_bitmap_page);
2560 } 2557 }
2561 ext4_unlock_group(sb, entry->group); 2558 ext4_unlock_group(sb, entry->group);
2562 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2559 if (test_opt(sb, DISCARD)) {
2563 + entry->start_blk 2560 ext4_fsblk_t discard_block;
2564 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2561
2565 trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, 2562 discard_block = entry->start_blk +
2566 entry->count); 2563 ext4_group_first_block_no(sb, entry->group);
2567 sb_issue_discard(sb, discard_block, entry->count); 2564 trace_ext4_discard_blocks(sb,
2568 2565 (unsigned long long)discard_block,
2566 entry->count);
2567 sb_issue_discard(sb, discard_block, entry->count);
2568 }
2569 kmem_cache_free(ext4_free_ext_cachep, entry); 2569 kmem_cache_free(ext4_free_ext_cachep, entry);
2570 ext4_mb_release_desc(&e4b); 2570 ext4_mb_release_desc(&e4b);
2571 } 2571 }
@@ -2698,14 +2698,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2698 if (err) 2698 if (err)
2699 goto out_err; 2699 goto out_err;
2700 2700
2701 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2701 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2702 + ac->ac_b_ex.fe_start
2703 + le32_to_cpu(es->s_first_data_block);
2704 2702
2705 len = ac->ac_b_ex.fe_len; 2703 len = ac->ac_b_ex.fe_len;
2706 if (!ext4_data_block_valid(sbi, block, len)) { 2704 if (!ext4_data_block_valid(sbi, block, len)) {
2707 ext4_error(sb, __func__, 2705 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2708 "Allocating blocks %llu-%llu which overlap "
2709 "fs metadata\n", block, block+len); 2706 "fs metadata\n", block, block+len);
2710 /* File system mounted not to panic on error 2707 /* File system mounted not to panic on error
2711 * Fix the bitmap and repeat the block allocation 2708 * Fix the bitmap and repeat the block allocation
@@ -2750,12 +2747,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2750 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2747 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2751 /* release all the reserved blocks if non delalloc */ 2748 /* release all the reserved blocks if non delalloc */
2752 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); 2749 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
2753 else {
2754 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
2755 ac->ac_b_ex.fe_len);
2756 /* convert reserved quota blocks to real quota blocks */
2757 vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
2758 }
2759 2750
2760 if (sbi->s_log_groups_per_flex) { 2751 if (sbi->s_log_groups_per_flex) {
2761 ext4_group_t flex_group = ext4_flex_group(sbi, 2752 ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3006,6 +2997,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3006} 2997}
3007 2998
3008/* 2999/*
3000 * Called on failure; free up any blocks from the inode PA for this
3001 * context. We don't need this for MB_GROUP_PA because we only change
3002 * pa_free in ext4_mb_release_context(), but on failure, we've already
3003 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3004 */
3005static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3006{
3007 struct ext4_prealloc_space *pa = ac->ac_pa;
3008 int len;
3009
3010 if (pa && pa->pa_type == MB_INODE_PA) {
3011 len = ac->ac_b_ex.fe_len;
3012 pa->pa_free += len;
3013 }
3014
3015}
3016
3017/*
3009 * use blocks preallocated to inode 3018 * use blocks preallocated to inode
3010 */ 3019 */
3011static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3020static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -3144,9 +3153,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3144 /* The max size of hash table is PREALLOC_TB_SIZE */ 3153 /* The max size of hash table is PREALLOC_TB_SIZE */
3145 order = PREALLOC_TB_SIZE - 1; 3154 order = PREALLOC_TB_SIZE - 1;
3146 3155
3147 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + 3156 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3148 ac->ac_g_ex.fe_start +
3149 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
3150 /* 3157 /*
3151 * search for the prealloc space that is having 3158 * search for the prealloc space that is having
3152 * minimal distance from the goal block. 3159 * minimal distance from the goal block.
@@ -3509,8 +3516,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3509 if (bit >= end) 3516 if (bit >= end)
3510 break; 3517 break;
3511 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3518 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3512 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3519 start = ext4_group_first_block_no(sb, group) + bit;
3513 le32_to_cpu(sbi->s_es->s_first_data_block);
3514 mb_debug(1, " free preallocated %u/%u in group %u\n", 3520 mb_debug(1, " free preallocated %u/%u in group %u\n",
3515 (unsigned) start, (unsigned) next - bit, 3521 (unsigned) start, (unsigned) next - bit,
3516 (unsigned) group); 3522 (unsigned) group);
@@ -3606,15 +3612,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3606 3612
3607 bitmap_bh = ext4_read_block_bitmap(sb, group); 3613 bitmap_bh = ext4_read_block_bitmap(sb, group);
3608 if (bitmap_bh == NULL) { 3614 if (bitmap_bh == NULL) {
3609 ext4_error(sb, __func__, "Error in reading block " 3615 ext4_error(sb, "Error reading block bitmap for %u", group);
3610 "bitmap for %u", group);
3611 return 0; 3616 return 0;
3612 } 3617 }
3613 3618
3614 err = ext4_mb_load_buddy(sb, group, &e4b); 3619 err = ext4_mb_load_buddy(sb, group, &e4b);
3615 if (err) { 3620 if (err) {
3616 ext4_error(sb, __func__, "Error in loading buddy " 3621 ext4_error(sb, "Error loading buddy information for %u", group);
3617 "information for %u", group);
3618 put_bh(bitmap_bh); 3622 put_bh(bitmap_bh);
3619 return 0; 3623 return 0;
3620 } 3624 }
@@ -3787,15 +3791,15 @@ repeat:
3787 3791
3788 err = ext4_mb_load_buddy(sb, group, &e4b); 3792 err = ext4_mb_load_buddy(sb, group, &e4b);
3789 if (err) { 3793 if (err) {
3790 ext4_error(sb, __func__, "Error in loading buddy " 3794 ext4_error(sb, "Error loading buddy information for %u",
3791 "information for %u", group); 3795 group);
3792 continue; 3796 continue;
3793 } 3797 }
3794 3798
3795 bitmap_bh = ext4_read_block_bitmap(sb, group); 3799 bitmap_bh = ext4_read_block_bitmap(sb, group);
3796 if (bitmap_bh == NULL) { 3800 if (bitmap_bh == NULL) {
3797 ext4_error(sb, __func__, "Error in reading block " 3801 ext4_error(sb, "Error reading block bitmap for %u",
3798 "bitmap for %u", group); 3802 group);
3799 ext4_mb_release_desc(&e4b); 3803 ext4_mb_release_desc(&e4b);
3800 continue; 3804 continue;
3801 } 3805 }
@@ -3921,7 +3925,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3921 3925
3922 /* don't use group allocation for large files */ 3926 /* don't use group allocation for large files */
3923 size = max(size, isize); 3927 size = max(size, isize);
3924 if (size >= sbi->s_mb_stream_request) { 3928 if (size > sbi->s_mb_stream_request) {
3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3929 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
3926 return; 3930 return;
3927 } 3931 }
@@ -3932,7 +3936,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3932 * per cpu locality group is to reduce the contention between block 3936 * per cpu locality group is to reduce the contention between block
3933 * request from multiple CPUs. 3937 * request from multiple CPUs.
3934 */ 3938 */
3935 ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); 3939 ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
3936 3940
3937 /* we're going to use group allocation */ 3941 /* we're going to use group allocation */
3938 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 3942 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4060,8 +4064,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4060 4064
4061 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4065 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4062 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4066 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4063 ext4_error(sb, __func__, "Error in loading buddy " 4067 ext4_error(sb, "Error loading buddy information for %u",
4064 "information for %u", group); 4068 group);
4065 continue; 4069 continue;
4066 } 4070 }
4067 ext4_lock_group(sb, group); 4071 ext4_lock_group(sb, group);
@@ -4237,7 +4241,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4237 return 0; 4241 return 0;
4238 } 4242 }
4239 reserv_blks = ar->len; 4243 reserv_blks = ar->len;
4240 while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { 4244 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
4241 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4245 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4242 ar->len--; 4246 ar->len--;
4243 } 4247 }
@@ -4290,6 +4294,7 @@ repeat:
4290 ac->ac_status = AC_STATUS_CONTINUE; 4294 ac->ac_status = AC_STATUS_CONTINUE;
4291 goto repeat; 4295 goto repeat;
4292 } else if (*errp) { 4296 } else if (*errp) {
4297 ext4_discard_allocated_blocks(ac);
4293 ac->ac_b_ex.fe_len = 0; 4298 ac->ac_b_ex.fe_len = 0;
4294 ar->len = 0; 4299 ar->len = 0;
4295 ext4_mb_show_ac(ac); 4300 ext4_mb_show_ac(ac);
@@ -4313,7 +4318,7 @@ out2:
4313 kmem_cache_free(ext4_ac_cachep, ac); 4318 kmem_cache_free(ext4_ac_cachep, ac);
4314out1: 4319out1:
4315 if (inquota && ar->len < inquota) 4320 if (inquota && ar->len < inquota)
4316 vfs_dq_free_block(ar->inode, inquota - ar->len); 4321 dquot_free_block(ar->inode, inquota - ar->len);
4317out3: 4322out3:
4318 if (!ar->len) { 4323 if (!ar->len) {
4319 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4324 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4422,18 +4427,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4422 return 0; 4427 return 0;
4423} 4428}
4424 4429
4425/* 4430/**
4426 * Main entry point into mballoc to free blocks 4431 * ext4_free_blocks() -- Free given blocks and update quota
4432 * @handle: handle for this transaction
4433 * @inode: inode
4434 * @block: start physical block to free
4435 * @count: number of blocks to count
4436 * @metadata: Are these metadata blocks
4427 */ 4437 */
4428void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4438void ext4_free_blocks(handle_t *handle, struct inode *inode,
4429 ext4_fsblk_t block, unsigned long count, 4439 struct buffer_head *bh, ext4_fsblk_t block,
4430 int metadata, unsigned long *freed) 4440 unsigned long count, int flags)
4431{ 4441{
4432 struct buffer_head *bitmap_bh = NULL; 4442 struct buffer_head *bitmap_bh = NULL;
4433 struct super_block *sb = inode->i_sb; 4443 struct super_block *sb = inode->i_sb;
4434 struct ext4_allocation_context *ac = NULL; 4444 struct ext4_allocation_context *ac = NULL;
4435 struct ext4_group_desc *gdp; 4445 struct ext4_group_desc *gdp;
4436 struct ext4_super_block *es; 4446 struct ext4_super_block *es;
4447 unsigned long freed = 0;
4437 unsigned int overflow; 4448 unsigned int overflow;
4438 ext4_grpblk_t bit; 4449 ext4_grpblk_t bit;
4439 struct buffer_head *gd_bh; 4450 struct buffer_head *gd_bh;
@@ -4443,21 +4454,49 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4443 int err = 0; 4454 int err = 0;
4444 int ret; 4455 int ret;
4445 4456
4446 *freed = 0; 4457 if (bh) {
4458 if (block)
4459 BUG_ON(block != bh->b_blocknr);
4460 else
4461 block = bh->b_blocknr;
4462 }
4447 4463
4448 sbi = EXT4_SB(sb); 4464 sbi = EXT4_SB(sb);
4449 es = EXT4_SB(sb)->s_es; 4465 es = EXT4_SB(sb)->s_es;
4450 if (block < le32_to_cpu(es->s_first_data_block) || 4466 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4451 block + count < block || 4467 !ext4_data_block_valid(sbi, block, count)) {
4452 block + count > ext4_blocks_count(es)) { 4468 ext4_error(sb, "Freeing blocks not in datazone - "
4453 ext4_error(sb, __func__, 4469 "block = %llu, count = %lu", block, count);
4454 "Freeing blocks not in datazone - "
4455 "block = %llu, count = %lu", block, count);
4456 goto error_return; 4470 goto error_return;
4457 } 4471 }
4458 4472
4459 ext4_debug("freeing block %llu\n", block); 4473 ext4_debug("freeing block %llu\n", block);
4460 trace_ext4_free_blocks(inode, block, count, metadata); 4474 trace_ext4_free_blocks(inode, block, count, flags);
4475
4476 if (flags & EXT4_FREE_BLOCKS_FORGET) {
4477 struct buffer_head *tbh = bh;
4478 int i;
4479
4480 BUG_ON(bh && (count > 1));
4481
4482 for (i = 0; i < count; i++) {
4483 if (!bh)
4484 tbh = sb_find_get_block(inode->i_sb,
4485 block + i);
4486 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4487 inode, tbh, block + i);
4488 }
4489 }
4490
4491 /*
4492 * We need to make sure we don't reuse the freed block until
4493 * after the transaction is committed, which we can do by
4494 * treating the block as metadata, below. We make an
4495 * exception if the inode is to be written in writeback mode
4496 * since writeback mode has weak data consistency guarantees.
4497 */
4498 if (!ext4_should_writeback_data(inode))
4499 flags |= EXT4_FREE_BLOCKS_METADATA;
4461 4500
4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4501 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4463 if (ac) { 4502 if (ac) {
@@ -4495,8 +4534,7 @@ do_more:
4495 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4534 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4496 EXT4_SB(sb)->s_itb_per_group)) { 4535 EXT4_SB(sb)->s_itb_per_group)) {
4497 4536
4498 ext4_error(sb, __func__, 4537 ext4_error(sb, "Freeing blocks in system zone - "
4499 "Freeing blocks in system zone - "
4500 "Block = %llu, count = %lu", block, count); 4538 "Block = %llu, count = %lu", block, count);
4501 /* err = 0. ext4_std_error should be a no op */ 4539 /* err = 0. ext4_std_error should be a no op */
4502 goto error_return; 4540 goto error_return;
@@ -4533,7 +4571,8 @@ do_more:
4533 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4571 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4534 if (err) 4572 if (err)
4535 goto error_return; 4573 goto error_return;
4536 if (metadata && ext4_handle_valid(handle)) { 4574
4575 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4537 struct ext4_free_data *new_entry; 4576 struct ext4_free_data *new_entry;
4538 /* 4577 /*
4539 * blocks being freed are metadata. these blocks shouldn't 4578 * blocks being freed are metadata. these blocks shouldn't
@@ -4572,7 +4611,7 @@ do_more:
4572 4611
4573 ext4_mb_release_desc(&e4b); 4612 ext4_mb_release_desc(&e4b);
4574 4613
4575 *freed += count; 4614 freed += count;
4576 4615
4577 /* We dirtied the bitmap block */ 4616 /* We dirtied the bitmap block */
4578 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4617 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4592,6 +4631,8 @@ do_more:
4592 } 4631 }
4593 sb->s_dirt = 1; 4632 sb->s_dirt = 1;
4594error_return: 4633error_return:
4634 if (freed)
4635 dquot_free_block(inode, freed);
4595 brelse(bitmap_bh); 4636 brelse(bitmap_bh);
4596 ext4_std_error(sb, err); 4637 ext4_std_error(sb, err);
4597 if (ac) 4638 if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 0ca811061bc7..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -17,7 +17,6 @@
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/version.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/mutex.h> 21#include <linux/mutex.h>
23#include "ext4_jbd2.h" 22#include "ext4_jbd2.h"
@@ -221,16 +220,9 @@ struct ext4_buddy {
221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
223 222
224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
225
226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 223static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
227 struct ext4_free_extent *fex) 224 struct ext4_free_extent *fex)
228{ 225{
229 ext4_fsblk_t block; 226 return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
230
231 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
232 + fex->fe_start
233 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
234 return block;
235} 227}
236#endif 228#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b80f3e2..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
17#include "ext4_extents.h" 18#include "ext4_extents.h"
18 19
@@ -238,7 +239,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
238 * So allocate a credit of 3. We may update 239 * So allocate a credit of 3. We may update
239 * quota (user and group). 240 * quota (user and group).
240 */ 241 */
241 needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 242 needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
242 243
243 if (ext4_journal_extend(handle, needed) != 0) 244 if (ext4_journal_extend(handle, needed) != 0)
244 retval = ext4_journal_restart(handle, needed); 245 retval = ext4_journal_restart(handle, needed);
@@ -262,13 +263,17 @@ static int free_dind_blocks(handle_t *handle,
262 for (i = 0; i < max_entries; i++) { 263 for (i = 0; i < max_entries; i++) {
263 if (tmp_idata[i]) { 264 if (tmp_idata[i]) {
264 extend_credit_for_blkdel(handle, inode); 265 extend_credit_for_blkdel(handle, inode);
265 ext4_free_blocks(handle, inode, 266 ext4_free_blocks(handle, inode, 0,
266 le32_to_cpu(tmp_idata[i]), 1, 1); 267 le32_to_cpu(tmp_idata[i]), 1,
268 EXT4_FREE_BLOCKS_METADATA |
269 EXT4_FREE_BLOCKS_FORGET);
267 } 270 }
268 } 271 }
269 put_bh(bh); 272 put_bh(bh);
270 extend_credit_for_blkdel(handle, inode); 273 extend_credit_for_blkdel(handle, inode);
271 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 274 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
275 EXT4_FREE_BLOCKS_METADATA |
276 EXT4_FREE_BLOCKS_FORGET);
272 return 0; 277 return 0;
273} 278}
274 279
@@ -297,7 +302,9 @@ static int free_tind_blocks(handle_t *handle,
297 } 302 }
298 put_bh(bh); 303 put_bh(bh);
299 extend_credit_for_blkdel(handle, inode); 304 extend_credit_for_blkdel(handle, inode);
300 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 305 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
306 EXT4_FREE_BLOCKS_METADATA |
307 EXT4_FREE_BLOCKS_FORGET);
301 return 0; 308 return 0;
302} 309}
303 310
@@ -308,8 +315,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
308 /* ei->i_data[EXT4_IND_BLOCK] */ 315 /* ei->i_data[EXT4_IND_BLOCK] */
309 if (i_data[0]) { 316 if (i_data[0]) {
310 extend_credit_for_blkdel(handle, inode); 317 extend_credit_for_blkdel(handle, inode);
311 ext4_free_blocks(handle, inode, 318 ext4_free_blocks(handle, inode, 0,
312 le32_to_cpu(i_data[0]), 1, 1); 319 le32_to_cpu(i_data[0]), 1,
320 EXT4_FREE_BLOCKS_METADATA |
321 EXT4_FREE_BLOCKS_FORGET);
313 } 322 }
314 323
315 /* ei->i_data[EXT4_DIND_BLOCK] */ 324 /* ei->i_data[EXT4_DIND_BLOCK] */
@@ -357,12 +366,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
357 * happened after we started the migrate. We need to 366 * happened after we started the migrate. We need to
358 * fail the migrate 367 * fail the migrate
359 */ 368 */
360 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { 369 if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
361 retval = -EAGAIN; 370 retval = -EAGAIN;
362 up_write(&EXT4_I(inode)->i_data_sem); 371 up_write(&EXT4_I(inode)->i_data_sem);
363 goto err_out; 372 goto err_out;
364 } else 373 } else
365 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 374 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
366 /* 375 /*
367 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
368 * Now copy the i_data across 377 * Now copy the i_data across
@@ -419,7 +428,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
419 } 428 }
420 put_bh(bh); 429 put_bh(bh);
421 extend_credit_for_blkdel(handle, inode); 430 extend_credit_for_blkdel(handle, inode);
422 ext4_free_blocks(handle, inode, block, 1, 1); 431 ext4_free_blocks(handle, inode, 0, block, 1,
432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
423 return retval; 433 return retval;
424} 434}
425 435
@@ -477,7 +487,7 @@ int ext4_ext_migrate(struct inode *inode)
477 handle = ext4_journal_start(inode, 487 handle = ext4_journal_start(inode,
478 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 488 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
479 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 489 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
480 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) 490 EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
481 + 1); 491 + 1);
482 if (IS_ERR(handle)) { 492 if (IS_ERR(handle)) {
483 retval = PTR_ERR(handle); 493 retval = PTR_ERR(handle);
@@ -494,14 +504,10 @@ int ext4_ext_migrate(struct inode *inode)
494 } 504 }
495 i_size_write(tmp_inode, i_size_read(inode)); 505 i_size_write(tmp_inode, i_size_read(inode));
496 /* 506 /*
497 * We don't want the inode to be reclaimed 507 * Set the i_nlink to zero so it will be deleted later
498 * if we got interrupted in between. We have 508 * when we drop inode reference.
499 * this tmp inode carrying reference to the
500 * data blocks of the original file. We set
501 * the i_nlink to zero at the last stage after
502 * switching the original file to extent format
503 */ 509 */
504 tmp_inode->i_nlink = 1; 510 tmp_inode->i_nlink = 0;
505 511
506 ext4_ext_tree_init(handle, tmp_inode); 512 ext4_ext_tree_init(handle, tmp_inode);
507 ext4_orphan_add(handle, tmp_inode); 513 ext4_orphan_add(handle, tmp_inode);
@@ -524,10 +530,20 @@ int ext4_ext_migrate(struct inode *inode)
524 * allocation. 530 * allocation.
525 */ 531 */
526 down_read((&EXT4_I(inode)->i_data_sem)); 532 down_read((&EXT4_I(inode)->i_data_sem));
527 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; 533 ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
528 up_read((&EXT4_I(inode)->i_data_sem)); 534 up_read((&EXT4_I(inode)->i_data_sem));
529 535
530 handle = ext4_journal_start(inode, 1); 536 handle = ext4_journal_start(inode, 1);
537 if (IS_ERR(handle)) {
538 /*
539 * It is impossible to update on-disk structures without
540 * a handle, so just rollback in-core changes and live other
541 * work to orphan_list_cleanup()
542 */
543 ext4_orphan_del(NULL, tmp_inode);
544 retval = PTR_ERR(handle);
545 goto out;
546 }
531 547
532 ei = EXT4_I(inode); 548 ei = EXT4_I(inode);
533 i_data = ei->i_data; 549 i_data = ei->i_data;
@@ -609,15 +625,8 @@ err_out:
609 625
610 /* Reset the extent details */ 626 /* Reset the extent details */
611 ext4_ext_tree_init(handle, tmp_inode); 627 ext4_ext_tree_init(handle, tmp_inode);
612
613 /*
614 * Set the i_nlink to zero so that
615 * generic_drop_inode really deletes the
616 * inode
617 */
618 tmp_inode->i_nlink = 0;
619
620 ext4_journal_stop(handle); 628 ext4_journal_stop(handle);
629out:
621 unlock_new_inode(tmp_inode); 630 unlock_new_inode(tmp_inode);
622 iput(tmp_inode); 631 iput(tmp_inode);
623 632
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 25b6b1457360..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h>
18#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
19#include "ext4_extents.h" 20#include "ext4_extents.h"
20#include "ext4.h" 21#include "ext4.h"
@@ -77,12 +78,14 @@ static int
77mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 78mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
78 struct ext4_extent **extent) 79 struct ext4_extent **extent)
79{ 80{
81 struct ext4_extent_header *eh;
80 int ppos, leaf_ppos = path->p_depth; 82 int ppos, leaf_ppos = path->p_depth;
81 83
82 ppos = leaf_ppos; 84 ppos = leaf_ppos;
83 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
84 /* leaf block */ 86 /* leaf block */
85 *extent = ++path[ppos].p_ext; 87 *extent = ++path[ppos].p_ext;
88 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
86 return 0; 89 return 0;
87 } 90 }
88 91
@@ -119,9 +122,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
119 ext_block_hdr(path[cur_ppos+1].p_bh); 122 ext_block_hdr(path[cur_ppos+1].p_bh);
120 } 123 }
121 124
125 path[leaf_ppos].p_ext = *extent = NULL;
126
127 eh = path[leaf_ppos].p_hdr;
128 if (le16_to_cpu(eh->eh_entries) == 0)
129 /* empty leaf is found */
130 return -ENODATA;
131
122 /* leaf block */ 132 /* leaf block */
123 path[leaf_ppos].p_ext = *extent = 133 path[leaf_ppos].p_ext = *extent =
124 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
135 path[leaf_ppos].p_block =
136 ext_pblock(path[leaf_ppos].p_ext);
125 return 0; 137 return 0;
126 } 138 }
127 } 139 }
@@ -141,12 +153,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
141 int ret = 0; 153 int ret = 0;
142 154
143 if (inode1 == NULL) { 155 if (inode1 == NULL) {
144 ext4_error(inode2->i_sb, function, 156 __ext4_error(inode2->i_sb, function,
145 "Both inodes should not be NULL: " 157 "Both inodes should not be NULL: "
146 "inode1 NULL inode2 %lu", inode2->i_ino); 158 "inode1 NULL inode2 %lu", inode2->i_ino);
147 ret = -EIO; 159 ret = -EIO;
148 } else if (inode2 == NULL) { 160 } else if (inode2 == NULL) {
149 ext4_error(inode1->i_sb, function, 161 __ext4_error(inode1->i_sb, function,
150 "Both inodes should not be NULL: " 162 "Both inodes should not be NULL: "
151 "inode1 %lu inode2 NULL", inode1->i_ino); 163 "inode1 %lu inode2 NULL", inode1->i_ino);
152 ret = -EIO; 164 ret = -EIO;
@@ -155,40 +167,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
155} 167}
156 168
157/** 169/**
158 * mext_double_down_read - Acquire two inodes' read semaphore 170 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
159 * 171 *
160 * @orig_inode: original inode structure 172 * @orig_inode: original inode structure
161 * @donor_inode: donor inode structure 173 * @donor_inode: donor inode structure
162 * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. 174 * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
175 * i_ino order.
163 */ 176 */
164static void 177static void
165mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) 178double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
166{
167 struct inode *first = orig_inode, *second = donor_inode;
168
169 /*
170 * Use the inode number to provide the stable locking order instead
171 * of its address, because the C language doesn't guarantee you can
172 * compare pointers that don't come from the same array.
173 */
174 if (donor_inode->i_ino < orig_inode->i_ino) {
175 first = donor_inode;
176 second = orig_inode;
177 }
178
179 down_read(&EXT4_I(first)->i_data_sem);
180 down_read(&EXT4_I(second)->i_data_sem);
181}
182
183/**
184 * mext_double_down_write - Acquire two inodes' write semaphore
185 *
186 * @orig_inode: original inode structure
187 * @donor_inode: donor inode structure
188 * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
189 */
190static void
191mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
192{ 179{
193 struct inode *first = orig_inode, *second = donor_inode; 180 struct inode *first = orig_inode, *second = donor_inode;
194 181
@@ -203,32 +190,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
203 } 190 }
204 191
205 down_write(&EXT4_I(first)->i_data_sem); 192 down_write(&EXT4_I(first)->i_data_sem);
206 down_write(&EXT4_I(second)->i_data_sem); 193 down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
207}
208
209/**
210 * mext_double_up_read - Release two inodes' read semaphore
211 *
212 * @orig_inode: original inode structure to be released its lock first
213 * @donor_inode: donor inode structure to be released its lock second
214 * Release read semaphore of two inodes (orig and donor).
215 */
216static void
217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
218{
219 up_read(&EXT4_I(orig_inode)->i_data_sem);
220 up_read(&EXT4_I(donor_inode)->i_data_sem);
221} 194}
222 195
223/** 196/**
224 * mext_double_up_write - Release two inodes' write semaphore 197 * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
225 * 198 *
226 * @orig_inode: original inode structure to be released its lock first 199 * @orig_inode: original inode structure to be released its lock first
227 * @donor_inode: donor inode structure to be released its lock second 200 * @donor_inode: donor inode structure to be released its lock second
228 * Release write semaphore of two inodes (orig and donor). 201 * Release write lock of i_data_sem of two inodes (orig and donor).
229 */ 202 */
230static void 203static void
231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 204double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
232{ 205{
233 up_write(&EXT4_I(orig_inode)->i_data_sem); 206 up_write(&EXT4_I(orig_inode)->i_data_sem);
234 up_write(&EXT4_I(donor_inode)->i_data_sem); 207 up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -280,6 +253,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
280 } 253 }
281 254
282 o_start->ee_len = start_ext->ee_len; 255 o_start->ee_len = start_ext->ee_len;
256 eblock = le32_to_cpu(start_ext->ee_block);
283 new_flag = 1; 257 new_flag = 1;
284 258
285 } else if (start_ext->ee_len && new_ext->ee_len && 259 } else if (start_ext->ee_len && new_ext->ee_len &&
@@ -290,6 +264,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
290 * orig |------------------------------| 264 * orig |------------------------------|
291 */ 265 */
292 o_start->ee_len = start_ext->ee_len; 266 o_start->ee_len = start_ext->ee_len;
267 eblock = le32_to_cpu(start_ext->ee_block);
293 new_flag = 1; 268 new_flag = 1;
294 269
295 } else if (!start_ext->ee_len && new_ext->ee_len && 270 } else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -503,7 +478,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
503 struct ext4_extent *oext, *o_start, *o_end, *prev_ext; 478 struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
504 struct ext4_extent new_ext, start_ext, end_ext; 479 struct ext4_extent new_ext, start_ext, end_ext;
505 ext4_lblk_t new_ext_end; 480 ext4_lblk_t new_ext_end;
506 ext4_fsblk_t new_phys_end;
507 int oext_alen, new_ext_alen, end_ext_alen; 481 int oext_alen, new_ext_alen, end_ext_alen;
508 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
509 int ret; 483 int ret;
@@ -517,7 +491,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
517 new_ext.ee_len = dext->ee_len; 491 new_ext.ee_len = dext->ee_len;
518 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 492 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
519 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 493 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
520 new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
521 494
522 /* 495 /*
523 * Case: original extent is first 496 * Case: original extent is first
@@ -530,6 +503,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
530 le32_to_cpu(oext->ee_block) + oext_alen) { 503 le32_to_cpu(oext->ee_block) + oext_alen) {
531 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - 504 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
532 le32_to_cpu(oext->ee_block)); 505 le32_to_cpu(oext->ee_block));
506 start_ext.ee_block = oext->ee_block;
533 copy_extent_status(oext, &start_ext); 507 copy_extent_status(oext, &start_ext);
534 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { 508 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
535 prev_ext = oext - 1; 509 prev_ext = oext - 1;
@@ -543,6 +517,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
543 start_ext.ee_len = cpu_to_le16( 517 start_ext.ee_len = cpu_to_le16(
544 ext4_ext_get_actual_len(prev_ext) + 518 ext4_ext_get_actual_len(prev_ext) +
545 new_ext_alen); 519 new_ext_alen);
520 start_ext.ee_block = oext->ee_block;
546 copy_extent_status(prev_ext, &start_ext); 521 copy_extent_status(prev_ext, &start_ext);
547 new_ext.ee_len = 0; 522 new_ext.ee_len = 0;
548 } 523 }
@@ -554,7 +529,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
554 * new_ext |-------| 529 * new_ext |-------|
555 */ 530 */
556 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 531 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
557 ext4_error(orig_inode->i_sb, __func__, 532 ext4_error(orig_inode->i_sb,
558 "new_ext_end(%u) should be less than or equal to " 533 "new_ext_end(%u) should be less than or equal to "
559 "oext->ee_block(%u) + oext_alen(%d) - 1", 534 "oext->ee_block(%u) + oext_alen(%d) - 1",
560 new_ext_end, le32_to_cpu(oext->ee_block), 535 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -596,7 +571,7 @@ out:
596 * @tmp_oext: the extent that will belong to the donor inode 571 * @tmp_oext: the extent that will belong to the donor inode
597 * @orig_off: block offset of original inode 572 * @orig_off: block offset of original inode
598 * @donor_off: block offset of donor inode 573 * @donor_off: block offset of donor inode
599 * @max_count: the maximun length of extents 574 * @max_count: the maximum length of extents
600 * 575 *
601 * Return 0 on success, or a negative error value on failure. 576 * Return 0 on success, or a negative error value on failure.
602 */ 577 */
@@ -661,6 +636,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
661 * @donor_inode: donor inode 636 * @donor_inode: donor inode
662 * @from: block offset of orig_inode 637 * @from: block offset of orig_inode
663 * @count: block count to be replaced 638 * @count: block count to be replaced
639 * @err: pointer to save return value
664 * 640 *
665 * Replace original inode extents and donor inode extents page by page. 641 * Replace original inode extents and donor inode extents page by page.
666 * We implement this replacement in the following three steps: 642 * We implement this replacement in the following three steps:
@@ -671,33 +647,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
671 * 3. Change the block information of donor inode to point at the saved 647 * 3. Change the block information of donor inode to point at the saved
672 * original inode blocks in the dummy extents. 648 * original inode blocks in the dummy extents.
673 * 649 *
674 * Return 0 on success, or a negative error value on failure. 650 * Return replaced block count.
675 */ 651 */
676static int 652static int
677mext_replace_branches(handle_t *handle, struct inode *orig_inode, 653mext_replace_branches(handle_t *handle, struct inode *orig_inode,
678 struct inode *donor_inode, ext4_lblk_t from, 654 struct inode *donor_inode, ext4_lblk_t from,
679 ext4_lblk_t count) 655 ext4_lblk_t count, int *err)
680{ 656{
681 struct ext4_ext_path *orig_path = NULL; 657 struct ext4_ext_path *orig_path = NULL;
682 struct ext4_ext_path *donor_path = NULL; 658 struct ext4_ext_path *donor_path = NULL;
683 struct ext4_extent *oext, *dext; 659 struct ext4_extent *oext, *dext;
684 struct ext4_extent tmp_dext, tmp_oext; 660 struct ext4_extent tmp_dext, tmp_oext;
685 ext4_lblk_t orig_off = from, donor_off = from; 661 ext4_lblk_t orig_off = from, donor_off = from;
686 int err = 0;
687 int depth; 662 int depth;
688 int replaced_count = 0; 663 int replaced_count = 0;
689 int dext_alen; 664 int dext_alen;
690 665
691 mext_double_down_write(orig_inode, donor_inode); 666 /* Protect extent trees against block allocations via delalloc */
667 double_down_write_data_sem(orig_inode, donor_inode);
692 668
693 /* Get the original extent for the block "orig_off" */ 669 /* Get the original extent for the block "orig_off" */
694 err = get_ext_path(orig_inode, orig_off, &orig_path); 670 *err = get_ext_path(orig_inode, orig_off, &orig_path);
695 if (err) 671 if (*err)
696 goto out; 672 goto out;
697 673
698 /* Get the donor extent for the head */ 674 /* Get the donor extent for the head */
699 err = get_ext_path(donor_inode, donor_off, &donor_path); 675 *err = get_ext_path(donor_inode, donor_off, &donor_path);
700 if (err) 676 if (*err)
701 goto out; 677 goto out;
702 depth = ext_depth(orig_inode); 678 depth = ext_depth(orig_inode);
703 oext = orig_path[depth].p_ext; 679 oext = orig_path[depth].p_ext;
@@ -707,39 +683,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
707 dext = donor_path[depth].p_ext; 683 dext = donor_path[depth].p_ext;
708 tmp_dext = *dext; 684 tmp_dext = *dext;
709 685
710 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 686 *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
711 donor_off, count); 687 donor_off, count);
712 if (err) 688 if (*err)
713 goto out; 689 goto out;
714 690
715 /* Loop for the donor extents */ 691 /* Loop for the donor extents */
716 while (1) { 692 while (1) {
717 /* The extent for donor must be found. */ 693 /* The extent for donor must be found. */
718 if (!dext) { 694 if (!dext) {
719 ext4_error(donor_inode->i_sb, __func__, 695 ext4_error(donor_inode->i_sb,
720 "The extent for donor must be found"); 696 "The extent for donor must be found");
721 err = -EIO; 697 *err = -EIO;
722 goto out; 698 goto out;
723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 699 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
724 ext4_error(donor_inode->i_sb, __func__, 700 ext4_error(donor_inode->i_sb,
725 "Donor offset(%u) and the first block of donor " 701 "Donor offset(%u) and the first block of donor "
726 "extent(%u) should be equal", 702 "extent(%u) should be equal",
727 donor_off, 703 donor_off,
728 le32_to_cpu(tmp_dext.ee_block)); 704 le32_to_cpu(tmp_dext.ee_block));
729 err = -EIO; 705 *err = -EIO;
730 goto out; 706 goto out;
731 } 707 }
732 708
733 /* Set donor extent to orig extent */ 709 /* Set donor extent to orig extent */
734 err = mext_leaf_block(handle, orig_inode, 710 *err = mext_leaf_block(handle, orig_inode,
735 orig_path, &tmp_dext, &orig_off); 711 orig_path, &tmp_dext, &orig_off);
736 if (err < 0) 712 if (*err)
737 goto out; 713 goto out;
738 714
739 /* Set orig extent to donor extent */ 715 /* Set orig extent to donor extent */
740 err = mext_leaf_block(handle, donor_inode, 716 *err = mext_leaf_block(handle, donor_inode,
741 donor_path, &tmp_oext, &donor_off); 717 donor_path, &tmp_oext, &donor_off);
742 if (err < 0) 718 if (*err)
743 goto out; 719 goto out;
744 720
745 dext_alen = ext4_ext_get_actual_len(&tmp_dext); 721 dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +729,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
753 729
754 if (orig_path) 730 if (orig_path)
755 ext4_ext_drop_refs(orig_path); 731 ext4_ext_drop_refs(orig_path);
756 err = get_ext_path(orig_inode, orig_off, &orig_path); 732 *err = get_ext_path(orig_inode, orig_off, &orig_path);
757 if (err) 733 if (*err)
758 goto out; 734 goto out;
759 depth = ext_depth(orig_inode); 735 depth = ext_depth(orig_inode);
760 oext = orig_path[depth].p_ext; 736 oext = orig_path[depth].p_ext;
761 if (le32_to_cpu(oext->ee_block) +
762 ext4_ext_get_actual_len(oext) <= orig_off) {
763 err = 0;
764 goto out;
765 }
766 tmp_oext = *oext; 737 tmp_oext = *oext;
767 738
768 if (donor_path) 739 if (donor_path)
769 ext4_ext_drop_refs(donor_path); 740 ext4_ext_drop_refs(donor_path);
770 err = get_ext_path(donor_inode, donor_off, &donor_path); 741 *err = get_ext_path(donor_inode, donor_off, &donor_path);
771 if (err) 742 if (*err)
772 goto out; 743 goto out;
773 depth = ext_depth(donor_inode); 744 depth = ext_depth(donor_inode);
774 dext = donor_path[depth].p_ext; 745 dext = donor_path[depth].p_ext;
775 if (le32_to_cpu(dext->ee_block) +
776 ext4_ext_get_actual_len(dext) <= donor_off) {
777 err = 0;
778 goto out;
779 }
780 tmp_dext = *dext; 746 tmp_dext = *dext;
781 747
782 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 748 *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
783 donor_off, count - replaced_count); 749 donor_off, count - replaced_count);
784 if (err) 750 if (*err)
785 goto out; 751 goto out;
786 } 752 }
787 753
@@ -795,8 +761,12 @@ out:
795 kfree(donor_path); 761 kfree(donor_path);
796 } 762 }
797 763
798 mext_double_up_write(orig_inode, donor_inode); 764 ext4_ext_invalidate_cache(orig_inode);
799 return err; 765 ext4_ext_invalidate_cache(donor_inode);
766
767 double_up_write_data_sem(orig_inode, donor_inode);
768
769 return replaced_count;
800} 770}
801 771
802/** 772/**
@@ -808,16 +778,17 @@ out:
808 * @data_offset_in_page: block index where data swapping starts 778 * @data_offset_in_page: block index where data swapping starts
809 * @block_len_in_page: the number of blocks to be swapped 779 * @block_len_in_page: the number of blocks to be swapped
810 * @uninit: orig extent is uninitialized or not 780 * @uninit: orig extent is uninitialized or not
781 * @err: pointer to save return value
811 * 782 *
812 * Save the data in original inode blocks and replace original inode extents 783 * Save the data in original inode blocks and replace original inode extents
813 * with donor inode extents by calling mext_replace_branches(). 784 * with donor inode extents by calling mext_replace_branches().
814 * Finally, write out the saved data in new original inode blocks. Return 0 785 * Finally, write out the saved data in new original inode blocks. Return
815 * on success, or a negative error value on failure. 786 * replaced block count.
816 */ 787 */
817static int 788static int
818move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 789move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
819 pgoff_t orig_page_offset, int data_offset_in_page, 790 pgoff_t orig_page_offset, int data_offset_in_page,
820 int block_len_in_page, int uninit) 791 int block_len_in_page, int uninit, int *err)
821{ 792{
822 struct inode *orig_inode = o_filp->f_dentry->d_inode; 793 struct inode *orig_inode = o_filp->f_dentry->d_inode;
823 struct address_space *mapping = orig_inode->i_mapping; 794 struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +800,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
829 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 800 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
830 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 801 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
831 unsigned int w_flags = 0; 802 unsigned int w_flags = 0;
832 unsigned int tmp_data_len, data_len; 803 unsigned int tmp_data_size, data_size, replaced_size;
833 void *fsdata; 804 void *fsdata;
834 int ret, i, jblocks; 805 int i, jblocks;
806 int err2 = 0;
807 int replaced_count = 0;
835 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 808 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
836 809
837 /* 810 /*
@@ -841,8 +814,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
841 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 814 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
842 handle = ext4_journal_start(orig_inode, jblocks); 815 handle = ext4_journal_start(orig_inode, jblocks);
843 if (IS_ERR(handle)) { 816 if (IS_ERR(handle)) {
844 ret = PTR_ERR(handle); 817 *err = PTR_ERR(handle);
845 return ret; 818 return 0;
846 } 819 }
847 820
848 if (segment_eq(get_fs(), KERNEL_DS)) 821 if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +831,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
858 * Just swap data blocks between orig and donor. 831 * Just swap data blocks between orig and donor.
859 */ 832 */
860 if (uninit) { 833 if (uninit) {
861 ret = mext_replace_branches(handle, orig_inode, 834 replaced_count = mext_replace_branches(handle, orig_inode,
862 donor_inode, orig_blk_offset, 835 donor_inode, orig_blk_offset,
863 block_len_in_page); 836 block_len_in_page, err);
864
865 /* Clear the inode cache not to refer to the old data */
866 ext4_ext_invalidate_cache(orig_inode);
867 ext4_ext_invalidate_cache(donor_inode);
868 goto out2; 837 goto out2;
869 } 838 }
870 839
871 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 840 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
872 841
873 /* Calculate data_len */ 842 /* Calculate data_size */
874 if ((orig_blk_offset + block_len_in_page - 1) == 843 if ((orig_blk_offset + block_len_in_page - 1) ==
875 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 844 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
876 /* Replace the last block */ 845 /* Replace the last block */
877 tmp_data_len = orig_inode->i_size & (blocksize - 1); 846 tmp_data_size = orig_inode->i_size & (blocksize - 1);
878 /* 847 /*
879 * If data_len equal zero, it shows data_len is multiples of 848 * If data_size equal zero, it shows data_size is multiples of
880 * blocksize. So we set appropriate value. 849 * blocksize. So we set appropriate value.
881 */ 850 */
882 if (tmp_data_len == 0) 851 if (tmp_data_size == 0)
883 tmp_data_len = blocksize; 852 tmp_data_size = blocksize;
884 853
885 data_len = tmp_data_len + 854 data_size = tmp_data_size +
886 ((block_len_in_page - 1) << orig_inode->i_blkbits); 855 ((block_len_in_page - 1) << orig_inode->i_blkbits);
887 } else { 856 } else
888 data_len = block_len_in_page << orig_inode->i_blkbits; 857 data_size = block_len_in_page << orig_inode->i_blkbits;
889 } 858
859 replaced_size = data_size;
890 860
891 ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, 861 *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
892 &page, &fsdata); 862 &page, &fsdata);
893 if (unlikely(ret < 0)) 863 if (unlikely(*err < 0))
894 goto out; 864 goto out;
895 865
896 if (!PageUptodate(page)) { 866 if (!PageUptodate(page)) {
@@ -911,14 +881,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
911 /* Release old bh and drop refs */ 881 /* Release old bh and drop refs */
912 try_to_release_page(page, 0); 882 try_to_release_page(page, 0);
913 883
914 ret = mext_replace_branches(handle, orig_inode, donor_inode, 884 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
915 orig_blk_offset, block_len_in_page); 885 orig_blk_offset, block_len_in_page,
916 if (ret < 0) 886 &err2);
917 goto out; 887 if (err2) {
918 888 if (replaced_count) {
919 /* Clear the inode cache not to refer to the old data */ 889 block_len_in_page = replaced_count;
920 ext4_ext_invalidate_cache(orig_inode); 890 replaced_size =
921 ext4_ext_invalidate_cache(donor_inode); 891 block_len_in_page << orig_inode->i_blkbits;
892 } else
893 goto out;
894 }
922 895
923 if (!page_has_buffers(page)) 896 if (!page_has_buffers(page))
924 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); 897 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +901,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
928 bh = bh->b_this_page; 901 bh = bh->b_this_page;
929 902
930 for (i = 0; i < block_len_in_page; i++) { 903 for (i = 0; i < block_len_in_page; i++) {
931 ret = ext4_get_block(orig_inode, 904 *err = ext4_get_block(orig_inode,
932 (sector_t)(orig_blk_offset + i), bh, 0); 905 (sector_t)(orig_blk_offset + i), bh, 0);
933 if (ret < 0) 906 if (*err < 0)
934 goto out; 907 goto out;
935 908
936 if (bh->b_this_page != NULL) 909 if (bh->b_this_page != NULL)
937 bh = bh->b_this_page; 910 bh = bh->b_this_page;
938 } 911 }
939 912
940 ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, 913 *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
941 page, fsdata); 914 page, fsdata);
942 page = NULL; 915 page = NULL;
943 916
@@ -951,18 +924,20 @@ out:
951out2: 924out2:
952 ext4_journal_stop(handle); 925 ext4_journal_stop(handle);
953 926
954 return ret < 0 ? ret : 0; 927 if (err2)
928 *err = err2;
929
930 return replaced_count;
955} 931}
956 932
957/** 933/**
958 * mext_check_argumants - Check whether move extent can be done 934 * mext_check_arguments - Check whether move extent can be done
959 * 935 *
960 * @orig_inode: original inode 936 * @orig_inode: original inode
961 * @donor_inode: donor inode 937 * @donor_inode: donor inode
962 * @orig_start: logical start offset in block for orig 938 * @orig_start: logical start offset in block for orig
963 * @donor_start: logical start offset in block for donor 939 * @donor_start: logical start offset in block for donor
964 * @len: the number of blocks to be moved 940 * @len: the number of blocks to be moved
965 * @moved_len: moved block length
966 * 941 *
967 * Check the arguments of ext4_move_extents() whether the files can be 942 * Check the arguments of ext4_move_extents() whether the files can be
968 * exchanged with each other. 943 * exchanged with each other.
@@ -970,18 +945,17 @@ out2:
970 */ 945 */
971static int 946static int
972mext_check_arguments(struct inode *orig_inode, 947mext_check_arguments(struct inode *orig_inode,
973 struct inode *donor_inode, __u64 orig_start, 948 struct inode *donor_inode, __u64 orig_start,
974 __u64 donor_start, __u64 *len, __u64 moved_len) 949 __u64 donor_start, __u64 *len)
975{ 950{
976 ext4_lblk_t orig_blocks, donor_blocks; 951 ext4_lblk_t orig_blocks, donor_blocks;
977 unsigned int blkbits = orig_inode->i_blkbits; 952 unsigned int blkbits = orig_inode->i_blkbits;
978 unsigned int blocksize = 1 << blkbits; 953 unsigned int blocksize = 1 << blkbits;
979 954
980 /* Regular file check */ 955 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
981 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 956 ext4_debug("ext4 move extent: suid or sgid is set"
982 ext4_debug("ext4 move extent: The argument files should be " 957 " to donor file [ino:orig %lu, donor %lu]\n",
983 "regular file [ino:orig %lu, donor %lu]\n", 958 orig_inode->i_ino, donor_inode->i_ino);
984 orig_inode->i_ino, donor_inode->i_ino);
985 return -EINVAL; 959 return -EINVAL;
986 } 960 }
987 961
@@ -1025,13 +999,6 @@ mext_check_arguments(struct inode *orig_inode,
1025 return -EINVAL; 999 return -EINVAL;
1026 } 1000 }
1027 1001
1028 if (moved_len) {
1029 ext4_debug("ext4 move extent: moved_len should be 0 "
1030 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
1031 donor_inode->i_ino);
1032 return -EINVAL;
1033 }
1034
1035 if ((orig_start > EXT_MAX_BLOCK) || 1002 if ((orig_start > EXT_MAX_BLOCK) ||
1036 (donor_start > EXT_MAX_BLOCK) || 1003 (donor_start > EXT_MAX_BLOCK) ||
1037 (*len > EXT_MAX_BLOCK) || 1004 (*len > EXT_MAX_BLOCK) ||
@@ -1088,7 +1055,7 @@ mext_check_arguments(struct inode *orig_inode,
1088 } 1055 }
1089 1056
1090 if (!*len) { 1057 if (!*len) {
1091 ext4_debug("ext4 move extent: len shoudld not be 0 " 1058 ext4_debug("ext4 move extent: len should not be 0 "
1092 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 1059 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
1093 donor_inode->i_ino); 1060 donor_inode->i_ino);
1094 return -EINVAL; 1061 return -EINVAL;
@@ -1232,16 +1199,24 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 return -EINVAL; 1199 return -EINVAL;
1233 } 1200 }
1234 1201
1235 /* protect orig and donor against a truncate */ 1202 /* Regular file check */
1203 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
1204 ext4_debug("ext4 move extent: The argument files should be "
1205 "regular file [ino:orig %lu, donor %lu]\n",
1206 orig_inode->i_ino, donor_inode->i_ino);
1207 return -EINVAL;
1208 }
1209
1210 /* Protect orig and donor inodes against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1211 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1212 if (ret1 < 0)
1238 return ret1; 1213 return ret1;
1239 1214
1240 mext_double_down_read(orig_inode, donor_inode); 1215 /* Protect extent tree against block allocations via delalloc */
1216 double_down_write_data_sem(orig_inode, donor_inode);
1241 /* Check the filesystem environment whether move_extent can be done */ 1217 /* Check the filesystem environment whether move_extent can be done */
1242 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1218 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1243 donor_start, &len, *moved_len); 1219 donor_start, &len);
1244 mext_double_up_read(orig_inode, donor_inode);
1245 if (ret1) 1220 if (ret1)
1246 goto out; 1221 goto out;
1247 1222
@@ -1355,36 +1330,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1355 seq_start = le32_to_cpu(ext_cur->ee_block); 1330 seq_start = le32_to_cpu(ext_cur->ee_block);
1356 rest_blocks = seq_blocks; 1331 rest_blocks = seq_blocks;
1357 1332
1358 /* Discard preallocations of two inodes */ 1333 /*
1359 down_write(&EXT4_I(orig_inode)->i_data_sem); 1334 * Up semaphore to avoid following problems:
1360 ext4_discard_preallocations(orig_inode); 1335 * a. transaction deadlock among ext4_journal_start,
1361 up_write(&EXT4_I(orig_inode)->i_data_sem); 1336 * ->write_begin via pagefault, and jbd2_journal_commit
1362 1337 * b. racing with ->readpage, ->write_begin, and ext4_get_block
1363 down_write(&EXT4_I(donor_inode)->i_data_sem); 1338 * in move_extent_per_page
1364 ext4_discard_preallocations(donor_inode); 1339 */
1365 up_write(&EXT4_I(donor_inode)->i_data_sem); 1340 double_up_write_data_sem(orig_inode, donor_inode);
1366 1341
1367 while (orig_page_offset <= seq_end_page) { 1342 while (orig_page_offset <= seq_end_page) {
1368 1343
1369 /* Swap original branches with new branches */ 1344 /* Swap original branches with new branches */
1370 ret1 = move_extent_per_page(o_filp, donor_inode, 1345 block_len_in_page = move_extent_per_page(
1346 o_filp, donor_inode,
1371 orig_page_offset, 1347 orig_page_offset,
1372 data_offset_in_page, 1348 data_offset_in_page,
1373 block_len_in_page, uninit); 1349 block_len_in_page, uninit,
1374 if (ret1 < 0) 1350 &ret1);
1375 goto out; 1351
1376 orig_page_offset++;
1377 /* Count how many blocks we have exchanged */ 1352 /* Count how many blocks we have exchanged */
1378 *moved_len += block_len_in_page; 1353 *moved_len += block_len_in_page;
1354 if (ret1 < 0)
1355 break;
1379 if (*moved_len > len) { 1356 if (*moved_len > len) {
1380 ext4_error(orig_inode->i_sb, __func__, 1357 ext4_error(orig_inode->i_sb,
1381 "We replaced blocks too much! " 1358 "We replaced blocks too much! "
1382 "sum of replaced: %llu requested: %llu", 1359 "sum of replaced: %llu requested: %llu",
1383 *moved_len, len); 1360 *moved_len, len);
1384 ret1 = -EIO; 1361 ret1 = -EIO;
1385 goto out; 1362 break;
1386 } 1363 }
1387 1364
1365 orig_page_offset++;
1388 data_offset_in_page = 0; 1366 data_offset_in_page = 0;
1389 rest_blocks -= block_len_in_page; 1367 rest_blocks -= block_len_in_page;
1390 if (rest_blocks > blocks_per_page) 1368 if (rest_blocks > blocks_per_page)
@@ -1393,6 +1371,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1393 block_len_in_page = rest_blocks; 1371 block_len_in_page = rest_blocks;
1394 } 1372 }
1395 1373
1374 double_down_write_data_sem(orig_inode, donor_inode);
1375 if (ret1 < 0)
1376 break;
1377
1396 /* Decrease buffer counter */ 1378 /* Decrease buffer counter */
1397 if (holecheck_path) 1379 if (holecheck_path)
1398 ext4_ext_drop_refs(holecheck_path); 1380 ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1396,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1414 1396
1415 } 1397 }
1416out: 1398out:
1399 if (*moved_len) {
1400 ext4_discard_preallocations(orig_inode);
1401 ext4_discard_preallocations(donor_inode);
1402 }
1403
1417 if (orig_path) { 1404 if (orig_path) {
1418 ext4_ext_drop_refs(orig_path); 1405 ext4_ext_drop_refs(orig_path);
1419 kfree(orig_path); 1406 kfree(orig_path);
@@ -1422,7 +1409,7 @@ out:
1422 ext4_ext_drop_refs(holecheck_path); 1409 ext4_ext_drop_refs(holecheck_path);
1423 kfree(holecheck_path); 1410 kfree(holecheck_path);
1424 } 1411 }
1425 1412 double_up_write_data_sem(orig_inode, donor_inode);
1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1413 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1427 1414
1428 if (ret1) 1415 if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d2c1b897fc7..0c070fabd108 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
383 if (root->info.hash_version != DX_HASH_TEA && 383 if (root->info.hash_version != DX_HASH_TEA &&
384 root->info.hash_version != DX_HASH_HALF_MD4 && 384 root->info.hash_version != DX_HASH_HALF_MD4 &&
385 root->info.hash_version != DX_HASH_LEGACY) { 385 root->info.hash_version != DX_HASH_LEGACY) {
386 ext4_warning(dir->i_sb, __func__, 386 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
387 "Unrecognised inode hash code %d",
388 root->info.hash_version); 387 root->info.hash_version);
389 brelse(bh); 388 brelse(bh);
390 *err = ERR_BAD_DX_DIR; 389 *err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
399 hash = hinfo->hash; 398 hash = hinfo->hash;
400 399
401 if (root->info.unused_flags & 1) { 400 if (root->info.unused_flags & 1) {
402 ext4_warning(dir->i_sb, __func__, 401 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
403 "Unimplemented inode hash flags: %#06x",
404 root->info.unused_flags); 402 root->info.unused_flags);
405 brelse(bh); 403 brelse(bh);
406 *err = ERR_BAD_DX_DIR; 404 *err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
408 } 406 }
409 407
410 if ((indirect = root->info.indirect_levels) > 1) { 408 if ((indirect = root->info.indirect_levels) > 1) {
411 ext4_warning(dir->i_sb, __func__, 409 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
412 "Unimplemented inode hash depth: %#06x",
413 root->info.indirect_levels); 410 root->info.indirect_levels);
414 brelse(bh); 411 brelse(bh);
415 *err = ERR_BAD_DX_DIR; 412 *err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
421 418
422 if (dx_get_limit(entries) != dx_root_limit(dir, 419 if (dx_get_limit(entries) != dx_root_limit(dir,
423 root->info.info_length)) { 420 root->info.info_length)) {
424 ext4_warning(dir->i_sb, __func__, 421 ext4_warning(dir->i_sb, "dx entry: limit != root limit");
425 "dx entry: limit != root limit");
426 brelse(bh); 422 brelse(bh);
427 *err = ERR_BAD_DX_DIR; 423 *err = ERR_BAD_DX_DIR;
428 goto fail; 424 goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
433 { 429 {
434 count = dx_get_count(entries); 430 count = dx_get_count(entries);
435 if (!count || count > dx_get_limit(entries)) { 431 if (!count || count > dx_get_limit(entries)) {
436 ext4_warning(dir->i_sb, __func__, 432 ext4_warning(dir->i_sb,
437 "dx entry: no count or count > limit"); 433 "dx entry: no count or count > limit");
438 brelse(bh); 434 brelse(bh);
439 *err = ERR_BAD_DX_DIR; 435 *err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
478 goto fail2; 474 goto fail2;
479 at = entries = ((struct dx_node *) bh->b_data)->entries; 475 at = entries = ((struct dx_node *) bh->b_data)->entries;
480 if (dx_get_limit(entries) != dx_node_limit (dir)) { 476 if (dx_get_limit(entries) != dx_node_limit (dir)) {
481 ext4_warning(dir->i_sb, __func__, 477 ext4_warning(dir->i_sb,
482 "dx entry: limit != node limit"); 478 "dx entry: limit != node limit");
483 brelse(bh); 479 brelse(bh);
484 *err = ERR_BAD_DX_DIR; 480 *err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
494 } 490 }
495fail: 491fail:
496 if (*err == ERR_BAD_DX_DIR) 492 if (*err == ERR_BAD_DX_DIR)
497 ext4_warning(dir->i_sb, __func__, 493 ext4_warning(dir->i_sb,
498 "Corrupt dir inode %ld, running e2fsck is " 494 "Corrupt dir inode %ld, running e2fsck is "
499 "recommended.", dir->i_ino); 495 "recommended.", dir->i_ino);
500 return NULL; 496 return NULL;
@@ -947,9 +943,8 @@ restart:
947 wait_on_buffer(bh); 943 wait_on_buffer(bh);
948 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
949 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
950 ext4_error(sb, __func__, "reading directory #%lu " 946 ext4_error(sb, "reading directory #%lu offset %lu",
951 "offset %lu", dir->i_ino, 947 dir->i_ino, (unsigned long)block);
952 (unsigned long)block);
953 brelse(bh); 948 brelse(bh);
954 goto next; 949 goto next;
955 } 950 }
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1041 retval = ext4_htree_next_block(dir, hash, frame, 1036 retval = ext4_htree_next_block(dir, hash, frame,
1042 frames, NULL); 1037 frames, NULL);
1043 if (retval < 0) { 1038 if (retval < 0) {
1044 ext4_warning(sb, __func__, 1039 ext4_warning(sb,
1045 "error reading index page in directory #%lu", 1040 "error reading index page in directory #%lu",
1046 dir->i_ino); 1041 dir->i_ino);
1047 *err = retval; 1042 *err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1071 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1072 brelse(bh); 1067 brelse(bh);
1073 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1074 ext4_error(dir->i_sb, "ext4_lookup", 1069 ext4_error(dir->i_sb, "bad inode number: %u", ino);
1075 "bad inode number: %u", ino);
1076 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1077 } 1071 }
1078 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1079 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1080 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1081 ext4_error(dir->i_sb, __func__, 1075 ext4_error(dir->i_sb,
1082 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1083 ino); 1077 ino);
1084 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1110 brelse(bh); 1104 brelse(bh);
1111 1105
1112 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1113 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1107 ext4_error(child->d_inode->i_sb,
1114 "bad inode number: %u", ino); 1108 "bad inode number: %u", ino);
1115 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1116 } 1110 }
@@ -1292,9 +1286,6 @@ errout:
1292 * add_dirent_to_buf will attempt search the directory block for 1286 * add_dirent_to_buf will attempt search the directory block for
1293 * space. It will return -ENOSPC if no space is available, and -EIO 1287 * space. It will return -ENOSPC if no space is available, and -EIO
1294 * and -EEXIST if directory entry already exists. 1288 * and -EEXIST if directory entry already exists.
1295 *
1296 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1297 * all other cases bh is released.
1298 */ 1289 */
1299static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1290static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1300 struct inode *inode, struct ext4_dir_entry_2 *de, 1291 struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1306,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1315 top = bh->b_data + blocksize - reclen; 1306 top = bh->b_data + blocksize - reclen;
1316 while ((char *) de <= top) { 1307 while ((char *) de <= top) {
1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1308 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1318 bh, offset)) { 1309 bh, offset))
1319 brelse(bh);
1320 return -EIO; 1310 return -EIO;
1321 } 1311 if (ext4_match(namelen, name, de))
1322 if (ext4_match(namelen, name, de)) {
1323 brelse(bh);
1324 return -EEXIST; 1312 return -EEXIST;
1325 }
1326 nlen = EXT4_DIR_REC_LEN(de->name_len); 1313 nlen = EXT4_DIR_REC_LEN(de->name_len);
1327 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1314 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1328 if ((de->inode? rlen - nlen: rlen) >= reclen) 1315 if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1324,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1337 err = ext4_journal_get_write_access(handle, bh); 1324 err = ext4_journal_get_write_access(handle, bh);
1338 if (err) { 1325 if (err) {
1339 ext4_std_error(dir->i_sb, err); 1326 ext4_std_error(dir->i_sb, err);
1340 brelse(bh);
1341 return err; 1327 return err;
1342 } 1328 }
1343 1329
@@ -1377,7 +1363,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1377 err = ext4_handle_dirty_metadata(handle, dir, bh); 1363 err = ext4_handle_dirty_metadata(handle, dir, bh);
1378 if (err) 1364 if (err)
1379 ext4_std_error(dir->i_sb, err); 1365 ext4_std_error(dir->i_sb, err);
1380 brelse(bh);
1381 return 0; 1366 return 0;
1382} 1367}
1383 1368
@@ -1419,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1419 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1420 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1421 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1422 ext4_error(dir->i_sb, __func__, 1407 ext4_error(dir->i_sb,
1423 "invalid rec_len for '..' in inode %lu", 1408 "invalid rec_len for '..' in inode %lu",
1424 dir->i_ino); 1409 dir->i_ino);
1425 brelse(bh); 1410 brelse(bh);
@@ -1471,7 +1456,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1471 if (!(de)) 1456 if (!(de))
1472 return retval; 1457 return retval;
1473 1458
1474 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1459 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1460 brelse(bh);
1461 return retval;
1475} 1462}
1476 1463
1477/* 1464/*
@@ -1514,8 +1501,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1514 if(!bh) 1501 if(!bh)
1515 return retval; 1502 return retval;
1516 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1503 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1517 if (retval != -ENOSPC) 1504 if (retval != -ENOSPC) {
1505 brelse(bh);
1518 return retval; 1506 return retval;
1507 }
1519 1508
1520 if (blocks == 1 && !dx_fallback && 1509 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 1510 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1517,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1517 de = (struct ext4_dir_entry_2 *) bh->b_data;
1529 de->inode = 0; 1518 de->inode = 0;
1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1519 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1531 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1520 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1521 brelse(bh);
1522 return retval;
1532} 1523}
1533 1524
1534/* 1525/*
@@ -1561,10 +1552,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1561 goto journal_error; 1552 goto journal_error;
1562 1553
1563 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1554 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1564 if (err != -ENOSPC) { 1555 if (err != -ENOSPC)
1565 bh = NULL;
1566 goto cleanup; 1556 goto cleanup;
1567 }
1568 1557
1569 /* Block full, should compress but for now just split */ 1558 /* Block full, should compress but for now just split */
1570 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", 1559 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1580,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1580 1569
1581 if (levels && (dx_get_count(frames->entries) == 1570 if (levels && (dx_get_count(frames->entries) ==
1582 dx_get_limit(frames->entries))) { 1571 dx_get_limit(frames->entries))) {
1583 ext4_warning(sb, __func__, 1572 ext4_warning(sb, "Directory index full!");
1584 "Directory index full!");
1585 err = -ENOSPC; 1573 err = -ENOSPC;
1586 goto cleanup; 1574 goto cleanup;
1587 } 1575 }
@@ -1657,7 +1645,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1657 if (!de) 1645 if (!de)
1658 goto cleanup; 1646 goto cleanup;
1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1647 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1660 bh = NULL;
1661 goto cleanup; 1648 goto cleanup;
1662 1649
1663journal_error: 1650journal_error:
@@ -1772,10 +1759,12 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1772 struct inode *inode; 1759 struct inode *inode;
1773 int err, retries = 0; 1760 int err, retries = 0;
1774 1761
1762 dquot_initialize(dir);
1763
1775retry: 1764retry:
1776 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1765 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1777 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1766 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1778 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1767 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1779 if (IS_ERR(handle)) 1768 if (IS_ERR(handle))
1780 return PTR_ERR(handle); 1769 return PTR_ERR(handle);
1781 1770
@@ -1806,10 +1795,12 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1806 if (!new_valid_dev(rdev)) 1795 if (!new_valid_dev(rdev))
1807 return -EINVAL; 1796 return -EINVAL;
1808 1797
1798 dquot_initialize(dir);
1799
1809retry: 1800retry:
1810 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1801 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1811 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1802 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1812 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1803 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1813 if (IS_ERR(handle)) 1804 if (IS_ERR(handle))
1814 return PTR_ERR(handle); 1805 return PTR_ERR(handle);
1815 1806
@@ -1843,10 +1834,12 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1843 if (EXT4_DIR_LINK_MAX(dir)) 1834 if (EXT4_DIR_LINK_MAX(dir))
1844 return -EMLINK; 1835 return -EMLINK;
1845 1836
1837 dquot_initialize(dir);
1838
1846retry: 1839retry:
1847 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1840 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1848 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1841 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1849 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1842 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1850 if (IS_ERR(handle)) 1843 if (IS_ERR(handle))
1851 return PTR_ERR(handle); 1844 return PTR_ERR(handle);
1852 1845
@@ -1922,11 +1915,11 @@ static int empty_dir(struct inode *inode)
1922 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1923 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1924 if (err) 1917 if (err)
1925 ext4_error(inode->i_sb, __func__, 1918 ext4_error(inode->i_sb,
1926 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory #%lu offset 0",
1927 err, inode->i_ino); 1920 err, inode->i_ino);
1928 else 1921 else
1929 ext4_warning(inode->i_sb, __func__, 1922 ext4_warning(inode->i_sb,
1930 "bad directory (dir #%lu) - no data block", 1923 "bad directory (dir #%lu) - no data block",
1931 inode->i_ino); 1924 inode->i_ino);
1932 return 1; 1925 return 1;
@@ -1937,7 +1930,7 @@ static int empty_dir(struct inode *inode)
1937 !le32_to_cpu(de1->inode) || 1930 !le32_to_cpu(de1->inode) ||
1938 strcmp(".", de->name) || 1931 strcmp(".", de->name) ||
1939 strcmp("..", de1->name)) { 1932 strcmp("..", de1->name)) {
1940 ext4_warning(inode->i_sb, "empty_dir", 1933 ext4_warning(inode->i_sb,
1941 "bad directory (dir #%lu) - no `.' or `..'", 1934 "bad directory (dir #%lu) - no `.' or `..'",
1942 inode->i_ino); 1935 inode->i_ino);
1943 brelse(bh); 1936 brelse(bh);
@@ -1955,7 +1948,7 @@ static int empty_dir(struct inode *inode)
1955 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1956 if (!bh) { 1949 if (!bh) {
1957 if (err) 1950 if (err)
1958 ext4_error(sb, __func__, 1951 ext4_error(sb,
1959 "error %d reading directory" 1952 "error %d reading directory"
1960 " #%lu offset %u", 1953 " #%lu offset %u",
1961 err, inode->i_ino, offset); 1954 err, inode->i_ino, offset);
@@ -2026,11 +2019,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2026 err = ext4_reserve_inode_write(handle, inode, &iloc); 2019 err = ext4_reserve_inode_write(handle, inode, &iloc);
2027 if (err) 2020 if (err)
2028 goto out_unlock; 2021 goto out_unlock;
2022 /*
2023 * Due to previous errors inode may be already a part of on-disk
2024 * orphan list. If so skip on-disk list modification.
2025 */
2026 if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
2027 (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
2028 goto mem_insert;
2029 2029
2030 /* Insert this inode at the head of the on-disk orphan list... */ 2030 /* Insert this inode at the head of the on-disk orphan list... */
2031 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 2031 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2032 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 2032 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2033 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); 2033 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
2034 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 2034 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2035 if (!err) 2035 if (!err)
2036 err = rc; 2036 err = rc;
@@ -2043,6 +2043,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2043 * 2043 *
2044 * This is safe: on error we're going to ignore the orphan list 2044 * This is safe: on error we're going to ignore the orphan list
2045 * anyway on the next recovery. */ 2045 * anyway on the next recovery. */
2046mem_insert:
2046 if (!err) 2047 if (!err)
2047 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2048 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2048 2049
@@ -2102,7 +2103,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2102 if (err) 2103 if (err)
2103 goto out_brelse; 2104 goto out_brelse;
2104 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2105 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2105 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); 2106 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
2106 } else { 2107 } else {
2107 struct ext4_iloc iloc2; 2108 struct ext4_iloc iloc2;
2108 struct inode *i_prev = 2109 struct inode *i_prev =
@@ -2142,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2142 2143
2143 /* Initialize quotas before so that eventual writes go in 2144 /* Initialize quotas before so that eventual writes go in
2144 * separate transaction */ 2145 * separate transaction */
2145 vfs_dq_init(dentry->d_inode); 2146 dquot_initialize(dir);
2147 dquot_initialize(dentry->d_inode);
2148
2146 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2149 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2147 if (IS_ERR(handle)) 2150 if (IS_ERR(handle))
2148 return PTR_ERR(handle); 2151 return PTR_ERR(handle);
@@ -2169,7 +2172,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2169 if (retval) 2172 if (retval)
2170 goto end_rmdir; 2173 goto end_rmdir;
2171 if (!EXT4_DIR_LINK_EMPTY(inode)) 2174 if (!EXT4_DIR_LINK_EMPTY(inode))
2172 ext4_warning(inode->i_sb, "ext4_rmdir", 2175 ext4_warning(inode->i_sb,
2173 "empty directory has too many links (%d)", 2176 "empty directory has too many links (%d)",
2174 inode->i_nlink); 2177 inode->i_nlink);
2175 inode->i_version++; 2178 inode->i_version++;
@@ -2201,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2201 2204
2202 /* Initialize quotas before so that eventual writes go 2205 /* Initialize quotas before so that eventual writes go
2203 * in separate transaction */ 2206 * in separate transaction */
2204 vfs_dq_init(dentry->d_inode); 2207 dquot_initialize(dir);
2208 dquot_initialize(dentry->d_inode);
2209
2205 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2210 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2206 if (IS_ERR(handle)) 2211 if (IS_ERR(handle))
2207 return PTR_ERR(handle); 2212 return PTR_ERR(handle);
@@ -2221,7 +2226,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2221 goto end_unlink; 2226 goto end_unlink;
2222 2227
2223 if (!inode->i_nlink) { 2228 if (!inode->i_nlink) {
2224 ext4_warning(inode->i_sb, "ext4_unlink", 2229 ext4_warning(inode->i_sb,
2225 "Deleting nonexistent file (%lu), %d", 2230 "Deleting nonexistent file (%lu), %d",
2226 inode->i_ino, inode->i_nlink); 2231 inode->i_ino, inode->i_nlink);
2227 inode->i_nlink = 1; 2232 inode->i_nlink = 1;
@@ -2256,10 +2261,12 @@ static int ext4_symlink(struct inode *dir,
2256 if (l > dir->i_sb->s_blocksize) 2261 if (l > dir->i_sb->s_blocksize)
2257 return -ENAMETOOLONG; 2262 return -ENAMETOOLONG;
2258 2263
2264 dquot_initialize(dir);
2265
2259retry: 2266retry:
2260 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2267 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2261 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2268 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2262 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 2269 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2263 if (IS_ERR(handle)) 2270 if (IS_ERR(handle))
2264 return PTR_ERR(handle); 2271 return PTR_ERR(handle);
2265 2272
@@ -2314,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry,
2314 if (inode->i_nlink >= EXT4_LINK_MAX) 2321 if (inode->i_nlink >= EXT4_LINK_MAX)
2315 return -EMLINK; 2322 return -EMLINK;
2316 2323
2324 dquot_initialize(dir);
2325
2317 /* 2326 /*
2318 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing 2327 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2319 * otherwise has the potential to corrupt the orphan inode list. 2328 * otherwise has the potential to corrupt the orphan inode list.
@@ -2364,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2364 struct ext4_dir_entry_2 *old_de, *new_de; 2373 struct ext4_dir_entry_2 *old_de, *new_de;
2365 int retval, force_da_alloc = 0; 2374 int retval, force_da_alloc = 0;
2366 2375
2376 dquot_initialize(old_dir);
2377 dquot_initialize(new_dir);
2378
2367 old_bh = new_bh = dir_bh = NULL; 2379 old_bh = new_bh = dir_bh = NULL;
2368 2380
2369 /* Initialize quotas before so that eventual writes go 2381 /* Initialize quotas before so that eventual writes go
2370 * in separate transaction */ 2382 * in separate transaction */
2371 if (new_dentry->d_inode) 2383 if (new_dentry->d_inode)
2372 vfs_dq_init(new_dentry->d_inode); 2384 dquot_initialize(new_dentry->d_inode);
2373 handle = ext4_journal_start(old_dir, 2 * 2385 handle = ext4_journal_start(old_dir, 2 *
2374 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2386 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2375 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); 2387 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2468,7 +2480,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2468 } 2480 }
2469 } 2481 }
2470 if (retval) { 2482 if (retval) {
2471 ext4_warning(old_dir->i_sb, "ext4_rename", 2483 ext4_warning(old_dir->i_sb,
2472 "Deleting old file (%lu), %d, error=%d", 2484 "Deleting old file (%lu), %d, error=%d",
2473 old_dir->i_ino, old_dir->i_nlink, retval); 2485 old_dir->i_ino, old_dir->i_nlink, retval);
2474 } 2486 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b5..5692c48754a0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
48 48
49 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 49 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
50 if (group != sbi->s_groups_count) 50 if (group != sbi->s_groups_count)
51 ext4_warning(sb, __func__, 51 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
52 "Cannot add at group %u (only %u groups)",
53 input->group, sbi->s_groups_count); 52 input->group, sbi->s_groups_count);
54 else if (offset != 0) 53 else if (offset != 0)
55 ext4_warning(sb, __func__, "Last group not full"); 54 ext4_warning(sb, "Last group not full");
56 else if (input->reserved_blocks > input->blocks_count / 5) 55 else if (input->reserved_blocks > input->blocks_count / 5)
57 ext4_warning(sb, __func__, "Reserved blocks too high (%u)", 56 ext4_warning(sb, "Reserved blocks too high (%u)",
58 input->reserved_blocks); 57 input->reserved_blocks);
59 else if (free_blocks_count < 0) 58 else if (free_blocks_count < 0)
60 ext4_warning(sb, __func__, "Bad blocks count %u", 59 ext4_warning(sb, "Bad blocks count %u",
61 input->blocks_count); 60 input->blocks_count);
62 else if (!(bh = sb_bread(sb, end - 1))) 61 else if (!(bh = sb_bread(sb, end - 1)))
63 ext4_warning(sb, __func__, 62 ext4_warning(sb, "Cannot read last block (%llu)",
64 "Cannot read last block (%llu)",
65 end - 1); 63 end - 1);
66 else if (outside(input->block_bitmap, start, end)) 64 else if (outside(input->block_bitmap, start, end))
67 ext4_warning(sb, __func__, 65 ext4_warning(sb, "Block bitmap not in group (block %llu)",
68 "Block bitmap not in group (block %llu)",
69 (unsigned long long)input->block_bitmap); 66 (unsigned long long)input->block_bitmap);
70 else if (outside(input->inode_bitmap, start, end)) 67 else if (outside(input->inode_bitmap, start, end))
71 ext4_warning(sb, __func__, 68 ext4_warning(sb, "Inode bitmap not in group (block %llu)",
72 "Inode bitmap not in group (block %llu)",
73 (unsigned long long)input->inode_bitmap); 69 (unsigned long long)input->inode_bitmap);
74 else if (outside(input->inode_table, start, end) || 70 else if (outside(input->inode_table, start, end) ||
75 outside(itend - 1, start, end)) 71 outside(itend - 1, start, end))
76 ext4_warning(sb, __func__, 72 ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
77 "Inode table not in group (blocks %llu-%llu)",
78 (unsigned long long)input->inode_table, itend - 1); 73 (unsigned long long)input->inode_table, itend - 1);
79 else if (input->inode_bitmap == input->block_bitmap) 74 else if (input->inode_bitmap == input->block_bitmap)
80 ext4_warning(sb, __func__, 75 ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
81 "Block bitmap same as inode bitmap (%llu)",
82 (unsigned long long)input->block_bitmap); 76 (unsigned long long)input->block_bitmap);
83 else if (inside(input->block_bitmap, input->inode_table, itend)) 77 else if (inside(input->block_bitmap, input->inode_table, itend))
84 ext4_warning(sb, __func__, 78 ext4_warning(sb, "Block bitmap (%llu) in inode table "
85 "Block bitmap (%llu) in inode table (%llu-%llu)", 79 "(%llu-%llu)",
86 (unsigned long long)input->block_bitmap, 80 (unsigned long long)input->block_bitmap,
87 (unsigned long long)input->inode_table, itend - 1); 81 (unsigned long long)input->inode_table, itend - 1);
88 else if (inside(input->inode_bitmap, input->inode_table, itend)) 82 else if (inside(input->inode_bitmap, input->inode_table, itend))
89 ext4_warning(sb, __func__, 83 ext4_warning(sb, "Inode bitmap (%llu) in inode table "
90 "Inode bitmap (%llu) in inode table (%llu-%llu)", 84 "(%llu-%llu)",
91 (unsigned long long)input->inode_bitmap, 85 (unsigned long long)input->inode_bitmap,
92 (unsigned long long)input->inode_table, itend - 1); 86 (unsigned long long)input->inode_table, itend - 1);
93 else if (inside(input->block_bitmap, start, metaend)) 87 else if (inside(input->block_bitmap, start, metaend))
94 ext4_warning(sb, __func__, 88 ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
95 "Block bitmap (%llu) in GDT table"
96 " (%llu-%llu)",
97 (unsigned long long)input->block_bitmap, 89 (unsigned long long)input->block_bitmap,
98 start, metaend - 1); 90 start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend)) 91 else if (inside(input->inode_bitmap, start, metaend))
100 ext4_warning(sb, __func__, 92 ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
101 "Inode bitmap (%llu) in GDT table"
102 " (%llu-%llu)",
103 (unsigned long long)input->inode_bitmap, 93 (unsigned long long)input->inode_bitmap,
104 start, metaend - 1); 94 start, metaend - 1);
105 else if (inside(input->inode_table, start, metaend) || 95 else if (inside(input->inode_table, start, metaend) ||
106 inside(itend - 1, start, metaend)) 96 inside(itend - 1, start, metaend))
107 ext4_warning(sb, __func__, 97 ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
108 "Inode table (%llu-%llu) overlaps" 98 "(%llu-%llu)",
109 "GDT table (%llu-%llu)",
110 (unsigned long long)input->inode_table, 99 (unsigned long long)input->inode_table,
111 itend - 1, start, metaend - 1); 100 itend - 1, start, metaend - 1);
112 else 101 else
@@ -247,7 +236,7 @@ static int setup_new_group_blocks(struct super_block *sb,
247 goto exit_bh; 236 goto exit_bh;
248 237
249 if (IS_ERR(gdb = bclean(handle, sb, block))) { 238 if (IS_ERR(gdb = bclean(handle, sb, block))) {
250 err = PTR_ERR(bh); 239 err = PTR_ERR(gdb);
251 goto exit_bh; 240 goto exit_bh;
252 } 241 }
253 ext4_handle_dirty_metadata(handle, NULL, gdb); 242 ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
364 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { 353 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
365 if (le32_to_cpu(*p++) != 354 if (le32_to_cpu(*p++) !=
366 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ 355 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
367 ext4_warning(sb, __func__, 356 ext4_warning(sb, "reserved GDT %llu"
368 "reserved GDT %llu"
369 " missing grp %d (%llu)", 357 " missing grp %d (%llu)",
370 blk, grp, 358 blk, grp,
371 grp * 359 grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
420 */ 408 */
421 if (EXT4_SB(sb)->s_sbh->b_blocknr != 409 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
422 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { 410 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
423 ext4_warning(sb, __func__, 411 ext4_warning(sb, "won't resize using backup superblock at %llu",
424 "won't resize using backup superblock at %llu",
425 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); 412 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
426 return -EPERM; 413 return -EPERM;
427 } 414 }
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
444 431
445 data = (__le32 *)dind->b_data; 432 data = (__le32 *)dind->b_data;
446 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { 433 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
447 ext4_warning(sb, __func__, 434 ext4_warning(sb, "new group %u GDT block %llu not reserved",
448 "new group %u GDT block %llu not reserved",
449 input->group, gdblock); 435 input->group, gdblock);
450 err = -EINVAL; 436 err = -EINVAL;
451 goto exit_dind; 437 goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
468 GFP_NOFS); 454 GFP_NOFS);
469 if (!n_group_desc) { 455 if (!n_group_desc) {
470 err = -ENOMEM; 456 err = -ENOMEM;
471 ext4_warning(sb, __func__, 457 ext4_warning(sb,
472 "not enough memory for %lu groups", gdb_num + 1); 458 "not enough memory for %lu groups", gdb_num + 1);
473 goto exit_inode; 459 goto exit_inode;
474 } 460 }
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
567 /* Get each reserved primary GDT block and verify it holds backups */ 553 /* Get each reserved primary GDT block and verify it holds backups */
568 for (res = 0; res < reserved_gdb; res++, blk++) { 554 for (res = 0; res < reserved_gdb; res++, blk++) {
569 if (le32_to_cpu(*data) != blk) { 555 if (le32_to_cpu(*data) != blk) {
570 ext4_warning(sb, __func__, 556 ext4_warning(sb, "reserved block %llu"
571 "reserved block %llu"
572 " not at offset %ld", 557 " not at offset %ld",
573 blk, 558 blk,
574 (long)(data - (__le32 *)dind->b_data)); 559 (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
713 */ 698 */
714exit_err: 699exit_err:
715 if (err) { 700 if (err) {
716 ext4_warning(sb, __func__, 701 ext4_warning(sb, "can't update backup for group %u (err %d), "
717 "can't update backup for group %u (err %d), "
718 "forcing fsck on next reboot", group, err); 702 "forcing fsck on next reboot", group, err);
719 sbi->s_mount_state &= ~EXT4_VALID_FS; 703 sbi->s_mount_state &= ~EXT4_VALID_FS;
720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 704 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
753 737
754 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 738 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
755 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { 739 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
756 ext4_warning(sb, __func__, 740 ext4_warning(sb, "Can't resize non-sparse filesystem further");
757 "Can't resize non-sparse filesystem further");
758 return -EPERM; 741 return -EPERM;
759 } 742 }
760 743
761 if (ext4_blocks_count(es) + input->blocks_count < 744 if (ext4_blocks_count(es) + input->blocks_count <
762 ext4_blocks_count(es)) { 745 ext4_blocks_count(es)) {
763 ext4_warning(sb, __func__, "blocks_count overflow"); 746 ext4_warning(sb, "blocks_count overflow");
764 return -EINVAL; 747 return -EINVAL;
765 } 748 }
766 749
767 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 750 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
768 le32_to_cpu(es->s_inodes_count)) { 751 le32_to_cpu(es->s_inodes_count)) {
769 ext4_warning(sb, __func__, "inodes_count overflow"); 752 ext4_warning(sb, "inodes_count overflow");
770 return -EINVAL; 753 return -EINVAL;
771 } 754 }
772 755
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
774 if (!EXT4_HAS_COMPAT_FEATURE(sb, 757 if (!EXT4_HAS_COMPAT_FEATURE(sb,
775 EXT4_FEATURE_COMPAT_RESIZE_INODE) 758 EXT4_FEATURE_COMPAT_RESIZE_INODE)
776 || !le16_to_cpu(es->s_reserved_gdt_blocks)) { 759 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
777 ext4_warning(sb, __func__, 760 ext4_warning(sb,
778 "No reserved GDT blocks, can't resize"); 761 "No reserved GDT blocks, can't resize");
779 return -EPERM; 762 return -EPERM;
780 } 763 }
781 inode = ext4_iget(sb, EXT4_RESIZE_INO); 764 inode = ext4_iget(sb, EXT4_RESIZE_INO);
782 if (IS_ERR(inode)) { 765 if (IS_ERR(inode)) {
783 ext4_warning(sb, __func__, 766 ext4_warning(sb, "Error opening resize inode");
784 "Error opening resize inode");
785 return PTR_ERR(inode); 767 return PTR_ERR(inode);
786 } 768 }
787 } 769 }
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
810 792
811 mutex_lock(&sbi->s_resize_lock); 793 mutex_lock(&sbi->s_resize_lock);
812 if (input->group != sbi->s_groups_count) { 794 if (input->group != sbi->s_groups_count) {
813 ext4_warning(sb, __func__, 795 ext4_warning(sb, "multiple resizers run on filesystem!");
814 "multiple resizers run on filesystem!");
815 err = -EBUSY; 796 err = -EBUSY;
816 goto exit_journal; 797 goto exit_journal;
817 } 798 }
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 978 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 979 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 980 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); 981 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1001 return -EINVAL; 982 return -EINVAL;
1002 } 983 }
1003 984
1004 if (n_blocks_count < o_blocks_count) { 985 if (n_blocks_count < o_blocks_count) {
1005 ext4_warning(sb, __func__, 986 ext4_warning(sb, "can't shrink FS - resize aborted");
1006 "can't shrink FS - resize aborted");
1007 return -EBUSY; 987 return -EBUSY;
1008 } 988 }
1009 989
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1011 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); 991 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
1012 992
1013 if (last == 0) { 993 if (last == 0) {
1014 ext4_warning(sb, __func__, 994 ext4_warning(sb, "need to use ext2online to resize further");
1015 "need to use ext2online to resize further");
1016 return -EPERM; 995 return -EPERM;
1017 } 996 }
1018 997
1019 add = EXT4_BLOCKS_PER_GROUP(sb) - last; 998 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
1020 999
1021 if (o_blocks_count + add < o_blocks_count) { 1000 if (o_blocks_count + add < o_blocks_count) {
1022 ext4_warning(sb, __func__, "blocks_count overflow"); 1001 ext4_warning(sb, "blocks_count overflow");
1023 return -EINVAL; 1002 return -EINVAL;
1024 } 1003 }
1025 1004
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1027 add = n_blocks_count - o_blocks_count; 1006 add = n_blocks_count - o_blocks_count;
1028 1007
1029 if (o_blocks_count + add < n_blocks_count) 1008 if (o_blocks_count + add < n_blocks_count)
1030 ext4_warning(sb, __func__, 1009 ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
1031 "will only finish group (%llu"
1032 " blocks, %u new)",
1033 o_blocks_count + add, add); 1010 o_blocks_count + add, add);
1034 1011
1035 /* See if the device is actually as big as what was requested */ 1012 /* See if the device is actually as big as what was requested */
1036 bh = sb_bread(sb, o_blocks_count + add - 1); 1013 bh = sb_bread(sb, o_blocks_count + add - 1);
1037 if (!bh) { 1014 if (!bh) {
1038 ext4_warning(sb, __func__, 1015 ext4_warning(sb, "can't read last block, resize aborted");
1039 "can't read last block, resize aborted");
1040 return -ENOSPC; 1016 return -ENOSPC;
1041 } 1017 }
1042 brelse(bh); 1018 brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1047 handle = ext4_journal_start_sb(sb, 3); 1023 handle = ext4_journal_start_sb(sb, 3);
1048 if (IS_ERR(handle)) { 1024 if (IS_ERR(handle)) {
1049 err = PTR_ERR(handle); 1025 err = PTR_ERR(handle);
1050 ext4_warning(sb, __func__, "error %d on journal start", err); 1026 ext4_warning(sb, "error %d on journal start", err);
1051 goto exit_put; 1027 goto exit_put;
1052 } 1028 }
1053 1029
1054 mutex_lock(&EXT4_SB(sb)->s_resize_lock); 1030 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1055 if (o_blocks_count != ext4_blocks_count(es)) { 1031 if (o_blocks_count != ext4_blocks_count(es)) {
1056 ext4_warning(sb, __func__, 1032 ext4_warning(sb, "multiple resizers run on filesystem!");
1057 "multiple resizers run on filesystem!");
1058 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1033 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1059 ext4_journal_stop(handle); 1034 ext4_journal_stop(handle);
1060 err = -EBUSY; 1035 err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1063 1038
1064 if ((err = ext4_journal_get_write_access(handle, 1039 if ((err = ext4_journal_get_write_access(handle,
1065 EXT4_SB(sb)->s_sbh))) { 1040 EXT4_SB(sb)->s_sbh))) {
1066 ext4_warning(sb, __func__, 1041 ext4_warning(sb, "error %d on journal write access", err);
1067 "error %d on journal write access", err);
1068 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1042 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1069 ext4_journal_stop(handle); 1043 ext4_journal_stop(handle);
1070 goto exit_put; 1044 goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4ca92aab514..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 68static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 69static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 70static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt);
71 73
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE,
77 .name = "ext3",
78 .get_sb = ext4_get_sb,
79 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV,
81};
82#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
83#else
84#define IS_EXT3_SB(sb) (0)
85#endif
72 86
73ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74 struct ext4_group_desc *bg) 88 struct ext4_group_desc *bg)
@@ -302,7 +316,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
302 * write out the superblock safely. 316 * write out the superblock safely.
303 * 317 *
304 * We'll just use the jbd2_journal_abort() error code to record an error in 318 * We'll just use the jbd2_journal_abort() error code to record an error in
305 * the journal instead. On recovery, the journal will compain about 319 * the journal instead. On recovery, the journal will complain about
306 * that error until we've noted it down and cleared it. 320 * that error until we've noted it down and cleared it.
307 */ 321 */
308 322
@@ -333,7 +347,7 @@ static void ext4_handle_error(struct super_block *sb)
333 sb->s_id); 347 sb->s_id);
334} 348}
335 349
336void ext4_error(struct super_block *sb, const char *function, 350void __ext4_error(struct super_block *sb, const char *function,
337 const char *fmt, ...) 351 const char *fmt, ...)
338{ 352{
339 va_list args; 353 va_list args;
@@ -347,6 +361,42 @@ void ext4_error(struct super_block *sb, const char *function,
347 ext4_handle_error(sb); 361 ext4_handle_error(sb);
348} 362}
349 363
364void ext4_error_inode(const char *function, struct inode *inode,
365 const char *fmt, ...)
366{
367 va_list args;
368
369 va_start(args, fmt);
370 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
371 inode->i_sb->s_id, function, inode->i_ino, current->comm);
372 vprintk(fmt, args);
373 printk("\n");
374 va_end(args);
375
376 ext4_handle_error(inode->i_sb);
377}
378
379void ext4_error_file(const char *function, struct file *file,
380 const char *fmt, ...)
381{
382 va_list args;
383 struct inode *inode = file->f_dentry->d_inode;
384 char pathname[80], *path;
385
386 va_start(args, fmt);
387 path = d_path(&(file->f_path), pathname, sizeof(pathname));
388 if (!path)
389 path = "(unknown)";
390 printk(KERN_CRIT
391 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
392 inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
393 vprintk(fmt, args);
394 printk("\n");
395 va_end(args);
396
397 ext4_handle_error(inode->i_sb);
398}
399
350static const char *ext4_decode_error(struct super_block *sb, int errno, 400static const char *ext4_decode_error(struct super_block *sb, int errno,
351 char nbuf[16]) 401 char nbuf[16])
352{ 402{
@@ -450,7 +500,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
450 va_end(args); 500 va_end(args);
451} 501}
452 502
453void ext4_warning(struct super_block *sb, const char *function, 503void __ext4_warning(struct super_block *sb, const char *function,
454 const char *fmt, ...) 504 const char *fmt, ...)
455{ 505{
456 va_list args; 506 va_list args;
@@ -507,7 +557,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
507 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) 557 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
508 return; 558 return;
509 559
510 ext4_warning(sb, __func__, 560 ext4_warning(sb,
511 "updating to rev %d because of new feature flag, " 561 "updating to rev %d because of new feature flag, "
512 "running e2fsck is recommended", 562 "running e2fsck is recommended",
513 EXT4_DYNAMIC_REV); 563 EXT4_DYNAMIC_REV);
@@ -603,10 +653,6 @@ static void ext4_put_super(struct super_block *sb)
603 if (sb->s_dirt) 653 if (sb->s_dirt)
604 ext4_commit_super(sb, 1); 654 ext4_commit_super(sb, 1);
605 655
606 ext4_release_system_zone(sb);
607 ext4_mb_release(sb);
608 ext4_ext_release(sb);
609 ext4_xattr_put_super(sb);
610 if (sbi->s_journal) { 656 if (sbi->s_journal) {
611 err = jbd2_journal_destroy(sbi->s_journal); 657 err = jbd2_journal_destroy(sbi->s_journal);
612 sbi->s_journal = NULL; 658 sbi->s_journal = NULL;
@@ -614,6 +660,12 @@ static void ext4_put_super(struct super_block *sb)
614 ext4_abort(sb, __func__, 660 ext4_abort(sb, __func__,
615 "Couldn't clean up the journal"); 661 "Couldn't clean up the journal");
616 } 662 }
663
664 ext4_release_system_zone(sb);
665 ext4_mb_release(sb);
666 ext4_ext_release(sb);
667 ext4_xattr_put_super(sb);
668
617 if (!(sb->s_flags & MS_RDONLY)) { 669 if (!(sb->s_flags & MS_RDONLY)) {
618 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 670 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
619 es->s_state = cpu_to_le16(sbi->s_mount_state); 671 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -700,10 +752,17 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
700 ei->i_reserved_data_blocks = 0; 752 ei->i_reserved_data_blocks = 0;
701 ei->i_reserved_meta_blocks = 0; 753 ei->i_reserved_meta_blocks = 0;
702 ei->i_allocated_meta_blocks = 0; 754 ei->i_allocated_meta_blocks = 0;
755 ei->i_da_metadata_calc_len = 0;
703 ei->i_delalloc_reserved_flag = 0; 756 ei->i_delalloc_reserved_flag = 0;
704 spin_lock_init(&(ei->i_block_reservation_lock)); 757 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 758#ifdef CONFIG_QUOTA
759 ei->i_reserved_quota = 0;
760#endif
761 INIT_LIST_HEAD(&ei->i_completed_io_list);
762 spin_lock_init(&ei->i_completed_io_lock);
706 ei->cur_aio_dio = NULL; 763 ei->cur_aio_dio = NULL;
764 ei->i_sync_tid = 0;
765 ei->i_datasync_tid = 0;
707 766
708 return &ei->vfs_inode; 767 return &ei->vfs_inode;
709} 768}
@@ -753,6 +812,7 @@ static void destroy_inodecache(void)
753 812
754static void ext4_clear_inode(struct inode *inode) 813static void ext4_clear_inode(struct inode *inode)
755{ 814{
815 dquot_drop(inode);
756 ext4_discard_preallocations(inode); 816 ext4_discard_preallocations(inode);
757 if (EXT4_JOURNAL(inode)) 817 if (EXT4_JOURNAL(inode))
758 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 818 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -765,9 +825,22 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
765#if defined(CONFIG_QUOTA) 825#if defined(CONFIG_QUOTA)
766 struct ext4_sb_info *sbi = EXT4_SB(sb); 826 struct ext4_sb_info *sbi = EXT4_SB(sb);
767 827
768 if (sbi->s_jquota_fmt) 828 if (sbi->s_jquota_fmt) {
769 seq_printf(seq, ",jqfmt=%s", 829 char *fmtname = "";
770 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0"); 830
831 switch (sbi->s_jquota_fmt) {
832 case QFMT_VFS_OLD:
833 fmtname = "vfsold";
834 break;
835 case QFMT_VFS_V0:
836 fmtname = "vfsv0";
837 break;
838 case QFMT_VFS_V1:
839 fmtname = "vfsv1";
840 break;
841 }
842 seq_printf(seq, ",jqfmt=%s", fmtname);
843 }
771 844
772 if (sbi->s_qf_names[USRQUOTA]) 845 if (sbi->s_qf_names[USRQUOTA])
773 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); 846 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -775,10 +848,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
775 if (sbi->s_qf_names[GRPQUOTA]) 848 if (sbi->s_qf_names[GRPQUOTA])
776 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 849 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
777 850
778 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) 851 if (test_opt(sb, USRQUOTA))
779 seq_puts(seq, ",usrquota"); 852 seq_puts(seq, ",usrquota");
780 853
781 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) 854 if (test_opt(sb, GRPQUOTA))
782 seq_puts(seq, ",grpquota"); 855 seq_puts(seq, ",grpquota");
783#endif 856#endif
784} 857}
@@ -899,6 +972,15 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
899 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 972 if (test_opt(sb, NO_AUTO_DA_ALLOC))
900 seq_puts(seq, ",noauto_da_alloc"); 973 seq_puts(seq, ",noauto_da_alloc");
901 974
975 if (test_opt(sb, DISCARD))
976 seq_puts(seq, ",discard");
977
978 if (test_opt(sb, NOLOAD))
979 seq_puts(seq, ",norecovery");
980
981 if (test_opt(sb, DIOREAD_NOLOCK))
982 seq_puts(seq, ",dioread_nolock");
983
902 ext4_show_quota_options(seq, sb); 984 ext4_show_quota_options(seq, sb);
903 985
904 return 0; 986 return 0;
@@ -985,17 +1067,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
985 const char *data, size_t len, loff_t off); 1067 const char *data, size_t len, loff_t off);
986 1068
987static const struct dquot_operations ext4_quota_operations = { 1069static const struct dquot_operations ext4_quota_operations = {
988 .initialize = dquot_initialize, 1070#ifdef CONFIG_QUOTA
989 .drop = dquot_drop,
990 .alloc_space = dquot_alloc_space,
991 .reserve_space = dquot_reserve_space,
992 .claim_space = dquot_claim_space,
993 .release_rsv = dquot_release_reserved_space,
994 .get_reserved_space = ext4_get_reserved_space, 1071 .get_reserved_space = ext4_get_reserved_space,
995 .alloc_inode = dquot_alloc_inode, 1072#endif
996 .free_space = dquot_free_space,
997 .free_inode = dquot_free_inode,
998 .transfer = dquot_transfer,
999 .write_dquot = ext4_write_dquot, 1073 .write_dquot = ext4_write_dquot,
1000 .acquire_dquot = ext4_acquire_dquot, 1074 .acquire_dquot = ext4_acquire_dquot,
1001 .release_dquot = ext4_release_dquot, 1075 .release_dquot = ext4_release_dquot,
@@ -1074,12 +1148,14 @@ enum {
1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1148 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1075 Opt_data_err_abort, Opt_data_err_ignore, 1149 Opt_data_err_abort, Opt_data_err_ignore,
1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1150 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1151 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1152 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
1079 Opt_usrquota, Opt_grpquota, Opt_i_version, 1153 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
1080 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1154 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1081 Opt_block_validity, Opt_noblock_validity, 1155 Opt_block_validity, Opt_noblock_validity,
1082 Opt_inode_readahead_blks, Opt_journal_ioprio 1156 Opt_inode_readahead_blks, Opt_journal_ioprio,
1157 Opt_dioread_nolock, Opt_dioread_lock,
1158 Opt_discard, Opt_nodiscard,
1083}; 1159};
1084 1160
1085static const match_table_t tokens = { 1161static const match_table_t tokens = {
@@ -1104,6 +1180,7 @@ static const match_table_t tokens = {
1104 {Opt_acl, "acl"}, 1180 {Opt_acl, "acl"},
1105 {Opt_noacl, "noacl"}, 1181 {Opt_noacl, "noacl"},
1106 {Opt_noload, "noload"}, 1182 {Opt_noload, "noload"},
1183 {Opt_noload, "norecovery"},
1107 {Opt_nobh, "nobh"}, 1184 {Opt_nobh, "nobh"},
1108 {Opt_bh, "bh"}, 1185 {Opt_bh, "bh"},
1109 {Opt_commit, "commit=%u"}, 1186 {Opt_commit, "commit=%u"},
@@ -1125,6 +1202,7 @@ static const match_table_t tokens = {
1125 {Opt_grpjquota, "grpjquota=%s"}, 1202 {Opt_grpjquota, "grpjquota=%s"},
1126 {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, 1203 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1127 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, 1204 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1205 {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1128 {Opt_grpquota, "grpquota"}, 1206 {Opt_grpquota, "grpquota"},
1129 {Opt_noquota, "noquota"}, 1207 {Opt_noquota, "noquota"},
1130 {Opt_quota, "quota"}, 1208 {Opt_quota, "quota"},
@@ -1144,6 +1222,10 @@ static const match_table_t tokens = {
1144 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1222 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1145 {Opt_auto_da_alloc, "auto_da_alloc"}, 1223 {Opt_auto_da_alloc, "auto_da_alloc"},
1146 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1224 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1225 {Opt_dioread_nolock, "dioread_nolock"},
1226 {Opt_dioread_lock, "dioread_lock"},
1227 {Opt_discard, "discard"},
1228 {Opt_nodiscard, "nodiscard"},
1147 {Opt_err, NULL}, 1229 {Opt_err, NULL},
1148}; 1230};
1149 1231
@@ -1171,6 +1253,66 @@ static ext4_fsblk_t get_sb_block(void **data)
1171} 1253}
1172 1254
1173#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1255#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1256static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1257 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1258
1259#ifdef CONFIG_QUOTA
1260static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1261{
1262 struct ext4_sb_info *sbi = EXT4_SB(sb);
1263 char *qname;
1264
1265 if (sb_any_quota_loaded(sb) &&
1266 !sbi->s_qf_names[qtype]) {
1267 ext4_msg(sb, KERN_ERR,
1268 "Cannot change journaled "
1269 "quota options when quota turned on");
1270 return 0;
1271 }
1272 qname = match_strdup(args);
1273 if (!qname) {
1274 ext4_msg(sb, KERN_ERR,
1275 "Not enough memory for storing quotafile name");
1276 return 0;
1277 }
1278 if (sbi->s_qf_names[qtype] &&
1279 strcmp(sbi->s_qf_names[qtype], qname)) {
1280 ext4_msg(sb, KERN_ERR,
1281 "%s quota file already specified", QTYPE2NAME(qtype));
1282 kfree(qname);
1283 return 0;
1284 }
1285 sbi->s_qf_names[qtype] = qname;
1286 if (strchr(sbi->s_qf_names[qtype], '/')) {
1287 ext4_msg(sb, KERN_ERR,
1288 "quotafile must be on filesystem root");
1289 kfree(sbi->s_qf_names[qtype]);
1290 sbi->s_qf_names[qtype] = NULL;
1291 return 0;
1292 }
1293 set_opt(sbi->s_mount_opt, QUOTA);
1294 return 1;
1295}
1296
1297static int clear_qf_name(struct super_block *sb, int qtype)
1298{
1299
1300 struct ext4_sb_info *sbi = EXT4_SB(sb);
1301
1302 if (sb_any_quota_loaded(sb) &&
1303 sbi->s_qf_names[qtype]) {
1304 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1305 " when quota turned on");
1306 return 0;
1307 }
1308 /*
1309 * The space will be released later when all options are confirmed
1310 * to be correct
1311 */
1312 sbi->s_qf_names[qtype] = NULL;
1313 return 1;
1314}
1315#endif
1174 1316
1175static int parse_options(char *options, struct super_block *sb, 1317static int parse_options(char *options, struct super_block *sb,
1176 unsigned long *journal_devnum, 1318 unsigned long *journal_devnum,
@@ -1183,8 +1325,7 @@ static int parse_options(char *options, struct super_block *sb,
1183 int data_opt = 0; 1325 int data_opt = 0;
1184 int option; 1326 int option;
1185#ifdef CONFIG_QUOTA 1327#ifdef CONFIG_QUOTA
1186 int qtype, qfmt; 1328 int qfmt;
1187 char *qname;
1188#endif 1329#endif
1189 1330
1190 if (!options) 1331 if (!options)
@@ -1195,19 +1336,31 @@ static int parse_options(char *options, struct super_block *sb,
1195 if (!*p) 1336 if (!*p)
1196 continue; 1337 continue;
1197 1338
1339 /*
1340 * Initialize args struct so we know whether arg was
1341 * found; some options take optional arguments.
1342 */
1343 args[0].to = args[0].from = 0;
1198 token = match_token(p, tokens, args); 1344 token = match_token(p, tokens, args);
1199 switch (token) { 1345 switch (token) {
1200 case Opt_bsd_df: 1346 case Opt_bsd_df:
1347 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1201 clear_opt(sbi->s_mount_opt, MINIX_DF); 1348 clear_opt(sbi->s_mount_opt, MINIX_DF);
1202 break; 1349 break;
1203 case Opt_minix_df: 1350 case Opt_minix_df:
1351 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1204 set_opt(sbi->s_mount_opt, MINIX_DF); 1352 set_opt(sbi->s_mount_opt, MINIX_DF);
1353
1205 break; 1354 break;
1206 case Opt_grpid: 1355 case Opt_grpid:
1356 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1207 set_opt(sbi->s_mount_opt, GRPID); 1357 set_opt(sbi->s_mount_opt, GRPID);
1358
1208 break; 1359 break;
1209 case Opt_nogrpid: 1360 case Opt_nogrpid:
1361 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1210 clear_opt(sbi->s_mount_opt, GRPID); 1362 clear_opt(sbi->s_mount_opt, GRPID);
1363
1211 break; 1364 break;
1212 case Opt_resuid: 1365 case Opt_resuid:
1213 if (match_int(&args[0], &option)) 1366 if (match_int(&args[0], &option))
@@ -1344,14 +1497,13 @@ static int parse_options(char *options, struct super_block *sb,
1344 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1497 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1345 datacheck: 1498 datacheck:
1346 if (is_remount) { 1499 if (is_remount) {
1347 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) 1500 if (test_opt(sb, DATA_FLAGS) != data_opt) {
1348 != data_opt) {
1349 ext4_msg(sb, KERN_ERR, 1501 ext4_msg(sb, KERN_ERR,
1350 "Cannot change data mode on remount"); 1502 "Cannot change data mode on remount");
1351 return 0; 1503 return 0;
1352 } 1504 }
1353 } else { 1505 } else {
1354 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; 1506 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1355 sbi->s_mount_opt |= data_opt; 1507 sbi->s_mount_opt |= data_opt;
1356 } 1508 }
1357 break; 1509 break;
@@ -1363,68 +1515,30 @@ static int parse_options(char *options, struct super_block *sb,
1363 break; 1515 break;
1364#ifdef CONFIG_QUOTA 1516#ifdef CONFIG_QUOTA
1365 case Opt_usrjquota: 1517 case Opt_usrjquota:
1366 qtype = USRQUOTA; 1518 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1367 goto set_qf_name;
1368 case Opt_grpjquota:
1369 qtype = GRPQUOTA;
1370set_qf_name:
1371 if (sb_any_quota_loaded(sb) &&
1372 !sbi->s_qf_names[qtype]) {
1373 ext4_msg(sb, KERN_ERR,
1374 "Cannot change journaled "
1375 "quota options when quota turned on");
1376 return 0;
1377 }
1378 qname = match_strdup(&args[0]);
1379 if (!qname) {
1380 ext4_msg(sb, KERN_ERR,
1381 "Not enough memory for "
1382 "storing quotafile name");
1383 return 0; 1519 return 0;
1384 } 1520 break;
1385 if (sbi->s_qf_names[qtype] && 1521 case Opt_grpjquota:
1386 strcmp(sbi->s_qf_names[qtype], qname)) { 1522 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1387 ext4_msg(sb, KERN_ERR,
1388 "%s quota file already "
1389 "specified", QTYPE2NAME(qtype));
1390 kfree(qname);
1391 return 0;
1392 }
1393 sbi->s_qf_names[qtype] = qname;
1394 if (strchr(sbi->s_qf_names[qtype], '/')) {
1395 ext4_msg(sb, KERN_ERR,
1396 "quotafile must be on "
1397 "filesystem root");
1398 kfree(sbi->s_qf_names[qtype]);
1399 sbi->s_qf_names[qtype] = NULL;
1400 return 0; 1523 return 0;
1401 }
1402 set_opt(sbi->s_mount_opt, QUOTA);
1403 break; 1524 break;
1404 case Opt_offusrjquota: 1525 case Opt_offusrjquota:
1405 qtype = USRQUOTA; 1526 if (!clear_qf_name(sb, USRQUOTA))
1406 goto clear_qf_name; 1527 return 0;
1528 break;
1407 case Opt_offgrpjquota: 1529 case Opt_offgrpjquota:
1408 qtype = GRPQUOTA; 1530 if (!clear_qf_name(sb, GRPQUOTA))
1409clear_qf_name:
1410 if (sb_any_quota_loaded(sb) &&
1411 sbi->s_qf_names[qtype]) {
1412 ext4_msg(sb, KERN_ERR, "Cannot change "
1413 "journaled quota options when "
1414 "quota turned on");
1415 return 0; 1531 return 0;
1416 }
1417 /*
1418 * The space will be released later when all options
1419 * are confirmed to be correct
1420 */
1421 sbi->s_qf_names[qtype] = NULL;
1422 break; 1532 break;
1533
1423 case Opt_jqfmt_vfsold: 1534 case Opt_jqfmt_vfsold:
1424 qfmt = QFMT_VFS_OLD; 1535 qfmt = QFMT_VFS_OLD;
1425 goto set_qf_format; 1536 goto set_qf_format;
1426 case Opt_jqfmt_vfsv0: 1537 case Opt_jqfmt_vfsv0:
1427 qfmt = QFMT_VFS_V0; 1538 qfmt = QFMT_VFS_V0;
1539 goto set_qf_format;
1540 case Opt_jqfmt_vfsv1:
1541 qfmt = QFMT_VFS_V1;
1428set_qf_format: 1542set_qf_format:
1429 if (sb_any_quota_loaded(sb) && 1543 if (sb_any_quota_loaded(sb) &&
1430 sbi->s_jquota_fmt != qfmt) { 1544 sbi->s_jquota_fmt != qfmt) {
@@ -1467,6 +1581,7 @@ set_qf_format:
1467 case Opt_offgrpjquota: 1581 case Opt_offgrpjquota:
1468 case Opt_jqfmt_vfsold: 1582 case Opt_jqfmt_vfsold:
1469 case Opt_jqfmt_vfsv0: 1583 case Opt_jqfmt_vfsv0:
1584 case Opt_jqfmt_vfsv1:
1470 ext4_msg(sb, KERN_ERR, 1585 ext4_msg(sb, KERN_ERR,
1471 "journaled quota options not supported"); 1586 "journaled quota options not supported");
1472 break; 1587 break;
@@ -1480,10 +1595,11 @@ set_qf_format:
1480 clear_opt(sbi->s_mount_opt, BARRIER); 1595 clear_opt(sbi->s_mount_opt, BARRIER);
1481 break; 1596 break;
1482 case Opt_barrier: 1597 case Opt_barrier:
1483 if (match_int(&args[0], &option)) { 1598 if (args[0].from) {
1484 set_opt(sbi->s_mount_opt, BARRIER); 1599 if (match_int(&args[0], &option))
1485 break; 1600 return 0;
1486 } 1601 } else
1602 option = 1; /* No argument, default to 1 */
1487 if (option) 1603 if (option)
1488 set_opt(sbi->s_mount_opt, BARRIER); 1604 set_opt(sbi->s_mount_opt, BARRIER);
1489 else 1605 else
@@ -1556,15 +1672,28 @@ set_qf_format:
1556 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1672 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1557 break; 1673 break;
1558 case Opt_auto_da_alloc: 1674 case Opt_auto_da_alloc:
1559 if (match_int(&args[0], &option)) { 1675 if (args[0].from) {
1560 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1676 if (match_int(&args[0], &option))
1561 break; 1677 return 0;
1562 } 1678 } else
1679 option = 1; /* No argument, default to 1 */
1563 if (option) 1680 if (option)
1564 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1681 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1565 else 1682 else
1566 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1683 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1567 break; 1684 break;
1685 case Opt_discard:
1686 set_opt(sbi->s_mount_opt, DISCARD);
1687 break;
1688 case Opt_nodiscard:
1689 clear_opt(sbi->s_mount_opt, DISCARD);
1690 break;
1691 case Opt_dioread_nolock:
1692 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1693 break;
1694 case Opt_dioread_lock:
1695 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1696 break;
1568 default: 1697 default:
1569 ext4_msg(sb, KERN_ERR, 1698 ext4_msg(sb, KERN_ERR,
1570 "Unrecognized mount option \"%s\" " 1699 "Unrecognized mount option \"%s\" "
@@ -1574,18 +1703,13 @@ set_qf_format:
1574 } 1703 }
1575#ifdef CONFIG_QUOTA 1704#ifdef CONFIG_QUOTA
1576 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1705 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1577 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && 1706 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1578 sbi->s_qf_names[USRQUOTA])
1579 clear_opt(sbi->s_mount_opt, USRQUOTA); 1707 clear_opt(sbi->s_mount_opt, USRQUOTA);
1580 1708
1581 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && 1709 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1582 sbi->s_qf_names[GRPQUOTA])
1583 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1710 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1584 1711
1585 if ((sbi->s_qf_names[USRQUOTA] && 1712 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1586 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1587 (sbi->s_qf_names[GRPQUOTA] &&
1588 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1589 ext4_msg(sb, KERN_ERR, "old and new quota " 1713 ext4_msg(sb, KERN_ERR, "old and new quota "
1590 "format mixing"); 1714 "format mixing");
1591 return 0; 1715 return 0;
@@ -1673,14 +1797,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
1673 size_t size; 1797 size_t size;
1674 int i; 1798 int i;
1675 1799
1676 if (!sbi->s_es->s_log_groups_per_flex) { 1800 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1801 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1802
1803 if (groups_per_flex < 2) {
1677 sbi->s_log_groups_per_flex = 0; 1804 sbi->s_log_groups_per_flex = 0;
1678 return 1; 1805 return 1;
1679 } 1806 }
1680 1807
1681 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1682 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1683
1684 /* We allocate both existing and potentially added groups */ 1808 /* We allocate both existing and potentially added groups */
1685 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 1809 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1686 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << 1810 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -1895,7 +2019,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1895 } 2019 }
1896 2020
1897 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2021 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1898 vfs_dq_init(inode); 2022 dquot_initialize(inode);
1899 if (inode->i_nlink) { 2023 if (inode->i_nlink) {
1900 ext4_msg(sb, KERN_DEBUG, 2024 ext4_msg(sb, KERN_DEBUG,
1901 "%s: truncating inode %lu to %lld bytes", 2025 "%s: truncating inode %lu to %lld bytes",
@@ -2099,11 +2223,8 @@ static int parse_strtoul(const char *buf,
2099{ 2223{
2100 char *endp; 2224 char *endp;
2101 2225
2102 while (*buf && isspace(*buf)) 2226 *value = simple_strtoul(skip_spaces(buf), &endp, 0);
2103 buf++; 2227 endp = skip_spaces(endp);
2104 *value = simple_strtoul(buf, &endp, 0);
2105 while (*endp && isspace(*endp))
2106 endp++;
2107 if (*endp || *value > max) 2228 if (*endp || *value > max)
2108 return -EINVAL; 2229 return -EINVAL;
2109 2230
@@ -2134,9 +2255,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2134 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2255 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2135 2256
2136 return snprintf(buf, PAGE_SIZE, "%llu\n", 2257 return snprintf(buf, PAGE_SIZE, "%llu\n",
2137 sbi->s_kbytes_written + 2258 (unsigned long long)(sbi->s_kbytes_written +
2138 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2259 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2139 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 2260 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2140} 2261}
2141 2262
2142static ssize_t inode_readahead_blks_store(struct ext4_attr *a, 2263static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
@@ -2251,7 +2372,7 @@ static void ext4_sb_release(struct kobject *kobj)
2251} 2372}
2252 2373
2253 2374
2254static struct sysfs_ops ext4_attr_ops = { 2375static const struct sysfs_ops ext4_attr_ops = {
2255 .show = ext4_attr_show, 2376 .show = ext4_attr_show,
2256 .store = ext4_attr_store, 2377 .store = ext4_attr_store,
2257}; 2378};
@@ -2391,8 +2512,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2391 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2512 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
2392 if (def_mount_opts & EXT4_DEFM_DEBUG) 2513 if (def_mount_opts & EXT4_DEFM_DEBUG)
2393 set_opt(sbi->s_mount_opt, DEBUG); 2514 set_opt(sbi->s_mount_opt, DEBUG);
2394 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) 2515 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
2516 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
2517 "2.6.38");
2395 set_opt(sbi->s_mount_opt, GRPID); 2518 set_opt(sbi->s_mount_opt, GRPID);
2519 }
2396 if (def_mount_opts & EXT4_DEFM_UID16) 2520 if (def_mount_opts & EXT4_DEFM_UID16)
2397 set_opt(sbi->s_mount_opt, NO_UID32); 2521 set_opt(sbi->s_mount_opt, NO_UID32);
2398#ifdef CONFIG_EXT4_FS_XATTR 2522#ifdef CONFIG_EXT4_FS_XATTR
@@ -2404,11 +2528,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2404 set_opt(sbi->s_mount_opt, POSIX_ACL); 2528 set_opt(sbi->s_mount_opt, POSIX_ACL);
2405#endif 2529#endif
2406 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 2530 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2407 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; 2531 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2408 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 2532 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2409 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; 2533 set_opt(sbi->s_mount_opt, ORDERED_DATA);
2410 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 2534 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2411 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; 2535 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2412 2536
2413 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 2537 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2414 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 2538 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2429,14 +2553,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2429 * enable delayed allocation by default 2553 * enable delayed allocation by default
2430 * Use -o nodelalloc to turn it off 2554 * Use -o nodelalloc to turn it off
2431 */ 2555 */
2432 set_opt(sbi->s_mount_opt, DELALLOC); 2556 if (!IS_EXT3_SB(sb))
2557 set_opt(sbi->s_mount_opt, DELALLOC);
2433 2558
2434 if (!parse_options((char *) data, sb, &journal_devnum, 2559 if (!parse_options((char *) data, sb, &journal_devnum,
2435 &journal_ioprio, NULL, 0)) 2560 &journal_ioprio, NULL, 0))
2436 goto failed_mount; 2561 goto failed_mount;
2437 2562
2438 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2563 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2439 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2564 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2440 2565
2441 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 2566 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
2442 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || 2567 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2721,31 +2846,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2721 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { 2846 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
2722 if (ext4_load_journal(sb, es, journal_devnum)) 2847 if (ext4_load_journal(sb, es, journal_devnum))
2723 goto failed_mount3; 2848 goto failed_mount3;
2724 if (!(sb->s_flags & MS_RDONLY) &&
2725 EXT4_SB(sb)->s_journal->j_failed_commit) {
2726 ext4_msg(sb, KERN_CRIT, "error: "
2727 "ext4_fill_super: Journal transaction "
2728 "%u is corrupt",
2729 EXT4_SB(sb)->s_journal->j_failed_commit);
2730 if (test_opt(sb, ERRORS_RO)) {
2731 ext4_msg(sb, KERN_CRIT,
2732 "Mounting filesystem read-only");
2733 sb->s_flags |= MS_RDONLY;
2734 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2735 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2736 }
2737 if (test_opt(sb, ERRORS_PANIC)) {
2738 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2739 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2740 ext4_commit_super(sb, 1);
2741 goto failed_mount4;
2742 }
2743 }
2744 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 2849 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2745 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2850 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2746 ext4_msg(sb, KERN_ERR, "required journal recovery " 2851 ext4_msg(sb, KERN_ERR, "required journal recovery "
2747 "suppressed and not mounted read-only"); 2852 "suppressed and not mounted read-only");
2748 goto failed_mount4; 2853 goto failed_mount_wq;
2749 } else { 2854 } else {
2750 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2855 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2751 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 2856 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2758,7 +2863,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2758 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2863 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2759 JBD2_FEATURE_INCOMPAT_64BIT)) { 2864 JBD2_FEATURE_INCOMPAT_64BIT)) {
2760 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 2865 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2761 goto failed_mount4; 2866 goto failed_mount_wq;
2762 } 2867 }
2763 2868
2764 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2869 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2797,7 +2902,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2797 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2902 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2798 ext4_msg(sb, KERN_ERR, "Journal does not support " 2903 ext4_msg(sb, KERN_ERR, "Journal does not support "
2799 "requested data journaling mode"); 2904 "requested data journaling mode");
2800 goto failed_mount4; 2905 goto failed_mount_wq;
2801 } 2906 }
2802 default: 2907 default:
2803 break; 2908 break;
@@ -2805,13 +2910,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2805 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2806 2911
2807no_journal: 2912no_journal:
2808
2809 if (test_opt(sb, NOBH)) { 2913 if (test_opt(sb, NOBH)) {
2810 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2811 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2812 "its supported only with writeback mode"); 2916 "its supported only with writeback mode");
2813 clear_opt(sbi->s_mount_opt, NOBH); 2917 clear_opt(sbi->s_mount_opt, NOBH);
2814 } 2918 }
2919 if (test_opt(sb, DIOREAD_NOLOCK)) {
2920 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2921 "not supported with nobh mode");
2922 goto failed_mount_wq;
2923 }
2815 } 2924 }
2816 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 2925 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2817 if (!EXT4_SB(sb)->dio_unwritten_wq) { 2926 if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2876,6 +2985,18 @@ no_journal:
2876 "requested data journaling mode"); 2985 "requested data journaling mode");
2877 clear_opt(sbi->s_mount_opt, DELALLOC); 2986 clear_opt(sbi->s_mount_opt, DELALLOC);
2878 } 2987 }
2988 if (test_opt(sb, DIOREAD_NOLOCK)) {
2989 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2990 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2991 "option - requested data journaling mode");
2992 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2993 }
2994 if (sb->s_blocksize < PAGE_SIZE) {
2995 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2996 "option - block size is too small");
2997 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2998 }
2999 }
2879 3000
2880 err = ext4_setup_system_zone(sb); 3001 err = ext4_setup_system_zone(sb);
2881 if (err) { 3002 if (err) {
@@ -3339,10 +3460,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
3339 char nbuf[16]; 3460 char nbuf[16];
3340 3461
3341 errstr = ext4_decode_error(sb, j_errno, nbuf); 3462 errstr = ext4_decode_error(sb, j_errno, nbuf);
3342 ext4_warning(sb, __func__, "Filesystem error recorded " 3463 ext4_warning(sb, "Filesystem error recorded "
3343 "from previous mount: %s", errstr); 3464 "from previous mount: %s", errstr);
3344 ext4_warning(sb, __func__, "Marking fs in need of " 3465 ext4_warning(sb, "Marking fs in need of filesystem check.");
3345 "filesystem check.");
3346 3466
3347 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 3467 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
3348 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 3468 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3493,7 +3613,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3493 ext4_abort(sb, __func__, "Abort forced by user"); 3613 ext4_abort(sb, __func__, "Abort forced by user");
3494 3614
3495 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3615 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3496 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 3616 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3497 3617
3498 es = sbi->s_es; 3618 es = sbi->s_es;
3499 3619
@@ -3668,13 +3788,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3668 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3788 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3669 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 3789 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3670 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 3790 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
3671 ext4_free_blocks_count_set(es, buf->f_bfree);
3672 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3791 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3673 if (buf->f_bfree < ext4_r_blocks_count(es)) 3792 if (buf->f_bfree < ext4_r_blocks_count(es))
3674 buf->f_bavail = 0; 3793 buf->f_bavail = 0;
3675 buf->f_files = le32_to_cpu(es->s_inodes_count); 3794 buf->f_files = le32_to_cpu(es->s_inodes_count);
3676 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 3795 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
3677 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
3678 buf->f_namelen = EXT4_NAME_LEN; 3796 buf->f_namelen = EXT4_NAME_LEN;
3679 fsid = le64_to_cpup((void *)es->s_uuid) ^ 3797 fsid = le64_to_cpup((void *)es->s_uuid) ^
3680 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 3798 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3689,7 +3807,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3689 * Process 1 Process 2 3807 * Process 1 Process 2
3690 * ext4_create() quota_sync() 3808 * ext4_create() quota_sync()
3691 * jbd2_journal_start() write_dquot() 3809 * jbd2_journal_start() write_dquot()
3692 * vfs_dq_init() down(dqio_mutex) 3810 * dquot_initialize() down(dqio_mutex)
3693 * down(dqio_mutex) jbd2_journal_start() 3811 * down(dqio_mutex) jbd2_journal_start()
3694 * 3812 *
3695 */ 3813 */
@@ -3898,9 +4016,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3898 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4016 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
3899 int err = 0; 4017 int err = 0;
3900 int offset = off & (sb->s_blocksize - 1); 4018 int offset = off & (sb->s_blocksize - 1);
3901 int tocopy;
3902 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; 4019 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
3903 size_t towrite = len;
3904 struct buffer_head *bh; 4020 struct buffer_head *bh;
3905 handle_t *handle = journal_current_handle(); 4021 handle_t *handle = journal_current_handle();
3906 4022
@@ -3910,52 +4026,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3910 (unsigned long long)off, (unsigned long long)len); 4026 (unsigned long long)off, (unsigned long long)len);
3911 return -EIO; 4027 return -EIO;
3912 } 4028 }
4029 /*
4030 * Since we account only one data block in transaction credits,
4031 * then it is impossible to cross a block boundary.
4032 */
4033 if (sb->s_blocksize - offset < len) {
4034 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
4035 " cancelled because not block aligned",
4036 (unsigned long long)off, (unsigned long long)len);
4037 return -EIO;
4038 }
4039
3913 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 4040 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
3914 while (towrite > 0) { 4041 bh = ext4_bread(handle, inode, blk, 1, &err);
3915 tocopy = sb->s_blocksize - offset < towrite ? 4042 if (!bh)
3916 sb->s_blocksize - offset : towrite; 4043 goto out;
3917 bh = ext4_bread(handle, inode, blk, 1, &err); 4044 if (journal_quota) {
3918 if (!bh) 4045 err = ext4_journal_get_write_access(handle, bh);
4046 if (err) {
4047 brelse(bh);
3919 goto out; 4048 goto out;
3920 if (journal_quota) {
3921 err = ext4_journal_get_write_access(handle, bh);
3922 if (err) {
3923 brelse(bh);
3924 goto out;
3925 }
3926 }
3927 lock_buffer(bh);
3928 memcpy(bh->b_data+offset, data, tocopy);
3929 flush_dcache_page(bh->b_page);
3930 unlock_buffer(bh);
3931 if (journal_quota)
3932 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3933 else {
3934 /* Always do at least ordered writes for quotas */
3935 err = ext4_jbd2_file_inode(handle, inode);
3936 mark_buffer_dirty(bh);
3937 } 4049 }
3938 brelse(bh);
3939 if (err)
3940 goto out;
3941 offset = 0;
3942 towrite -= tocopy;
3943 data += tocopy;
3944 blk++;
3945 } 4050 }
4051 lock_buffer(bh);
4052 memcpy(bh->b_data+offset, data, len);
4053 flush_dcache_page(bh->b_page);
4054 unlock_buffer(bh);
4055 if (journal_quota)
4056 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4057 else {
4058 /* Always do at least ordered writes for quotas */
4059 err = ext4_jbd2_file_inode(handle, inode);
4060 mark_buffer_dirty(bh);
4061 }
4062 brelse(bh);
3946out: 4063out:
3947 if (len == towrite) { 4064 if (err) {
3948 mutex_unlock(&inode->i_mutex); 4065 mutex_unlock(&inode->i_mutex);
3949 return err; 4066 return err;
3950 } 4067 }
3951 if (inode->i_size < off+len-towrite) { 4068 if (inode->i_size < off + len) {
3952 i_size_write(inode, off+len-towrite); 4069 i_size_write(inode, off + len);
3953 EXT4_I(inode)->i_disksize = inode->i_size; 4070 EXT4_I(inode)->i_disksize = inode->i_size;
3954 } 4071 }
3955 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 4072 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3956 ext4_mark_inode_dirty(handle, inode); 4073 ext4_mark_inode_dirty(handle, inode);
3957 mutex_unlock(&inode->i_mutex); 4074 mutex_unlock(&inode->i_mutex);
3958 return len - towrite; 4075 return len;
3959} 4076}
3960 4077
3961#endif 4078#endif
@@ -3966,6 +4083,52 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
3966 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4083 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3967} 4084}
3968 4085
4086#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4087static struct file_system_type ext2_fs_type = {
4088 .owner = THIS_MODULE,
4089 .name = "ext2",
4090 .get_sb = ext4_get_sb,
4091 .kill_sb = kill_block_super,
4092 .fs_flags = FS_REQUIRES_DEV,
4093};
4094
4095static inline void register_as_ext2(void)
4096{
4097 int err = register_filesystem(&ext2_fs_type);
4098 if (err)
4099 printk(KERN_WARNING
4100 "EXT4-fs: Unable to register as ext2 (%d)\n", err);
4101}
4102
4103static inline void unregister_as_ext2(void)
4104{
4105 unregister_filesystem(&ext2_fs_type);
4106}
4107MODULE_ALIAS("ext2");
4108#else
4109static inline void register_as_ext2(void) { }
4110static inline void unregister_as_ext2(void) { }
4111#endif
4112
4113#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4114static inline void register_as_ext3(void)
4115{
4116 int err = register_filesystem(&ext3_fs_type);
4117 if (err)
4118 printk(KERN_WARNING
4119 "EXT4-fs: Unable to register as ext3 (%d)\n", err);
4120}
4121
4122static inline void unregister_as_ext3(void)
4123{
4124 unregister_filesystem(&ext3_fs_type);
4125}
4126MODULE_ALIAS("ext3");
4127#else
4128static inline void register_as_ext3(void) { }
4129static inline void unregister_as_ext3(void) { }
4130#endif
4131
3969static struct file_system_type ext4_fs_type = { 4132static struct file_system_type ext4_fs_type = {
3970 .owner = THIS_MODULE, 4133 .owner = THIS_MODULE,
3971 .name = "ext4", 4134 .name = "ext4",
@@ -3995,11 +4158,15 @@ static int __init init_ext4_fs(void)
3995 err = init_inodecache(); 4158 err = init_inodecache();
3996 if (err) 4159 if (err)
3997 goto out1; 4160 goto out1;
4161 register_as_ext2();
4162 register_as_ext3();
3998 err = register_filesystem(&ext4_fs_type); 4163 err = register_filesystem(&ext4_fs_type);
3999 if (err) 4164 if (err)
4000 goto out; 4165 goto out;
4001 return 0; 4166 return 0;
4002out: 4167out:
4168 unregister_as_ext2();
4169 unregister_as_ext3();
4003 destroy_inodecache(); 4170 destroy_inodecache();
4004out1: 4171out1:
4005 exit_ext4_xattr(); 4172 exit_ext4_xattr();
@@ -4015,6 +4182,8 @@ out4:
4015 4182
4016static void __exit exit_ext4_fs(void) 4183static void __exit exit_ext4_fs(void)
4017{ 4184{
4185 unregister_as_ext2();
4186 unregister_as_ext3();
4018 unregister_filesystem(&ext4_fs_type); 4187 unregister_filesystem(&ext4_fs_type);
4019 destroy_inodecache(); 4188 destroy_inodecache();
4020 exit_ext4_xattr(); 4189 exit_ext4_xattr();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8d..b4c5aa8489d8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -92,7 +92,7 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
92 struct mb_cache_entry **); 92 struct mb_cache_entry **);
93static void ext4_xattr_rehash(struct ext4_xattr_header *, 93static void ext4_xattr_rehash(struct ext4_xattr_header *,
94 struct ext4_xattr_entry *); 94 struct ext4_xattr_entry *);
95static int ext4_xattr_list(struct inode *inode, char *buffer, 95static int ext4_xattr_list(struct dentry *dentry, char *buffer,
96 size_t buffer_size); 96 size_t buffer_size);
97 97
98static struct mb_cache *ext4_xattr_cache; 98static struct mb_cache *ext4_xattr_cache;
@@ -140,7 +140,7 @@ ext4_xattr_handler(int name_index)
140ssize_t 140ssize_t
141ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) 141ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
142{ 142{
143 return ext4_xattr_list(dentry->d_inode, buffer, size); 143 return ext4_xattr_list(dentry, buffer, size);
144} 144}
145 145
146static int 146static int
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
227 ea_bdebug(bh, "b_count=%d, refcount=%d", 227 ea_bdebug(bh, "b_count=%d, refcount=%d",
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: ext4_error(inode->i_sb, __func__, 230bad_block:
231 ext4_error(inode->i_sb,
231 "inode %lu: bad block %llu", inode->i_ino, 232 "inode %lu: bad block %llu", inode->i_ino,
232 EXT4_I(inode)->i_file_acl); 233 EXT4_I(inode)->i_file_acl);
233 error = -EIO; 234 error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
267 void *end; 268 void *end;
268 int error; 269 int error;
269 270
270 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 271 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
271 return -ENODATA; 272 return -ENODATA;
272 error = ext4_get_inode_loc(inode, &iloc); 273 error = ext4_get_inode_loc(inode, &iloc);
273 if (error) 274 if (error)
@@ -325,7 +326,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
325} 326}
326 327
327static int 328static int
328ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, 329ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
329 char *buffer, size_t buffer_size) 330 char *buffer, size_t buffer_size)
330{ 331{
331 size_t rest = buffer_size; 332 size_t rest = buffer_size;
@@ -335,9 +336,10 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
335 ext4_xattr_handler(entry->e_name_index); 336 ext4_xattr_handler(entry->e_name_index);
336 337
337 if (handler) { 338 if (handler) {
338 size_t size = handler->list(inode, buffer, rest, 339 size_t size = handler->list(dentry, buffer, rest,
339 entry->e_name, 340 entry->e_name,
340 entry->e_name_len); 341 entry->e_name_len,
342 handler->flags);
341 if (buffer) { 343 if (buffer) {
342 if (size > rest) 344 if (size > rest)
343 return -ERANGE; 345 return -ERANGE;
@@ -350,8 +352,9 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
350} 352}
351 353
352static int 354static int
353ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) 355ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
354{ 356{
357 struct inode *inode = dentry->d_inode;
355 struct buffer_head *bh = NULL; 358 struct buffer_head *bh = NULL;
356 int error; 359 int error;
357 360
@@ -369,14 +372,14 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
369 ea_bdebug(bh, "b_count=%d, refcount=%d", 372 ea_bdebug(bh, "b_count=%d, refcount=%d",
370 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
371 if (ext4_xattr_check_block(bh)) { 374 if (ext4_xattr_check_block(bh)) {
372 ext4_error(inode->i_sb, __func__, 375 ext4_error(inode->i_sb,
373 "inode %lu: bad block %llu", inode->i_ino, 376 "inode %lu: bad block %llu", inode->i_ino,
374 EXT4_I(inode)->i_file_acl); 377 EXT4_I(inode)->i_file_acl);
375 error = -EIO; 378 error = -EIO;
376 goto cleanup; 379 goto cleanup;
377 } 380 }
378 ext4_xattr_cache_insert(bh); 381 ext4_xattr_cache_insert(bh);
379 error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); 382 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
380 383
381cleanup: 384cleanup:
382 brelse(bh); 385 brelse(bh);
@@ -385,15 +388,16 @@ cleanup:
385} 388}
386 389
387static int 390static int
388ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) 391ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
389{ 392{
393 struct inode *inode = dentry->d_inode;
390 struct ext4_xattr_ibody_header *header; 394 struct ext4_xattr_ibody_header *header;
391 struct ext4_inode *raw_inode; 395 struct ext4_inode *raw_inode;
392 struct ext4_iloc iloc; 396 struct ext4_iloc iloc;
393 void *end; 397 void *end;
394 int error; 398 int error;
395 399
396 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 400 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
397 return 0; 401 return 0;
398 error = ext4_get_inode_loc(inode, &iloc); 402 error = ext4_get_inode_loc(inode, &iloc);
399 if (error) 403 if (error)
@@ -404,7 +408,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
404 error = ext4_xattr_check_names(IFIRST(header), end); 408 error = ext4_xattr_check_names(IFIRST(header), end);
405 if (error) 409 if (error)
406 goto cleanup; 410 goto cleanup;
407 error = ext4_xattr_list_entries(inode, IFIRST(header), 411 error = ext4_xattr_list_entries(dentry, IFIRST(header),
408 buffer, buffer_size); 412 buffer, buffer_size);
409 413
410cleanup: 414cleanup:
@@ -423,12 +427,12 @@ cleanup:
423 * used / required on success. 427 * used / required on success.
424 */ 428 */
425static int 429static int
426ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) 430ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
427{ 431{
428 int i_error, b_error; 432 int i_error, b_error;
429 433
430 down_read(&EXT4_I(inode)->xattr_sem); 434 down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
431 i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size); 435 i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
432 if (i_error < 0) { 436 if (i_error < 0) {
433 b_error = 0; 437 b_error = 0;
434 } else { 438 } else {
@@ -436,11 +440,11 @@ ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
436 buffer += i_error; 440 buffer += i_error;
437 buffer_size -= i_error; 441 buffer_size -= i_error;
438 } 442 }
439 b_error = ext4_xattr_block_list(inode, buffer, buffer_size); 443 b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
440 if (b_error < 0) 444 if (b_error < 0)
441 i_error = 0; 445 i_error = 0;
442 } 446 }
443 up_read(&EXT4_I(inode)->xattr_sem); 447 up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
444 return i_error + b_error; 448 return i_error + b_error;
445} 449}
446 450
@@ -482,15 +486,16 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
482 ea_bdebug(bh, "refcount now=0; freeing"); 486 ea_bdebug(bh, "refcount now=0; freeing");
483 if (ce) 487 if (ce)
484 mb_cache_entry_free(ce); 488 mb_cache_entry_free(ce);
485 ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
486 get_bh(bh); 489 get_bh(bh);
487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 490 ext4_free_blocks(handle, inode, bh, 0, 1,
491 EXT4_FREE_BLOCKS_METADATA |
492 EXT4_FREE_BLOCKS_FORGET);
488 } else { 493 } else {
489 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 494 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
490 error = ext4_handle_dirty_metadata(handle, inode, bh); 495 error = ext4_handle_dirty_metadata(handle, inode, bh);
491 if (IS_SYNC(inode)) 496 if (IS_SYNC(inode))
492 ext4_handle_sync(handle); 497 ext4_handle_sync(handle);
493 vfs_dq_free_block(inode, 1); 498 dquot_free_block(inode, 1);
494 ea_bdebug(bh, "refcount now=%d; releasing", 499 ea_bdebug(bh, "refcount now=%d; releasing",
495 le32_to_cpu(BHDR(bh)->h_refcount)); 500 le32_to_cpu(BHDR(bh)->h_refcount));
496 if (ce) 501 if (ce)
@@ -661,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
661 atomic_read(&(bs->bh->b_count)), 666 atomic_read(&(bs->bh->b_count)),
662 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 667 le32_to_cpu(BHDR(bs->bh)->h_refcount));
663 if (ext4_xattr_check_block(bs->bh)) { 668 if (ext4_xattr_check_block(bs->bh)) {
664 ext4_error(sb, __func__, 669 ext4_error(sb, "inode %lu: bad block %llu",
665 "inode %lu: bad block %llu", inode->i_ino, 670 inode->i_ino, EXT4_I(inode)->i_file_acl);
666 EXT4_I(inode)->i_file_acl);
667 error = -EIO; 671 error = -EIO;
668 goto cleanup; 672 goto cleanup;
669 } 673 }
@@ -783,8 +787,8 @@ inserted:
783 else { 787 else {
784 /* The old block is released after updating 788 /* The old block is released after updating
785 the inode. */ 789 the inode. */
786 error = -EDQUOT; 790 error = dquot_alloc_block(inode, 1);
787 if (vfs_dq_alloc_block(inode, 1)) 791 if (error)
788 goto cleanup; 792 goto cleanup;
789 error = ext4_journal_get_write_access(handle, 793 error = ext4_journal_get_write_access(handle,
790 new_bh); 794 new_bh);
@@ -832,7 +836,8 @@ inserted:
832 new_bh = sb_getblk(sb, block); 836 new_bh = sb_getblk(sb, block);
833 if (!new_bh) { 837 if (!new_bh) {
834getblk_failed: 838getblk_failed:
835 ext4_free_blocks(handle, inode, block, 1, 1); 839 ext4_free_blocks(handle, inode, 0, block, 1,
840 EXT4_FREE_BLOCKS_METADATA);
836 error = -EIO; 841 error = -EIO;
837 goto cleanup; 842 goto cleanup;
838 } 843 }
@@ -871,13 +876,12 @@ cleanup:
871 return error; 876 return error;
872 877
873cleanup_dquot: 878cleanup_dquot:
874 vfs_dq_free_block(inode, 1); 879 dquot_free_block(inode, 1);
875 goto cleanup; 880 goto cleanup;
876 881
877bad_block: 882bad_block:
878 ext4_error(inode->i_sb, __func__, 883 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
879 "inode %lu: bad block %llu", inode->i_ino, 884 inode->i_ino, EXT4_I(inode)->i_file_acl);
880 EXT4_I(inode)->i_file_acl);
881 goto cleanup; 885 goto cleanup;
882 886
883#undef header 887#undef header
@@ -903,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
903 is->s.base = is->s.first = IFIRST(header); 907 is->s.base = is->s.first = IFIRST(header);
904 is->s.here = is->s.first; 908 is->s.here = is->s.first;
905 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 909 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
906 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 910 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
907 error = ext4_xattr_check_names(IFIRST(header), is->s.end); 911 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
908 if (error) 912 if (error)
909 return error; 913 return error;
@@ -935,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
935 header = IHDR(inode, ext4_raw_inode(&is->iloc)); 939 header = IHDR(inode, ext4_raw_inode(&is->iloc));
936 if (!IS_LAST_ENTRY(s->first)) { 940 if (!IS_LAST_ENTRY(s->first)) {
937 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); 941 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
938 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; 942 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
939 } else { 943 } else {
940 header->h_magic = cpu_to_le32(0); 944 header->h_magic = cpu_to_le32(0);
941 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; 945 ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
942 } 946 }
943 return 0; 947 return 0;
944} 948}
@@ -981,17 +985,21 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
981 if (strlen(name) > 255) 985 if (strlen(name) > 255)
982 return -ERANGE; 986 return -ERANGE;
983 down_write(&EXT4_I(inode)->xattr_sem); 987 down_write(&EXT4_I(inode)->xattr_sem);
984 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; 988 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
985 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 989 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
986 990
987 error = ext4_get_inode_loc(inode, &is.iloc); 991 error = ext4_get_inode_loc(inode, &is.iloc);
988 if (error) 992 if (error)
989 goto cleanup; 993 goto cleanup;
990 994
991 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 995 error = ext4_journal_get_write_access(handle, is.iloc.bh);
996 if (error)
997 goto cleanup;
998
999 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
992 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 1000 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
993 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 1001 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
994 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; 1002 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
995 } 1003 }
996 1004
997 error = ext4_xattr_ibody_find(inode, &i, &is); 1005 error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1013,9 +1021,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1013 if (flags & XATTR_CREATE) 1021 if (flags & XATTR_CREATE)
1014 goto cleanup; 1022 goto cleanup;
1015 } 1023 }
1016 error = ext4_journal_get_write_access(handle, is.iloc.bh);
1017 if (error)
1018 goto cleanup;
1019 if (!value) { 1024 if (!value) {
1020 if (!is.s.not_found) 1025 if (!is.s.not_found)
1021 error = ext4_xattr_ibody_set(handle, inode, &i, &is); 1026 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1046,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1046 ext4_xattr_update_super_block(handle, inode->i_sb); 1051 ext4_xattr_update_super_block(handle, inode->i_sb);
1047 inode->i_ctime = ext4_current_time(inode); 1052 inode->i_ctime = ext4_current_time(inode);
1048 if (!value) 1053 if (!value)
1049 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1054 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1050 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 1055 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1051 /* 1056 /*
1052 * The bh is consumed by ext4_mark_iloc_dirty, even with 1057 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1061,7 +1066,7 @@ cleanup:
1061 brelse(is.iloc.bh); 1066 brelse(is.iloc.bh);
1062 brelse(bs.bh); 1067 brelse(bs.bh);
1063 if (no_expand == 0) 1068 if (no_expand == 0)
1064 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1069 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1065 up_write(&EXT4_I(inode)->xattr_sem); 1070 up_write(&EXT4_I(inode)->xattr_sem);
1066 return error; 1071 return error;
1067} 1072}
@@ -1189,9 +1194,8 @@ retry:
1189 if (!bh) 1194 if (!bh)
1190 goto cleanup; 1195 goto cleanup;
1191 if (ext4_xattr_check_block(bh)) { 1196 if (ext4_xattr_check_block(bh)) {
1192 ext4_error(inode->i_sb, __func__, 1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1193 "inode %lu: bad block %llu", inode->i_ino, 1198 inode->i_ino, EXT4_I(inode)->i_file_acl);
1194 EXT4_I(inode)->i_file_acl);
1195 error = -EIO; 1199 error = -EIO;
1196 goto cleanup; 1200 goto cleanup;
1197 } 1201 }
@@ -1296,6 +1300,8 @@ retry:
1296 1300
1297 /* Remove the chosen entry from the inode */ 1301 /* Remove the chosen entry from the inode */
1298 error = ext4_xattr_ibody_set(handle, inode, &i, is); 1302 error = ext4_xattr_ibody_set(handle, inode, &i, is);
1303 if (error)
1304 goto cleanup;
1299 1305
1300 entry = IFIRST(header); 1306 entry = IFIRST(header);
1301 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) 1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1326,6 +1332,8 @@ retry:
1326 goto cleanup; 1332 goto cleanup;
1327 kfree(b_entry_name); 1333 kfree(b_entry_name);
1328 kfree(buffer); 1334 kfree(buffer);
1335 b_entry_name = NULL;
1336 buffer = NULL;
1329 brelse(is->iloc.bh); 1337 brelse(is->iloc.bh);
1330 kfree(is); 1338 kfree(is);
1331 kfree(bs); 1339 kfree(bs);
@@ -1364,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1364 goto cleanup; 1372 goto cleanup;
1365 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1366 if (!bh) { 1374 if (!bh) {
1367 ext4_error(inode->i_sb, __func__, 1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error",
1368 "inode %lu: block %llu read error", inode->i_ino, 1376 inode->i_ino, EXT4_I(inode)->i_file_acl);
1369 EXT4_I(inode)->i_file_acl);
1370 goto cleanup; 1377 goto cleanup;
1371 } 1378 }
1372 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1373 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1374 ext4_error(inode->i_sb, __func__, 1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1375 "inode %lu: bad block %llu", inode->i_ino, 1382 inode->i_ino, EXT4_I(inode)->i_file_acl);
1376 EXT4_I(inode)->i_file_acl);
1377 goto cleanup; 1383 goto cleanup;
1378 } 1384 }
1379 ext4_xattr_release_block(handle, inode, bh); 1385 ext4_xattr_release_block(handle, inode, bh);
@@ -1498,7 +1504,7 @@ again:
1498 } 1504 }
1499 bh = sb_bread(inode->i_sb, ce->e_block); 1505 bh = sb_bread(inode->i_sb, ce->e_block);
1500 if (!bh) { 1506 if (!bh) {
1501 ext4_error(inode->i_sb, __func__, 1507 ext4_error(inode->i_sb,
1502 "inode %lu: block %lu read error", 1508 "inode %lu: block %lu read error",
1503 inode->i_ino, (unsigned long) ce->e_block); 1509 inode->i_ino, (unsigned long) ce->e_block);
1504 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index ca5f89fc6cae..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,13 +7,14 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/security.h> 9#include <linux/security.h>
10#include <linux/slab.h>
10#include "ext4_jbd2.h" 11#include "ext4_jbd2.h"
11#include "ext4.h" 12#include "ext4.h"
12#include "xattr.h" 13#include "xattr.h"
13 14
14static size_t 15static size_t
15ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, 16ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
16 const char *name, size_t name_len) 17 const char *name, size_t name_len, int type)
17{ 18{
18 const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; 19 const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
19 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
@@ -28,23 +29,23 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
28} 29}
29 30
30static int 31static int
31ext4_xattr_security_get(struct inode *inode, const char *name, 32ext4_xattr_security_get(struct dentry *dentry, const char *name,
32 void *buffer, size_t size) 33 void *buffer, size_t size, int type)
33{ 34{
34 if (strcmp(name, "") == 0) 35 if (strcmp(name, "") == 0)
35 return -EINVAL; 36 return -EINVAL;
36 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name, 37 return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
37 buffer, size); 38 name, buffer, size);
38} 39}
39 40
40static int 41static int
41ext4_xattr_security_set(struct inode *inode, const char *name, 42ext4_xattr_security_set(struct dentry *dentry, const char *name,
42 const void *value, size_t size, int flags) 43 const void *value, size_t size, int flags, int type)
43{ 44{
44 if (strcmp(name, "") == 0) 45 if (strcmp(name, "") == 0)
45 return -EINVAL; 46 return -EINVAL;
46 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, 47 return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
47 value, size, flags); 48 name, value, size, flags);
48} 49}
49 50
50int 51int
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index ac1a52cf2a37..15b50edc6587 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -14,8 +14,8 @@
14#include "xattr.h" 14#include "xattr.h"
15 15
16static size_t 16static size_t
17ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, 17ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
18 const char *name, size_t name_len) 18 const char *name, size_t name_len, int type)
19{ 19{
20 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; 20 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
21 const size_t total_len = prefix_len + name_len + 1; 21 const size_t total_len = prefix_len + name_len + 1;
@@ -32,23 +32,23 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
32} 32}
33 33
34static int 34static int
35ext4_xattr_trusted_get(struct inode *inode, const char *name, 35ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
36 void *buffer, size_t size) 36 size_t size, int type)
37{ 37{
38 if (strcmp(name, "") == 0) 38 if (strcmp(name, "") == 0)
39 return -EINVAL; 39 return -EINVAL;
40 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name, 40 return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
41 buffer, size); 41 name, buffer, size);
42} 42}
43 43
44static int 44static int
45ext4_xattr_trusted_set(struct inode *inode, const char *name, 45ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
46 const void *value, size_t size, int flags) 46 const void *value, size_t size, int flags, int type)
47{ 47{
48 if (strcmp(name, "") == 0) 48 if (strcmp(name, "") == 0)
49 return -EINVAL; 49 return -EINVAL;
50 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, 50 return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
51 value, size, flags); 51 name, value, size, flags);
52} 52}
53 53
54struct xattr_handler ext4_xattr_trusted_handler = { 54struct xattr_handler ext4_xattr_trusted_handler = {
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d91aa61b42aa..c4ce05746ce1 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -13,13 +13,13 @@
13#include "xattr.h" 13#include "xattr.h"
14 14
15static size_t 15static size_t
16ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, 16ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
17 const char *name, size_t name_len) 17 const char *name, size_t name_len, int type)
18{ 18{
19 const size_t prefix_len = XATTR_USER_PREFIX_LEN; 19 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
20 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
21 21
22 if (!test_opt(inode->i_sb, XATTR_USER)) 22 if (!test_opt(dentry->d_sb, XATTR_USER))
23 return 0; 23 return 0;
24 24
25 if (list && total_len <= list_size) { 25 if (list && total_len <= list_size) {
@@ -31,26 +31,27 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
31} 31}
32 32
33static int 33static int
34ext4_xattr_user_get(struct inode *inode, const char *name, 34ext4_xattr_user_get(struct dentry *dentry, const char *name,
35 void *buffer, size_t size) 35 void *buffer, size_t size, int type)
36{ 36{
37 if (strcmp(name, "") == 0) 37 if (strcmp(name, "") == 0)
38 return -EINVAL; 38 return -EINVAL;
39 if (!test_opt(inode->i_sb, XATTR_USER)) 39 if (!test_opt(dentry->d_sb, XATTR_USER))
40 return -EOPNOTSUPP; 40 return -EOPNOTSUPP;
41 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size); 41 return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
42 name, buffer, size);
42} 43}
43 44
44static int 45static int
45ext4_xattr_user_set(struct inode *inode, const char *name, 46ext4_xattr_user_set(struct dentry *dentry, const char *name,
46 const void *value, size_t size, int flags) 47 const void *value, size_t size, int flags, int type)
47{ 48{
48 if (strcmp(name, "") == 0) 49 if (strcmp(name, "") == 0)
49 return -EINVAL; 50 return -EINVAL;
50 if (!test_opt(inode->i_sb, XATTR_USER)) 51 if (!test_opt(dentry->d_sb, XATTR_USER))
51 return -EOPNOTSUPP; 52 return -EOPNOTSUPP;
52 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, 53 return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
53 value, size, flags); 54 name, value, size, flags);
54} 55}
55 56
56struct xattr_handler ext4_xattr_user_handler = { 57struct xattr_handler ext4_xattr_user_handler = {
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/slab.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include "fat.h" 14#include "fat.h"
14 15
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 7db0979c6b72..e6efdfa0f6db 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -44,7 +44,8 @@ struct fat_mount_options {
44 nocase:1, /* Does this need case conversion? 0=need case conversion*/ 44 nocase:1, /* Does this need case conversion? 0=need case conversion*/
45 usefree:1, /* Use free_clusters for FAT32 */ 45 usefree:1, /* Use free_clusters for FAT32 */
46 tz_utc:1, /* Filesystem timestamps are in UTC */ 46 tz_utc:1, /* Filesystem timestamps are in UTC */
47 rodir:1; /* allow ATTR_RO for directory */ 47 rodir:1, /* allow ATTR_RO for directory */
48 discard:1; /* Issue discard requests on deletions */
48}; 49};
49 50
50#define FAT_HASH_BITS 8 51#define FAT_HASH_BITS 8
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index a81037721a6f..81184d3b75a3 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -566,16 +566,21 @@ int fat_free_clusters(struct inode *inode, int cluster)
566 goto error; 566 goto error;
567 } 567 }
568 568
569 /* 569 if (sbi->options.discard) {
570 * Issue discard for the sectors we no longer care about, 570 /*
571 * batching contiguous clusters into one request 571 * Issue discard for the sectors we no longer
572 */ 572 * care about, batching contiguous clusters
573 if (cluster != fatent.entry + 1) { 573 * into one request
574 int nr_clus = fatent.entry - first_cl + 1; 574 */
575 575 if (cluster != fatent.entry + 1) {
576 sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), 576 int nr_clus = fatent.entry - first_cl + 1;
577 nr_clus * sbi->sec_per_clus); 577
578 first_cl = cluster; 578 sb_issue_discard(sb,
579 fat_clus_to_blknr(sbi, first_cl),
580 nr_clus * sbi->sec_per_clus);
581
582 first_cl = cluster;
583 }
579 } 584 }
580 585
581 ops->ent_put(&fatent, FAT_ENT_FREE); 586 ops->ent_put(&fatent, FAT_ENT_FREE);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 76b7961ab663..0ce143bd7d56 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
558 buf->f_bavail = sbi->free_clusters; 558 buf->f_bavail = sbi->free_clusters;
559 buf->f_fsid.val[0] = (u32)id; 559 buf->f_fsid.val[0] = (u32)id;
560 buf->f_fsid.val[1] = (u32)(id >> 32); 560 buf->f_fsid.val[1] = (u32)(id >> 32);
561 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 561 buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
562 562
563 return 0; 563 return 0;
564} 564}
@@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
577 return i_pos; 577 return i_pos;
578} 578}
579 579
580static int fat_write_inode(struct inode *inode, int wait) 580static int __fat_write_inode(struct inode *inode, int wait)
581{ 581{
582 struct super_block *sb = inode->i_sb; 582 struct super_block *sb = inode->i_sb;
583 struct msdos_sb_info *sbi = MSDOS_SB(sb); 583 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +634,14 @@ retry:
634 return err; 634 return err;
635} 635}
636 636
637static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
638{
639 return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
640}
641
637int fat_sync_inode(struct inode *inode) 642int fat_sync_inode(struct inode *inode)
638{ 643{
639 return fat_write_inode(inode, 1); 644 return __fat_write_inode(inode, 1);
640} 645}
641 646
642EXPORT_SYMBOL_GPL(fat_sync_inode); 647EXPORT_SYMBOL_GPL(fat_sync_inode);
@@ -858,6 +863,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
858 seq_puts(m, ",errors=panic"); 863 seq_puts(m, ",errors=panic");
859 else 864 else
860 seq_puts(m, ",errors=remount-ro"); 865 seq_puts(m, ",errors=remount-ro");
866 if (opts->discard)
867 seq_puts(m, ",discard");
861 868
862 return 0; 869 return 0;
863} 870}
@@ -871,7 +878,7 @@ enum {
871 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, 878 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
872 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, 879 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
873 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, 880 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
874 Opt_err_panic, Opt_err_ro, Opt_err, 881 Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
875}; 882};
876 883
877static const match_table_t fat_tokens = { 884static const match_table_t fat_tokens = {
@@ -899,6 +906,7 @@ static const match_table_t fat_tokens = {
899 {Opt_err_cont, "errors=continue"}, 906 {Opt_err_cont, "errors=continue"},
900 {Opt_err_panic, "errors=panic"}, 907 {Opt_err_panic, "errors=panic"},
901 {Opt_err_ro, "errors=remount-ro"}, 908 {Opt_err_ro, "errors=remount-ro"},
909 {Opt_discard, "discard"},
902 {Opt_obsolate, "conv=binary"}, 910 {Opt_obsolate, "conv=binary"},
903 {Opt_obsolate, "conv=text"}, 911 {Opt_obsolate, "conv=text"},
904 {Opt_obsolate, "conv=auto"}, 912 {Opt_obsolate, "conv=auto"},
@@ -1136,6 +1144,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
1136 case Opt_rodir: 1144 case Opt_rodir:
1137 opts->rodir = 1; 1145 opts->rodir = 1;
1138 break; 1146 break;
1147 case Opt_discard:
1148 opts->discard = 1;
1149 break;
1139 1150
1140 /* obsolete mount options */ 1151 /* obsolete mount options */
1141 case Opt_obsolate: 1152 case Opt_obsolate:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 0f55f5cb732f..d3da05f26465 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -9,6 +9,7 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/time.h>
12#include "fat.h" 13#include "fat.h"
13 14
14/* 15/*
@@ -157,10 +158,6 @@ extern struct timezone sys_tz;
157#define SECS_PER_MIN 60 158#define SECS_PER_MIN 60
158#define SECS_PER_HOUR (60 * 60) 159#define SECS_PER_HOUR (60 * 60)
159#define SECS_PER_DAY (SECS_PER_HOUR * 24) 160#define SECS_PER_DAY (SECS_PER_HOUR * 24)
160#define UNIX_SECS_1980 315532800L
161#if BITS_PER_LONG == 64
162#define UNIX_SECS_2108 4354819200L
163#endif
164/* days between 1.1.70 and 1.1.80 (2 leap days) */ 161/* days between 1.1.70 and 1.1.80 (2 leap days) */
165#define DAYS_DELTA (365 * 10 + 2) 162#define DAYS_DELTA (365 * 10 + 2)
166/* 120 (2100 - 1980) isn't leap year */ 163/* 120 (2100 - 1980) isn't leap year */
@@ -213,58 +210,35 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
213void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, 210void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
214 __le16 *time, __le16 *date, u8 *time_cs) 211 __le16 *time, __le16 *date, u8 *time_cs)
215{ 212{
216 time_t second = ts->tv_sec; 213 struct tm tm;
217 time_t day, leap_day, month, year; 214 time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 :
215 -sys_tz.tz_minuteswest * 60, &tm);
218 216
219 if (!sbi->options.tz_utc) 217 /* FAT can only support year between 1980 to 2107 */
220 second -= sys_tz.tz_minuteswest * SECS_PER_MIN; 218 if (tm.tm_year < 1980 - 1900) {
221
222 /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
223 if (second < UNIX_SECS_1980) {
224 *time = 0; 219 *time = 0;
225 *date = cpu_to_le16((0 << 9) | (1 << 5) | 1); 220 *date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
226 if (time_cs) 221 if (time_cs)
227 *time_cs = 0; 222 *time_cs = 0;
228 return; 223 return;
229 } 224 }
230#if BITS_PER_LONG == 64 225 if (tm.tm_year > 2107 - 1900) {
231 if (second >= UNIX_SECS_2108) {
232 *time = cpu_to_le16((23 << 11) | (59 << 5) | 29); 226 *time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
233 *date = cpu_to_le16((127 << 9) | (12 << 5) | 31); 227 *date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
234 if (time_cs) 228 if (time_cs)
235 *time_cs = 199; 229 *time_cs = 199;
236 return; 230 return;
237 } 231 }
238#endif
239 232
240 day = second / SECS_PER_DAY - DAYS_DELTA; 233 /* from 1900 -> from 1980 */
241 year = day / 365; 234 tm.tm_year -= 80;
242 leap_day = (year + 3) / 4; 235 /* 0~11 -> 1~12 */
243 if (year > YEAR_2100) /* 2100 isn't leap year */ 236 tm.tm_mon++;
244 leap_day--; 237 /* 0~59 -> 0~29(2sec counts) */
245 if (year * 365 + leap_day > day) 238 tm.tm_sec >>= 1;
246 year--;
247 leap_day = (year + 3) / 4;
248 if (year > YEAR_2100) /* 2100 isn't leap year */
249 leap_day--;
250 day -= year * 365 + leap_day;
251
252 if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
253 month = 2;
254 } else {
255 if (IS_LEAP_YEAR(year) && day > days_in_year[3])
256 day--;
257 for (month = 1; month < 12; month++) {
258 if (days_in_year[month + 1] > day)
259 break;
260 }
261 }
262 day -= days_in_year[month];
263 239
264 *time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11 240 *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec);
265 | ((second / SECS_PER_MIN) % 60) << 5 241 *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday);
266 | (second % SECS_PER_MIN) >> 1);
267 *date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
268 if (time_cs) 242 if (time_cs)
269 *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000; 243 *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
270} 244}
@@ -285,4 +259,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
285 } 259 }
286 return err; 260 return err;
287} 261}
288
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
309{ 309{
310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; 310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
311 wchar_t *ip, *ext_start, *end, *name_start; 311 wchar_t *ip, *ext_start, *end, *name_start;
312 unsigned char base[9], ext[4], buf[8], *p; 312 unsigned char base[9], ext[4], buf[5], *p;
313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; 313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
314 int chl, chi; 314 int chl, chi;
315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; 315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
467 return 0; 467 return 0;
468 } 468 }
469 469
470 i = jiffies & 0xffff; 470 i = jiffies;
471 sz = (jiffies >> 16) & 0x7; 471 sz = (jiffies >> 16) & 0x7;
472 if (baselen > 2) { 472 if (baselen > 2) {
473 baselen = numtail2_baselen; 473 baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
476 name_res[baselen + 4] = '~'; 476 name_res[baselen + 4] = '~';
477 name_res[baselen + 5] = '1' + sz; 477 name_res[baselen + 5] = '1' + sz;
478 while (1) { 478 while (1) {
479 sprintf(buf, "%04X", i); 479 snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
480 memcpy(&name_res[baselen], buf, 4); 480 memcpy(&name_res[baselen], buf, 4);
481 if (vfat_find_form(dir, name_res) < 0) 481 if (vfat_find_form(dir, name_res) < 0)
482 break; 482 break;
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 if (*outlen < 0) 503 if (*outlen < 0)
504 return *outlen; 504 return *outlen;
505 else if (*outlen > 255) 505 else if (*outlen > FAT_LFN_LEN)
506 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
507 507
508 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
509 } else { 509 } else {
510 if (nls) { 510 if (nls) {
511 for (i = 0, ip = name, op = outname, *outlen = 0; 511 for (i = 0, ip = name, op = outname, *outlen = 0;
512 i < len && *outlen <= 255; 512 i < len && *outlen <= FAT_LFN_LEN;
513 *outlen += 1) 513 *outlen += 1)
514 { 514 {
515 if (escape && (*ip == ':')) { 515 if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
549 return -ENAMETOOLONG; 549 return -ENAMETOOLONG;
550 } else { 550 } else {
551 for (i = 0, ip = name, op = outname, *outlen = 0; 551 for (i = 0, ip = name, op = outname, *outlen = 0;
552 i < len && *outlen <= 255; 552 i < len && *outlen <= FAT_LFN_LEN;
553 i++, *outlen += 1) 553 i++, *outlen += 1)
554 { 554 {
555 *op++ = *ip++; 555 *op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
701 return fat_search_long(dir, qname->name, len, sinfo); 701 return fat_search_long(dir, qname->name, len, sinfo);
702} 702}
703 703
704/*
705 * (nfsd's) anonymous disconnected dentry?
706 * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
707 */
708static int vfat_d_anon_disconn(struct dentry *dentry)
709{
710 return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
711}
712
704static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, 713static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
705 struct nameidata *nd) 714 struct nameidata *nd)
706{ 715{
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
729 } 738 }
730 739
731 alias = d_find_alias(inode); 740 alias = d_find_alias(inode);
732 if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { 741 if (alias && !vfat_d_anon_disconn(alias)) {
733 /* 742 /*
734 * This inode has non DCACHE_DISCONNECTED dentry. This 743 * This inode has non anonymous-DCACHE_DISCONNECTED
735 * means, the user did ->lookup() by an another name 744 * dentry. This means, the user did ->lookup() by an
736 * (longname vs 8.3 alias of it) in past. 745 * another name (longname vs 8.3 alias of it) in past.
737 * 746 *
738 * Switch to new one for reason of locality if possible. 747 * Switch to new one for reason of locality if possible.
739 */ 748 */
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
743 iput(inode); 752 iput(inode);
744 unlock_super(sb); 753 unlock_super(sb);
745 return alias; 754 return alias;
746 } 755 } else
756 dput(alias);
757
747out: 758out:
748 unlock_super(sb); 759 unlock_super(sb);
749 dentry->d_op = sb->s_root->d_op; 760 dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2cf93ec40a67..452d02f9075e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
344 switch (cmd) { 344 switch (cmd) {
345 case F_DUPFD: 345 case F_DUPFD:
346 case F_DUPFD_CLOEXEC: 346 case F_DUPFD_CLOEXEC:
347 if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 347 if (arg >= rlimit(RLIMIT_NOFILE))
348 break; 348 break;
349 err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); 349 err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
350 if (err >= 0) { 350 if (err >= 0) {
@@ -618,60 +618,90 @@ static DEFINE_RWLOCK(fasync_lock);
618static struct kmem_cache *fasync_cache __read_mostly; 618static struct kmem_cache *fasync_cache __read_mostly;
619 619
620/* 620/*
621 * fasync_helper() is used by almost all character device drivers 621 * Remove a fasync entry. If successfully removed, return
622 * to set up the fasync queue. It returns negative on error, 0 if it did 622 * positive and clear the FASYNC flag. If no entry exists,
623 * no changes and positive if it added/deleted the entry. 623 * do nothing and return 0.
624 *
625 * NOTE! It is very important that the FASYNC flag always
626 * match the state "is the filp on a fasync list".
627 *
628 * We always take the 'filp->f_lock', in since fasync_lock
629 * needs to be irq-safe.
624 */ 630 */
625int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) 631static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
626{ 632{
627 struct fasync_struct *fa, **fp; 633 struct fasync_struct *fa, **fp;
628 struct fasync_struct *new = NULL;
629 int result = 0; 634 int result = 0;
630 635
631 if (on) { 636 spin_lock(&filp->f_lock);
632 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); 637 write_lock_irq(&fasync_lock);
633 if (!new) 638 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
634 return -ENOMEM; 639 if (fa->fa_file != filp)
640 continue;
641 *fp = fa->fa_next;
642 kmem_cache_free(fasync_cache, fa);
643 filp->f_flags &= ~FASYNC;
644 result = 1;
645 break;
635 } 646 }
647 write_unlock_irq(&fasync_lock);
648 spin_unlock(&filp->f_lock);
649 return result;
650}
651
652/*
653 * Add a fasync entry. Return negative on error, positive if
654 * added, and zero if did nothing but change an existing one.
655 *
656 * NOTE! It is very important that the FASYNC flag always
657 * match the state "is the filp on a fasync list".
658 */
659static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
660{
661 struct fasync_struct *new, *fa, **fp;
662 int result = 0;
663
664 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
665 if (!new)
666 return -ENOMEM;
636 667
637 /*
638 * We need to take f_lock first since it's not an IRQ-safe
639 * lock.
640 */
641 spin_lock(&filp->f_lock); 668 spin_lock(&filp->f_lock);
642 write_lock_irq(&fasync_lock); 669 write_lock_irq(&fasync_lock);
643 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 670 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
644 if (fa->fa_file == filp) { 671 if (fa->fa_file != filp)
645 if(on) { 672 continue;
646 fa->fa_fd = fd; 673 fa->fa_fd = fd;
647 kmem_cache_free(fasync_cache, new); 674 kmem_cache_free(fasync_cache, new);
648 } else { 675 goto out;
649 *fp = fa->fa_next;
650 kmem_cache_free(fasync_cache, fa);
651 result = 1;
652 }
653 goto out;
654 }
655 } 676 }
656 677
657 if (on) { 678 new->magic = FASYNC_MAGIC;
658 new->magic = FASYNC_MAGIC; 679 new->fa_file = filp;
659 new->fa_file = filp; 680 new->fa_fd = fd;
660 new->fa_fd = fd; 681 new->fa_next = *fapp;
661 new->fa_next = *fapp; 682 *fapp = new;
662 *fapp = new; 683 result = 1;
663 result = 1; 684 filp->f_flags |= FASYNC;
664 } 685
665out: 686out:
666 if (on)
667 filp->f_flags |= FASYNC;
668 else
669 filp->f_flags &= ~FASYNC;
670 write_unlock_irq(&fasync_lock); 687 write_unlock_irq(&fasync_lock);
671 spin_unlock(&filp->f_lock); 688 spin_unlock(&filp->f_lock);
672 return result; 689 return result;
673} 690}
674 691
692/*
693 * fasync_helper() is used by almost all character device drivers
694 * to set up the fasync queue, and for regular files by the file
695 * lease code. It returns negative on error, 0 if it did no changes
696 * and positive if it added/deleted the entry.
697 */
698int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
699{
700 if (!on)
701 return fasync_remove_entry(filp, fapp);
702 return fasync_add_entry(fd, filp, fapp);
703}
704
675EXPORT_SYMBOL(fasync_helper); 705EXPORT_SYMBOL(fasync_helper);
676 706
677void __kill_fasync(struct fasync_struct *fa, int sig, int band) 707void __kill_fasync(struct fasync_struct *fa, int sig, int band)
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/sched.h> 14#include <linux/sched.h>
16#include <linux/pipe_fs_i.h> 15#include <linux/pipe_fs_i.h>
diff --git a/fs/file.c b/fs/file.c
index 87e129030ab1..34bb7f71d994 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr)
257 * N.B. For clone tasks sharing a files structure, this test 257 * N.B. For clone tasks sharing a files structure, this test
258 * will limit the total number of files that can be opened. 258 * will limit the total number of files that can be opened.
259 */ 259 */
260 if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 260 if (nr >= rlimit(RLIMIT_NOFILE))
261 return -EMFILE; 261 return -EMFILE;
262 262
263 /* Do we need to expand? */ 263 /* Do we need to expand? */
@@ -478,7 +478,7 @@ repeat:
478 error = fd; 478 error = fd;
479#if 1 479#if 1
480 /* Sanity check */ 480 /* Sanity check */
481 if (rcu_dereference(fdt->fd[fd]) != NULL) { 481 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
483 rcu_assign_pointer(fdt->fd[fd], NULL); 483 rcu_assign_pointer(fdt->fd[fd], NULL);
484 } 484 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 8eb44042e009..32d12b78bac8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,7 +13,6 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/security.h> 15#include <linux/security.h>
16#include <linux/ima.h>
17#include <linux/eventpoll.h> 16#include <linux/eventpoll.h>
18#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
19#include <linux/mount.h> 18#include <linux/mount.h>
@@ -22,9 +21,12 @@
22#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
23#include <linux/sysctl.h> 22#include <linux/sysctl.h>
24#include <linux/percpu_counter.h> 23#include <linux/percpu_counter.h>
24#include <linux/ima.h>
25 25
26#include <asm/atomic.h> 26#include <asm/atomic.h>
27 27
28#include "internal.h"
29
28/* sysctl tunables... */ 30/* sysctl tunables... */
29struct files_stat_struct files_stat = { 31struct files_stat_struct files_stat = {
30 .max_files = NR_FILE 32 .max_files = NR_FILE
@@ -148,8 +150,6 @@ fail:
148 return NULL; 150 return NULL;
149} 151}
150 152
151EXPORT_SYMBOL(get_empty_filp);
152
153/** 153/**
154 * alloc_file - allocate and initialize a 'struct file' 154 * alloc_file - allocate and initialize a 'struct file'
155 * @mnt: the vfsmount on which the file will reside 155 * @mnt: the vfsmount on which the file will reside
@@ -165,8 +165,8 @@ EXPORT_SYMBOL(get_empty_filp);
165 * If all the callers of init_file() are eliminated, its 165 * If all the callers of init_file() are eliminated, its
166 * code should be moved into this function. 166 * code should be moved into this function.
167 */ 167 */
168struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, 168struct file *alloc_file(struct path *path, fmode_t mode,
169 fmode_t mode, const struct file_operations *fop) 169 const struct file_operations *fop)
170{ 170{
171 struct file *file; 171 struct file *file;
172 172
@@ -174,35 +174,8 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
174 if (!file) 174 if (!file)
175 return NULL; 175 return NULL;
176 176
177 init_file(file, mnt, dentry, mode, fop); 177 file->f_path = *path;
178 return file; 178 file->f_mapping = path->dentry->d_inode->i_mapping;
179}
180EXPORT_SYMBOL(alloc_file);
181
182/**
183 * init_file - initialize a 'struct file'
184 * @file: the already allocated 'struct file' to initialized
185 * @mnt: the vfsmount on which the file resides
186 * @dentry: the dentry representing this file
187 * @mode: the mode the file is opened with
188 * @fop: the 'struct file_operations' for this file
189 *
190 * Use this instead of setting the members directly. Doing so
191 * avoids making mistakes like forgetting the mntget() or
192 * forgetting to take a write on the mnt.
193 *
194 * Note: This is a crappy interface. It is here to make
195 * merging with the existing users of get_empty_filp()
196 * who have complex failure logic easier. All users
197 * of this should be moving to alloc_file().
198 */
199int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
200 fmode_t mode, const struct file_operations *fop)
201{
202 int error = 0;
203 file->f_path.dentry = dentry;
204 file->f_path.mnt = mntget(mnt);
205 file->f_mapping = dentry->d_inode->i_mapping;
206 file->f_mode = mode; 179 file->f_mode = mode;
207 file->f_op = fop; 180 file->f_op = fop;
208 181
@@ -212,14 +185,14 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
212 * visible. We do this for consistency, and so 185 * visible. We do this for consistency, and so
213 * that we can do debugging checks at __fput() 186 * that we can do debugging checks at __fput()
214 */ 187 */
215 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { 188 if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
216 file_take_write(file); 189 file_take_write(file);
217 error = mnt_clone_write(mnt); 190 WARN_ON(mnt_clone_write(path->mnt));
218 WARN_ON(error);
219 } 191 }
220 return error; 192 ima_counts_get(file);
193 return file;
221} 194}
222EXPORT_SYMBOL(init_file); 195EXPORT_SYMBOL(alloc_file);
223 196
224void fput(struct file *file) 197void fput(struct file *file)
225{ 198{
@@ -420,7 +393,9 @@ retry:
420 continue; 393 continue;
421 if (!(f->f_mode & FMODE_WRITE)) 394 if (!(f->f_mode & FMODE_WRITE))
422 continue; 395 continue;
396 spin_lock(&f->f_lock);
423 f->f_mode &= ~FMODE_WRITE; 397 f->f_mode &= ~FMODE_WRITE;
398 spin_unlock(&f->f_lock);
424 if (file_check_writeable(f) != 0) 399 if (file_check_writeable(f) != 0)
425 continue; 400 continue;
426 file_release_write(f); 401 file_release_write(f);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/kmod.h> 13#include <linux/kmod.h>
15#include <linux/init.h> 14#include <linux/init.h>
16#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19/* 19/*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/buffer_head.h> 34#include <linux/buffer_head.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/pagemap.h> 36#include <linux/pagemap.h>
38 37
39#include "vxfs_extern.h" 38#include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2af..781a322ccb45 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -242,6 +243,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
242/** 243/**
243 * bdi_start_writeback - start writeback 244 * bdi_start_writeback - start writeback
244 * @bdi: the backing device to write from 245 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block
245 * @nr_pages: the number of pages to write 247 * @nr_pages: the number of pages to write
246 * 248 *
247 * Description: 249 * Description:
@@ -380,10 +382,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
380 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 382 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
381} 383}
382 384
383static int write_inode(struct inode *inode, int sync) 385static int write_inode(struct inode *inode, struct writeback_control *wbc)
384{ 386{
385 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 387 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
386 return inode->i_sb->s_op->write_inode(inode, sync); 388 return inode->i_sb->s_op->write_inode(inode, wbc);
387 return 0; 389 return 0;
388} 390}
389 391
@@ -420,7 +422,6 @@ static int
420writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 422writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
421{ 423{
422 struct address_space *mapping = inode->i_mapping; 424 struct address_space *mapping = inode->i_mapping;
423 int wait = wbc->sync_mode == WB_SYNC_ALL;
424 unsigned dirty; 425 unsigned dirty;
425 int ret; 426 int ret;
426 427
@@ -438,7 +439,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
438 * We'll have another go at writing back this inode when we 439 * We'll have another go at writing back this inode when we
439 * completed a full scan of b_io. 440 * completed a full scan of b_io.
440 */ 441 */
441 if (!wait) { 442 if (wbc->sync_mode != WB_SYNC_ALL) {
442 requeue_io(inode); 443 requeue_io(inode);
443 return 0; 444 return 0;
444 } 445 }
@@ -460,15 +461,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
460 461
461 ret = do_writepages(mapping, wbc); 462 ret = do_writepages(mapping, wbc);
462 463
463 /* Don't write the inode if only I_DIRTY_PAGES was set */ 464 /*
464 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 465 * Make sure to wait on the data before writing out the metadata.
465 int err = write_inode(inode, wait); 466 * This is important for filesystems that modify metadata on data
467 * I/O completion.
468 */
469 if (wbc->sync_mode == WB_SYNC_ALL) {
470 int err = filemap_fdatawait(mapping);
466 if (ret == 0) 471 if (ret == 0)
467 ret = err; 472 ret = err;
468 } 473 }
469 474
470 if (wait) { 475 /* Don't write the inode if only I_DIRTY_PAGES was set */
471 int err = filemap_fdatawait(mapping); 476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc);
472 if (ret == 0) 478 if (ret == 0)
473 ret = err; 479 ret = err;
474 } 480 }
@@ -614,7 +620,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
614 struct writeback_control *wbc) 620 struct writeback_control *wbc)
615{ 621{
616 struct super_block *sb = wbc->sb, *pin_sb = NULL; 622 struct super_block *sb = wbc->sb, *pin_sb = NULL;
617 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
618 const unsigned long start = jiffies; /* livelock avoidance */ 623 const unsigned long start = jiffies; /* livelock avoidance */
619 624
620 spin_lock(&inode_lock); 625 spin_lock(&inode_lock);
@@ -635,36 +640,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
635 continue; 640 continue;
636 } 641 }
637 642
638 if (!bdi_cap_writeback_dirty(wb->bdi)) {
639 redirty_tail(inode);
640 if (is_blkdev_sb) {
641 /*
642 * Dirty memory-backed blockdev: the ramdisk
643 * driver does this. Skip just this inode
644 */
645 continue;
646 }
647 /*
648 * Dirty memory-backed inode against a filesystem other
649 * than the kernel-internal bdev filesystem. Skip the
650 * entire superblock.
651 */
652 break;
653 }
654
655 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 643 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
656 requeue_io(inode); 644 requeue_io(inode);
657 continue; 645 continue;
658 } 646 }
659 647
660 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
661 wbc->encountered_congestion = 1;
662 if (!is_blkdev_sb)
663 break; /* Skip a congested fs */
664 requeue_io(inode);
665 continue; /* Skip a congested blockdev */
666 }
667
668 /* 648 /*
669 * Was this inode dirtied after sync_sb_inodes was called? 649 * Was this inode dirtied after sync_sb_inodes was called?
670 * This keeps sync from extra jobs and livelock. 650 * This keeps sync from extra jobs and livelock.
@@ -756,6 +736,7 @@ static long wb_writeback(struct bdi_writeback *wb,
756 .sync_mode = args->sync_mode, 736 .sync_mode = args->sync_mode,
757 .older_than_this = NULL, 737 .older_than_this = NULL,
758 .for_kupdate = args->for_kupdate, 738 .for_kupdate = args->for_kupdate,
739 .for_background = args->for_background,
759 .range_cyclic = args->range_cyclic, 740 .range_cyclic = args->range_cyclic,
760 }; 741 };
761 unsigned long oldest_jif; 742 unsigned long oldest_jif;
@@ -787,7 +768,6 @@ static long wb_writeback(struct bdi_writeback *wb,
787 break; 768 break;
788 769
789 wbc.more_io = 0; 770 wbc.more_io = 0;
790 wbc.encountered_congestion = 0;
791 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 771 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
792 wbc.pages_skipped = 0; 772 wbc.pages_skipped = 0;
793 writeback_inodes_wb(wb, &wbc); 773 writeback_inodes_wb(wb, &wbc);
@@ -1213,6 +1193,23 @@ void writeback_inodes_sb(struct super_block *sb)
1213EXPORT_SYMBOL(writeback_inodes_sb); 1193EXPORT_SYMBOL(writeback_inodes_sb);
1214 1194
1215/** 1195/**
1196 * writeback_inodes_sb_if_idle - start writeback if none underway
1197 * @sb: the superblock
1198 *
1199 * Invoke writeback_inodes_sb if no writeback is currently underway.
1200 * Returns 1 if writeback was started, 0 if not.
1201 */
1202int writeback_inodes_sb_if_idle(struct super_block *sb)
1203{
1204 if (!writeback_in_progress(sb->s_bdi)) {
1205 writeback_inodes_sb(sb);
1206 return 1;
1207 } else
1208 return 0;
1209}
1210EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1211
1212/**
1216 * sync_inodes_sb - sync sb inode pages 1213 * sync_inodes_sb - sync sb inode pages
1217 * @sb: the superblock 1214 * @sb: the superblock
1218 * 1215 *
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a242..cc94bb9563f2 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
1 1
2config FSCACHE 2config FSCACHE
3 tristate "General filesystem local caching manager" 3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK 4 select SLOW_WORK
6 help 5 help
7 This option enables a generic filesystem caching manager that can be 6 This option enables a generic filesystem caching manager that can be
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index e590242fa41a..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
12#define FSCACHE_DEBUG_LEVEL COOKIE 12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/slab.h>
15#include <linux/key.h> 16#include <linux/key.h>
16#include <keys/user-type.h> 17#include <keys/user-type.h>
17#include "internal.h" 18#include "internal.h"
@@ -91,7 +92,7 @@ EXPORT_SYMBOL(fscache_object_destroy);
91 */ 92 */
92static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) 93static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
93{ 94{
94 struct fscache_object *pobj, *obj, *minobj = NULL; 95 struct fscache_object *pobj, *obj = NULL, *minobj = NULL;
95 struct rb_node *p; 96 struct rb_node *p;
96 unsigned long pos; 97 unsigned long pos;
97 98
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
53static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
55static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC 56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); 57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif 58#endif
59static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
69 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC 72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc, 73 .desc = fscache_object_slow_work_desc,
74#endif 74#endif
75}; 75};
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
364/* 364/*
365 * describe an object for slow-work debugging 365 * describe an object for slow-work debugging
366 */ 366 */
367#ifdef CONFIG_SLOW_WORK_PROC 367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work, 368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m) 369 struct seq_file *m)
370{ 370{
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
14#define FSCACHE_DEBUG_LEVEL OPERATION 14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19atomic_t fscache_op_debug_id; 20atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
500/* 501/*
501 * describe an operation for slow-work debugging 502 * describe an operation for slow-work debugging
502 */ 503 */
503#ifdef CONFIG_SLOW_WORK_PROC 504#ifdef CONFIG_SLOW_WORK_DEBUG
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m) 505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{ 506{
506 struct fscache_operation *op = 507 struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
517 .get_ref = fscache_op_get_ref, 518 .get_ref = fscache_op_get_ref,
518 .put_ref = fscache_op_put_ref, 519 .put_ref = fscache_op_put_ref,
519 .execute = fscache_op_execute, 520 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC 521#ifdef CONFIG_SLOW_WORK_DEBUG
521 .desc = fscache_op_desc, 522 .desc = fscache_op_desc,
522#endif 523#endif
523}; 524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
14#include <linux/fscache-cache.h> 14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19/* 20/*
@@ -881,6 +882,7 @@ submit_failed:
881 goto nobufs; 882 goto nobufs;
882 883
883nobufs_unlock_obj: 884nobufs_unlock_obj:
885 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 886 spin_unlock(&object->lock);
885nobufs: 887nobufs:
886 spin_unlock(&cookie->lock); 888 spin_unlock(&cookie->lock);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
44#include <linux/magic.h> 44#include <linux/magic.h>
45#include <linux/miscdevice.h> 45#include <linux/miscdevice.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/slab.h>
47#include <linux/spinlock.h> 48#include <linux/spinlock.h>
48#include <linux/stat.h> 49#include <linux/stat.h>
49 50
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..eb7e9423691f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
865 865
866 down_read(&fc->killsb); 866 down_read(&fc->killsb);
867 err = -ENOENT; 867 err = -ENOENT;
868 if (!fc->sb) 868 if (fc->sb) {
869 goto err_unlock; 869 err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
870 870 outarg.off, outarg.len);
871 err = fuse_reverse_inval_inode(fc->sb, outarg.ino, 871 }
872 outarg.off, outarg.len);
873
874err_unlock:
875 up_read(&fc->killsb); 872 up_read(&fc->killsb);
876 return err; 873 return err;
877 874
@@ -884,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
884 struct fuse_copy_state *cs) 881 struct fuse_copy_state *cs)
885{ 882{
886 struct fuse_notify_inval_entry_out outarg; 883 struct fuse_notify_inval_entry_out outarg;
887 int err = -EINVAL; 884 int err = -ENOMEM;
888 char buf[FUSE_NAME_MAX+1]; 885 char *buf;
889 struct qstr name; 886 struct qstr name;
890 887
888 buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
889 if (!buf)
890 goto err;
891
892 err = -EINVAL;
891 if (size < sizeof(outarg)) 893 if (size < sizeof(outarg))
892 goto err; 894 goto err;
893 895
@@ -910,16 +912,14 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
910 912
911 down_read(&fc->killsb); 913 down_read(&fc->killsb);
912 err = -ENOENT; 914 err = -ENOENT;
913 if (!fc->sb) 915 if (fc->sb)
914 goto err_unlock; 916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
915
916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
917
918err_unlock:
919 up_read(&fc->killsb); 917 up_read(&fc->killsb);
918 kfree(buf);
920 return err; 919 return err;
921 920
922err: 921err:
922 kfree(buf);
923 fuse_copy_finish(cs); 923 fuse_copy_finish(cs);
924 return err; 924 return err;
925} 925}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c18913a777ae..a9f5e137f1d3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
828 if (!page) 828 if (!page)
829 break; 829 break;
830 830
831 if (mapping_writably_mapped(mapping))
832 flush_dcache_page(page);
833
831 pagefault_disable(); 834 pagefault_disable();
832 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); 835 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
833 pagefault_enable(); 836 pagefault_enable();
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24b..ec14d19ce501 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
850 req->in.args[0].size = sizeof(*arg); 850 req->in.args[0].size = sizeof(*arg);
851 req->in.args[0].value = arg; 851 req->in.args[0].value = arg;
852 req->out.numargs = 1; 852 req->out.numargs = 1;
853 /* Variable length arguement used for backward compatibility 853 /* Variable length argument used for backward compatibility
854 with interface version < 7.5. Rest of init_out is zeroed 854 with interface version < 7.5. Rest of init_out is zeroed
855 by do_get_request(), so a short reply is not a problem */ 855 by do_get_request(), so a short reply is not a problem */
856 req->out.argvar = 1; 856 req->out.argvar = 1;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index e0b53aa7bbec..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -1,62 +1,59 @@
1/* 1/*
2 * fs/generic_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de> 2 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 * 3 *
6 * This file is released under the GPL. 4 * This file is released under the GPL.
5 *
6 * Generic ACL support for in-memory filesystems.
7 */ 7 */
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/gfp.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/generic_acl.h> 12#include <linux/generic_acl.h>
13#include <linux/posix_acl.h>
14#include <linux/posix_acl_xattr.h>
12 15
13/** 16
14 * generic_acl_list - Generic xattr_handler->list() operation 17static size_t
15 * @ops: Filesystem specific getacl and setacl callbacks 18generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
16 */ 19 const char *name, size_t name_len, int type)
17size_t
18generic_acl_list(struct inode *inode, struct generic_acl_operations *ops,
19 int type, char *list, size_t list_size)
20{ 20{
21 struct posix_acl *acl; 21 struct posix_acl *acl;
22 const char *name; 22 const char *xname;
23 size_t size; 23 size_t size;
24 24
25 acl = ops->getacl(inode, type); 25 acl = get_cached_acl(dentry->d_inode, type);
26 if (!acl) 26 if (!acl)
27 return 0; 27 return 0;
28 posix_acl_release(acl); 28 posix_acl_release(acl);
29 29
30 switch(type) { 30 switch (type) {
31 case ACL_TYPE_ACCESS: 31 case ACL_TYPE_ACCESS:
32 name = POSIX_ACL_XATTR_ACCESS; 32 xname = POSIX_ACL_XATTR_ACCESS;
33 break; 33 break;
34 34 case ACL_TYPE_DEFAULT:
35 case ACL_TYPE_DEFAULT: 35 xname = POSIX_ACL_XATTR_DEFAULT;
36 name = POSIX_ACL_XATTR_DEFAULT; 36 break;
37 break; 37 default:
38 38 return 0;
39 default:
40 return 0;
41 } 39 }
42 size = strlen(name) + 1; 40 size = strlen(xname) + 1;
43 if (list && size <= list_size) 41 if (list && size <= list_size)
44 memcpy(list, name, size); 42 memcpy(list, xname, size);
45 return size; 43 return size;
46} 44}
47 45
48/** 46static int
49 * generic_acl_get - Generic xattr_handler->get() operation 47generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
50 * @ops: Filesystem specific getacl and setacl callbacks 48 size_t size, int type)
51 */
52int
53generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
54 int type, void *buffer, size_t size)
55{ 49{
56 struct posix_acl *acl; 50 struct posix_acl *acl;
57 int error; 51 int error;
58 52
59 acl = ops->getacl(inode, type); 53 if (strcmp(name, "") != 0)
54 return -EINVAL;
55
56 acl = get_cached_acl(dentry->d_inode, type);
60 if (!acl) 57 if (!acl)
61 return -ENODATA; 58 return -ENODATA;
62 error = posix_acl_to_xattr(acl, buffer, size); 59 error = posix_acl_to_xattr(acl, buffer, size);
@@ -65,17 +62,16 @@ generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
65 return error; 62 return error;
66} 63}
67 64
68/** 65static int
69 * generic_acl_set - Generic xattr_handler->set() operation 66generic_acl_set(struct dentry *dentry, const char *name, const void *value,
70 * @ops: Filesystem specific getacl and setacl callbacks 67 size_t size, int flags, int type)
71 */
72int
73generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
74 int type, const void *value, size_t size)
75{ 68{
69 struct inode *inode = dentry->d_inode;
76 struct posix_acl *acl = NULL; 70 struct posix_acl *acl = NULL;
77 int error; 71 int error;
78 72
73 if (strcmp(name, "") != 0)
74 return -EINVAL;
79 if (S_ISLNK(inode->i_mode)) 75 if (S_ISLNK(inode->i_mode))
80 return -EOPNOTSUPP; 76 return -EOPNOTSUPP;
81 if (!is_owner_or_cap(inode)) 77 if (!is_owner_or_cap(inode))
@@ -91,28 +87,27 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
91 error = posix_acl_valid(acl); 87 error = posix_acl_valid(acl);
92 if (error) 88 if (error)
93 goto failed; 89 goto failed;
94 switch(type) { 90 switch (type) {
95 case ACL_TYPE_ACCESS: 91 case ACL_TYPE_ACCESS:
96 mode = inode->i_mode; 92 mode = inode->i_mode;
97 error = posix_acl_equiv_mode(acl, &mode); 93 error = posix_acl_equiv_mode(acl, &mode);
98 if (error < 0) 94 if (error < 0)
99 goto failed; 95 goto failed;
100 inode->i_mode = mode; 96 inode->i_mode = mode;
101 if (error == 0) { 97 if (error == 0) {
102 posix_acl_release(acl); 98 posix_acl_release(acl);
103 acl = NULL; 99 acl = NULL;
104 } 100 }
105 break; 101 break;
106 102 case ACL_TYPE_DEFAULT:
107 case ACL_TYPE_DEFAULT: 103 if (!S_ISDIR(inode->i_mode)) {
108 if (!S_ISDIR(inode->i_mode)) { 104 error = -EINVAL;
109 error = -EINVAL; 105 goto failed;
110 goto failed; 106 }
111 } 107 break;
112 break;
113 } 108 }
114 } 109 }
115 ops->setacl(inode, type, acl); 110 set_cached_acl(inode, type, acl);
116 error = 0; 111 error = 0;
117failed: 112failed:
118 posix_acl_release(acl); 113 posix_acl_release(acl);
@@ -121,14 +116,12 @@ failed:
121 116
122/** 117/**
123 * generic_acl_init - Take care of acl inheritance at @inode create time 118 * generic_acl_init - Take care of acl inheritance at @inode create time
124 * @ops: Filesystem specific getacl and setacl callbacks
125 * 119 *
126 * Files created inside a directory with a default ACL inherit the 120 * Files created inside a directory with a default ACL inherit the
127 * directory's default ACL. 121 * directory's default ACL.
128 */ 122 */
129int 123int
130generic_acl_init(struct inode *inode, struct inode *dir, 124generic_acl_init(struct inode *inode, struct inode *dir)
131 struct generic_acl_operations *ops)
132{ 125{
133 struct posix_acl *acl = NULL; 126 struct posix_acl *acl = NULL;
134 mode_t mode = inode->i_mode; 127 mode_t mode = inode->i_mode;
@@ -136,7 +129,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
136 129
137 inode->i_mode = mode & ~current_umask(); 130 inode->i_mode = mode & ~current_umask();
138 if (!S_ISLNK(inode->i_mode)) 131 if (!S_ISLNK(inode->i_mode))
139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT); 132 acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
140 if (acl) { 133 if (acl) {
141 struct posix_acl *clone; 134 struct posix_acl *clone;
142 135
@@ -145,7 +138,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
145 error = -ENOMEM; 138 error = -ENOMEM;
146 if (!clone) 139 if (!clone)
147 goto cleanup; 140 goto cleanup;
148 ops->setacl(inode, ACL_TYPE_DEFAULT, clone); 141 set_cached_acl(inode, ACL_TYPE_DEFAULT, clone);
149 posix_acl_release(clone); 142 posix_acl_release(clone);
150 } 143 }
151 clone = posix_acl_clone(acl, GFP_KERNEL); 144 clone = posix_acl_clone(acl, GFP_KERNEL);
@@ -156,7 +149,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
156 if (error >= 0) { 149 if (error >= 0) {
157 inode->i_mode = mode; 150 inode->i_mode = mode;
158 if (error > 0) 151 if (error > 0)
159 ops->setacl(inode, ACL_TYPE_ACCESS, clone); 152 set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
160 } 153 }
161 posix_acl_release(clone); 154 posix_acl_release(clone);
162 } 155 }
@@ -169,20 +162,19 @@ cleanup:
169 162
170/** 163/**
171 * generic_acl_chmod - change the access acl of @inode upon chmod() 164 * generic_acl_chmod - change the access acl of @inode upon chmod()
172 * @ops: FIlesystem specific getacl and setacl callbacks
173 * 165 *
174 * A chmod also changes the permissions of the owner, group/mask, and 166 * A chmod also changes the permissions of the owner, group/mask, and
175 * other ACL entries. 167 * other ACL entries.
176 */ 168 */
177int 169int
178generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) 170generic_acl_chmod(struct inode *inode)
179{ 171{
180 struct posix_acl *acl, *clone; 172 struct posix_acl *acl, *clone;
181 int error = 0; 173 int error = 0;
182 174
183 if (S_ISLNK(inode->i_mode)) 175 if (S_ISLNK(inode->i_mode))
184 return -EOPNOTSUPP; 176 return -EOPNOTSUPP;
185 acl = ops->getacl(inode, ACL_TYPE_ACCESS); 177 acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
186 if (acl) { 178 if (acl) {
187 clone = posix_acl_clone(acl, GFP_KERNEL); 179 clone = posix_acl_clone(acl, GFP_KERNEL);
188 posix_acl_release(acl); 180 posix_acl_release(acl);
@@ -190,8 +182,37 @@ generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops)
190 return -ENOMEM; 182 return -ENOMEM;
191 error = posix_acl_chmod_masq(clone, inode->i_mode); 183 error = posix_acl_chmod_masq(clone, inode->i_mode);
192 if (!error) 184 if (!error)
193 ops->setacl(inode, ACL_TYPE_ACCESS, clone); 185 set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
194 posix_acl_release(clone); 186 posix_acl_release(clone);
195 } 187 }
196 return error; 188 return error;
197} 189}
190
191int
192generic_check_acl(struct inode *inode, int mask)
193{
194 struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
195
196 if (acl) {
197 int error = posix_acl_permission(inode, acl, mask);
198 posix_acl_release(acl);
199 return error;
200 }
201 return -EAGAIN;
202}
203
204struct xattr_handler generic_acl_access_handler = {
205 .prefix = POSIX_ACL_XATTR_ACCESS,
206 .flags = ACL_TYPE_ACCESS,
207 .list = generic_acl_list,
208 .get = generic_acl_get,
209 .set = generic_acl_set,
210};
211
212struct xattr_handler generic_acl_default_handler = {
213 .prefix = POSIX_ACL_XATTR_DEFAULT,
214 .flags = ACL_TYPE_DEFAULT,
215 .list = generic_acl_list,
216 .get = generic_acl_get,
217 .set = generic_acl_set,
218};
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5971359d2090..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,6 +8,7 @@ config GFS2_FS
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK 10 select SLOW_WORK
11 select QUOTACTL
11 help 12 help
12 A cluster filesystem. 13 A cluster filesystem.
13 14
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3fc4e3ac7d84..87ee309d4c24 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
15#include <linux/posix_acl.h> 16#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h> 17#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
@@ -26,108 +27,44 @@
26#include "trans.h" 27#include "trans.h"
27#include "util.h" 28#include "util.h"
28 29
29#define ACL_ACCESS 1 30static const char *gfs2_acl_name(int type)
30#define ACL_DEFAULT 0
31
32int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
33 struct gfs2_ea_request *er, int *remove, mode_t *mode)
34{ 31{
35 struct posix_acl *acl; 32 switch (type) {
36 int error; 33 case ACL_TYPE_ACCESS:
37 34 return GFS2_POSIX_ACL_ACCESS;
38 error = gfs2_acl_validate_remove(ip, access); 35 case ACL_TYPE_DEFAULT:
39 if (error) 36 return GFS2_POSIX_ACL_DEFAULT;
40 return error;
41
42 if (!er->er_data)
43 return -EINVAL;
44
45 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
46 if (IS_ERR(acl))
47 return PTR_ERR(acl);
48 if (!acl) {
49 *remove = 1;
50 return 0;
51 }
52
53 error = posix_acl_valid(acl);
54 if (error)
55 goto out;
56
57 if (access) {
58 error = posix_acl_equiv_mode(acl, mode);
59 if (!error)
60 *remove = 1;
61 else if (error > 0)
62 error = 0;
63 } 37 }
64 38 return NULL;
65out:
66 posix_acl_release(acl);
67 return error;
68}
69
70int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
71{
72 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
73 return -EOPNOTSUPP;
74 if (!is_owner_or_cap(&ip->i_inode))
75 return -EPERM;
76 if (S_ISLNK(ip->i_inode.i_mode))
77 return -EOPNOTSUPP;
78 if (!access && !S_ISDIR(ip->i_inode.i_mode))
79 return -EACCES;
80
81 return 0;
82} 39}
83 40
84static int acl_get(struct gfs2_inode *ip, const char *name, 41static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
85 struct posix_acl **acl, struct gfs2_ea_location *el,
86 char **datap, unsigned int *lenp)
87{ 42{
43 struct posix_acl *acl;
44 const char *name;
88 char *data; 45 char *data;
89 unsigned int len; 46 int len;
90 int error;
91
92 el->el_bh = NULL;
93 47
94 if (!ip->i_eattr) 48 if (!ip->i_eattr)
95 return 0; 49 return NULL;
96
97 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
98 if (error)
99 return error;
100 if (!el->el_ea)
101 return 0;
102 if (!GFS2_EA_DATA_LEN(el->el_ea))
103 goto out;
104 50
105 len = GFS2_EA_DATA_LEN(el->el_ea); 51 acl = get_cached_acl(&ip->i_inode, type);
106 data = kmalloc(len, GFP_NOFS); 52 if (acl != ACL_NOT_CACHED)
107 error = -ENOMEM; 53 return acl;
108 if (!data)
109 goto out;
110 54
111 error = gfs2_ea_get_copy(ip, el, data, len); 55 name = gfs2_acl_name(type);
112 if (error < 0) 56 if (name == NULL)
113 goto out_kfree; 57 return ERR_PTR(-EINVAL);
114 error = 0;
115 58
116 if (acl) { 59 len = gfs2_xattr_acl_get(ip, name, &data);
117 *acl = posix_acl_from_xattr(data, len); 60 if (len < 0)
118 if (IS_ERR(*acl)) 61 return ERR_PTR(len);
119 error = PTR_ERR(*acl); 62 if (len == 0)
120 } 63 return NULL;
121 64
122out_kfree: 65 acl = posix_acl_from_xattr(data, len);
123 if (error || !datap) { 66 kfree(data);
124 kfree(data); 67 return acl;
125 } else {
126 *datap = data;
127 *lenp = len;
128 }
129out:
130 return error;
131} 68}
132 69
133/** 70/**
@@ -140,14 +77,12 @@ out:
140 77
141int gfs2_check_acl(struct inode *inode, int mask) 78int gfs2_check_acl(struct inode *inode, int mask)
142{ 79{
143 struct gfs2_ea_location el; 80 struct posix_acl *acl;
144 struct posix_acl *acl = NULL;
145 int error; 81 int error;
146 82
147 error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL); 83 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
148 brelse(el.el_bh); 84 if (IS_ERR(acl))
149 if (error) 85 return PTR_ERR(acl);
150 return error;
151 86
152 if (acl) { 87 if (acl) {
153 error = posix_acl_permission(inode, acl, mask); 88 error = posix_acl_permission(inode, acl, mask);
@@ -158,57 +93,75 @@ int gfs2_check_acl(struct inode *inode, int mask)
158 return -EAGAIN; 93 return -EAGAIN;
159} 94}
160 95
161static int munge_mode(struct gfs2_inode *ip, mode_t mode) 96static int gfs2_set_mode(struct inode *inode, mode_t mode)
162{ 97{
163 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 98 int error = 0;
164 struct buffer_head *dibh;
165 int error;
166 99
167 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 100 if (mode != inode->i_mode) {
168 if (error) 101 struct iattr iattr;
169 return error;
170 102
171 error = gfs2_meta_inode_buffer(ip, &dibh); 103 iattr.ia_valid = ATTR_MODE;
172 if (!error) { 104 iattr.ia_mode = mode;
173 gfs2_assert_withdraw(sdp, 105
174 (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT)); 106 error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
175 ip->i_inode.i_mode = mode;
176 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
177 gfs2_dinode_out(ip, dibh->b_data);
178 brelse(dibh);
179 } 107 }
180 108
181 gfs2_trans_end(sdp); 109 return error;
110}
111
112static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
113{
114 int error;
115 int len;
116 char *data;
117 const char *name = gfs2_acl_name(type);
182 118
183 return 0; 119 BUG_ON(name == NULL);
120 len = posix_acl_to_xattr(acl, NULL, 0);
121 if (len == 0)
122 return 0;
123 data = kmalloc(len, GFP_NOFS);
124 if (data == NULL)
125 return -ENOMEM;
126 error = posix_acl_to_xattr(acl, data, len);
127 if (error < 0)
128 goto out;
129 error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
130 if (!error)
131 set_cached_acl(inode, type, acl);
132out:
133 kfree(data);
134 return error;
184} 135}
185 136
186int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) 137int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
187{ 138{
188 struct gfs2_ea_location el;
189 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 139 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
190 struct posix_acl *acl = NULL, *clone; 140 struct posix_acl *acl, *clone;
191 mode_t mode = ip->i_inode.i_mode; 141 mode_t mode = inode->i_mode;
192 char *data = NULL; 142 int error = 0;
193 unsigned int len;
194 int error;
195 143
196 if (!sdp->sd_args.ar_posix_acl) 144 if (!sdp->sd_args.ar_posix_acl)
197 return 0; 145 return 0;
198 if (S_ISLNK(ip->i_inode.i_mode)) 146 if (S_ISLNK(inode->i_mode))
199 return 0; 147 return 0;
200 148
201 error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len); 149 acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
202 brelse(el.el_bh); 150 if (IS_ERR(acl))
203 if (error) 151 return PTR_ERR(acl);
204 return error;
205 if (!acl) { 152 if (!acl) {
206 mode &= ~current_umask(); 153 mode &= ~current_umask();
207 if (mode != ip->i_inode.i_mode) 154 if (mode != inode->i_mode)
208 error = munge_mode(ip, mode); 155 error = gfs2_set_mode(inode, mode);
209 return error; 156 return error;
210 } 157 }
211 158
159 if (S_ISDIR(inode->i_mode)) {
160 error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
161 if (error)
162 goto out;
163 }
164
212 clone = posix_acl_clone(acl, GFP_NOFS); 165 clone = posix_acl_clone(acl, GFP_NOFS);
213 error = -ENOMEM; 166 error = -ENOMEM;
214 if (!clone) 167 if (!clone)
@@ -216,43 +169,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
216 posix_acl_release(acl); 169 posix_acl_release(acl);
217 acl = clone; 170 acl = clone;
218 171
219 if (S_ISDIR(ip->i_inode.i_mode)) {
220 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
221 GFS2_POSIX_ACL_DEFAULT, data, len, 0);
222 if (error)
223 goto out;
224 }
225
226 error = posix_acl_create_masq(acl, &mode); 172 error = posix_acl_create_masq(acl, &mode);
227 if (error < 0) 173 if (error < 0)
228 goto out; 174 goto out;
229 if (error == 0) 175 if (error == 0)
230 goto munge; 176 goto munge;
231 177
232 posix_acl_to_xattr(acl, data, len); 178 error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
233 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
234 GFS2_POSIX_ACL_ACCESS, data, len, 0);
235 if (error) 179 if (error)
236 goto out; 180 goto out;
237munge: 181munge:
238 error = munge_mode(ip, mode); 182 error = gfs2_set_mode(inode, mode);
239out: 183out:
240 posix_acl_release(acl); 184 posix_acl_release(acl);
241 kfree(data);
242 return error; 185 return error;
243} 186}
244 187
245int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) 188int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
246{ 189{
247 struct posix_acl *acl = NULL, *clone; 190 struct posix_acl *acl, *clone;
248 struct gfs2_ea_location el;
249 char *data; 191 char *data;
250 unsigned int len; 192 unsigned int len;
251 int error; 193 int error;
252 194
253 error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len); 195 acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
254 if (error) 196 if (IS_ERR(acl))
255 goto out_brelse; 197 return PTR_ERR(acl);
256 if (!acl) 198 if (!acl)
257 return gfs2_setattr_simple(ip, attr); 199 return gfs2_setattr_simple(ip, attr);
258 200
@@ -265,15 +207,138 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
265 207
266 error = posix_acl_chmod_masq(acl, attr->ia_mode); 208 error = posix_acl_chmod_masq(acl, attr->ia_mode);
267 if (!error) { 209 if (!error) {
210 len = posix_acl_to_xattr(acl, NULL, 0);
211 data = kmalloc(len, GFP_NOFS);
212 error = -ENOMEM;
213 if (data == NULL)
214 goto out;
268 posix_acl_to_xattr(acl, data, len); 215 posix_acl_to_xattr(acl, data, len);
269 error = gfs2_ea_acl_chmod(ip, &el, attr, data); 216 error = gfs2_xattr_acl_chmod(ip, attr, data);
217 kfree(data);
218 set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
270 } 219 }
271 220
272out: 221out:
273 posix_acl_release(acl); 222 posix_acl_release(acl);
274 kfree(data);
275out_brelse:
276 brelse(el.el_bh);
277 return error; 223 return error;
278} 224}
279 225
226static int gfs2_acl_type(const char *name)
227{
228 if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
229 return ACL_TYPE_ACCESS;
230 if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
231 return ACL_TYPE_DEFAULT;
232 return -EINVAL;
233}
234
235static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
236 void *buffer, size_t size, int xtype)
237{
238 struct inode *inode = dentry->d_inode;
239 struct posix_acl *acl;
240 int type;
241 int error;
242
243 type = gfs2_acl_type(name);
244 if (type < 0)
245 return type;
246
247 acl = gfs2_acl_get(GFS2_I(inode), type);
248 if (IS_ERR(acl))
249 return PTR_ERR(acl);
250 if (acl == NULL)
251 return -ENODATA;
252
253 error = posix_acl_to_xattr(acl, buffer, size);
254 posix_acl_release(acl);
255
256 return error;
257}
258
259static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
260 const void *value, size_t size, int flags,
261 int xtype)
262{
263 struct inode *inode = dentry->d_inode;
264 struct gfs2_sbd *sdp = GFS2_SB(inode);
265 struct posix_acl *acl = NULL;
266 int error = 0, type;
267
268 if (!sdp->sd_args.ar_posix_acl)
269 return -EOPNOTSUPP;
270
271 type = gfs2_acl_type(name);
272 if (type < 0)
273 return type;
274 if (flags & XATTR_CREATE)
275 return -EINVAL;
276 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
277 return value ? -EACCES : 0;
278 if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
279 return -EPERM;
280 if (S_ISLNK(inode->i_mode))
281 return -EOPNOTSUPP;
282
283 if (!value)
284 goto set_acl;
285
286 acl = posix_acl_from_xattr(value, size);
287 if (!acl) {
288 /*
289 * acl_set_file(3) may request that we set default ACLs with
290 * zero length -- defend (gracefully) against that here.
291 */
292 goto out;
293 }
294 if (IS_ERR(acl)) {
295 error = PTR_ERR(acl);
296 goto out;
297 }
298
299 error = posix_acl_valid(acl);
300 if (error)
301 goto out_release;
302
303 error = -EINVAL;
304 if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
305 goto out_release;
306
307 if (type == ACL_TYPE_ACCESS) {
308 mode_t mode = inode->i_mode;
309 error = posix_acl_equiv_mode(acl, &mode);
310
311 if (error <= 0) {
312 posix_acl_release(acl);
313 acl = NULL;
314
315 if (error < 0)
316 return error;
317 }
318
319 error = gfs2_set_mode(inode, mode);
320 if (error)
321 goto out_release;
322 }
323
324set_acl:
325 error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS);
326 if (!error) {
327 if (acl)
328 set_cached_acl(inode, type, acl);
329 else
330 forget_cached_acl(inode, type);
331 }
332out_release:
333 posix_acl_release(acl);
334out:
335 return error;
336}
337
338struct xattr_handler gfs2_xattr_system_handler = {
339 .prefix = XATTR_SYSTEM_PREFIX,
340 .flags = GFS2_EATYPE_SYS,
341 .get = gfs2_xattr_system_get,
342 .set = gfs2_xattr_system_set,
343};
344
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb64..9306a2e6620c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
13#include "incore.h" 13#include "incore.h"
14 14
15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" 15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
16#define GFS2_POSIX_ACL_ACCESS_LEN 16
17#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" 16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
18#define GFS2_POSIX_ACL_DEFAULT_LEN 17 17#define GFS2_ACL_MAX_ENTRIES 25
19 18
20#define GFS2_ACL_IS_ACCESS(name, len) \ 19extern int gfs2_check_acl(struct inode *inode, int mask);
21 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \ 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
22 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len))) 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
23 22extern struct xattr_handler gfs2_xattr_system_handler;
24#define GFS2_ACL_IS_DEFAULT(name, len) \
25 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
26 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
27
28struct gfs2_ea_request;
29
30int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
31 struct gfs2_ea_request *er,
32 int *remove, mode_t *mode);
33int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
34int gfs2_check_acl(struct inode *inode, int mask);
35int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
36int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
37 23
38#endif /* __ACL_DOT_H__ */ 24#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 694b5d48f036..0c1d0b82dcf1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
269 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 269 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
270 unsigned offset = i_size & (PAGE_CACHE_SIZE-1); 270 unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
271 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); 271 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
272 struct backing_dev_info *bdi = mapping->backing_dev_info;
273 int i; 272 int i;
274 int ret; 273 int ret;
275 274
@@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
313 312
314 if (ret || (--(wbc->nr_to_write) <= 0)) 313 if (ret || (--(wbc->nr_to_write) <= 0))
315 ret = 1; 314 ret = 1;
316 if (wbc->nonblocking && bdi_write_congested(bdi)) {
317 wbc->encountered_congestion = 1;
318 ret = 1;
319 }
320
321 } 315 }
322 gfs2_trans_end(sdp); 316 gfs2_trans_end(sdp);
323 return ret; 317 return ret;
@@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
338static int gfs2_write_cache_jdata(struct address_space *mapping, 332static int gfs2_write_cache_jdata(struct address_space *mapping,
339 struct writeback_control *wbc) 333 struct writeback_control *wbc)
340{ 334{
341 struct backing_dev_info *bdi = mapping->backing_dev_info;
342 int ret = 0; 335 int ret = 0;
343 int done = 0; 336 int done = 0;
344 struct pagevec pvec; 337 struct pagevec pvec;
@@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
348 int scanned = 0; 341 int scanned = 0;
349 int range_whole = 0; 342 int range_whole = 0;
350 343
351 if (wbc->nonblocking && bdi_write_congested(bdi)) {
352 wbc->encountered_congestion = 1;
353 return 0;
354 }
355
356 pagevec_init(&pvec, 0); 344 pagevec_init(&pvec, 0);
357 if (wbc->range_cyclic) { 345 if (wbc->range_cyclic) {
358 index = mapping->writeback_index; /* Start from prev offset */ 346 index = mapping->writeback_index; /* Start from prev offset */
@@ -819,8 +807,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
819 mark_inode_dirty(inode); 807 mark_inode_dirty(inode);
820 } 808 }
821 809
822 if (inode == sdp->sd_rindex) 810 if (inode == sdp->sd_rindex) {
823 adjust_fs_space(inode); 811 adjust_fs_space(inode);
812 ip->i_gh.gh_flags |= GL_NOCACHE;
813 }
824 814
825 brelse(dibh); 815 brelse(dibh);
826 gfs2_trans_end(sdp); 816 gfs2_trans_end(sdp);
@@ -889,8 +879,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
889 mark_inode_dirty(inode); 879 mark_inode_dirty(inode);
890 } 880 }
891 881
892 if (inode == sdp->sd_rindex) 882 if (inode == sdp->sd_rindex) {
893 adjust_fs_space(inode); 883 adjust_fs_space(inode);
884 ip->i_gh.gh_flags |= GL_NOCACHE;
885 }
894 886
895 brelse(dibh); 887 brelse(dibh);
896 gfs2_trans_end(sdp); 888 gfs2_trans_end(sdp);
@@ -1069,8 +1061,8 @@ out:
1069 1061
1070int gfs2_releasepage(struct page *page, gfp_t gfp_mask) 1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1071{ 1063{
1072 struct inode *aspace = page->mapping->host; 1064 struct address_space *mapping = page->mapping;
1073 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; 1065 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
1074 struct buffer_head *bh, *head; 1066 struct buffer_head *bh, *head;
1075 struct gfs2_bufdata *bd; 1067 struct gfs2_bufdata *bd;
1076 1068
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d47379e794b..5e411d5f4697 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -541,7 +540,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
541 *ptr++ = cpu_to_be64(bn++); 540 *ptr++ = cpu_to_be64(bn++);
542 break; 541 break;
543 } 542 }
544 } while (state != ALLOC_DATA); 543 } while ((state != ALLOC_DATA) || !dblock);
545 544
546 ip->i_height = height; 545 ip->i_height = height;
547 gfs2_add_inode_blocks(&ip->i_inode, alloced); 546 gfs2_add_inode_blocks(&ip->i_inode, alloced);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5cebad..25fddc100f18 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -525,38 +525,6 @@ consist_inode:
525 return ERR_PTR(-EIO); 525 return ERR_PTR(-EIO);
526} 526}
527 527
528
529/**
530 * dirent_first - Return the first dirent
531 * @dip: the directory
532 * @bh: The buffer
533 * @dent: Pointer to list of dirents
534 *
535 * return first dirent whether bh points to leaf or stuffed dinode
536 *
537 * Returns: IS_LEAF, IS_DINODE, or -errno
538 */
539
540static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
541 struct gfs2_dirent **dent)
542{
543 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
544
545 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
546 if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
547 return -EIO;
548 *dent = (struct gfs2_dirent *)(bh->b_data +
549 sizeof(struct gfs2_leaf));
550 return IS_LEAF;
551 } else {
552 if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
553 return -EIO;
554 *dent = (struct gfs2_dirent *)(bh->b_data +
555 sizeof(struct gfs2_dinode));
556 return IS_DINODE;
557 }
558}
559
560static int dirent_check_reclen(struct gfs2_inode *dip, 528static int dirent_check_reclen(struct gfs2_inode *dip,
561 const struct gfs2_dirent *d, const void *end_p) 529 const struct gfs2_dirent *d, const void *end_p)
562{ 530{
@@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1006 divider = (start + half_len) << (32 - dip->i_depth); 974 divider = (start + half_len) << (32 - dip->i_depth);
1007 975
1008 /* Copy the entries */ 976 /* Copy the entries */
1009 dirent_first(dip, obh, &dent); 977 dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
1010 978
1011 do { 979 do {
1012 next = dent; 980 next = dent;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..c22c21174833 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4eb308aa3234..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -569,6 +569,40 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
569 return ret; 569 return ret;
570} 570}
571 571
572/**
573 * gfs2_file_aio_write - Perform a write to a file
574 * @iocb: The io context
575 * @iov: The data to write
576 * @nr_segs: Number of @iov segments
577 * @pos: The file position
578 *
579 * We have to do a lock/unlock here to refresh the inode size for
580 * O_APPEND writes, otherwise we can land up writing at the wrong
581 * offset. There is still a race, but provided the app is using its
582 * own file locking, this will make O_APPEND work as expected.
583 *
584 */
585
586static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
587 unsigned long nr_segs, loff_t pos)
588{
589 struct file *file = iocb->ki_filp;
590
591 if (file->f_flags & O_APPEND) {
592 struct dentry *dentry = file->f_dentry;
593 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
594 struct gfs2_holder gh;
595 int ret;
596
597 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
598 if (ret)
599 return ret;
600 gfs2_glock_dq_uninit(&gh);
601 }
602
603 return generic_file_aio_write(iocb, iov, nr_segs, pos);
604}
605
572#ifdef CONFIG_GFS2_FS_LOCKING_DLM 606#ifdef CONFIG_GFS2_FS_LOCKING_DLM
573 607
574/** 608/**
@@ -606,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
606 640
607 if (!(fl->fl_flags & FL_POSIX)) 641 if (!(fl->fl_flags & FL_POSIX))
608 return -ENOLCK; 642 return -ENOLCK;
609 if (__mandatory_lock(&ip->i_inode)) 643 if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
610 return -ENOLCK; 644 return -ENOLCK;
611 645
612 if (cmd == F_CANCELLK) { 646 if (cmd == F_CANCELLK) {
@@ -711,7 +745,7 @@ const struct file_operations gfs2_file_fops = {
711 .read = do_sync_read, 745 .read = do_sync_read,
712 .aio_read = generic_file_aio_read, 746 .aio_read = generic_file_aio_read,
713 .write = do_sync_write, 747 .write = do_sync_write,
714 .aio_write = generic_file_aio_write, 748 .aio_write = gfs2_file_aio_write,
715 .unlocked_ioctl = gfs2_ioctl, 749 .unlocked_ioctl = gfs2_ioctl,
716 .mmap = gfs2_mmap, 750 .mmap = gfs2_mmap,
717 .open = gfs2_open, 751 .open = gfs2_open,
@@ -741,7 +775,7 @@ const struct file_operations gfs2_file_fops_nolock = {
741 .read = do_sync_read, 775 .read = do_sync_read,
742 .aio_read = generic_file_aio_read, 776 .aio_read = generic_file_aio_read,
743 .write = do_sync_write, 777 .write = do_sync_write,
744 .aio_write = generic_file_aio_write, 778 .aio_write = gfs2_file_aio_write,
745 .unlocked_ioctl = gfs2_ioctl, 779 .unlocked_ioctl = gfs2_ioctl,
746 .mmap = gfs2_mmap, 780 .mmap = gfs2_mmap,
747 .open = gfs2_open, 781 .open = gfs2_open,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8b674b1f3a55..454d4b4eb36b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
19#include <linux/list.h> 19#include <linux/list.h>
20#include <linux/wait.h> 20#include <linux/wait.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/rwsem.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24#include <linux/seq_file.h> 23#include <linux/seq_file.h>
25#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
60#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) 59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
61static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
62 61
63static DECLARE_RWSEM(gfs2_umount_flush_sem);
64static struct dentry *gfs2_root; 62static struct dentry *gfs2_root;
65static struct workqueue_struct *glock_workqueue; 63static struct workqueue_struct *glock_workqueue;
66struct workqueue_struct *gfs2_delete_workqueue; 64struct workqueue_struct *gfs2_delete_workqueue;
@@ -154,12 +152,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
154static void glock_free(struct gfs2_glock *gl) 152static void glock_free(struct gfs2_glock *gl)
155{ 153{
156 struct gfs2_sbd *sdp = gl->gl_sbd; 154 struct gfs2_sbd *sdp = gl->gl_sbd;
157 struct inode *aspace = gl->gl_aspace; 155 struct address_space *mapping = gfs2_glock2aspace(gl);
156 struct kmem_cache *cachep = gfs2_glock_cachep;
158 157
159 if (aspace) 158 GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
160 gfs2_aspace_put(aspace);
161 trace_gfs2_glock_put(gl); 159 trace_gfs2_glock_put(gl);
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); 160 if (mapping)
161 cachep = gfs2_glock_aspace_cachep;
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
163} 163}
164 164
165/** 165/**
@@ -241,15 +241,14 @@ int gfs2_glock_put(struct gfs2_glock *gl)
241 int rv = 0; 241 int rv = 0;
242 242
243 write_lock(gl_lock_addr(gl->gl_hash)); 243 write_lock(gl_lock_addr(gl->gl_hash));
244 if (atomic_dec_and_test(&gl->gl_ref)) { 244 if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
245 hlist_del(&gl->gl_list); 245 hlist_del(&gl->gl_list);
246 write_unlock(gl_lock_addr(gl->gl_hash));
247 spin_lock(&lru_lock);
248 if (!list_empty(&gl->gl_lru)) { 246 if (!list_empty(&gl->gl_lru)) {
249 list_del_init(&gl->gl_lru); 247 list_del_init(&gl->gl_lru);
250 atomic_dec(&lru_count); 248 atomic_dec(&lru_count);
251 } 249 }
252 spin_unlock(&lru_lock); 250 spin_unlock(&lru_lock);
251 write_unlock(gl_lock_addr(gl->gl_hash));
253 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 252 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
254 glock_free(gl); 253 glock_free(gl);
255 rv = 1; 254 rv = 1;
@@ -513,7 +512,6 @@ retry:
513 GLOCK_BUG_ON(gl, 1); 512 GLOCK_BUG_ON(gl, 1);
514 } 513 }
515 spin_unlock(&gl->gl_spin); 514 spin_unlock(&gl->gl_spin);
516 gfs2_glock_put(gl);
517 return; 515 return;
518 } 516 }
519 517
@@ -524,8 +522,6 @@ retry:
524 if (glops->go_xmote_bh) { 522 if (glops->go_xmote_bh) {
525 spin_unlock(&gl->gl_spin); 523 spin_unlock(&gl->gl_spin);
526 rv = glops->go_xmote_bh(gl, gh); 524 rv = glops->go_xmote_bh(gl, gh);
527 if (rv == -EAGAIN)
528 return;
529 spin_lock(&gl->gl_spin); 525 spin_lock(&gl->gl_spin);
530 if (rv) { 526 if (rv) {
531 do_error(gl, rv); 527 do_error(gl, rv);
@@ -540,7 +536,6 @@ out:
540 clear_bit(GLF_LOCK, &gl->gl_flags); 536 clear_bit(GLF_LOCK, &gl->gl_flags);
541out_locked: 537out_locked:
542 spin_unlock(&gl->gl_spin); 538 spin_unlock(&gl->gl_spin);
543 gfs2_glock_put(gl);
544} 539}
545 540
546static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, 541static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
@@ -600,7 +595,6 @@ __acquires(&gl->gl_spin)
600 595
601 if (!(ret & LM_OUT_ASYNC)) { 596 if (!(ret & LM_OUT_ASYNC)) {
602 finish_xmote(gl, ret); 597 finish_xmote(gl, ret);
603 gfs2_glock_hold(gl);
604 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 598 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
605 gfs2_glock_put(gl); 599 gfs2_glock_put(gl);
606 } else { 600 } else {
@@ -672,12 +666,17 @@ out:
672 return; 666 return;
673 667
674out_sched: 668out_sched:
669 clear_bit(GLF_LOCK, &gl->gl_flags);
670 smp_mb__after_clear_bit();
675 gfs2_glock_hold(gl); 671 gfs2_glock_hold(gl);
676 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 672 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
677 gfs2_glock_put_nolock(gl); 673 gfs2_glock_put_nolock(gl);
674 return;
675
678out_unlock: 676out_unlock:
679 clear_bit(GLF_LOCK, &gl->gl_flags); 677 clear_bit(GLF_LOCK, &gl->gl_flags);
680 goto out; 678 smp_mb__after_clear_bit();
679 return;
681} 680}
682 681
683static void delete_work_func(struct work_struct *work) 682static void delete_work_func(struct work_struct *work)
@@ -707,10 +706,12 @@ static void glock_work_func(struct work_struct *work)
707{ 706{
708 unsigned long delay = 0; 707 unsigned long delay = 0;
709 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); 708 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
709 int drop_ref = 0;
710 710
711 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 711 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
712 finish_xmote(gl, gl->gl_reply); 712 finish_xmote(gl, gl->gl_reply);
713 down_read(&gfs2_umount_flush_sem); 713 drop_ref = 1;
714 }
714 spin_lock(&gl->gl_spin); 715 spin_lock(&gl->gl_spin);
715 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 716 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
716 gl->gl_state != LM_ST_UNLOCKED && 717 gl->gl_state != LM_ST_UNLOCKED &&
@@ -723,10 +724,11 @@ static void glock_work_func(struct work_struct *work)
723 } 724 }
724 run_queue(gl, 0); 725 run_queue(gl, 0);
725 spin_unlock(&gl->gl_spin); 726 spin_unlock(&gl->gl_spin);
726 up_read(&gfs2_umount_flush_sem);
727 if (!delay || 727 if (!delay ||
728 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 728 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
729 gfs2_glock_put(gl); 729 gfs2_glock_put(gl);
730 if (drop_ref)
731 gfs2_glock_put(gl);
730} 732}
731 733
732/** 734/**
@@ -746,10 +748,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
746 const struct gfs2_glock_operations *glops, int create, 748 const struct gfs2_glock_operations *glops, int create,
747 struct gfs2_glock **glp) 749 struct gfs2_glock **glp)
748{ 750{
751 struct super_block *s = sdp->sd_vfs;
749 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; 752 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
750 struct gfs2_glock *gl, *tmp; 753 struct gfs2_glock *gl, *tmp;
751 unsigned int hash = gl_hash(sdp, &name); 754 unsigned int hash = gl_hash(sdp, &name);
752 int error; 755 struct address_space *mapping;
753 756
754 read_lock(gl_lock_addr(hash)); 757 read_lock(gl_lock_addr(hash));
755 gl = search_bucket(hash, sdp, &name); 758 gl = search_bucket(hash, sdp, &name);
@@ -761,10 +764,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
761 if (!create) 764 if (!create)
762 return -ENOENT; 765 return -ENOENT;
763 766
764 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 767 if (glops->go_flags & GLOF_ASPACE)
768 gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
769 else
770 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
765 if (!gl) 771 if (!gl)
766 return -ENOMEM; 772 return -ENOMEM;
767 773
774 atomic_inc(&sdp->sd_glock_disposal);
768 gl->gl_flags = 0; 775 gl->gl_flags = 0;
769 gl->gl_name = name; 776 gl->gl_name = name;
770 atomic_set(&gl->gl_ref, 1); 777 atomic_set(&gl->gl_ref, 1);
@@ -779,18 +786,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
779 gl->gl_tchange = jiffies; 786 gl->gl_tchange = jiffies;
780 gl->gl_object = NULL; 787 gl->gl_object = NULL;
781 gl->gl_sbd = sdp; 788 gl->gl_sbd = sdp;
782 gl->gl_aspace = NULL;
783 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 789 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
784 INIT_WORK(&gl->gl_delete, delete_work_func); 790 INIT_WORK(&gl->gl_delete, delete_work_func);
785 791
786 /* If this glock protects actual on-disk data or metadata blocks, 792 mapping = gfs2_glock2aspace(gl);
787 create a VFS inode to manage the pages/buffers holding them. */ 793 if (mapping) {
788 if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { 794 mapping->a_ops = &gfs2_meta_aops;
789 gl->gl_aspace = gfs2_aspace_get(sdp); 795 mapping->host = s->s_bdev->bd_inode;
790 if (!gl->gl_aspace) { 796 mapping->flags = 0;
791 error = -ENOMEM; 797 mapping_set_gfp_mask(mapping, GFP_NOFS);
792 goto fail; 798 mapping->assoc_mapping = NULL;
793 } 799 mapping->backing_dev_info = s->s_bdi;
800 mapping->writeback_index = 0;
794 } 801 }
795 802
796 write_lock(gl_lock_addr(hash)); 803 write_lock(gl_lock_addr(hash));
@@ -807,10 +814,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
807 *glp = gl; 814 *glp = gl;
808 815
809 return 0; 816 return 0;
810
811fail:
812 kmem_cache_free(gfs2_glock_cachep, gl);
813 return error;
814} 817}
815 818
816/** 819/**
@@ -1361,10 +1364,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1361 list_del_init(&gl->gl_lru); 1364 list_del_init(&gl->gl_lru);
1362 atomic_dec(&lru_count); 1365 atomic_dec(&lru_count);
1363 1366
1364 /* Check if glock is about to be freed */
1365 if (atomic_read(&gl->gl_ref) == 0)
1366 continue;
1367
1368 /* Test for being demotable */ 1367 /* Test for being demotable */
1369 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 1368 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
1370 gfs2_glock_hold(gl); 1369 gfs2_glock_hold(gl);
@@ -1375,10 +1374,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1375 handle_callback(gl, LM_ST_UNLOCKED, 0); 1374 handle_callback(gl, LM_ST_UNLOCKED, 0);
1376 nr--; 1375 nr--;
1377 } 1376 }
1377 clear_bit(GLF_LOCK, &gl->gl_flags);
1378 smp_mb__after_clear_bit();
1378 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1379 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1379 gfs2_glock_put_nolock(gl); 1380 gfs2_glock_put_nolock(gl);
1380 spin_unlock(&gl->gl_spin); 1381 spin_unlock(&gl->gl_spin);
1381 clear_bit(GLF_LOCK, &gl->gl_flags);
1382 spin_lock(&lru_lock); 1382 spin_lock(&lru_lock);
1383 continue; 1383 continue;
1384 } 1384 }
@@ -1508,35 +1508,13 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1508 1508
1509void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1509void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1510{ 1510{
1511 unsigned long t;
1512 unsigned int x; 1511 unsigned int x;
1513 int cont;
1514 1512
1515 t = jiffies; 1513 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1516 1514 examine_bucket(clear_glock, sdp, x);
1517 for (;;) { 1515 flush_workqueue(glock_workqueue);
1518 cont = 0; 1516 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
1519 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { 1517 gfs2_dump_lockstate(sdp);
1520 if (examine_bucket(clear_glock, sdp, x))
1521 cont = 1;
1522 }
1523
1524 if (!cont)
1525 break;
1526
1527 if (time_after_eq(jiffies,
1528 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
1529 fs_warn(sdp, "Unmount seems to be stalled. "
1530 "Dumping lock state...\n");
1531 gfs2_dump_lockstate(sdp);
1532 t = jiffies;
1533 }
1534
1535 down_write(&gfs2_umount_flush_sem);
1536 invalidate_inodes(sdp->sd_vfs);
1537 up_write(&gfs2_umount_flush_sem);
1538 msleep(10);
1539 }
1540} 1518}
1541 1519
1542void gfs2_glock_finish_truncate(struct gfs2_inode *ip) 1520void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1680,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1680 dtime *= 1000000/HZ; /* demote time in uSec */ 1658 dtime *= 1000000/HZ; /* demote time in uSec */
1681 if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) 1659 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1682 dtime = 0; 1660 dtime = 0;
1683 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n", 1661 gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
1684 state2str(gl->gl_state), 1662 state2str(gl->gl_state),
1685 gl->gl_name.ln_type, 1663 gl->gl_name.ln_type,
1686 (unsigned long long)gl->gl_name.ln_number, 1664 (unsigned long long)gl->gl_name.ln_number,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c609894ec0d0..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -123,7 +123,7 @@ struct lm_lockops {
123 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); 123 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
124 void (*lm_unmount) (struct gfs2_sbd *sdp); 124 void (*lm_unmount) (struct gfs2_sbd *sdp);
125 void (*lm_withdraw) (struct gfs2_sbd *sdp); 125 void (*lm_withdraw) (struct gfs2_sbd *sdp);
126 void (*lm_put_lock) (struct kmem_cache *cachep, void *gl); 126 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
127 unsigned int (*lm_lock) (struct gfs2_glock *gl, 127 unsigned int (*lm_lock) (struct gfs2_glock *gl,
128 unsigned int req_state, unsigned int flags); 128 unsigned int req_state, unsigned int flags);
129 void (*lm_cancel) (struct gfs2_glock *gl); 129 void (*lm_cancel) (struct gfs2_glock *gl);
@@ -180,13 +180,11 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
180 return gl->gl_state == LM_ST_SHARED; 180 return gl->gl_state == LM_ST_SHARED;
181} 181}
182 182
183static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) 183static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
184{ 184{
185 int ret; 185 if (gl->gl_ops->go_flags & GLOF_ASPACE)
186 spin_lock(&gl->gl_spin); 186 return (struct address_space *)(gl + 1);
187 ret = test_bit(GLF_DEMOTE, &gl->gl_flags); 187 return NULL;
188 spin_unlock(&gl->gl_spin);
189 return ret;
190} 188}
191 189
192int gfs2_glock_get(struct gfs2_sbd *sdp, 190int gfs2_glock_get(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6985eef06c39..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,12 +7,12 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 13#include <linux/gfs2_ondisk.h>
15#include <linux/bio.h> 14#include <linux/bio.h>
15#include <linux/posix_acl.h>
16 16
17#include "gfs2.h" 17#include "gfs2.h"
18#include "incore.h" 18#include "incore.h"
@@ -86,7 +86,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
86 86
87static void rgrp_go_sync(struct gfs2_glock *gl) 87static void rgrp_go_sync(struct gfs2_glock *gl)
88{ 88{
89 struct address_space *metamapping = gl->gl_aspace->i_mapping; 89 struct address_space *metamapping = gfs2_glock2aspace(gl);
90 int error; 90 int error;
91 91
92 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 92 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -112,7 +112,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
112 112
113static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 113static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
114{ 114{
115 struct address_space *mapping = gl->gl_aspace->i_mapping; 115 struct address_space *mapping = gfs2_glock2aspace(gl);
116 116
117 BUG_ON(!(flags & DIO_METADATA)); 117 BUG_ON(!(flags & DIO_METADATA));
118 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 118 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -133,7 +133,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
133static void inode_go_sync(struct gfs2_glock *gl) 133static void inode_go_sync(struct gfs2_glock *gl)
134{ 134{
135 struct gfs2_inode *ip = gl->gl_object; 135 struct gfs2_inode *ip = gl->gl_object;
136 struct address_space *metamapping = gl->gl_aspace->i_mapping; 136 struct address_space *metamapping = gfs2_glock2aspace(gl);
137 int error; 137 int error;
138 138
139 if (ip && !S_ISREG(ip->i_inode.i_mode)) 139 if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -182,10 +182,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
182 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 182 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
183 183
184 if (flags & DIO_METADATA) { 184 if (flags & DIO_METADATA) {
185 struct address_space *mapping = gl->gl_aspace->i_mapping; 185 struct address_space *mapping = gfs2_glock2aspace(gl);
186 truncate_inode_pages(mapping, 0); 186 truncate_inode_pages(mapping, 0);
187 if (ip) 187 if (ip) {
188 set_bit(GIF_INVALID, &ip->i_flags); 188 set_bit(GIF_INVALID, &ip->i_flags);
189 forget_all_cached_acls(&ip->i_inode);
190 }
189 } 191 }
190 192
191 if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) 193 if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
@@ -279,7 +281,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
279 281
280static int rgrp_go_demote_ok(const struct gfs2_glock *gl) 282static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
281{ 283{
282 return !gl->gl_aspace->i_mapping->nrpages; 284 const struct address_space *mapping = (const struct address_space *)(gl + 1);
285 return !mapping->nrpages;
283} 286}
284 287
285/** 288/**
@@ -384,8 +387,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
384 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; 387 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
385 388
386 if (gl->gl_demote_state == LM_ST_UNLOCKED && 389 if (gl->gl_demote_state == LM_ST_UNLOCKED &&
387 gl->gl_state == LM_ST_SHARED && 390 gl->gl_state == LM_ST_SHARED && ip) {
388 ip && test_bit(GIF_USER, &ip->i_flags)) {
389 gfs2_glock_hold(gl); 391 gfs2_glock_hold(gl);
390 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) 392 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
391 gfs2_glock_put_nolock(gl); 393 gfs2_glock_put_nolock(gl);
@@ -404,6 +406,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
404 .go_dump = inode_go_dump, 406 .go_dump = inode_go_dump,
405 .go_type = LM_TYPE_INODE, 407 .go_type = LM_TYPE_INODE,
406 .go_min_hold_time = HZ / 5, 408 .go_min_hold_time = HZ / 5,
409 .go_flags = GLOF_ASPACE,
407}; 410};
408 411
409const struct gfs2_glock_operations gfs2_rgrp_glops = { 412const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -415,6 +418,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
415 .go_dump = gfs2_rgrp_dump, 418 .go_dump = gfs2_rgrp_dump,
416 .go_type = LM_TYPE_RGRP, 419 .go_type = LM_TYPE_RGRP,
417 .go_min_hold_time = HZ / 5, 420 .go_min_hold_time = HZ / 5,
421 .go_flags = GLOF_ASPACE,
418}; 422};
419 423
420const struct gfs2_glock_operations gfs2_trans_glops = { 424const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6edb423f90b3..3aac46f6853e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
162 void (*go_callback) (struct gfs2_glock *gl); 162 void (*go_callback) (struct gfs2_glock *gl);
163 const int go_type; 163 const int go_type;
164 const unsigned long go_min_hold_time; 164 const unsigned long go_min_hold_time;
165 const unsigned long go_flags;
166#define GLOF_ASPACE 1
165}; 167};
166 168
167enum { 169enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
225 227
226 struct gfs2_sbd *gl_sbd; 228 struct gfs2_sbd *gl_sbd;
227 229
228 struct inode *gl_aspace;
229 struct list_head gl_ail_list; 230 struct list_head gl_ail_list;
230 atomic_t gl_ail_count; 231 atomic_t gl_ail_count;
231 struct delayed_work gl_work; 232 struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
258 GIF_INVALID = 0, 259 GIF_INVALID = 0,
259 GIF_QD_LOCKED = 1, 260 GIF_QD_LOCKED = 1,
260 GIF_SW_PAGED = 3, 261 GIF_SW_PAGED = 3,
261 GIF_USER = 4, /* user inode, not metadata addr space */
262}; 262};
263 263
264 264
@@ -429,7 +429,11 @@ struct gfs2_args {
429 unsigned int ar_meta:1; /* mount metafs */ 429 unsigned int ar_meta:1; /* mount metafs */
430 unsigned int ar_discard:1; /* discard requests */ 430 unsigned int ar_discard:1; /* discard requests */
431 unsigned int ar_errors:2; /* errors=withdraw | panic */ 431 unsigned int ar_errors:2; /* errors=withdraw | panic */
432 unsigned int ar_nobarrier:1; /* do not send barriers */
432 int ar_commit; /* Commit interval */ 433 int ar_commit; /* Commit interval */
434 int ar_statfs_quantum; /* The fast statfs interval */
435 int ar_quota_quantum; /* The quota interval */
436 int ar_statfs_percent; /* The % change to force sync */
433}; 437};
434 438
435struct gfs2_tune { 439struct gfs2_tune {
@@ -447,7 +451,6 @@ struct gfs2_tune {
447 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
448 unsigned int gt_new_files_jdata; 452 unsigned int gt_new_files_jdata;
449 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
450 unsigned int gt_stall_secs; /* Detects trouble! */
451 unsigned int gt_complain_secs; 454 unsigned int gt_complain_secs;
452 unsigned int gt_statfs_quantum; 455 unsigned int gt_statfs_quantum;
453 unsigned int gt_statfs_slow; 456 unsigned int gt_statfs_slow;
@@ -540,6 +543,8 @@ struct gfs2_sbd {
540 struct gfs2_holder sd_live_gh; 543 struct gfs2_holder sd_live_gh;
541 struct gfs2_glock *sd_rename_gl; 544 struct gfs2_glock *sd_rename_gl;
542 struct gfs2_glock *sd_trans_gl; 545 struct gfs2_glock *sd_trans_gl;
546 wait_queue_head_t sd_glock_wait;
547 atomic_t sd_glock_disposal;
543 548
544 /* Inode Stuff */ 549 /* Inode Stuff */
545 550
@@ -558,6 +563,7 @@ struct gfs2_sbd {
558 spinlock_t sd_statfs_spin; 563 spinlock_t sd_statfs_spin;
559 struct gfs2_statfs_change_host sd_statfs_master; 564 struct gfs2_statfs_change_host sd_statfs_master;
560 struct gfs2_statfs_change_host sd_statfs_local; 565 struct gfs2_statfs_change_host sd_statfs_local;
566 int sd_statfs_force_sync;
561 567
562 /* Resource group stuff */ 568 /* Resource group stuff */
563 569
@@ -610,7 +616,7 @@ struct gfs2_sbd {
610 unsigned int sd_log_blks_reserved; 616 unsigned int sd_log_blks_reserved;
611 unsigned int sd_log_commited_buf; 617 unsigned int sd_log_commited_buf;
612 unsigned int sd_log_commited_databuf; 618 unsigned int sd_log_commited_databuf;
613 unsigned int sd_log_commited_revoke; 619 int sd_log_commited_revoke;
614 620
615 unsigned int sd_log_num_buf; 621 unsigned int sd_log_num_buf;
616 unsigned int sd_log_num_revoke; 622 unsigned int sd_log_num_revoke;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fb15d3b1f409..b1bf2694fb2b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
45 struct gfs2_inode *ip = GFS2_I(inode); 45 struct gfs2_inode *ip = GFS2_I(inode);
46 u64 *no_addr = opaque; 46 u64 *no_addr = opaque;
47 47
48 if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags)) 48 if (ip->i_no_addr == *no_addr)
49 return 1; 49 return 1;
50 50
51 return 0; 51 return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
58 58
59 inode->i_ino = (unsigned long)*no_addr; 59 inode->i_ino = (unsigned long)*no_addr;
60 ip->i_no_addr = *no_addr; 60 ip->i_no_addr = *no_addr;
61 set_bit(GIF_USER, &ip->i_flags);
62 return 0; 61 return 0;
63} 62}
64 63
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
84 struct gfs2_inode *ip = GFS2_I(inode); 83 struct gfs2_inode *ip = GFS2_I(inode);
85 struct gfs2_skip_data *data = opaque; 84 struct gfs2_skip_data *data = opaque;
86 85
87 if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){ 86 if (ip->i_no_addr == data->no_addr) {
88 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ 87 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
89 data->skipped = 1; 88 data->skipped = 1;
90 return 0; 89 return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
103 return 1; 102 return 1;
104 inode->i_ino = (unsigned long)(data->no_addr); 103 inode->i_ino = (unsigned long)(data->no_addr);
105 ip->i_no_addr = data->no_addr; 104 ip->i_no_addr = data->no_addr;
106 set_bit(GIF_USER, &ip->i_flags);
107 return 0; 105 return 0;
108} 106}
109 107
@@ -125,7 +123,7 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
125 * directory entry when gfs2_inode_lookup() is invoked. Part of the code 123 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
126 * segment inside gfs2_inode_lookup code needs to get moved around. 124 * segment inside gfs2_inode_lookup code needs to get moved around.
127 * 125 *
128 * Clean up I_LOCK and I_NEW as well. 126 * Clears I_NEW as well.
129 **/ 127 **/
130 128
131void gfs2_set_iop(struct inode *inode) 129void gfs2_set_iop(struct inode *inode)
@@ -801,7 +799,8 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
801 return err; 799 return err;
802 } 800 }
803 801
804 err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0); 802 err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
803 GFS2_EATYPE_SECURITY);
805 kfree(value); 804 kfree(value);
806 kfree(name); 805 kfree(name);
807 806
@@ -871,7 +870,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
871 if (error) 870 if (error)
872 goto fail_gunlock2; 871 goto fail_gunlock2;
873 872
874 error = gfs2_acl_create(dip, GFS2_I(inode)); 873 error = gfs2_acl_create(dip, inode);
875 if (error) 874 if (error)
876 goto fail_gunlock2; 875 goto fail_gunlock2;
877 876
@@ -947,9 +946,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
947 946
948 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 947 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
949 str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); 948 str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
950 str->di_header.__pad0 = 0;
951 str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); 949 str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
952 str->di_header.__pad1 = 0;
953 str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); 950 str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
954 str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); 951 str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
955 str->di_mode = cpu_to_be32(ip->i_inode.i_mode); 952 str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 46df988323bc..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14 15
@@ -21,6 +22,7 @@ static void gdlm_ast(void *arg)
21{ 22{
22 struct gfs2_glock *gl = arg; 23 struct gfs2_glock *gl = arg;
23 unsigned ret = gl->gl_state; 24 unsigned ret = gl->gl_state;
25 struct gfs2_sbd *sdp = gl->gl_sbd;
24 26
25 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); 27 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
26 28
@@ -29,7 +31,12 @@ static void gdlm_ast(void *arg)
29 31
30 switch (gl->gl_lksb.sb_status) { 32 switch (gl->gl_lksb.sb_status) {
31 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 33 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
32 kmem_cache_free(gfs2_glock_cachep, gl); 34 if (gl->gl_ops->go_flags & GLOF_ASPACE)
35 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
36 else
37 kmem_cache_free(gfs2_glock_cachep, gl);
38 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
39 wake_up(&sdp->sd_glock_wait);
33 return; 40 return;
34 case -DLM_ECANCEL: /* Cancel while getting lock */ 41 case -DLM_ECANCEL: /* Cancel while getting lock */
35 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
@@ -164,14 +171,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
164 return LM_OUT_ASYNC; 171 return LM_OUT_ASYNC;
165} 172}
166 173
167static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) 174static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
168{ 175{
169 struct gfs2_glock *gl = ptr; 176 struct gfs2_sbd *sdp = gl->gl_sbd;
170 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 177 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
171 int error; 178 int error;
172 179
173 if (gl->gl_lksb.sb_lkid == 0) { 180 if (gl->gl_lksb.sb_lkid == 0) {
174 kmem_cache_free(cachep, gl); 181 kmem_cache_free(cachep, gl);
182 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
183 wake_up(&sdp->sd_glock_wait);
175 return; 184 return;
176 } 185 }
177 186
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f67..e5bf4b59d46e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
417 databufhdrs_needed = (sdp->sd_log_commited_databuf + 417 databufhdrs_needed = (sdp->sd_log_commited_databuf +
418 (dbuf_limit - 1)) / dbuf_limit; 418 (dbuf_limit - 1)) / dbuf_limit;
419 419
420 if (sdp->sd_log_commited_revoke) 420 if (sdp->sd_log_commited_revoke > 0)
421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
422 sizeof(u64)); 422 sizeof(u64));
423 423
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
596 memset(lh, 0, sizeof(struct gfs2_log_header)); 596 memset(lh, 0, sizeof(struct gfs2_log_header));
597 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 597 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
598 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); 598 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
599 lh->lh_header.__pad0 = cpu_to_be64(0);
599 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); 600 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
601 lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
600 lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); 602 lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
601 lh->lh_flags = cpu_to_be32(flags); 603 lh->lh_flags = cpu_to_be32(flags);
602 lh->lh_tail = cpu_to_be32(tail); 604 lh->lh_tail = cpu_to_be32(tail);
@@ -788,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
788 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
789 (((int)sdp->sd_log_commited_databuf) >= 0)); 791 (((int)sdp->sd_log_commited_databuf) >= 0));
790 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
791 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
792 reserved = calc_reserved(sdp); 793 reserved = calc_reserved(sdp);
793 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 794 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
794 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 795 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5b..adc260fbea90 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
132static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) 132static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
133{ 133{
134 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); 134 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
135 struct gfs2_meta_header *mh;
135 struct gfs2_trans *tr; 136 struct gfs2_trans *tr;
136 137
137 lock_buffer(bd->bd_bh); 138 lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
148 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 149 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
149 gfs2_meta_check(sdp, bd->bd_bh); 150 gfs2_meta_check(sdp, bd->bd_bh);
150 gfs2_pin(sdp, bd->bd_bh); 151 gfs2_pin(sdp, bd->bd_bh);
152 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
153 mh->__pad0 = cpu_to_be64(0);
154 mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
151 sdp->sd_log_num_buf++; 155 sdp->sd_log_num_buf++;
152 list_add(&le->le_list, &sdp->sd_log_le_buf); 156 list_add(&le->le_list, &sdp->sd_log_le_buf);
153 tr->tr_num_buf_new++; 157 tr->tr_num_buf_new++;
@@ -524,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
524 gfs2_pin(sdp, bd->bd_bh); 528 gfs2_pin(sdp, bd->bd_bh);
525 tr->tr_num_databuf_new++; 529 tr->tr_num_databuf_new++;
526 sdp->sd_log_num_databuf++; 530 sdp->sd_log_num_databuf++;
527 list_add(&le->le_list, &sdp->sd_log_le_databuf); 531 list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
528 } else { 532 } else {
529 list_add(&le->le_list, &sdp->sd_log_le_ordered); 533 list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
530 } 534 }
531out: 535out:
532 gfs2_log_unlock(sdp); 536 gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..a88fadc704bb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
52 atomic_set(&gl->gl_ail_count, 0); 52 atomic_set(&gl->gl_ail_count, 0);
53} 53}
54 54
55static void gfs2_init_gl_aspace_once(void *foo)
56{
57 struct gfs2_glock *gl = foo;
58 struct address_space *mapping = (struct address_space *)(gl + 1);
59
60 gfs2_init_glock_once(gl);
61 memset(mapping, 0, sizeof(*mapping));
62 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
63 spin_lock_init(&mapping->tree_lock);
64 spin_lock_init(&mapping->i_mmap_lock);
65 INIT_LIST_HEAD(&mapping->private_list);
66 spin_lock_init(&mapping->private_lock);
67 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
68 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
69}
70
55/** 71/**
56 * init_gfs2_fs - Register GFS2 as a filesystem 72 * init_gfs2_fs - Register GFS2 as a filesystem
57 * 73 *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
78 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
79 goto fail; 95 goto fail;
80 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once);
101
102 if (!gfs2_glock_aspace_cachep)
103 goto fail;
104
81 gfs2_inode_cachep = kmem_cache_create("gfs2_inode", 105 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
82 sizeof(struct gfs2_inode), 106 sizeof(struct gfs2_inode),
83 0, SLAB_RECLAIM_ACCOUNT| 107 0, SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
144 if (gfs2_inode_cachep) 168 if (gfs2_inode_cachep)
145 kmem_cache_destroy(gfs2_inode_cachep); 169 kmem_cache_destroy(gfs2_inode_cachep);
146 170
171 if (gfs2_glock_aspace_cachep)
172 kmem_cache_destroy(gfs2_glock_aspace_cachep);
173
147 if (gfs2_glock_cachep) 174 if (gfs2_glock_cachep)
148 kmem_cache_destroy(gfs2_glock_cachep); 175 kmem_cache_destroy(gfs2_glock_cachep);
149 176
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
169 kmem_cache_destroy(gfs2_rgrpd_cachep); 196 kmem_cache_destroy(gfs2_rgrpd_cachep);
170 kmem_cache_destroy(gfs2_bufdata_cachep); 197 kmem_cache_destroy(gfs2_bufdata_cachep);
171 kmem_cache_destroy(gfs2_inode_cachep); 198 kmem_cache_destroy(gfs2_inode_cachep);
199 kmem_cache_destroy(gfs2_glock_aspace_cachep);
172 kmem_cache_destroy(gfs2_glock_cachep); 200 kmem_cache_destroy(gfs2_glock_cachep);
173 201
174 gfs2_sys_uninit(); 202 gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index cb8d7a93d5ec..0bb12c80937a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -93,49 +93,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
93 return err; 93 return err;
94} 94}
95 95
96static const struct address_space_operations aspace_aops = { 96const struct address_space_operations gfs2_meta_aops = {
97 .writepage = gfs2_aspace_writepage, 97 .writepage = gfs2_aspace_writepage,
98 .releasepage = gfs2_releasepage, 98 .releasepage = gfs2_releasepage,
99 .sync_page = block_sync_page, 99 .sync_page = block_sync_page,
100}; 100};
101 101
102/** 102/**
103 * gfs2_aspace_get - Create and initialize a struct inode structure
104 * @sdp: the filesystem the aspace is in
105 *
106 * Right now a struct inode is just a struct inode. Maybe Linux
107 * will supply a more lightweight address space construct (that works)
108 * in the future.
109 *
110 * Make sure pages/buffers in this aspace aren't in high memory.
111 *
112 * Returns: the aspace
113 */
114
115struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
116{
117 struct inode *aspace;
118 struct gfs2_inode *ip;
119
120 aspace = new_inode(sdp->sd_vfs);
121 if (aspace) {
122 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
123 aspace->i_mapping->a_ops = &aspace_aops;
124 aspace->i_size = ~0ULL;
125 ip = GFS2_I(aspace);
126 clear_bit(GIF_USER, &ip->i_flags);
127 insert_inode_hash(aspace);
128 }
129 return aspace;
130}
131
132void gfs2_aspace_put(struct inode *aspace)
133{
134 remove_inode_hash(aspace);
135 iput(aspace);
136}
137
138/**
139 * gfs2_meta_sync - Sync all buffers associated with a glock 103 * gfs2_meta_sync - Sync all buffers associated with a glock
140 * @gl: The glock 104 * @gl: The glock
141 * 105 *
@@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace)
143 107
144void gfs2_meta_sync(struct gfs2_glock *gl) 108void gfs2_meta_sync(struct gfs2_glock *gl)
145{ 109{
146 struct address_space *mapping = gl->gl_aspace->i_mapping; 110 struct address_space *mapping = gfs2_glock2aspace(gl);
147 int error; 111 int error;
148 112
149 filemap_fdatawrite(mapping); 113 filemap_fdatawrite(mapping);
@@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
164 128
165struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) 129struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
166{ 130{
167 struct address_space *mapping = gl->gl_aspace->i_mapping; 131 struct address_space *mapping = gfs2_glock2aspace(gl);
168 struct gfs2_sbd *sdp = gl->gl_sbd; 132 struct gfs2_sbd *sdp = gl->gl_sbd;
169 struct page *page; 133 struct page *page;
170 struct buffer_head *bh; 134 struct buffer_head *bh;
@@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
344 308
345void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) 309void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
346{ 310{
347 struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host); 311 struct address_space *mapping = bh->b_page->mapping;
312 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
348 struct gfs2_bufdata *bd = bh->b_private; 313 struct gfs2_bufdata *bd = bh->b_private;
314
349 if (test_clear_buffer_pinned(bh)) { 315 if (test_clear_buffer_pinned(bh)) {
350 list_del_init(&bd->bd_le.le_list); 316 list_del_init(&bd->bd_le.le_list);
351 if (meta) { 317 if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
37 0, from_head - to_head); 37 0, from_head - to_head);
38} 38}
39 39
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); 40extern const struct address_space_operations gfs2_meta_aops;
41void gfs2_aspace_put(struct inode *aspace); 41
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{
44 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
47 else
48 return inode->i_sb->s_fs_info;
49}
42 50
43void gfs2_meta_sync(struct gfs2_glock *gl); 51void gfs2_meta_sync(struct gfs2_glock *gl);
44 52
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fb6c048981..c1309ed1c496 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/slow-work.h> 20#include <linux/slow-work.h>
21#include <linux/quotaops.h>
21 22
22#include "gfs2.h" 23#include "gfs2.h"
23#include "incore.h" 24#include "incore.h"
@@ -62,13 +63,9 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
62 gt->gt_quota_warn_period = 10; 63 gt->gt_quota_warn_period = 10;
63 gt->gt_quota_scale_num = 1; 64 gt->gt_quota_scale_num = 1;
64 gt->gt_quota_scale_den = 1; 65 gt->gt_quota_scale_den = 1;
65 gt->gt_quota_quantum = 60;
66 gt->gt_new_files_jdata = 0; 66 gt->gt_new_files_jdata = 0;
67 gt->gt_max_readahead = 1 << 18; 67 gt->gt_max_readahead = 1 << 18;
68 gt->gt_stall_secs = 600;
69 gt->gt_complain_secs = 10; 68 gt->gt_complain_secs = 10;
70 gt->gt_statfs_quantum = 30;
71 gt->gt_statfs_slow = 0;
72} 69}
73 70
74static struct gfs2_sbd *init_sbd(struct super_block *sb) 71static struct gfs2_sbd *init_sbd(struct super_block *sb)
@@ -84,6 +81,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
84 81
85 gfs2_tune_init(&sdp->sd_tune); 82 gfs2_tune_init(&sdp->sd_tune);
86 83
84 init_waitqueue_head(&sdp->sd_glock_wait);
85 atomic_set(&sdp->sd_glock_disposal, 0);
87 spin_lock_init(&sdp->sd_statfs_spin); 86 spin_lock_init(&sdp->sd_statfs_spin);
88 87
89 spin_lock_init(&sdp->sd_rindex_spin); 88 spin_lock_init(&sdp->sd_rindex_spin);
@@ -725,7 +724,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
725 goto fail; 724 goto fail;
726 } 725 }
727 726
728 error = -EINVAL; 727 error = -EUSERS;
729 if (!gfs2_jindex_size(sdp)) { 728 if (!gfs2_jindex_size(sdp)) {
730 fs_err(sdp, "no journals!\n"); 729 fs_err(sdp, "no journals!\n");
731 goto fail_jindex; 730 goto fail_jindex;
@@ -985,16 +984,24 @@ static const match_table_t nolock_tokens = {
985 { Opt_err, NULL }, 984 { Opt_err, NULL },
986}; 985};
987 986
987static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
988{
989 struct gfs2_sbd *sdp = gl->gl_sbd;
990 kmem_cache_free(cachep, gl);
991 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
992 wake_up(&sdp->sd_glock_wait);
993}
994
988static const struct lm_lockops nolock_ops = { 995static const struct lm_lockops nolock_ops = {
989 .lm_proto_name = "lock_nolock", 996 .lm_proto_name = "lock_nolock",
990 .lm_put_lock = kmem_cache_free, 997 .lm_put_lock = nolock_put_lock,
991 .lm_tokens = &nolock_tokens, 998 .lm_tokens = &nolock_tokens,
992}; 999};
993 1000
994/** 1001/**
995 * gfs2_lm_mount - mount a locking protocol 1002 * gfs2_lm_mount - mount a locking protocol
996 * @sdp: the filesystem 1003 * @sdp: the filesystem
997 * @args: mount arguements 1004 * @args: mount arguments
998 * @silent: if 1, don't complain if the FS isn't a GFS2 fs 1005 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
999 * 1006 *
1000 * Returns: errno 1007 * Returns: errno
@@ -1114,7 +1121,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp)
1114 * Returns: errno 1121 * Returns: errno
1115 */ 1122 */
1116 1123
1117static int fill_super(struct super_block *sb, void *data, int silent) 1124static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
1118{ 1125{
1119 struct gfs2_sbd *sdp; 1126 struct gfs2_sbd *sdp;
1120 struct gfs2_holder mount_gh; 1127 struct gfs2_holder mount_gh;
@@ -1125,17 +1132,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1125 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); 1132 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
1126 return -ENOMEM; 1133 return -ENOMEM;
1127 } 1134 }
1128 1135 sdp->sd_args = *args;
1129 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1130 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1131 sdp->sd_args.ar_commit = 60;
1132 sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
1133
1134 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1135 if (error) {
1136 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
1137 goto fail;
1138 }
1139 1136
1140 if (sdp->sd_args.ar_spectator) { 1137 if (sdp->sd_args.ar_spectator) {
1141 sb->s_flags |= MS_RDONLY; 1138 sb->s_flags |= MS_RDONLY;
@@ -1143,11 +1140,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1143 } 1140 }
1144 if (sdp->sd_args.ar_posix_acl) 1141 if (sdp->sd_args.ar_posix_acl)
1145 sb->s_flags |= MS_POSIXACL; 1142 sb->s_flags |= MS_POSIXACL;
1143 if (sdp->sd_args.ar_nobarrier)
1144 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
1146 1145
1147 sb->s_magic = GFS2_MAGIC; 1146 sb->s_magic = GFS2_MAGIC;
1148 sb->s_op = &gfs2_super_ops; 1147 sb->s_op = &gfs2_super_ops;
1149 sb->s_export_op = &gfs2_export_ops; 1148 sb->s_export_op = &gfs2_export_ops;
1150 sb->s_xattr = gfs2_xattr_handlers; 1149 sb->s_xattr = gfs2_xattr_handlers;
1150 sb->s_qcop = &gfs2_quotactl_ops;
1151 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
1151 sb->s_time_gran = 1; 1152 sb->s_time_gran = 1;
1152 sb->s_maxbytes = MAX_LFS_FILESIZE; 1153 sb->s_maxbytes = MAX_LFS_FILESIZE;
1153 1154
@@ -1160,6 +1161,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1160 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; 1161 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
1161 1162
1162 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; 1163 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
1164 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
1165 if (sdp->sd_args.ar_statfs_quantum) {
1166 sdp->sd_tune.gt_statfs_slow = 0;
1167 sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
1168 }
1169 else {
1170 sdp->sd_tune.gt_statfs_slow = 1;
1171 sdp->sd_tune.gt_statfs_quantum = 30;
1172 }
1163 1173
1164 error = init_names(sdp, silent); 1174 error = init_names(sdp, silent);
1165 if (error) 1175 if (error)
@@ -1230,10 +1240,9 @@ fail_sb:
1230fail_locking: 1240fail_locking:
1231 init_locking(sdp, &mount_gh, UNDO); 1241 init_locking(sdp, &mount_gh, UNDO);
1232fail_lm: 1242fail_lm:
1243 invalidate_inodes(sb);
1233 gfs2_gl_hash_clear(sdp); 1244 gfs2_gl_hash_clear(sdp);
1234 gfs2_lm_unmount(sdp); 1245 gfs2_lm_unmount(sdp);
1235 while (invalidate_inodes(sb))
1236 yield();
1237fail_sys: 1246fail_sys:
1238 gfs2_sys_fs_del(sdp); 1247 gfs2_sys_fs_del(sdp);
1239fail: 1248fail:
@@ -1243,18 +1252,127 @@ fail:
1243 return error; 1252 return error;
1244} 1253}
1245 1254
1246static int gfs2_get_sb(struct file_system_type *fs_type, int flags, 1255static int set_gfs2_super(struct super_block *s, void *data)
1247 const char *dev_name, void *data, struct vfsmount *mnt)
1248{ 1256{
1249 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); 1257 s->s_bdev = data;
1258 s->s_dev = s->s_bdev->bd_dev;
1259
1260 /*
1261 * We set the bdi here to the queue backing, file systems can
1262 * overwrite this in ->fill_super()
1263 */
1264 s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
1265 return 0;
1250} 1266}
1251 1267
1252static int test_meta_super(struct super_block *s, void *ptr) 1268static int test_gfs2_super(struct super_block *s, void *ptr)
1253{ 1269{
1254 struct block_device *bdev = ptr; 1270 struct block_device *bdev = ptr;
1255 return (bdev == s->s_bdev); 1271 return (bdev == s->s_bdev);
1256} 1272}
1257 1273
1274/**
1275 * gfs2_get_sb - Get the GFS2 superblock
1276 * @fs_type: The GFS2 filesystem type
1277 * @flags: Mount flags
1278 * @dev_name: The name of the device
1279 * @data: The mount arguments
1280 * @mnt: The vfsmnt for this mount
1281 *
1282 * Q. Why not use get_sb_bdev() ?
1283 * A. We need to select one of two root directories to mount, independent
1284 * of whether this is the initial, or subsequent, mount of this sb
1285 *
1286 * Returns: 0 or -ve on error
1287 */
1288
1289static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1290 const char *dev_name, void *data, struct vfsmount *mnt)
1291{
1292 struct block_device *bdev;
1293 struct super_block *s;
1294 fmode_t mode = FMODE_READ;
1295 int error;
1296 struct gfs2_args args;
1297 struct gfs2_sbd *sdp;
1298
1299 if (!(flags & MS_RDONLY))
1300 mode |= FMODE_WRITE;
1301
1302 bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1303 if (IS_ERR(bdev))
1304 return PTR_ERR(bdev);
1305
1306 /*
1307 * once the super is inserted into the list by sget, s_umount
1308 * will protect the lockfs code from trying to start a snapshot
1309 * while we are mounting
1310 */
1311 mutex_lock(&bdev->bd_fsfreeze_mutex);
1312 if (bdev->bd_fsfreeze_count > 0) {
1313 mutex_unlock(&bdev->bd_fsfreeze_mutex);
1314 error = -EBUSY;
1315 goto error_bdev;
1316 }
1317 s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
1318 mutex_unlock(&bdev->bd_fsfreeze_mutex);
1319 error = PTR_ERR(s);
1320 if (IS_ERR(s))
1321 goto error_bdev;
1322
1323 memset(&args, 0, sizeof(args));
1324 args.ar_quota = GFS2_QUOTA_DEFAULT;
1325 args.ar_data = GFS2_DATA_DEFAULT;
1326 args.ar_commit = 60;
1327 args.ar_statfs_quantum = 30;
1328 args.ar_quota_quantum = 60;
1329 args.ar_errors = GFS2_ERRORS_DEFAULT;
1330
1331 error = gfs2_mount_args(&args, data);
1332 if (error) {
1333 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
1334 if (s->s_root)
1335 goto error_super;
1336 deactivate_locked_super(s);
1337 return error;
1338 }
1339
1340 if (s->s_root) {
1341 error = -EBUSY;
1342 if ((flags ^ s->s_flags) & MS_RDONLY)
1343 goto error_super;
1344 close_bdev_exclusive(bdev, mode);
1345 } else {
1346 char b[BDEVNAME_SIZE];
1347
1348 s->s_flags = flags;
1349 s->s_mode = mode;
1350 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
1351 sb_set_blocksize(s, block_size(bdev));
1352 error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
1353 if (error) {
1354 deactivate_locked_super(s);
1355 return error;
1356 }
1357 s->s_flags |= MS_ACTIVE;
1358 bdev->bd_super = s;
1359 }
1360
1361 sdp = s->s_fs_info;
1362 mnt->mnt_sb = s;
1363 if (args.ar_meta)
1364 mnt->mnt_root = dget(sdp->sd_master_dir);
1365 else
1366 mnt->mnt_root = dget(sdp->sd_root_dir);
1367 return 0;
1368
1369error_super:
1370 deactivate_locked_super(s);
1371error_bdev:
1372 close_bdev_exclusive(bdev, mode);
1373 return error;
1374}
1375
1258static int set_meta_super(struct super_block *s, void *ptr) 1376static int set_meta_super(struct super_block *s, void *ptr)
1259{ 1377{
1260 return -EINVAL; 1378 return -EINVAL;
@@ -1274,13 +1392,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1274 dev_name, error); 1392 dev_name, error);
1275 return error; 1393 return error;
1276 } 1394 }
1277 s = sget(&gfs2_fs_type, test_meta_super, set_meta_super, 1395 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
1278 path.dentry->d_inode->i_sb->s_bdev); 1396 path.dentry->d_inode->i_sb->s_bdev);
1279 path_put(&path); 1397 path_put(&path);
1280 if (IS_ERR(s)) { 1398 if (IS_ERR(s)) {
1281 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1399 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
1282 return PTR_ERR(s); 1400 return PTR_ERR(s);
1283 } 1401 }
1402 if ((flags ^ s->s_flags) & MS_RDONLY) {
1403 deactivate_locked_super(s);
1404 return -EBUSY;
1405 }
1284 sdp = s->s_fs_info; 1406 sdp = s->s_fs_info;
1285 mnt->mnt_sb = s; 1407 mnt->mnt_sb = s;
1286 mnt->mnt_root = dget(sdp->sd_master_dir); 1408 mnt->mnt_root = dget(sdp->sd_master_dir);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 247436c10deb..4e64352d49de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -748,7 +748,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
748 struct gfs2_rgrpd *nrgd; 748 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 749 unsigned int num_gh;
750 int dir_rename = 0; 750 int dir_rename = 0;
751 int alloc_required; 751 int alloc_required = 0;
752 unsigned int x; 752 unsigned int x;
753 int error; 753 int error;
754 754
@@ -867,7 +867,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
867 goto out_gunlock; 867 goto out_gunlock;
868 } 868 }
869 869
870 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); 870 if (nip == NULL)
871 alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
872 error = alloc_required;
871 if (error < 0) 873 if (error < 0)
872 goto out_gunlock; 874 goto out_gunlock;
873 error = 0; 875 error = 0;
@@ -974,121 +976,62 @@ out:
974} 976}
975 977
976/** 978/**
977 * gfs2_readlinki - return the contents of a symlink 979 * gfs2_follow_link - Follow a symbolic link
978 * @ip: the symlink's inode 980 * @dentry: The dentry of the link
979 * @buf: a pointer to the buffer to be filled 981 * @nd: Data that we pass to vfs_follow_link()
980 * @len: a pointer to the length of @buf
981 * 982 *
982 * If @buf is too small, a piece of memory is kmalloc()ed and needs 983 * This can handle symlinks of any size.
983 * to be freed by the caller.
984 * 984 *
985 * Returns: errno 985 * Returns: 0 on success or error code
986 */ 986 */
987 987
988static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) 988static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
989{ 989{
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
990 struct gfs2_holder i_gh; 991 struct gfs2_holder i_gh;
991 struct buffer_head *dibh; 992 struct buffer_head *dibh;
992 unsigned int x; 993 unsigned int x;
994 char *buf;
993 int error; 995 int error;
994 996
995 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); 997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
996 error = gfs2_glock_nq(&i_gh); 998 error = gfs2_glock_nq(&i_gh);
997 if (error) { 999 if (error) {
998 gfs2_holder_uninit(&i_gh); 1000 gfs2_holder_uninit(&i_gh);
999 return error; 1001 nd_set_link(nd, ERR_PTR(error));
1002 return NULL;
1000 } 1003 }
1001 1004
1002 if (!ip->i_disksize) { 1005 if (!ip->i_disksize) {
1003 gfs2_consist_inode(ip); 1006 gfs2_consist_inode(ip);
1004 error = -EIO; 1007 buf = ERR_PTR(-EIO);
1005 goto out; 1008 goto out;
1006 } 1009 }
1007 1010
1008 error = gfs2_meta_inode_buffer(ip, &dibh); 1011 error = gfs2_meta_inode_buffer(ip, &dibh);
1009 if (error) 1012 if (error) {
1013 buf = ERR_PTR(error);
1010 goto out; 1014 goto out;
1011
1012 x = ip->i_disksize + 1;
1013 if (x > *len) {
1014 *buf = kmalloc(x, GFP_NOFS);
1015 if (!*buf) {
1016 error = -ENOMEM;
1017 goto out_brelse;
1018 }
1019 } 1015 }
1020 1016
1021 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); 1017 x = ip->i_disksize + 1;
1022 *len = x; 1018 buf = kmalloc(x, GFP_NOFS);
1023 1019 if (!buf)
1024out_brelse: 1020 buf = ERR_PTR(-ENOMEM);
1021 else
1022 memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1025 brelse(dibh); 1023 brelse(dibh);
1026out: 1024out:
1027 gfs2_glock_dq_uninit(&i_gh); 1025 gfs2_glock_dq_uninit(&i_gh);
1028 return error; 1026 nd_set_link(nd, buf);
1029} 1027 return NULL;
1030
1031/**
1032 * gfs2_readlink - Read the value of a symlink
1033 * @dentry: the symlink
1034 * @buf: the buffer to read the symlink data into
1035 * @size: the size of the buffer
1036 *
1037 * Returns: errno
1038 */
1039
1040static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
1041 int user_size)
1042{
1043 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
1044 char array[GFS2_FAST_NAME_SIZE], *buf = array;
1045 unsigned int len = GFS2_FAST_NAME_SIZE;
1046 int error;
1047
1048 error = gfs2_readlinki(ip, &buf, &len);
1049 if (error)
1050 return error;
1051
1052 if (user_size > len - 1)
1053 user_size = len - 1;
1054
1055 if (copy_to_user(user_buf, buf, user_size))
1056 error = -EFAULT;
1057 else
1058 error = user_size;
1059
1060 if (buf != array)
1061 kfree(buf);
1062
1063 return error;
1064} 1028}
1065 1029
1066/** 1030static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1067 * gfs2_follow_link - Follow a symbolic link
1068 * @dentry: The dentry of the link
1069 * @nd: Data that we pass to vfs_follow_link()
1070 *
1071 * This can handle symlinks of any size. It is optimised for symlinks
1072 * under GFS2_FAST_NAME_SIZE.
1073 *
1074 * Returns: 0 on success or error code
1075 */
1076
1077static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1078{ 1031{
1079 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 1032 char *s = nd_get_link(nd);
1080 char array[GFS2_FAST_NAME_SIZE], *buf = array; 1033 if (!IS_ERR(s))
1081 unsigned int len = GFS2_FAST_NAME_SIZE; 1034 kfree(s);
1082 int error;
1083
1084 error = gfs2_readlinki(ip, &buf, &len);
1085 if (!error) {
1086 error = vfs_follow_link(nd, buf);
1087 if (buf != array)
1088 kfree(buf);
1089 }
1090
1091 return ERR_PTR(error);
1092} 1035}
1093 1036
1094/** 1037/**
@@ -1423,8 +1366,9 @@ const struct inode_operations gfs2_dir_iops = {
1423}; 1366};
1424 1367
1425const struct inode_operations gfs2_symlink_iops = { 1368const struct inode_operations gfs2_symlink_iops = {
1426 .readlink = gfs2_readlink, 1369 .readlink = generic_readlink,
1427 .follow_link = gfs2_follow_link, 1370 .follow_link = gfs2_follow_link,
1371 .put_link = gfs2_put_link,
1428 .permission = gfs2_permission, 1372 .permission = gfs2_permission,
1429 .setattr = gfs2_setattr, 1373 .setattr = gfs2_setattr,
1430 .getattr = gfs2_getattr, 1374 .getattr = gfs2_getattr,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e9b9326bfc9..6dbcbad6ab17 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -15,7 +15,7 @@
15 * fuzziness in the current usage value of IDs that are being used on different 15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on 16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable. 17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check 18 * Since quota tags are part of transactions, there is no need for a quota check
19 * program to be run on node crashes or anything like that. 19 * program to be run on node crashes or anything like that.
20 * 20 *
21 * There are couple of knobs that let the administrator manage the quota 21 * There are couple of knobs that let the administrator manage the quota
@@ -47,6 +47,8 @@
47#include <linux/gfs2_ondisk.h> 47#include <linux/gfs2_ondisk.h>
48#include <linux/kthread.h> 48#include <linux/kthread.h>
49#include <linux/freezer.h> 49#include <linux/freezer.h>
50#include <linux/quota.h>
51#include <linux/dqblk_xfs.h>
50 52
51#include "gfs2.h" 53#include "gfs2.h"
52#include "incore.h" 54#include "incore.h"
@@ -65,13 +67,6 @@
65#define QUOTA_USER 1 67#define QUOTA_USER 1
66#define QUOTA_GROUP 0 68#define QUOTA_GROUP 0
67 69
68struct gfs2_quota_host {
69 u64 qu_limit;
70 u64 qu_warn;
71 s64 qu_value;
72 u32 qu_ll_next;
73};
74
75struct gfs2_quota_change_host { 70struct gfs2_quota_change_host {
76 u64 qc_change; 71 u64 qc_change;
77 u32 qc_flags; /* GFS2_QCF_... */ 72 u32 qc_flags; /* GFS2_QCF_... */
@@ -164,7 +159,7 @@ fail:
164 return error; 159 return error;
165} 160}
166 161
167static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, 162static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
168 struct gfs2_quota_data **qdp) 163 struct gfs2_quota_data **qdp)
169{ 164{
170 struct gfs2_quota_data *qd = NULL, *new_qd = NULL; 165 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -202,7 +197,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
202 197
203 spin_unlock(&qd_lru_lock); 198 spin_unlock(&qd_lru_lock);
204 199
205 if (qd || !create) { 200 if (qd) {
206 if (new_qd) { 201 if (new_qd) {
207 gfs2_glock_put(new_qd->qd_gl); 202 gfs2_glock_put(new_qd->qd_gl);
208 kmem_cache_free(gfs2_quotad_cachep, new_qd); 203 kmem_cache_free(gfs2_quotad_cachep, new_qd);
@@ -461,12 +456,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
461 qd_put(qd); 456 qd_put(qd);
462} 457}
463 458
464static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create, 459static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
465 struct gfs2_quota_data **qdp) 460 struct gfs2_quota_data **qdp)
466{ 461{
467 int error; 462 int error;
468 463
469 error = qd_get(sdp, user, id, create, qdp); 464 error = qd_get(sdp, user, id, qdp);
470 if (error) 465 if (error)
471 return error; 466 return error;
472 467
@@ -508,20 +503,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
508 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) 503 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
509 return 0; 504 return 0;
510 505
511 error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd); 506 error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
512 if (error) 507 if (error)
513 goto out; 508 goto out;
514 al->al_qd_num++; 509 al->al_qd_num++;
515 qd++; 510 qd++;
516 511
517 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd); 512 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
518 if (error) 513 if (error)
519 goto out; 514 goto out;
520 al->al_qd_num++; 515 al->al_qd_num++;
521 qd++; 516 qd++;
522 517
523 if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { 518 if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
524 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd); 519 error = qdsb_get(sdp, QUOTA_USER, uid, qd);
525 if (error) 520 if (error)
526 goto out; 521 goto out;
527 al->al_qd_num++; 522 al->al_qd_num++;
@@ -529,7 +524,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
529 } 524 }
530 525
531 if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) { 526 if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
532 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd); 527 error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
533 if (error) 528 if (error)
534 goto out; 529 goto out;
535 al->al_qd_num++; 530 al->al_qd_num++;
@@ -617,48 +612,36 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
617 mutex_unlock(&sdp->sd_quota_mutex); 612 mutex_unlock(&sdp->sd_quota_mutex);
618} 613}
619 614
620static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
621{
622 const struct gfs2_quota *str = buf;
623
624 qu->qu_limit = be64_to_cpu(str->qu_limit);
625 qu->qu_warn = be64_to_cpu(str->qu_warn);
626 qu->qu_value = be64_to_cpu(str->qu_value);
627 qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
628}
629
630static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
631{
632 struct gfs2_quota *str = buf;
633
634 str->qu_limit = cpu_to_be64(qu->qu_limit);
635 str->qu_warn = cpu_to_be64(qu->qu_warn);
636 str->qu_value = cpu_to_be64(qu->qu_value);
637 str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
638 memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
639}
640
641/** 615/**
642 * gfs2_adjust_quota 616 * gfs2_adjust_quota - adjust record of current block usage
617 * @ip: The quota inode
618 * @loc: Offset of the entry in the quota file
619 * @change: The amount of usage change to record
620 * @qd: The quota data
621 * @fdq: The updated limits to record
643 * 622 *
644 * This function was mostly borrowed from gfs2_block_truncate_page which was 623 * This function was mostly borrowed from gfs2_block_truncate_page which was
645 * in turn mostly borrowed from ext3 624 * in turn mostly borrowed from ext3
625 *
626 * Returns: 0 or -ve on error
646 */ 627 */
628
647static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, 629static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
648 s64 change, struct gfs2_quota_data *qd) 630 s64 change, struct gfs2_quota_data *qd,
631 struct fs_disk_quota *fdq)
649{ 632{
650 struct inode *inode = &ip->i_inode; 633 struct inode *inode = &ip->i_inode;
651 struct address_space *mapping = inode->i_mapping; 634 struct address_space *mapping = inode->i_mapping;
652 unsigned long index = loc >> PAGE_CACHE_SHIFT; 635 unsigned long index = loc >> PAGE_CACHE_SHIFT;
653 unsigned offset = loc & (PAGE_CACHE_SIZE - 1); 636 unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
654 unsigned blocksize, iblock, pos; 637 unsigned blocksize, iblock, pos;
655 struct buffer_head *bh; 638 struct buffer_head *bh, *dibh;
656 struct page *page; 639 struct page *page;
657 void *kaddr; 640 void *kaddr;
658 char *ptr; 641 struct gfs2_quota *qp;
659 struct gfs2_quota_host qp;
660 s64 value; 642 s64 value;
661 int err = -EIO; 643 int err = -EIO;
644 u64 size;
662 645
663 if (gfs2_is_stuffed(ip)) 646 if (gfs2_is_stuffed(ip))
664 gfs2_unstuff_dinode(ip, NULL); 647 gfs2_unstuff_dinode(ip, NULL);
@@ -700,18 +683,38 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
700 gfs2_trans_add_bh(ip->i_gl, bh, 0); 683 gfs2_trans_add_bh(ip->i_gl, bh, 0);
701 684
702 kaddr = kmap_atomic(page, KM_USER0); 685 kaddr = kmap_atomic(page, KM_USER0);
703 ptr = kaddr + offset; 686 qp = kaddr + offset;
704 gfs2_quota_in(&qp, ptr); 687 value = (s64)be64_to_cpu(qp->qu_value) + change;
705 qp.qu_value += change; 688 qp->qu_value = cpu_to_be64(value);
706 value = qp.qu_value; 689 qd->qd_qb.qb_value = qp->qu_value;
707 gfs2_quota_out(&qp, ptr); 690 if (fdq) {
691 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
692 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
693 qd->qd_qb.qb_warn = qp->qu_warn;
694 }
695 if (fdq->d_fieldmask & FS_DQ_BHARD) {
696 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
697 qd->qd_qb.qb_limit = qp->qu_limit;
698 }
699 }
708 flush_dcache_page(page); 700 flush_dcache_page(page);
709 kunmap_atomic(kaddr, KM_USER0); 701 kunmap_atomic(kaddr, KM_USER0);
710 err = 0; 702
711 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); 703 err = gfs2_meta_inode_buffer(ip, &dibh);
712 qd->qd_qb.qb_value = cpu_to_be64(value); 704 if (err)
713 ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC); 705 goto unlock;
714 ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value); 706
707 size = loc + sizeof(struct gfs2_quota);
708 if (size > inode->i_size) {
709 ip->i_disksize = size;
710 i_size_write(inode, size);
711 }
712 inode->i_mtime = inode->i_atime = CURRENT_TIME;
713 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
714 gfs2_dinode_out(ip, dibh->b_data);
715 brelse(dibh);
716 mark_inode_dirty(inode);
717
715unlock: 718unlock:
716 unlock_page(page); 719 unlock_page(page);
717 page_cache_release(page); 720 page_cache_release(page);
@@ -739,9 +742,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
739 return -ENOMEM; 742 return -ENOMEM;
740 743
741 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); 744 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
745 mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
742 for (qx = 0; qx < num_qd; qx++) { 746 for (qx = 0; qx < num_qd; qx++) {
743 error = gfs2_glock_nq_init(qda[qx]->qd_gl, 747 error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
744 LM_ST_EXCLUSIVE,
745 GL_NOCACHE, &ghs[qx]); 748 GL_NOCACHE, &ghs[qx]);
746 if (error) 749 if (error)
747 goto out; 750 goto out;
@@ -795,9 +798,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
795 for (x = 0; x < num_qd; x++) { 798 for (x = 0; x < num_qd; x++) {
796 qd = qda[x]; 799 qd = qda[x];
797 offset = qd2offset(qd); 800 offset = qd2offset(qd);
798 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, 801 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
799 (struct gfs2_quota_data *)
800 qd);
801 if (error) 802 if (error)
802 goto out_end_trans; 803 goto out_end_trans;
803 804
@@ -817,21 +818,44 @@ out_gunlock:
817out: 818out:
818 while (qx--) 819 while (qx--)
819 gfs2_glock_dq_uninit(&ghs[qx]); 820 gfs2_glock_dq_uninit(&ghs[qx]);
821 mutex_unlock(&ip->i_inode.i_mutex);
820 kfree(ghs); 822 kfree(ghs);
821 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); 823 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
822 return error; 824 return error;
823} 825}
824 826
827static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
828{
829 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
830 struct gfs2_quota q;
831 struct gfs2_quota_lvb *qlvb;
832 loff_t pos;
833 int error;
834
835 memset(&q, 0, sizeof(struct gfs2_quota));
836 pos = qd2offset(qd);
837 error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
838 if (error < 0)
839 return error;
840
841 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
842 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
843 qlvb->__pad = 0;
844 qlvb->qb_limit = q.qu_limit;
845 qlvb->qb_warn = q.qu_warn;
846 qlvb->qb_value = q.qu_value;
847 qd->qd_qb = *qlvb;
848
849 return 0;
850}
851
825static int do_glock(struct gfs2_quota_data *qd, int force_refresh, 852static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
826 struct gfs2_holder *q_gh) 853 struct gfs2_holder *q_gh)
827{ 854{
828 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 855 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
829 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); 856 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
830 struct gfs2_holder i_gh; 857 struct gfs2_holder i_gh;
831 struct gfs2_quota_host q;
832 char buf[sizeof(struct gfs2_quota)];
833 int error; 858 int error;
834 struct gfs2_quota_lvb *qlvb;
835 859
836restart: 860restart:
837 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); 861 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
@@ -841,11 +865,9 @@ restart:
841 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 865 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
842 866
843 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { 867 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
844 loff_t pos;
845 gfs2_glock_dq_uninit(q_gh); 868 gfs2_glock_dq_uninit(q_gh);
846 error = gfs2_glock_nq_init(qd->qd_gl, 869 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
847 LM_ST_EXCLUSIVE, GL_NOCACHE, 870 GL_NOCACHE, q_gh);
848 q_gh);
849 if (error) 871 if (error)
850 return error; 872 return error;
851 873
@@ -853,29 +875,14 @@ restart:
853 if (error) 875 if (error)
854 goto fail; 876 goto fail;
855 877
856 memset(buf, 0, sizeof(struct gfs2_quota)); 878 error = update_qd(sdp, qd);
857 pos = qd2offset(qd); 879 if (error)
858 error = gfs2_internal_read(ip, NULL, buf, &pos,
859 sizeof(struct gfs2_quota));
860 if (error < 0)
861 goto fail_gunlock; 880 goto fail_gunlock;
862 881
863 gfs2_glock_dq_uninit(&i_gh); 882 gfs2_glock_dq_uninit(&i_gh);
864 883 gfs2_glock_dq_uninit(q_gh);
865 gfs2_quota_in(&q, buf); 884 force_refresh = 0;
866 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 885 goto restart;
867 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
868 qlvb->__pad = 0;
869 qlvb->qb_limit = cpu_to_be64(q.qu_limit);
870 qlvb->qb_warn = cpu_to_be64(q.qu_warn);
871 qlvb->qb_value = cpu_to_be64(q.qu_value);
872 qd->qd_qb = *qlvb;
873
874 if (gfs2_glock_is_blocking(qd->qd_gl)) {
875 gfs2_glock_dq_uninit(q_gh);
876 force_refresh = 0;
877 goto restart;
878 }
879 } 886 }
880 887
881 return 0; 888 return 0;
@@ -995,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
995{ 1002{
996 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 1003 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
997 1004
998 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n", 1005 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
999 sdp->sd_fsname, type, 1006 sdp->sd_fsname, type,
1000 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", 1007 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
1001 qd->qd_id); 1008 qd->qd_id);
@@ -1032,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
1032 1039
1033 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { 1040 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
1034 print_message(qd, "exceeded"); 1041 print_message(qd, "exceeded");
1042 quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
1043 USRQUOTA : GRPQUOTA, qd->qd_id,
1044 sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
1045
1035 error = -EDQUOT; 1046 error = -EDQUOT;
1036 break; 1047 break;
1037 } else if (be64_to_cpu(qd->qd_qb.qb_warn) && 1048 } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
@@ -1039,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
1039 time_after_eq(jiffies, qd->qd_last_warn + 1050 time_after_eq(jiffies, qd->qd_last_warn +
1040 gfs2_tune_get(sdp, 1051 gfs2_tune_get(sdp,
1041 gt_quota_warn_period) * HZ)) { 1052 gt_quota_warn_period) * HZ)) {
1053 quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
1054 USRQUOTA : GRPQUOTA, qd->qd_id,
1055 sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
1042 error = print_message(qd, "warning"); 1056 error = print_message(qd, "warning");
1043 qd->qd_last_warn = jiffies; 1057 qd->qd_last_warn = jiffies;
1044 } 1058 }
@@ -1069,8 +1083,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1069 } 1083 }
1070} 1084}
1071 1085
1072int gfs2_quota_sync(struct gfs2_sbd *sdp) 1086int gfs2_quota_sync(struct super_block *sb, int type, int wait)
1073{ 1087{
1088 struct gfs2_sbd *sdp = sb->s_fs_info;
1074 struct gfs2_quota_data **qda; 1089 struct gfs2_quota_data **qda;
1075 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); 1090 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
1076 unsigned int num_qd; 1091 unsigned int num_qd;
@@ -1112,13 +1127,18 @@ int gfs2_quota_sync(struct gfs2_sbd *sdp)
1112 return error; 1127 return error;
1113} 1128}
1114 1129
1130static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
1131{
1132 return gfs2_quota_sync(sb, type, 0);
1133}
1134
1115int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) 1135int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
1116{ 1136{
1117 struct gfs2_quota_data *qd; 1137 struct gfs2_quota_data *qd;
1118 struct gfs2_holder q_gh; 1138 struct gfs2_holder q_gh;
1119 int error; 1139 int error;
1120 1140
1121 error = qd_get(sdp, user, id, CREATE, &qd); 1141 error = qd_get(sdp, user, id, &qd);
1122 if (error) 1142 if (error)
1123 return error; 1143 return error;
1124 1144
@@ -1127,7 +1147,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
1127 gfs2_glock_dq_uninit(&q_gh); 1147 gfs2_glock_dq_uninit(&q_gh);
1128 1148
1129 qd_put(qd); 1149 qd_put(qd);
1130
1131 return error; 1150 return error;
1132} 1151}
1133 1152
@@ -1298,12 +1317,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
1298} 1317}
1299 1318
1300static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, 1319static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
1301 int (*fxn)(struct gfs2_sbd *sdp), 1320 int (*fxn)(struct super_block *sb, int type),
1302 unsigned long t, unsigned long *timeo, 1321 unsigned long t, unsigned long *timeo,
1303 unsigned int *new_timeo) 1322 unsigned int *new_timeo)
1304{ 1323{
1305 if (t >= *timeo) { 1324 if (t >= *timeo) {
1306 int error = fxn(sdp); 1325 int error = fxn(sdp->sd_vfs, 0);
1307 quotad_error(sdp, msg, error); 1326 quotad_error(sdp, msg, error);
1308 *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; 1327 *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
1309 } else { 1328 } else {
@@ -1330,6 +1349,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
1330 } 1349 }
1331} 1350}
1332 1351
1352void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
1353 if (!sdp->sd_statfs_force_sync) {
1354 sdp->sd_statfs_force_sync = 1;
1355 wake_up(&sdp->sd_quota_wait);
1356 }
1357}
1358
1359
1333/** 1360/**
1334 * gfs2_quotad - Write cached quota changes into the quota file 1361 * gfs2_quotad - Write cached quota changes into the quota file
1335 * @sdp: Pointer to GFS2 superblock 1362 * @sdp: Pointer to GFS2 superblock
@@ -1349,11 +1376,18 @@ int gfs2_quotad(void *data)
1349 while (!kthread_should_stop()) { 1376 while (!kthread_should_stop()) {
1350 1377
1351 /* Update the master statfs file */ 1378 /* Update the master statfs file */
1352 quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, 1379 if (sdp->sd_statfs_force_sync) {
1353 &statfs_timeo, &tune->gt_statfs_quantum); 1380 int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
1381 quotad_error(sdp, "statfs", error);
1382 statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
1383 }
1384 else
1385 quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
1386 &statfs_timeo,
1387 &tune->gt_statfs_quantum);
1354 1388
1355 /* Update quota file */ 1389 /* Update quota file */
1356 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, 1390 quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
1357 &quotad_timeo, &tune->gt_quota_quantum); 1391 &quotad_timeo, &tune->gt_quota_quantum);
1358 1392
1359 /* Check for & recover partially truncated inodes */ 1393 /* Check for & recover partially truncated inodes */
@@ -1367,7 +1401,7 @@ int gfs2_quotad(void *data)
1367 spin_lock(&sdp->sd_trunc_lock); 1401 spin_lock(&sdp->sd_trunc_lock);
1368 empty = list_empty(&sdp->sd_trunc_list); 1402 empty = list_empty(&sdp->sd_trunc_list);
1369 spin_unlock(&sdp->sd_trunc_lock); 1403 spin_unlock(&sdp->sd_trunc_lock);
1370 if (empty) 1404 if (empty && !sdp->sd_statfs_force_sync)
1371 t -= schedule_timeout(t); 1405 t -= schedule_timeout(t);
1372 else 1406 else
1373 t = 0; 1407 t = 0;
@@ -1377,3 +1411,181 @@ int gfs2_quotad(void *data)
1377 return 0; 1411 return 0;
1378} 1412}
1379 1413
1414static int gfs2_quota_get_xstate(struct super_block *sb,
1415 struct fs_quota_stat *fqs)
1416{
1417 struct gfs2_sbd *sdp = sb->s_fs_info;
1418
1419 memset(fqs, 0, sizeof(struct fs_quota_stat));
1420 fqs->qs_version = FS_QSTAT_VERSION;
1421 if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
1422 fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
1423 else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
1424 fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
1425 if (sdp->sd_quota_inode) {
1426 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
1427 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
1428 }
1429 fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
1430 fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
1431 fqs->qs_incoredqs = atomic_read(&qd_lru_count);
1432 return 0;
1433}
1434
1435static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
1436 struct fs_disk_quota *fdq)
1437{
1438 struct gfs2_sbd *sdp = sb->s_fs_info;
1439 struct gfs2_quota_lvb *qlvb;
1440 struct gfs2_quota_data *qd;
1441 struct gfs2_holder q_gh;
1442 int error;
1443
1444 memset(fdq, 0, sizeof(struct fs_disk_quota));
1445
1446 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
1447 return -ESRCH; /* Crazy XFS error code */
1448
1449 if (type == USRQUOTA)
1450 type = QUOTA_USER;
1451 else if (type == GRPQUOTA)
1452 type = QUOTA_GROUP;
1453 else
1454 return -EINVAL;
1455
1456 error = qd_get(sdp, type, id, &qd);
1457 if (error)
1458 return error;
1459 error = do_glock(qd, FORCE, &q_gh);
1460 if (error)
1461 goto out;
1462
1463 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
1464 fdq->d_version = FS_DQUOT_VERSION;
1465 fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
1466 fdq->d_id = id;
1467 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
1468 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
1469 fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
1470
1471 gfs2_glock_dq_uninit(&q_gh);
1472out:
1473 qd_put(qd);
1474 return error;
1475}
1476
1477/* GFS2 only supports a subset of the XFS fields */
1478#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
1479
1480static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
1481 struct fs_disk_quota *fdq)
1482{
1483 struct gfs2_sbd *sdp = sb->s_fs_info;
1484 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
1485 struct gfs2_quota_data *qd;
1486 struct gfs2_holder q_gh, i_gh;
1487 unsigned int data_blocks, ind_blocks;
1488 unsigned int blocks = 0;
1489 int alloc_required;
1490 struct gfs2_alloc *al;
1491 loff_t offset;
1492 int error;
1493
1494 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
1495 return -ESRCH; /* Crazy XFS error code */
1496
1497 switch(type) {
1498 case USRQUOTA:
1499 type = QUOTA_USER;
1500 if (fdq->d_flags != XFS_USER_QUOTA)
1501 return -EINVAL;
1502 break;
1503 case GRPQUOTA:
1504 type = QUOTA_GROUP;
1505 if (fdq->d_flags != XFS_GROUP_QUOTA)
1506 return -EINVAL;
1507 break;
1508 default:
1509 return -EINVAL;
1510 }
1511
1512 if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
1513 return -EINVAL;
1514 if (fdq->d_id != id)
1515 return -EINVAL;
1516
1517 error = qd_get(sdp, type, id, &qd);
1518 if (error)
1519 return error;
1520
1521 mutex_lock(&ip->i_inode.i_mutex);
1522 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
1523 if (error)
1524 goto out_put;
1525 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1526 if (error)
1527 goto out_q;
1528
1529 /* Check for existing entry, if none then alloc new blocks */
1530 error = update_qd(sdp, qd);
1531 if (error)
1532 goto out_i;
1533
1534 /* If nothing has changed, this is a no-op */
1535 if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
1536 (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
1537 fdq->d_fieldmask ^= FS_DQ_BSOFT;
1538 if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
1539 (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
1540 fdq->d_fieldmask ^= FS_DQ_BHARD;
1541 if (fdq->d_fieldmask == 0)
1542 goto out_i;
1543
1544 offset = qd2offset(qd);
1545 error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
1546 &alloc_required);
1547 if (error)
1548 goto out_i;
1549 if (alloc_required) {
1550 al = gfs2_alloc_get(ip);
1551 if (al == NULL)
1552 goto out_i;
1553 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
1554 &data_blocks, &ind_blocks);
1555 blocks = al->al_requested = 1 + data_blocks + ind_blocks;
1556 error = gfs2_inplace_reserve(ip);
1557 if (error)
1558 goto out_alloc;
1559 }
1560
1561 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
1562 if (error)
1563 goto out_release;
1564
1565 /* Apply changes */
1566 error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
1567
1568 gfs2_trans_end(sdp);
1569out_release:
1570 if (alloc_required) {
1571 gfs2_inplace_release(ip);
1572out_alloc:
1573 gfs2_alloc_put(ip);
1574 }
1575out_i:
1576 gfs2_glock_dq_uninit(&i_gh);
1577out_q:
1578 gfs2_glock_dq_uninit(&q_gh);
1579out_put:
1580 mutex_unlock(&ip->i_inode.i_mutex);
1581 qd_put(qd);
1582 return error;
1583}
1584
1585const struct quotactl_ops gfs2_quotactl_ops = {
1586 .quota_sync = gfs2_quota_sync,
1587 .get_xstate = gfs2_quota_get_xstate,
1588 .get_xquota = gfs2_xquota_get,
1589 .set_xquota = gfs2_xquota_set,
1590};
1591
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0fa5fa63d0e8..195f60c8bd14 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,13 +25,15 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid); 26 u32 uid, u32 gid);
27 27
28extern int gfs2_quota_sync(struct gfs2_sbd *sdp); 28extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); 29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30 30
31extern int gfs2_quota_init(struct gfs2_sbd *sdp); 31extern int gfs2_quota_init(struct gfs2_sbd *sdp);
32extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); 32extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
33extern int gfs2_quotad(void *data); 33extern int gfs2_quotad(void *data);
34 34
35extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
36
35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) 37static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
36{ 38{
37 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 39 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -50,5 +52,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
50} 52}
51 53
52extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); 54extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
55extern const struct quotactl_ops gfs2_quotactl_ops;
53 56
54#endif /* __QUOTA_DOT_H__ */ 57#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 09fa31965576..4b9bece3d437 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -410,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
410 memset(lh, 0, sizeof(struct gfs2_log_header)); 410 memset(lh, 0, sizeof(struct gfs2_log_header));
411 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 411 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
412 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); 412 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
413 lh->lh_header.__pad0 = cpu_to_be64(0);
413 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); 414 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
415 lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
414 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); 416 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
415 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); 417 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
416 lh->lh_blkno = cpu_to_be32(lblock); 418 lh->lh_blkno = cpu_to_be32(lblock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8f1cfb02a6cb..503b842f3ba2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -591,11 +591,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = ip->i_disksize;
592 int error; 592 int error;
593 593
594 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { 594 do_div(rgrp_count, sizeof(struct gfs2_rindex));
595 gfs2_consist_inode(ip);
596 return -EIO;
597 }
598
599 clear_rgrpdi(sdp); 595 clear_rgrpdi(sdp);
600 596
601 file_ra_state_init(&ra_state, inode->i_mapping); 597 file_ra_state_init(&ra_state, inode->i_mapping);
@@ -915,7 +911,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
915struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) 911struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
916{ 912{
917 BUG_ON(ip->i_alloc != NULL); 913 BUG_ON(ip->i_alloc != NULL);
918 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL); 914 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
919 return ip->i_alloc; 915 return ip->i_alloc;
920} 916}
921 917
@@ -1710,11 +1706,16 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
1710{ 1706{
1711 struct gfs2_rgrpd *rgd; 1707 struct gfs2_rgrpd *rgd;
1712 struct gfs2_holder ri_gh, rgd_gh; 1708 struct gfs2_holder ri_gh, rgd_gh;
1709 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1710 int ri_locked = 0;
1713 int error; 1711 int error;
1714 1712
1715 error = gfs2_rindex_hold(sdp, &ri_gh); 1713 if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
1716 if (error) 1714 error = gfs2_rindex_hold(sdp, &ri_gh);
1717 goto fail; 1715 if (error)
1716 goto fail;
1717 ri_locked = 1;
1718 }
1718 1719
1719 error = -EINVAL; 1720 error = -EINVAL;
1720 rgd = gfs2_blk2rgrpd(sdp, no_addr); 1721 rgd = gfs2_blk2rgrpd(sdp, no_addr);
@@ -1730,7 +1731,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
1730 1731
1731 gfs2_glock_dq_uninit(&rgd_gh); 1732 gfs2_glock_dq_uninit(&rgd_gh);
1732fail_rindex: 1733fail_rindex:
1733 gfs2_glock_dq_uninit(&ri_gh); 1734 if (ri_locked)
1735 gfs2_glock_dq_uninit(&ri_gh);
1734fail: 1736fail:
1735 return error; 1737 return error;
1736} 1738}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
10#ifndef __RGRP_DOT_H__ 10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__ 11#define __RGRP_DOT_H__
12 12
13#include <linux/slab.h>
14
13struct gfs2_rgrpd; 15struct gfs2_rgrpd;
14struct gfs2_sbd; 16struct gfs2_sbd;
15struct gfs2_holder; 17struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0ec3ec672de1..50aac606b990 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -21,6 +21,8 @@
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/time.h> 23#include <linux/time.h>
24#include <linux/wait.h>
25#include <linux/writeback.h>
24 26
25#include "gfs2.h" 27#include "gfs2.h"
26#include "incore.h" 28#include "incore.h"
@@ -70,6 +72,11 @@ enum {
70 Opt_commit, 72 Opt_commit,
71 Opt_err_withdraw, 73 Opt_err_withdraw,
72 Opt_err_panic, 74 Opt_err_panic,
75 Opt_statfs_quantum,
76 Opt_statfs_percent,
77 Opt_quota_quantum,
78 Opt_barrier,
79 Opt_nobarrier,
73 Opt_error, 80 Opt_error,
74}; 81};
75 82
@@ -101,18 +108,23 @@ static const match_table_t tokens = {
101 {Opt_commit, "commit=%d"}, 108 {Opt_commit, "commit=%d"},
102 {Opt_err_withdraw, "errors=withdraw"}, 109 {Opt_err_withdraw, "errors=withdraw"},
103 {Opt_err_panic, "errors=panic"}, 110 {Opt_err_panic, "errors=panic"},
111 {Opt_statfs_quantum, "statfs_quantum=%d"},
112 {Opt_statfs_percent, "statfs_percent=%d"},
113 {Opt_quota_quantum, "quota_quantum=%d"},
114 {Opt_barrier, "barrier"},
115 {Opt_nobarrier, "nobarrier"},
104 {Opt_error, NULL} 116 {Opt_error, NULL}
105}; 117};
106 118
107/** 119/**
108 * gfs2_mount_args - Parse mount options 120 * gfs2_mount_args - Parse mount options
109 * @sdp: 121 * @args: The structure into which the parsed options will be written
110 * @data: 122 * @options: The options to parse
111 * 123 *
112 * Return: errno 124 * Return: errno
113 */ 125 */
114 126
115int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) 127int gfs2_mount_args(struct gfs2_args *args, char *options)
116{ 128{
117 char *o; 129 char *o;
118 int token; 130 int token;
@@ -157,7 +169,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
157 break; 169 break;
158 case Opt_debug: 170 case Opt_debug:
159 if (args->ar_errors == GFS2_ERRORS_PANIC) { 171 if (args->ar_errors == GFS2_ERRORS_PANIC) {
160 fs_info(sdp, "-o debug and -o errors=panic " 172 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
161 "are mutually exclusive.\n"); 173 "are mutually exclusive.\n");
162 return -EINVAL; 174 return -EINVAL;
163 } 175 }
@@ -210,7 +222,29 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
210 case Opt_commit: 222 case Opt_commit:
211 rv = match_int(&tmp[0], &args->ar_commit); 223 rv = match_int(&tmp[0], &args->ar_commit);
212 if (rv || args->ar_commit <= 0) { 224 if (rv || args->ar_commit <= 0) {
213 fs_info(sdp, "commit mount option requires a positive numeric argument\n"); 225 printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
226 return rv ? rv : -EINVAL;
227 }
228 break;
229 case Opt_statfs_quantum:
230 rv = match_int(&tmp[0], &args->ar_statfs_quantum);
231 if (rv || args->ar_statfs_quantum < 0) {
232 printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
233 return rv ? rv : -EINVAL;
234 }
235 break;
236 case Opt_quota_quantum:
237 rv = match_int(&tmp[0], &args->ar_quota_quantum);
238 if (rv || args->ar_quota_quantum <= 0) {
239 printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
240 return rv ? rv : -EINVAL;
241 }
242 break;
243 case Opt_statfs_percent:
244 rv = match_int(&tmp[0], &args->ar_statfs_percent);
245 if (rv || args->ar_statfs_percent < 0 ||
246 args->ar_statfs_percent > 100) {
247 printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
214 return rv ? rv : -EINVAL; 248 return rv ? rv : -EINVAL;
215 } 249 }
216 break; 250 break;
@@ -219,15 +253,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
219 break; 253 break;
220 case Opt_err_panic: 254 case Opt_err_panic:
221 if (args->ar_debug) { 255 if (args->ar_debug) {
222 fs_info(sdp, "-o debug and -o errors=panic " 256 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
223 "are mutually exclusive.\n"); 257 "are mutually exclusive.\n");
224 return -EINVAL; 258 return -EINVAL;
225 } 259 }
226 args->ar_errors = GFS2_ERRORS_PANIC; 260 args->ar_errors = GFS2_ERRORS_PANIC;
227 break; 261 break;
262 case Opt_barrier:
263 args->ar_nobarrier = 0;
264 break;
265 case Opt_nobarrier:
266 args->ar_nobarrier = 1;
267 break;
228 case Opt_error: 268 case Opt_error:
229 default: 269 default:
230 fs_info(sdp, "invalid mount option: %s\n", o); 270 printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
231 return -EINVAL; 271 return -EINVAL;
232 } 272 }
233 } 273 }
@@ -442,7 +482,10 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
442{ 482{
443 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); 483 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
444 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; 484 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
485 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
445 struct buffer_head *l_bh; 486 struct buffer_head *l_bh;
487 s64 x, y;
488 int need_sync = 0;
446 int error; 489 int error;
447 490
448 error = gfs2_meta_inode_buffer(l_ip, &l_bh); 491 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
@@ -456,9 +499,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
456 l_sc->sc_free += free; 499 l_sc->sc_free += free;
457 l_sc->sc_dinodes += dinodes; 500 l_sc->sc_dinodes += dinodes;
458 gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); 501 gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
502 if (sdp->sd_args.ar_statfs_percent) {
503 x = 100 * l_sc->sc_free;
504 y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
505 if (x >= y || x <= -y)
506 need_sync = 1;
507 }
459 spin_unlock(&sdp->sd_statfs_spin); 508 spin_unlock(&sdp->sd_statfs_spin);
460 509
461 brelse(l_bh); 510 brelse(l_bh);
511 if (need_sync)
512 gfs2_wake_up_statfs(sdp);
462} 513}
463 514
464void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, 515void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
@@ -484,8 +535,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
484 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); 535 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
485} 536}
486 537
487int gfs2_statfs_sync(struct gfs2_sbd *sdp) 538int gfs2_statfs_sync(struct super_block *sb, int type)
488{ 539{
540 struct gfs2_sbd *sdp = sb->s_fs_info;
489 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); 541 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
490 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); 542 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
491 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; 543 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -521,6 +573,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
521 goto out_bh2; 573 goto out_bh2;
522 574
523 update_statfs(sdp, m_bh, l_bh); 575 update_statfs(sdp, m_bh, l_bh);
576 sdp->sd_statfs_force_sync = 0;
524 577
525 gfs2_trans_end(sdp); 578 gfs2_trans_end(sdp);
526 579
@@ -659,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
659 * Returns: errno 712 * Returns: errno
660 */ 713 */
661 714
662static int gfs2_write_inode(struct inode *inode, int sync) 715static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
663{ 716{
664 struct gfs2_inode *ip = GFS2_I(inode); 717 struct gfs2_inode *ip = GFS2_I(inode);
665 struct gfs2_sbd *sdp = GFS2_SB(inode); 718 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -670,8 +723,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
670 int ret = 0; 723 int ret = 0;
671 724
672 /* Check this is a "normal" inode, etc */ 725 /* Check this is a "normal" inode, etc */
673 if (!test_bit(GIF_USER, &ip->i_flags) || 726 if (current->flags & PF_MEMALLOC)
674 (current->flags & PF_MEMALLOC))
675 return 0; 727 return 0;
676 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 728 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
677 if (ret) 729 if (ret)
@@ -694,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
694do_unlock: 746do_unlock:
695 gfs2_glock_dq_uninit(&gh); 747 gfs2_glock_dq_uninit(&gh);
696do_flush: 748do_flush:
697 if (sync != 0) 749 if (wbc->sync_mode == WB_SYNC_ALL)
698 gfs2_log_flush(GFS2_SB(inode), ip->i_gl); 750 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
699 return ret; 751 return ret;
700} 752}
@@ -712,8 +764,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
712 int error; 764 int error;
713 765
714 flush_workqueue(gfs2_delete_workqueue); 766 flush_workqueue(gfs2_delete_workqueue);
715 gfs2_quota_sync(sdp); 767 gfs2_quota_sync(sdp->sd_vfs, 0, 1);
716 gfs2_statfs_sync(sdp); 768 gfs2_statfs_sync(sdp->sd_vfs, 0);
717 769
718 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, 770 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
719 &t_gh); 771 &t_gh);
@@ -808,6 +860,7 @@ restart:
808 gfs2_clear_rgrpd(sdp); 860 gfs2_clear_rgrpd(sdp);
809 gfs2_jindex_free(sdp); 861 gfs2_jindex_free(sdp);
810 /* Take apart glock structures and buffer lists */ 862 /* Take apart glock structures and buffer lists */
863 invalidate_inodes(sdp->sd_vfs);
811 gfs2_gl_hash_clear(sdp); 864 gfs2_gl_hash_clear(sdp);
812 /* Unmount the locking protocol */ 865 /* Unmount the locking protocol */
813 gfs2_lm_unmount(sdp); 866 gfs2_lm_unmount(sdp);
@@ -1061,8 +1114,13 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1061 1114
1062 spin_lock(&gt->gt_spin); 1115 spin_lock(&gt->gt_spin);
1063 args.ar_commit = gt->gt_log_flush_secs; 1116 args.ar_commit = gt->gt_log_flush_secs;
1117 args.ar_quota_quantum = gt->gt_quota_quantum;
1118 if (gt->gt_statfs_slow)
1119 args.ar_statfs_quantum = 0;
1120 else
1121 args.ar_statfs_quantum = gt->gt_statfs_quantum;
1064 spin_unlock(&gt->gt_spin); 1122 spin_unlock(&gt->gt_spin);
1065 error = gfs2_mount_args(sdp, &args, data); 1123 error = gfs2_mount_args(&args, data);
1066 if (error) 1124 if (error)
1067 return error; 1125 return error;
1068 1126
@@ -1097,8 +1155,21 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1097 sb->s_flags |= MS_POSIXACL; 1155 sb->s_flags |= MS_POSIXACL;
1098 else 1156 else
1099 sb->s_flags &= ~MS_POSIXACL; 1157 sb->s_flags &= ~MS_POSIXACL;
1158 if (sdp->sd_args.ar_nobarrier)
1159 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
1160 else
1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
1100 spin_lock(&gt->gt_spin); 1162 spin_lock(&gt->gt_spin);
1101 gt->gt_log_flush_secs = args.ar_commit; 1163 gt->gt_log_flush_secs = args.ar_commit;
1164 gt->gt_quota_quantum = args.ar_quota_quantum;
1165 if (args.ar_statfs_quantum) {
1166 gt->gt_statfs_slow = 0;
1167 gt->gt_statfs_quantum = args.ar_statfs_quantum;
1168 }
1169 else {
1170 gt->gt_statfs_slow = 1;
1171 gt->gt_statfs_quantum = 30;
1172 }
1102 spin_unlock(&gt->gt_spin); 1173 spin_unlock(&gt->gt_spin);
1103 1174
1104 gfs2_online_uevent(sdp); 1175 gfs2_online_uevent(sdp);
@@ -1124,7 +1195,7 @@ static void gfs2_drop_inode(struct inode *inode)
1124{ 1195{
1125 struct gfs2_inode *ip = GFS2_I(inode); 1196 struct gfs2_inode *ip = GFS2_I(inode);
1126 1197
1127 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { 1198 if (inode->i_nlink) {
1128 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1199 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1129 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1200 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1130 clear_nlink(inode); 1201 clear_nlink(inode);
@@ -1142,18 +1213,12 @@ static void gfs2_clear_inode(struct inode *inode)
1142{ 1213{
1143 struct gfs2_inode *ip = GFS2_I(inode); 1214 struct gfs2_inode *ip = GFS2_I(inode);
1144 1215
1145 /* This tells us its a "real" inode and not one which only 1216 ip->i_gl->gl_object = NULL;
1146 * serves to contain an address space (see rgrp.c, meta_io.c) 1217 gfs2_glock_put(ip->i_gl);
1147 * which therefore doesn't have its own glocks. 1218 ip->i_gl = NULL;
1148 */ 1219 if (ip->i_iopen_gh.gh_gl) {
1149 if (test_bit(GIF_USER, &ip->i_flags)) { 1220 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1150 ip->i_gl->gl_object = NULL; 1221 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1151 gfs2_glock_put(ip->i_gl);
1152 ip->i_gl = NULL;
1153 if (ip->i_iopen_gh.gh_gl) {
1154 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1155 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1156 }
1157 } 1222 }
1158} 1223}
1159 1224
@@ -1179,7 +1244,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1179{ 1244{
1180 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; 1245 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
1181 struct gfs2_args *args = &sdp->sd_args; 1246 struct gfs2_args *args = &sdp->sd_args;
1182 int lfsecs; 1247 int val;
1183 1248
1184 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) 1249 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
1185 seq_printf(s, ",meta"); 1250 seq_printf(s, ",meta");
@@ -1240,9 +1305,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1240 } 1305 }
1241 if (args->ar_discard) 1306 if (args->ar_discard)
1242 seq_printf(s, ",discard"); 1307 seq_printf(s, ",discard");
1243 lfsecs = sdp->sd_tune.gt_log_flush_secs; 1308 val = sdp->sd_tune.gt_log_flush_secs;
1244 if (lfsecs != 60) 1309 if (val != 60)
1245 seq_printf(s, ",commit=%d", lfsecs); 1310 seq_printf(s, ",commit=%d", val);
1311 val = sdp->sd_tune.gt_statfs_quantum;
1312 if (val != 30)
1313 seq_printf(s, ",statfs_quantum=%d", val);
1314 val = sdp->sd_tune.gt_quota_quantum;
1315 if (val != 60)
1316 seq_printf(s, ",quota_quantum=%d", val);
1317 if (args->ar_statfs_percent)
1318 seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
1246 if (args->ar_errors != GFS2_ERRORS_DEFAULT) { 1319 if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
1247 const char *state; 1320 const char *state;
1248 1321
@@ -1259,6 +1332,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1259 } 1332 }
1260 seq_printf(s, ",errors=%s", state); 1333 seq_printf(s, ",errors=%s", state);
1261 } 1334 }
1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1336 seq_printf(s, ",nobarrier");
1337
1262 return 0; 1338 return 0;
1263} 1339}
1264 1340
@@ -1277,9 +1353,6 @@ static void gfs2_delete_inode(struct inode *inode)
1277 struct gfs2_holder gh; 1353 struct gfs2_holder gh;
1278 int error; 1354 int error;
1279 1355
1280 if (!test_bit(GIF_USER, &ip->i_flags))
1281 goto out;
1282
1283 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1356 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1284 if (unlikely(error)) { 1357 if (unlikely(error)) {
1285 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1358 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 235db3682885..3df60f2d84e3 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -27,7 +27,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
27 27
28extern void gfs2_jindex_free(struct gfs2_sbd *sdp); 28extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
29 29
30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); 30extern int gfs2_mount_args(struct gfs2_args *args, char *data);
31 31
32extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); 32extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
33extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); 33extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
@@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
44 const void *buf); 44 const void *buf);
45extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, 45extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
46 struct buffer_head *l_bh); 46 struct buffer_head *l_bh);
47extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); 47extern int gfs2_statfs_sync(struct super_block *sb, int type);
48 48
49extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); 49extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
50extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); 50extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 446329728d52..54fd98425991 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h> 11#include <linux/spinlock.h>
13#include <linux/completion.h> 12#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
@@ -49,7 +48,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
49 return a->store ? a->store(sdp, buf, len) : len; 48 return a->store ? a->store(sdp, buf, len) : len;
50} 49}
51 50
52static struct sysfs_ops gfs2_attr_ops = { 51static const struct sysfs_ops gfs2_attr_ops = {
53 .show = gfs2_attr_show, 52 .show = gfs2_attr_show,
54 .store = gfs2_attr_store, 53 .store = gfs2_attr_store,
55}; 54};
@@ -85,11 +84,7 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
85 buf[0] = '\0'; 84 buf[0] = '\0';
86 if (!gfs2_uuid_valid(uuid)) 85 if (!gfs2_uuid_valid(uuid))
87 return 0; 86 return 0;
88 return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-" 87 return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid);
89 "%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
90 uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5],
91 uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11],
92 uuid[12], uuid[13], uuid[14], uuid[15]);
93} 88}
94 89
95static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) 90static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
@@ -158,7 +153,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
158 if (simple_strtol(buf, NULL, 0) != 1) 153 if (simple_strtol(buf, NULL, 0) != 1)
159 return -EINVAL; 154 return -EINVAL;
160 155
161 gfs2_statfs_sync(sdp); 156 gfs2_statfs_sync(sdp->sd_vfs, 0);
162 return len; 157 return len;
163} 158}
164 159
@@ -171,13 +166,14 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
171 if (simple_strtol(buf, NULL, 0) != 1) 166 if (simple_strtol(buf, NULL, 0) != 1)
172 return -EINVAL; 167 return -EINVAL;
173 168
174 gfs2_quota_sync(sdp); 169 gfs2_quota_sync(sdp->sd_vfs, 0, 1);
175 return len; 170 return len;
176} 171}
177 172
178static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, 173static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
179 size_t len) 174 size_t len)
180{ 175{
176 int error;
181 u32 id; 177 u32 id;
182 178
183 if (!capable(CAP_SYS_ADMIN)) 179 if (!capable(CAP_SYS_ADMIN))
@@ -185,13 +181,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
185 181
186 id = simple_strtoul(buf, NULL, 0); 182 id = simple_strtoul(buf, NULL, 0);
187 183
188 gfs2_quota_refresh(sdp, 1, id); 184 error = gfs2_quota_refresh(sdp, 1, id);
189 return len; 185 return error ? error : len;
190} 186}
191 187
192static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, 188static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
193 size_t len) 189 size_t len)
194{ 190{
191 int error;
195 u32 id; 192 u32 id;
196 193
197 if (!capable(CAP_SYS_ADMIN)) 194 if (!capable(CAP_SYS_ADMIN))
@@ -199,8 +196,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
199 196
200 id = simple_strtoul(buf, NULL, 0); 197 id = simple_strtoul(buf, NULL, 0);
201 198
202 gfs2_quota_refresh(sdp, 0, id); 199 error = gfs2_quota_refresh(sdp, 0, id);
203 return len; 200 return error ? error : len;
204} 201}
205 202
206static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 203static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
@@ -480,7 +477,6 @@ TUNE_ATTR(complain_secs, 0);
480TUNE_ATTR(statfs_slow, 0); 477TUNE_ATTR(statfs_slow, 0);
481TUNE_ATTR(new_files_jdata, 0); 478TUNE_ATTR(new_files_jdata, 0);
482TUNE_ATTR(quota_simul_sync, 1); 479TUNE_ATTR(quota_simul_sync, 1);
483TUNE_ATTR(stall_secs, 1);
484TUNE_ATTR(statfs_quantum, 1); 480TUNE_ATTR(statfs_quantum, 1);
485TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 481TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
486 482
@@ -493,7 +489,6 @@ static struct attribute *tune_attrs[] = {
493 &tune_attr_complain_secs.attr, 489 &tune_attr_complain_secs.attr,
494 &tune_attr_statfs_slow.attr, 490 &tune_attr_statfs_slow.attr,
495 &tune_attr_quota_simul_sync.attr, 491 &tune_attr_quota_simul_sync.attr,
496 &tune_attr_stall_secs.attr,
497 &tune_attr_statfs_quantum.attr, 492 &tune_attr_statfs_quantum.attr,
498 &tune_attr_quota_scale.attr, 493 &tune_attr_quota_scale.attr,
499 &tune_attr_new_files_jdata.attr, 494 &tune_attr_new_files_jdata.attr,
@@ -573,18 +568,12 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
573 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 568 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
574 if (!sdp->sd_args.ar_spectator) 569 if (!sdp->sd_args.ar_spectator)
575 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 570 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
576 if (gfs2_uuid_valid(uuid)) { 571 if (gfs2_uuid_valid(uuid))
577 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" 572 add_uevent_var(env, "UUID=%pUB", uuid);
578 "%02X%02X-%02X%02X%02X%02X%02X%02X",
579 uuid[0], uuid[1], uuid[2], uuid[3], uuid[4],
580 uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
581 uuid[10], uuid[11], uuid[12], uuid[13],
582 uuid[14], uuid[15]);
583 }
584 return 0; 573 return 0;
585} 574}
586 575
587static struct kset_uevent_ops gfs2_uevent_ops = { 576static const struct kset_uevent_ops gfs2_uevent_ops = {
588 .uevent = gfs2_uevent, 577 .uevent = gfs2_uevent,
589}; 578};
590 579
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -21,6 +20,7 @@
21#include "util.h" 20#include "util.h"
22 21
23struct kmem_cache *gfs2_glock_cachep __read_mostly; 22struct kmem_cache *gfs2_glock_cachep __read_mostly;
23struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
24struct kmem_cache *gfs2_inode_cachep __read_mostly; 24struct kmem_cache *gfs2_inode_cachep __read_mostly;
25struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 25struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
26struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 26struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
145 145
146 146
147extern struct kmem_cache *gfs2_glock_cachep; 147extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_glock_aspace_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 149extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 150extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 151extern struct kmem_cache *gfs2_rgrpd_cachep;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a0f8ef6ee27..c2ebdf2c01d4 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -186,8 +186,8 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
186 return 0; 186 return 0;
187} 187}
188 188
189int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, 189static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
190 struct gfs2_ea_location *el) 190 struct gfs2_ea_location *el)
191{ 191{
192 struct ea_find ef; 192 struct ea_find ef;
193 int error; 193 int error;
@@ -516,8 +516,8 @@ out:
516 return error; 516 return error;
517} 517}
518 518
519int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, 519static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
520 char *data, size_t size) 520 char *data, size_t size)
521{ 521{
522 int ret; 522 int ret;
523 size_t len = GFS2_EA_DATA_LEN(el->el_ea); 523 size_t len = GFS2_EA_DATA_LEN(el->el_ea);
@@ -534,21 +534,50 @@ int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
534 return len; 534 return len;
535} 535}
536 536
537int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
538{
539 struct gfs2_ea_location el;
540 int error;
541 int len;
542 char *data;
543
544 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
545 if (error)
546 return error;
547 if (!el.el_ea)
548 goto out;
549 if (!GFS2_EA_DATA_LEN(el.el_ea))
550 goto out;
551
552 len = GFS2_EA_DATA_LEN(el.el_ea);
553 data = kmalloc(len, GFP_NOFS);
554 error = -ENOMEM;
555 if (data == NULL)
556 goto out;
557
558 error = gfs2_ea_get_copy(ip, &el, data, len);
559 if (error == 0)
560 error = len;
561 *ppdata = data;
562out:
563 brelse(el.el_bh);
564 return error;
565}
566
537/** 567/**
538 * gfs2_xattr_get - Get a GFS2 extended attribute 568 * gfs2_xattr_get - Get a GFS2 extended attribute
539 * @inode: The inode 569 * @inode: The inode
540 * @type: The type of extended attribute
541 * @name: The name of the extended attribute 570 * @name: The name of the extended attribute
542 * @buffer: The buffer to write the result into 571 * @buffer: The buffer to write the result into
543 * @size: The size of the buffer 572 * @size: The size of the buffer
573 * @type: The type of extended attribute
544 * 574 *
545 * Returns: actual size of data on success, -errno on error 575 * Returns: actual size of data on success, -errno on error
546 */ 576 */
547 577static int gfs2_xattr_get(struct dentry *dentry, const char *name,
548int gfs2_xattr_get(struct inode *inode, int type, const char *name, 578 void *buffer, size_t size, int type)
549 void *buffer, size_t size)
550{ 579{
551 struct gfs2_inode *ip = GFS2_I(inode); 580 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
552 struct gfs2_ea_location el; 581 struct gfs2_ea_location el;
553 int error; 582 int error;
554 583
@@ -1089,7 +1118,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1089 1118
1090/** 1119/**
1091 * gfs2_xattr_remove - Remove a GFS2 extended attribute 1120 * gfs2_xattr_remove - Remove a GFS2 extended attribute
1092 * @inode: The inode 1121 * @ip: The inode
1093 * @type: The type of the extended attribute 1122 * @type: The type of the extended attribute
1094 * @name: The name of the extended attribute 1123 * @name: The name of the extended attribute
1095 * 1124 *
@@ -1100,9 +1129,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1100 * Returns: 0, or errno on failure 1129 * Returns: 0, or errno on failure
1101 */ 1130 */
1102 1131
1103static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) 1132static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name)
1104{ 1133{
1105 struct gfs2_inode *ip = GFS2_I(inode);
1106 struct gfs2_ea_location el; 1134 struct gfs2_ea_location el;
1107 int error; 1135 int error;
1108 1136
@@ -1126,24 +1154,24 @@ static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
1126} 1154}
1127 1155
1128/** 1156/**
1129 * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute 1157 * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
1130 * @inode: The inode 1158 * @ip: The inode
1131 * @type: The type of the extended attribute
1132 * @name: The name of the extended attribute 1159 * @name: The name of the extended attribute
1133 * @value: The value of the extended attribute (NULL for remove) 1160 * @value: The value of the extended attribute (NULL for remove)
1134 * @size: The size of the @value argument 1161 * @size: The size of the @value argument
1135 * @flags: Create or Replace 1162 * @flags: Create or Replace
1163 * @type: The type of the extended attribute
1136 * 1164 *
1137 * See gfs2_xattr_remove() for details of the removal of xattrs. 1165 * See gfs2_xattr_remove() for details of the removal of xattrs.
1138 * 1166 *
1139 * Returns: 0 or errno on failure 1167 * Returns: 0 or errno on failure
1140 */ 1168 */
1141 1169
1142int gfs2_xattr_set(struct inode *inode, int type, const char *name, 1170int __gfs2_xattr_set(struct inode *inode, const char *name,
1143 const void *value, size_t size, int flags) 1171 const void *value, size_t size, int flags, int type)
1144{ 1172{
1145 struct gfs2_sbd *sdp = GFS2_SB(inode);
1146 struct gfs2_inode *ip = GFS2_I(inode); 1173 struct gfs2_inode *ip = GFS2_I(inode);
1174 struct gfs2_sbd *sdp = GFS2_SB(inode);
1147 struct gfs2_ea_location el; 1175 struct gfs2_ea_location el;
1148 unsigned int namel = strlen(name); 1176 unsigned int namel = strlen(name);
1149 int error; 1177 int error;
@@ -1154,7 +1182,7 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name,
1154 return -ERANGE; 1182 return -ERANGE;
1155 1183
1156 if (value == NULL) 1184 if (value == NULL)
1157 return gfs2_xattr_remove(inode, type, name); 1185 return gfs2_xattr_remove(ip, type, name);
1158 1186
1159 if (ea_check_size(sdp, namel, size)) 1187 if (ea_check_size(sdp, namel, size))
1160 return -ERANGE; 1188 return -ERANGE;
@@ -1194,6 +1222,13 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name,
1194 return error; 1222 return error;
1195} 1223}
1196 1224
1225static int gfs2_xattr_set(struct dentry *dentry, const char *name,
1226 const void *value, size_t size, int flags, int type)
1227{
1228 return __gfs2_xattr_set(dentry->d_inode, name, value,
1229 size, flags, type);
1230}
1231
1197static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, 1232static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1198 struct gfs2_ea_header *ea, char *data) 1233 struct gfs2_ea_header *ea, char *data)
1199{ 1234{
@@ -1259,23 +1294,29 @@ fail:
1259 return error; 1294 return error;
1260} 1295}
1261 1296
1262int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, 1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1263 struct iattr *attr, char *data)
1264{ 1298{
1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1300 struct gfs2_ea_location el;
1265 struct buffer_head *dibh; 1301 struct buffer_head *dibh;
1266 int error; 1302 int error;
1267 1303
1268 if (GFS2_EA_IS_STUFFED(el->el_ea)) { 1304 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
1269 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); 1305 if (error)
1270 if (error) 1306 return error;
1271 return error;
1272 1307
1273 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); 1308 if (GFS2_EA_IS_STUFFED(el.el_ea)) {
1274 memcpy(GFS2_EA2DATA(el->el_ea), data, 1309 error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
1275 GFS2_EA_DATA_LEN(el->el_ea)); 1310 if (error == 0) {
1276 } else 1311 gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
1277 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data); 1312 memcpy(GFS2_EA2DATA(el.el_ea), data,
1313 GFS2_EA_DATA_LEN(el.el_ea));
1314 }
1315 } else {
1316 error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
1317 }
1278 1318
1319 brelse(el.el_bh);
1279 if (error) 1320 if (error)
1280 return error; 1321 return error;
1281 1322
@@ -1288,8 +1329,7 @@ int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1288 brelse(dibh); 1329 brelse(dibh);
1289 } 1330 }
1290 1331
1291 gfs2_trans_end(GFS2_SB(&ip->i_inode)); 1332 gfs2_trans_end(sdp);
1292
1293 return error; 1333 return error;
1294} 1334}
1295 1335
@@ -1495,58 +1535,18 @@ out_alloc:
1495 return error; 1535 return error;
1496} 1536}
1497 1537
1498static int gfs2_xattr_user_get(struct inode *inode, const char *name,
1499 void *buffer, size_t size)
1500{
1501 return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
1502}
1503
1504static int gfs2_xattr_user_set(struct inode *inode, const char *name,
1505 const void *value, size_t size, int flags)
1506{
1507 return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
1508}
1509
1510static int gfs2_xattr_system_get(struct inode *inode, const char *name,
1511 void *buffer, size_t size)
1512{
1513 return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
1514}
1515
1516static int gfs2_xattr_system_set(struct inode *inode, const char *name,
1517 const void *value, size_t size, int flags)
1518{
1519 return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
1520}
1521
1522static int gfs2_xattr_security_get(struct inode *inode, const char *name,
1523 void *buffer, size_t size)
1524{
1525 return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
1526}
1527
1528static int gfs2_xattr_security_set(struct inode *inode, const char *name,
1529 const void *value, size_t size, int flags)
1530{
1531 return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
1532}
1533
1534static struct xattr_handler gfs2_xattr_user_handler = { 1538static struct xattr_handler gfs2_xattr_user_handler = {
1535 .prefix = XATTR_USER_PREFIX, 1539 .prefix = XATTR_USER_PREFIX,
1536 .get = gfs2_xattr_user_get, 1540 .flags = GFS2_EATYPE_USR,
1537 .set = gfs2_xattr_user_set, 1541 .get = gfs2_xattr_get,
1542 .set = gfs2_xattr_set,
1538}; 1543};
1539 1544
1540static struct xattr_handler gfs2_xattr_security_handler = { 1545static struct xattr_handler gfs2_xattr_security_handler = {
1541 .prefix = XATTR_SECURITY_PREFIX, 1546 .prefix = XATTR_SECURITY_PREFIX,
1542 .get = gfs2_xattr_security_get, 1547 .flags = GFS2_EATYPE_SECURITY,
1543 .set = gfs2_xattr_security_set, 1548 .get = gfs2_xattr_get,
1544}; 1549 .set = gfs2_xattr_set,
1545
1546static struct xattr_handler gfs2_xattr_system_handler = {
1547 .prefix = XATTR_SYSTEM_PREFIX,
1548 .get = gfs2_xattr_system_get,
1549 .set = gfs2_xattr_system_set,
1550}; 1550};
1551 1551
1552struct xattr_handler *gfs2_xattr_handlers[] = { 1552struct xattr_handler *gfs2_xattr_handlers[] = {
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index cbdfd7743733..d392f8358f2f 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -53,20 +53,15 @@ struct gfs2_ea_location {
53 struct gfs2_ea_header *el_prev; 53 struct gfs2_ea_header *el_prev;
54}; 54};
55 55
56extern int gfs2_xattr_get(struct inode *inode, int type, const char *name, 56extern int __gfs2_xattr_set(struct inode *inode, const char *name,
57 void *buffer, size_t size); 57 const void *value, size_t size,
58extern int gfs2_xattr_set(struct inode *inode, int type, const char *name, 58 int flags, int type);
59 const void *value, size_t size, int flags);
60extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); 59extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
61extern int gfs2_ea_dealloc(struct gfs2_inode *ip); 60extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
62 61
63/* Exported to acl.c */ 62/* Exported to acl.c */
64 63
65extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, 64extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
66 struct gfs2_ea_location *el); 65extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
67extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
68 char *data, size_t size);
69extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
70 struct iattr *attr, char *data);
71 66
72#endif /* __EATTR_DOT_H__ */ 67#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/log2.h> 13#include <linux/log2.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 6d98f116ca03..424b0337f524 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -289,6 +289,10 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
289 err = hfs_brec_find(&src_fd); 289 err = hfs_brec_find(&src_fd);
290 if (err) 290 if (err)
291 goto out; 291 goto out;
292 if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
293 err = -EIO;
294 goto out;
295 }
292 296
293 hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, 297 hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
294 src_fd.entrylength); 298 src_fd.entrylength);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 7c69b98a2e45..2b3b8611b41b 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -79,6 +79,11 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
79 filp->f_pos++; 79 filp->f_pos++;
80 /* fall through */ 80 /* fall through */
81 case 1: 81 case 1:
82 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
83 err = -EIO;
84 goto out;
85 }
86
82 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 87 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
83 if (entry.type != HFS_CDR_THD) { 88 if (entry.type != HFS_CDR_THD) {
84 printk(KERN_ERR "hfs: bad catalog folder thread\n"); 89 printk(KERN_ERR "hfs: bad catalog folder thread\n");
@@ -109,6 +114,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
109 err = -EIO; 114 err = -EIO;
110 goto out; 115 goto out;
111 } 116 }
117
118 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
119 err = -EIO;
120 goto out;
121 }
122
112 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 123 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
113 type = entry.type; 124 type = entry.type;
114 len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); 125 len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e11671..fe35e3b626c4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
188 188
189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); 189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); 190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
191extern int hfs_write_inode(struct inode *, int); 191extern int hfs_write_inode(struct inode *, struct writeback_control *);
192extern int hfs_inode_setattr(struct dentry *, struct iattr *); 192extern int hfs_inode_setattr(struct dentry *, struct iattr *);
193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, 193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
194 __be32 log_size, __be32 phys_size, u32 clump_size); 194 __be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d99..14f5cb1b9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
381 HFS_SB(inode->i_sb)->alloc_blksz); 381 HFS_SB(inode->i_sb)->alloc_blksz);
382} 382}
383 383
384int hfs_write_inode(struct inode *inode, int unused) 384int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
385{ 385{
386 struct inode *main_inode = inode; 386 struct inode *main_inode = inode;
387 struct hfs_find_data fd; 387 struct hfs_find_data fd;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
11#include <linux/cdrom.h> 11#include <linux/cdrom.h>
12#include <linux/genhd.h> 12#include <linux/genhd.h>
13#include <linux/nls.h> 13#include <linux/nls.h>
14#include <linux/slab.h>
14 15
15#include "hfs_fs.h" 16#include "hfs_fs.h"
16#include "btree.h" 17#include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index f7fcbe49da72..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
19#include <linux/nls.h> 19#include <linux/nls.h>
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include <linux/smp_lock.h> 23#include <linux/smp_lock.h>
23#include <linux/vfs.h> 24#include <linux/vfs.h>
24 25
@@ -409,8 +410,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
409 /* try to get the root inode */ 410 /* try to get the root inode */
410 hfs_find_init(HFS_SB(sb)->cat_tree, &fd); 411 hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
411 res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); 412 res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
412 if (!res) 413 if (!res) {
414 if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
415 res = -EIO;
416 goto bail;
417 }
413 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); 418 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
419 }
414 if (res) { 420 if (res) {
415 hfs_find_exit(&fd); 421 hfs_find_exit(&fd);
416 goto bail_no_root; 422 goto bail_no_root;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
15#include <linux/nls.h> 15#include <linux/nls.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/slab.h>
18#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
19 20
20enum { 21enum {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d5148..74b473a8ef92 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
87 return ERR_PTR(err); 87 return ERR_PTR(err);
88} 88}
89 89
90static int hfsplus_write_inode(struct inode *inode, int unused) 90static int hfsplus_write_inode(struct inode *inode,
91 struct writeback_control *wbc)
91{ 92{
92 struct hfsplus_vh *vhdr; 93 struct hfsplus_vh *vhdr;
93 int ret = 0; 94 int ret = 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/slab.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/mount.h> 16#include <linux/mount.h>
16#include "hostfs.h" 17#include "hostfs.h"
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 1aa88c4e0964..6a2f04bf3df0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
353} 353}
354 354
355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos, 355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
356 unsigned len, char *buf) 356 unsigned len, const char *buf)
357{ 357{
358 struct buffer_head *bh; 358 struct buffer_head *bh;
359 char *data; 359 char *data;
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
6 * general buffer i/o 6 * general buffer i/o
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11void hpfs_lock_creation(struct super_block *s) 12void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 940d6d150bee..67d9d36b3d5f 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
20 20
21 if (l == 1) if (qstr->name[0]=='.') goto x; 21 if (l == 1) if (qstr->name[0]=='.') goto x;
22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x; 22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
23 hpfs_adjust_length((char *)qstr->name, &l); 23 hpfs_adjust_length(qstr->name, &l);
24 /*if (hpfs_chk_name((char *)qstr->name,&l))*/ 24 /*if (hpfs_chk_name(qstr->name,&l))*/
25 /*return -ENAMETOOLONG;*/ 25 /*return -ENAMETOOLONG;*/
26 /*return -ENOENT;*/ 26 /*return -ENOENT;*/
27 x: 27 x:
@@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
38{ 38{
39 unsigned al=a->len; 39 unsigned al=a->len;
40 unsigned bl=b->len; 40 unsigned bl=b->len;
41 hpfs_adjust_length((char *)a->name, &al); 41 hpfs_adjust_length(a->name, &al);
42 /*hpfs_adjust_length((char *)b->name, &bl);*/ 42 /*hpfs_adjust_length(b->name, &bl);*/
43 /* 'a' is the qstr of an already existing dentry, so the name 43 /* 'a' is the qstr of an already existing dentry, so the name
44 * must be valid. 'b' must be validated first. 44 * must be valid. 'b' must be validated first.
45 */ 45 */
46 46
47 if (hpfs_chk_name((char *)b->name, &bl)) return 1; 47 if (hpfs_chk_name(b->name, &bl))
48 if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1; 48 return 1;
49 if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
50 return 1;
49 return 0; 51 return 0;
50} 52}
51 53
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8865c94f55f6..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12static int hpfs_dir_release(struct inode *inode, struct file *filp) 13static int hpfs_dir_release(struct inode *inode, struct file *filp)
@@ -59,7 +60,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
59 struct hpfs_dirent *de; 60 struct hpfs_dirent *de;
60 int lc; 61 int lc;
61 long old_pos; 62 long old_pos;
62 char *tempname; 63 unsigned char *tempname;
63 int c1, c2 = 0; 64 int c1, c2 = 0;
64 int ret = 0; 65 int ret = 0;
65 66
@@ -158,11 +159,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
158 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); 159 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
159 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) { 160 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) {
160 filp->f_pos = old_pos; 161 filp->f_pos = old_pos;
161 if (tempname != (char *)de->name) kfree(tempname); 162 if (tempname != de->name) kfree(tempname);
162 hpfs_brelse4(&qbh); 163 hpfs_brelse4(&qbh);
163 goto out; 164 goto out;
164 } 165 }
165 if (tempname != (char *)de->name) kfree(tempname); 166 if (tempname != de->name) kfree(tempname);
166 hpfs_brelse4(&qbh); 167 hpfs_brelse4(&qbh);
167 } 168 }
168out: 169out:
@@ -187,7 +188,7 @@ out:
187 188
188struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 189struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
189{ 190{
190 const char *name = dentry->d_name.name; 191 const unsigned char *name = dentry->d_name.name;
191 unsigned len = dentry->d_name.len; 192 unsigned len = dentry->d_name.len;
192 struct quad_buffer_head qbh; 193 struct quad_buffer_head qbh;
193 struct hpfs_dirent *de; 194 struct hpfs_dirent *de;
@@ -197,7 +198,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
197 struct hpfs_inode_info *hpfs_result; 198 struct hpfs_inode_info *hpfs_result;
198 199
199 lock_kernel(); 200 lock_kernel();
200 if ((err = hpfs_chk_name((char *)name, &len))) { 201 if ((err = hpfs_chk_name(name, &len))) {
201 if (err == -ENAMETOOLONG) { 202 if (err == -ENAMETOOLONG) {
202 unlock_kernel(); 203 unlock_kernel();
203 return ERR_PTR(-ENAMETOOLONG); 204 return ERR_PTR(-ENAMETOOLONG);
@@ -209,7 +210,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
209 * '.' and '..' will never be passed here. 210 * '.' and '..' will never be passed here.
210 */ 211 */
211 212
212 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh); 213 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
213 214
214 /* 215 /*
215 * This is not really a bailout, just means file not found. 216 * This is not really a bailout, just means file not found.
@@ -250,7 +251,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
250 hpfs_result = hpfs_i(result); 251 hpfs_result = hpfs_i(result);
251 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; 252 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
252 253
253 hpfs_decide_conv(result, (char *)name, len); 254 hpfs_decide_conv(result, name, len);
254 255
255 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { 256 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
256 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); 257 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index fe83c2b7d2d8..9b2ffadfc8c4 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
158 158
159/* Add an entry to dnode and don't care if it grows over 2048 bytes */ 159/* Add an entry to dnode and don't care if it grows over 2048 bytes */
160 160
161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name, 161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
162 const unsigned char *name,
162 unsigned namelen, secno down_ptr) 163 unsigned namelen, secno down_ptr)
163{ 164{
164 struct hpfs_dirent *de; 165 struct hpfs_dirent *de;
@@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d)
223/* Add an entry to dnode and do dnode splitting if required */ 224/* Add an entry to dnode and do dnode splitting if required */
224 225
225static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, 226static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
226 unsigned char *name, unsigned namelen, 227 const unsigned char *name, unsigned namelen,
227 struct hpfs_dirent *new_de, dnode_secno down_ptr) 228 struct hpfs_dirent *new_de, dnode_secno down_ptr)
228{ 229{
229 struct quad_buffer_head qbh, qbh1, qbh2; 230 struct quad_buffer_head qbh, qbh1, qbh2;
@@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
231 dnode_secno adno, rdno; 232 dnode_secno adno, rdno;
232 struct hpfs_dirent *de; 233 struct hpfs_dirent *de;
233 struct hpfs_dirent nde; 234 struct hpfs_dirent nde;
234 char *nname; 235 unsigned char *nname;
235 int h; 236 int h;
236 int pos; 237 int pos;
237 struct buffer_head *bh; 238 struct buffer_head *bh;
@@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
305 pos++; 306 pos++;
306 } 307 }
307 copy_de(new_de = &nde, de); 308 copy_de(new_de = &nde, de);
308 memcpy(name = nname, de->name, namelen = de->namelen); 309 memcpy(nname, de->name, de->namelen);
310 name = nname;
311 namelen = de->namelen;
309 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); 312 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
310 down_ptr = adno; 313 down_ptr = adno;
311 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); 314 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
@@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
368 * I hope, now it's finally bug-free. 371 * I hope, now it's finally bug-free.
369 */ 372 */
370 373
371int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen, 374int hpfs_add_dirent(struct inode *i,
375 const unsigned char *name, unsigned namelen,
372 struct hpfs_dirent *new_de, int cdepth) 376 struct hpfs_dirent *new_de, int cdepth)
373{ 377{
374 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 378 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
@@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
897 901
898/* Find a dirent in tree */ 902/* Find a dirent in tree */
899 903
900struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len, 904struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
905 const unsigned char *name, unsigned len,
901 dnode_secno *dd, struct quad_buffer_head *qbh) 906 dnode_secno *dd, struct quad_buffer_head *qbh)
902{ 907{
903 struct dnode *dnode; 908 struct dnode *dnode;
@@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
988struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, 993struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
989 struct fnode *f, struct quad_buffer_head *qbh) 994 struct fnode *f, struct quad_buffer_head *qbh)
990{ 995{
991 char *name1; 996 unsigned char *name1;
992 char *name2; 997 unsigned char *name2;
993 int name1len, name2len; 998 int name1len, name2len;
994 struct dnode *d; 999 struct dnode *d;
995 dnode_secno dno, downd; 1000 dnode_secno dno, downd;
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 547a8384571f..45e53d972b42 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
62 return ret; 62 return ret;
63} 63}
64 64
65static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data, 65static void set_indirect_ea(struct super_block *s, int ano, secno a,
66 int size) 66 const char *data, int size)
67{ 67{
68 hpfs_ea_write(s, a, ano, 0, size, data); 68 hpfs_ea_write(s, a, ano, 0, size, data);
69} 69}
@@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
186 * This driver can't change sizes of eas ('cause I just don't need it). 186 * This driver can't change sizes of eas ('cause I just don't need it).
187 */ 187 */
188 188
189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size) 189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
190 const char *data, int size)
190{ 191{
191 fnode_secno fno = inode->i_ino; 192 fnode_secno fno = inode->i_ino;
192 struct super_block *s = inode->i_sb; 193 struct super_block *s = inode->i_sb;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 701ca54c0867..97bf738cd5d6 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade
215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned); 215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
216void hpfs_remove_btree(struct super_block *, struct bplus_header *); 216void hpfs_remove_btree(struct super_block *, struct bplus_header *);
217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *); 217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *); 218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
219void hpfs_ea_remove(struct super_block *, secno, int, unsigned); 219void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned); 220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
221void hpfs_remove_fnode(struct super_block *, fnode_secno fno); 221void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
@@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops;
244 244
245void hpfs_add_pos(struct inode *, loff_t *); 245void hpfs_add_pos(struct inode *, loff_t *);
246void hpfs_del_pos(struct inode *, loff_t *); 246void hpfs_del_pos(struct inode *, loff_t *);
247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno); 247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
248int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int); 248 const unsigned char *, unsigned, secno);
249int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
250 struct hpfs_dirent *, int);
249int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); 251int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
250void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); 252void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
251dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); 253dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
252struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *); 254struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
253struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *); 255struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
256 const unsigned char *, unsigned, dnode_secno *,
257 struct quad_buffer_head *);
254void hpfs_remove_dtree(struct super_block *, dnode_secno); 258void hpfs_remove_dtree(struct super_block *, dnode_secno);
255struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *); 259struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
256 260
@@ -259,7 +263,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f
259void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned); 263void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
260int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int); 264int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
261char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *); 265char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
262void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int); 266void hpfs_set_ea(struct inode *, struct fnode *, const char *,
267 const char *, int);
263 268
264/* file.c */ 269/* file.c */
265 270
@@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *);
282 287
283unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); 288unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
284unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); 289unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
285char *hpfs_load_code_page(struct super_block *, secno); 290unsigned char *hpfs_load_code_page(struct super_block *, secno);
286secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); 291secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
287struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); 292struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
288struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); 293struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
@@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
292/* name.c */ 297/* name.c */
293 298
294unsigned char hpfs_upcase(unsigned char *, unsigned char); 299unsigned char hpfs_upcase(unsigned char *, unsigned char);
295int hpfs_chk_name(unsigned char *, unsigned *); 300int hpfs_chk_name(const unsigned char *, unsigned *);
296char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); 301unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
297int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int); 302int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
298int hpfs_is_name_long(unsigned char *, unsigned); 303 const unsigned char *, unsigned, int);
299void hpfs_adjust_length(unsigned char *, unsigned *); 304int hpfs_is_name_long(const unsigned char *, unsigned);
300void hpfs_decide_conv(struct inode *, unsigned char *, unsigned); 305void hpfs_adjust_length(const unsigned char *, unsigned *);
306void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned);
301 307
302/* namei.c */ 308/* namei.c */
303 309
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index fe703ae46bc7..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12void hpfs_init_inode(struct inode *i) 13void hpfs_init_inode(struct inode *i)
@@ -46,7 +47,7 @@ void hpfs_read_inode(struct inode *i)
46 struct fnode *fnode; 47 struct fnode *fnode;
47 struct super_block *sb = i->i_sb; 48 struct super_block *sb = i->i_sb;
48 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 49 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
49 unsigned char *ea; 50 void *ea;
50 int ea_size; 51 int ea_size;
51 52
52 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) { 53 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
@@ -112,7 +113,7 @@ void hpfs_read_inode(struct inode *i)
112 } 113 }
113 } 114 }
114 if (fnode->dirflag) { 115 if (fnode->dirflag) {
115 unsigned n_dnodes, n_subdirs; 116 int n_dnodes, n_subdirs;
116 i->i_mode |= S_IFDIR; 117 i->i_mode |= S_IFDIR;
117 i->i_op = &hpfs_dir_iops; 118 i->i_op = &hpfs_dir_iops;
118 i->i_fop = &hpfs_dir_ops; 119 i->i_fop = &hpfs_dir_ops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index c4724589b2eb..840d033ecee8 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
35 * lowercasing table 35 * lowercasing table
36 */ 36 */
37 37
38char *hpfs_load_code_page(struct super_block *s, secno cps) 38unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
39{ 39{
40 struct buffer_head *bh; 40 struct buffer_head *bh;
41 secno cpds; 41 secno cpds;
@@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
71 brelse(bh); 71 brelse(bh);
72 return NULL; 72 return NULL;
73 } 73 }
74 ptr = (char *)cpd + cpd->offs[cpi] + 6; 74 ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6;
75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) { 75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
76 printk("HPFS: out of memory for code page table\n"); 76 printk("HPFS: out of memory for code page table\n");
77 brelse(bh); 77 brelse(bh);
@@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) 217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
218 if (hpfs_sb(s)->sb_chk) { 218 if (hpfs_sb(s)->sb_chk) {
219 unsigned p, pp = 0; 219 unsigned p, pp = 0;
220 unsigned char *d = (char *)dnode; 220 unsigned char *d = (unsigned char *)dnode;
221 int b = 0; 221 int b = 0;
222 if (dnode->magic != DNODE_MAGIC) { 222 if (dnode->magic != DNODE_MAGIC) {
223 hpfs_error(s, "bad magic on dnode %08x", secno); 223 hpfs_error(s, "bad magic on dnode %08x", secno);
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 1f4a964384eb..f24736d7a439 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -8,16 +8,16 @@
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10 10
11static char *text_postfix[]={ 11static const char *text_postfix[]={
12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF", 12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF",
13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS", 13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS",
14".RC", ".TEX", ".TXT", ".Y", ""}; 14".RC", ".TEX", ".TXT", ".Y", ""};
15 15
16static char *text_prefix[]={ 16static const char *text_prefix[]={
17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ", 17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ",
18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""}; 18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""};
19 19
20void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len) 20void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len)
21{ 21{
22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
23 int i; 23 int i;
@@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a)
71 return dir[a]; 71 return dir[a];
72} 72}
73 73
74int hpfs_chk_name(unsigned char *name, unsigned *len) 74int hpfs_chk_name(const unsigned char *name, unsigned *len)
75{ 75{
76 int i; 76 int i;
77 if (*len > 254) return -ENAMETOOLONG; 77 if (*len > 254) return -ENAMETOOLONG;
@@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len)
83 return 0; 83 return 0;
84} 84}
85 85
86char *hpfs_translate_name(struct super_block *s, unsigned char *from, 86unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
87 unsigned len, int lc, int lng) 87 unsigned len, int lc, int lng)
88{ 88{
89 char *to; 89 unsigned char *to;
90 int i; 90 int i;
91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { 91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
92 printk("HPFS: Long name flag mismatch - name "); 92 printk("HPFS: Long name flag mismatch - name ");
@@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
103 return to; 103 return to;
104} 104}
105 105
106int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1, 106int hpfs_compare_names(struct super_block *s,
107 unsigned char *n2, unsigned l2, int last) 107 const unsigned char *n1, unsigned l1,
108 const unsigned char *n2, unsigned l2, int last)
108{ 109{
109 unsigned l = l1 < l2 ? l1 : l2; 110 unsigned l = l1 < l2 ? l1 : l2;
110 unsigned i; 111 unsigned i;
@@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
120 return 0; 121 return 0;
121} 122}
122 123
123int hpfs_is_name_long(unsigned char *name, unsigned len) 124int hpfs_is_name_long(const unsigned char *name, unsigned len)
124{ 125{
125 int i,j; 126 int i,j;
126 for (i = 0; i < len && name[i] != '.'; i++) 127 for (i = 0; i < len && name[i] != '.'; i++)
@@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len)
134 135
135/* OS/2 clears dots and spaces at the end of file name, so we have to */ 136/* OS/2 clears dots and spaces at the end of file name, so we have to */
136 137
137void hpfs_adjust_length(unsigned char *name, unsigned *len) 138void hpfs_adjust_length(const unsigned char *name, unsigned *len)
138{ 139{
139 if (!*len) return; 140 if (!*len) return;
140 if (*len == 1 && name[0] == '.') return; 141 if (*len == 1 && name[0] == '.') return;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 82b9c4ba9ed0..11c2b4080f65 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
11 11
12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
13{ 13{
14 const char *name = dentry->d_name.name; 14 const unsigned char *name = dentry->d_name.name;
15 unsigned len = dentry->d_name.len; 15 unsigned len = dentry->d_name.len;
16 struct quad_buffer_head qbh0; 16 struct quad_buffer_head qbh0;
17 struct buffer_head *bh; 17 struct buffer_head *bh;
@@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
24 int r; 24 int r;
25 struct hpfs_dirent dee; 25 struct hpfs_dirent dee;
26 int err; 26 int err;
27 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 27 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
28 lock_kernel(); 28 lock_kernel();
29 err = -ENOSPC; 29 err = -ENOSPC;
30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
62 result->i_mode &= ~0222; 62 result->i_mode &= ~0222;
63 63
64 mutex_lock(&hpfs_i(dir)->i_mutex); 64 mutex_lock(&hpfs_i(dir)->i_mutex);
65 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 65 r = hpfs_add_dirent(dir, name, len, &dee, 0);
66 if (r == 1) 66 if (r == 1)
67 goto bail3; 67 goto bail3;
68 if (r == -1) { 68 if (r == -1) {
@@ -121,7 +121,7 @@ bail:
121 121
122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) 122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
123{ 123{
124 const char *name = dentry->d_name.name; 124 const unsigned char *name = dentry->d_name.name;
125 unsigned len = dentry->d_name.len; 125 unsigned len = dentry->d_name.len;
126 struct inode *result = NULL; 126 struct inode *result = NULL;
127 struct buffer_head *bh; 127 struct buffer_head *bh;
@@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
130 int r; 130 int r;
131 struct hpfs_dirent dee; 131 struct hpfs_dirent dee;
132 int err; 132 int err;
133 if ((err = hpfs_chk_name((char *)name, &len))) 133 if ((err = hpfs_chk_name(name, &len)))
134 return err==-ENOENT ? -EINVAL : err; 134 return err==-ENOENT ? -EINVAL : err;
135 lock_kernel(); 135 lock_kernel();
136 err = -ENOSPC; 136 err = -ENOSPC;
@@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
155 result->i_op = &hpfs_file_iops; 155 result->i_op = &hpfs_file_iops;
156 result->i_fop = &hpfs_file_ops; 156 result->i_fop = &hpfs_file_ops;
157 result->i_nlink = 1; 157 result->i_nlink = 1;
158 hpfs_decide_conv(result, (char *)name, len); 158 hpfs_decide_conv(result, name, len);
159 hpfs_i(result)->i_parent_dir = dir->i_ino; 159 hpfs_i(result)->i_parent_dir = dir->i_ino;
160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); 160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date);
161 result->i_ctime.tv_nsec = 0; 161 result->i_ctime.tv_nsec = 0;
@@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
170 hpfs_i(result)->mmu_private = 0; 170 hpfs_i(result)->mmu_private = 0;
171 171
172 mutex_lock(&hpfs_i(dir)->i_mutex); 172 mutex_lock(&hpfs_i(dir)->i_mutex);
173 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 173 r = hpfs_add_dirent(dir, name, len, &dee, 0);
174 if (r == 1) 174 if (r == 1)
175 goto bail2; 175 goto bail2;
176 if (r == -1) { 176 if (r == -1) {
@@ -211,7 +211,7 @@ bail:
211 211
212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) 212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
213{ 213{
214 const char *name = dentry->d_name.name; 214 const unsigned char *name = dentry->d_name.name;
215 unsigned len = dentry->d_name.len; 215 unsigned len = dentry->d_name.len;
216 struct buffer_head *bh; 216 struct buffer_head *bh;
217 struct fnode *fnode; 217 struct fnode *fnode;
@@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
220 struct hpfs_dirent dee; 220 struct hpfs_dirent dee;
221 struct inode *result = NULL; 221 struct inode *result = NULL;
222 int err; 222 int err;
223 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 223 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; 224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
225 if (!new_valid_dev(rdev)) 225 if (!new_valid_dev(rdev))
226 return -EINVAL; 226 return -EINVAL;
@@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
256 init_special_inode(result, mode, rdev); 256 init_special_inode(result, mode, rdev);
257 257
258 mutex_lock(&hpfs_i(dir)->i_mutex); 258 mutex_lock(&hpfs_i(dir)->i_mutex);
259 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 259 r = hpfs_add_dirent(dir, name, len, &dee, 0);
260 if (r == 1) 260 if (r == 1)
261 goto bail2; 261 goto bail2;
262 if (r == -1) { 262 if (r == -1) {
@@ -289,7 +289,7 @@ bail:
289 289
290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) 290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
291{ 291{
292 const char *name = dentry->d_name.name; 292 const unsigned char *name = dentry->d_name.name;
293 unsigned len = dentry->d_name.len; 293 unsigned len = dentry->d_name.len;
294 struct buffer_head *bh; 294 struct buffer_head *bh;
295 struct fnode *fnode; 295 struct fnode *fnode;
@@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
298 struct hpfs_dirent dee; 298 struct hpfs_dirent dee;
299 struct inode *result; 299 struct inode *result;
300 int err; 300 int err;
301 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 301 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
302 lock_kernel(); 302 lock_kernel();
303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) { 303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
304 unlock_kernel(); 304 unlock_kernel();
@@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
335 result->i_data.a_ops = &hpfs_symlink_aops; 335 result->i_data.a_ops = &hpfs_symlink_aops;
336 336
337 mutex_lock(&hpfs_i(dir)->i_mutex); 337 mutex_lock(&hpfs_i(dir)->i_mutex);
338 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 338 r = hpfs_add_dirent(dir, name, len, &dee, 0);
339 if (r == 1) 339 if (r == 1)
340 goto bail2; 340 goto bail2;
341 if (r == -1) { 341 if (r == -1) {
@@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
345 fnode->len = len; 345 fnode->len = len;
346 memcpy(fnode->name, name, len > 15 ? 15 : len); 346 memcpy(fnode->name, name, len > 15 ? 15 : len);
347 fnode->up = dir->i_ino; 347 fnode->up = dir->i_ino;
348 hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink)); 348 hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
349 mark_buffer_dirty(bh); 349 mark_buffer_dirty(bh);
350 brelse(bh); 350 brelse(bh);
351 351
@@ -369,7 +369,7 @@ bail:
369 369
370static int hpfs_unlink(struct inode *dir, struct dentry *dentry) 370static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
371{ 371{
372 const char *name = dentry->d_name.name; 372 const unsigned char *name = dentry->d_name.name;
373 unsigned len = dentry->d_name.len; 373 unsigned len = dentry->d_name.len;
374 struct quad_buffer_head qbh; 374 struct quad_buffer_head qbh;
375 struct hpfs_dirent *de; 375 struct hpfs_dirent *de;
@@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
381 int err; 381 int err;
382 382
383 lock_kernel(); 383 lock_kernel();
384 hpfs_adjust_length((char *)name, &len); 384 hpfs_adjust_length(name, &len);
385again: 385again:
386 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 386 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
387 mutex_lock(&hpfs_i(dir)->i_mutex); 387 mutex_lock(&hpfs_i(dir)->i_mutex);
388 err = -ENOENT; 388 err = -ENOENT;
389 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 389 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
390 if (!de) 390 if (!de)
391 goto out; 391 goto out;
392 392
@@ -413,22 +413,25 @@ again:
413 413
414 mutex_unlock(&hpfs_i(dir)->i_mutex); 414 mutex_unlock(&hpfs_i(dir)->i_mutex);
415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
416 d_drop(dentry); 416 dentry_unhash(dentry);
417 spin_lock(&dentry->d_lock); 417 if (!d_unhashed(dentry)) {
418 if (atomic_read(&dentry->d_count) > 1 || 418 dput(dentry);
419 generic_permission(inode, MAY_WRITE, NULL) || 419 unlock_kernel();
420 return -ENOSPC;
421 }
422 if (generic_permission(inode, MAY_WRITE, NULL) ||
420 !S_ISREG(inode->i_mode) || 423 !S_ISREG(inode->i_mode) ||
421 get_write_access(inode)) { 424 get_write_access(inode)) {
422 spin_unlock(&dentry->d_lock);
423 d_rehash(dentry); 425 d_rehash(dentry);
426 dput(dentry);
424 } else { 427 } else {
425 struct iattr newattrs; 428 struct iattr newattrs;
426 spin_unlock(&dentry->d_lock);
427 /*printk("HPFS: truncating file before delete.\n");*/ 429 /*printk("HPFS: truncating file before delete.\n");*/
428 newattrs.ia_size = 0; 430 newattrs.ia_size = 0;
429 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 431 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
430 err = notify_change(dentry, &newattrs); 432 err = notify_change(dentry, &newattrs);
431 put_write_access(inode); 433 put_write_access(inode);
434 dput(dentry);
432 if (!err) 435 if (!err)
433 goto again; 436 goto again;
434 } 437 }
@@ -451,7 +454,7 @@ out:
451 454
452static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) 455static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
453{ 456{
454 const char *name = dentry->d_name.name; 457 const unsigned char *name = dentry->d_name.name;
455 unsigned len = dentry->d_name.len; 458 unsigned len = dentry->d_name.len;
456 struct quad_buffer_head qbh; 459 struct quad_buffer_head qbh;
457 struct hpfs_dirent *de; 460 struct hpfs_dirent *de;
@@ -462,12 +465,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
462 int err; 465 int err;
463 int r; 466 int r;
464 467
465 hpfs_adjust_length((char *)name, &len); 468 hpfs_adjust_length(name, &len);
466 lock_kernel(); 469 lock_kernel();
467 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 470 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
468 mutex_lock(&hpfs_i(dir)->i_mutex); 471 mutex_lock(&hpfs_i(dir)->i_mutex);
469 err = -ENOENT; 472 err = -ENOENT;
470 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 473 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
471 if (!de) 474 if (!de)
472 goto out; 475 goto out;
473 476
@@ -546,10 +549,10 @@ const struct address_space_operations hpfs_symlink_aops = {
546static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, 549static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
547 struct inode *new_dir, struct dentry *new_dentry) 550 struct inode *new_dir, struct dentry *new_dentry)
548{ 551{
549 char *old_name = (char *)old_dentry->d_name.name; 552 const unsigned char *old_name = old_dentry->d_name.name;
550 int old_len = old_dentry->d_name.len; 553 unsigned old_len = old_dentry->d_name.len;
551 char *new_name = (char *)new_dentry->d_name.name; 554 const unsigned char *new_name = new_dentry->d_name.name;
552 int new_len = new_dentry->d_name.len; 555 unsigned new_len = new_dentry->d_name.len;
553 struct inode *i = old_dentry->d_inode; 556 struct inode *i = old_dentry->d_inode;
554 struct inode *new_inode = new_dentry->d_inode; 557 struct inode *new_inode = new_dentry->d_inode;
555 struct quad_buffer_head qbh, qbh1; 558 struct quad_buffer_head qbh, qbh1;
@@ -560,9 +563,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
560 struct buffer_head *bh; 563 struct buffer_head *bh;
561 struct fnode *fnode; 564 struct fnode *fnode;
562 int err; 565 int err;
563 if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err; 566 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
564 err = 0; 567 err = 0;
565 hpfs_adjust_length((char *)old_name, &old_len); 568 hpfs_adjust_length(old_name, &old_len);
566 569
567 lock_kernel(); 570 lock_kernel();
568 /* order doesn't matter, due to VFS exclusion */ 571 /* order doesn't matter, due to VFS exclusion */
@@ -579,7 +582,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
579 goto end1; 582 goto end1;
580 } 583 }
581 584
582 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 585 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
583 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed"); 586 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
584 err = -ENOENT; 587 err = -ENOENT;
585 goto end1; 588 goto end1;
@@ -590,7 +593,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
590 if (new_inode) { 593 if (new_inode) {
591 int r; 594 int r;
592 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) { 595 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
593 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) { 596 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
594 clear_nlink(new_inode); 597 clear_nlink(new_inode);
595 copy_de(nde, &de); 598 copy_de(nde, &de);
596 memcpy(nde->name, new_name, new_len); 599 memcpy(nde->name, new_name, new_len);
@@ -618,7 +621,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
618 } 621 }
619 622
620 if (new_dir == old_dir) 623 if (new_dir == old_dir)
621 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 624 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
622 hpfs_unlock_creation(i->i_sb); 625 hpfs_unlock_creation(i->i_sb);
623 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); 626 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
624 err = -ENOENT; 627 err = -ENOENT;
@@ -648,7 +651,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
648 brelse(bh); 651 brelse(bh);
649 } 652 }
650 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv; 653 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
651 hpfs_decide_conv(i, (char *)new_name, new_len); 654 hpfs_decide_conv(i, new_name, new_len);
652end1: 655end1:
653 if (old_dir != new_dir) 656 if (old_dir != new_dir)
654 mutex_unlock(&hpfs_i(new_dir)->i_mutex); 657 mutex_unlock(&hpfs_i(new_dir)->i_mutex);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f2feaa06bf26..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -14,6 +14,8 @@
14#include <linux/magic.h> 14#include <linux/magic.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
17#include <linux/bitmap.h>
18#include <linux/slab.h>
17 19
18/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 20/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
19 21
@@ -115,15 +117,13 @@ static void hpfs_put_super(struct super_block *s)
115unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) 117unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
116{ 118{
117 struct quad_buffer_head qbh; 119 struct quad_buffer_head qbh;
118 unsigned *bits; 120 unsigned long *bits;
119 unsigned i, count; 121 unsigned count;
120 if (!(bits = hpfs_map_4sectors(s, secno, &qbh, 4))) return 0; 122
121 count = 0; 123 bits = hpfs_map_4sectors(s, secno, &qbh, 4);
122 for (i = 0; i < 2048 / sizeof(unsigned); i++) { 124 if (!bits)
123 unsigned b; 125 return 0;
124 if (!bits[i]) continue; 126 count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
125 for (b = bits[i]; b; b>>=1) count += b & 1;
126 }
127 hpfs_brelse4(&qbh); 127 hpfs_brelse4(&qbh);
128 return count; 128 return count;
129} 129}
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index a5089a6dd67a..2e4dfa8593da 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = {
646static int hppfs_readlink(struct dentry *dentry, char __user *buffer, 646static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
647 int buflen) 647 int buflen)
648{ 648{
649 struct dentry *proc_dentry; 649 struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
650
651 proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
652 return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, 650 return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
653 buflen); 651 buflen);
654} 652}
655 653
656static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) 654static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
657{ 655{
658 struct dentry *proc_dentry; 656 struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
659
660 proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
661 657
662 return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); 658 return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
663} 659}
664 660
661static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
662 void *cookie)
663{
664 struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
665
666 if (proc_dentry->d_inode->i_op->put_link)
667 proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie);
668}
669
665static const struct inode_operations hppfs_dir_iops = { 670static const struct inode_operations hppfs_dir_iops = {
666 .lookup = hppfs_lookup, 671 .lookup = hppfs_lookup,
667}; 672};
@@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = {
669static const struct inode_operations hppfs_link_iops = { 674static const struct inode_operations hppfs_link_iops = {
670 .readlink = hppfs_readlink, 675 .readlink = hppfs_readlink,
671 .follow_link = hppfs_follow_link, 676 .follow_link = hppfs_follow_link,
677 .put_link = hppfs_put_link,
672}; 678};
673 679
674static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) 680static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
@@ -712,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
712 struct vfsmount *proc_mnt; 718 struct vfsmount *proc_mnt;
713 int err = -ENOENT; 719 int err = -ENOENT;
714 720
715 proc_mnt = do_kern_mount("proc", 0, "proc", NULL); 721 proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
716 if (IS_ERR(proc_mnt)) 722 if (IS_ERR(proc_mnt))
717 goto out; 723 goto out;
718 724
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87a1258953b8..a0bbd3d1b41a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,7 +30,6 @@
30#include <linux/dnotify.h> 30#include <linux/dnotify.h>
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/ima.h>
34#include <linux/magic.h> 33#include <linux/magic.h>
35 34
36#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -922,7 +921,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
922 int error = -ENOMEM; 921 int error = -ENOMEM;
923 struct file *file; 922 struct file *file;
924 struct inode *inode; 923 struct inode *inode;
925 struct dentry *dentry, *root; 924 struct path path;
925 struct dentry *root;
926 struct qstr quick_string; 926 struct qstr quick_string;
927 927
928 *user = NULL; 928 *user = NULL;
@@ -944,10 +944,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
944 quick_string.name = name; 944 quick_string.name = name;
945 quick_string.len = strlen(quick_string.name); 945 quick_string.len = strlen(quick_string.name);
946 quick_string.hash = 0; 946 quick_string.hash = 0;
947 dentry = d_alloc(root, &quick_string); 947 path.dentry = d_alloc(root, &quick_string);
948 if (!dentry) 948 if (!path.dentry)
949 goto out_shm_unlock; 949 goto out_shm_unlock;
950 950
951 path.mnt = mntget(hugetlbfs_vfsmount);
951 error = -ENOSPC; 952 error = -ENOSPC;
952 inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), 953 inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(),
953 current_fsgid(), S_IFREG | S_IRWXUGO, 0); 954 current_fsgid(), S_IFREG | S_IRWXUGO, 0);
@@ -960,24 +961,22 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
960 acctflag)) 961 acctflag))
961 goto out_inode; 962 goto out_inode;
962 963
963 d_instantiate(dentry, inode); 964 d_instantiate(path.dentry, inode);
964 inode->i_size = size; 965 inode->i_size = size;
965 inode->i_nlink = 0; 966 inode->i_nlink = 0;
966 967
967 error = -ENFILE; 968 error = -ENFILE;
968 file = alloc_file(hugetlbfs_vfsmount, dentry, 969 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
969 FMODE_WRITE | FMODE_READ,
970 &hugetlbfs_file_operations); 970 &hugetlbfs_file_operations);
971 if (!file) 971 if (!file)
972 goto out_dentry; /* inode is already attached */ 972 goto out_dentry; /* inode is already attached */
973 ima_counts_get(file);
974 973
975 return file; 974 return file;
976 975
977out_inode: 976out_inode:
978 iput(inode); 977 iput(inode);
979out_dentry: 978out_dentry:
980 dput(dentry); 979 path_put(&path);
981out_shm_unlock: 980out_shm_unlock:
982 if (*user) { 981 if (*user) {
983 user_shm_unlock(size, *user); 982 user_shm_unlock(size, *user);
diff --git a/fs/inode.c b/fs/inode.c
index 4d8e3be55976..407bf392e20a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/dcache.h> 9#include <linux/dcache.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/quotaops.h>
12#include <linux/slab.h> 11#include <linux/slab.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/module.h> 13#include <linux/module.h>
@@ -18,7 +17,6 @@
18#include <linux/hash.h> 17#include <linux/hash.h>
19#include <linux/swap.h> 18#include <linux/swap.h>
20#include <linux/security.h> 19#include <linux/security.h>
21#include <linux/ima.h>
22#include <linux/pagemap.h> 20#include <linux/pagemap.h>
23#include <linux/cdev.h> 21#include <linux/cdev.h>
24#include <linux/bootmem.h> 22#include <linux/bootmem.h>
@@ -114,7 +112,7 @@ static void wake_up_inode(struct inode *inode)
114 * Prevent speculative execution through spin_unlock(&inode_lock); 112 * Prevent speculative execution through spin_unlock(&inode_lock);
115 */ 113 */
116 smp_mb(); 114 smp_mb();
117 wake_up_bit(&inode->i_state, __I_LOCK); 115 wake_up_bit(&inode->i_state, __I_NEW);
118} 116}
119 117
120/** 118/**
@@ -157,11 +155,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
157 155
158 if (security_inode_alloc(inode)) 156 if (security_inode_alloc(inode))
159 goto out; 157 goto out;
160
161 /* allocate and initialize an i_integrity */
162 if (ima_inode_alloc(inode))
163 goto out_free_security;
164
165 spin_lock_init(&inode->i_lock); 158 spin_lock_init(&inode->i_lock);
166 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 159 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
167 160
@@ -201,9 +194,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
201#endif 194#endif
202 195
203 return 0; 196 return 0;
204
205out_free_security:
206 security_inode_free(inode);
207out: 197out:
208 return -ENOMEM; 198 return -ENOMEM;
209} 199}
@@ -235,7 +225,6 @@ static struct inode *alloc_inode(struct super_block *sb)
235void __destroy_inode(struct inode *inode) 225void __destroy_inode(struct inode *inode)
236{ 226{
237 BUG_ON(inode_has_buffers(inode)); 227 BUG_ON(inode_has_buffers(inode));
238 ima_inode_free(inode);
239 security_inode_free(inode); 228 security_inode_free(inode);
240 fsnotify_inode_delete(inode); 229 fsnotify_inode_delete(inode);
241#ifdef CONFIG_FS_POSIX_ACL 230#ifdef CONFIG_FS_POSIX_ACL
@@ -324,7 +313,6 @@ void clear_inode(struct inode *inode)
324 BUG_ON(!(inode->i_state & I_FREEING)); 313 BUG_ON(!(inode->i_state & I_FREEING));
325 BUG_ON(inode->i_state & I_CLEAR); 314 BUG_ON(inode->i_state & I_CLEAR);
326 inode_sync_wait(inode); 315 inode_sync_wait(inode);
327 vfs_dq_drop(inode);
328 if (inode->i_sb->s_op->clear_inode) 316 if (inode->i_sb->s_op->clear_inode)
329 inode->i_sb->s_op->clear_inode(inode); 317 inode->i_sb->s_op->clear_inode(inode);
330 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 318 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -700,17 +688,17 @@ void unlock_new_inode(struct inode *inode)
700 } 688 }
701#endif 689#endif
702 /* 690 /*
703 * This is special! We do not need the spinlock when clearing I_LOCK, 691 * This is special! We do not need the spinlock when clearing I_NEW,
704 * because we're guaranteed that nobody else tries to do anything about 692 * because we're guaranteed that nobody else tries to do anything about
705 * the state of the inode when it is locked, as we just created it (so 693 * the state of the inode when it is locked, as we just created it (so
706 * there can be no old holders that haven't tested I_LOCK). 694 * there can be no old holders that haven't tested I_NEW).
707 * However we must emit the memory barrier so that other CPUs reliably 695 * However we must emit the memory barrier so that other CPUs reliably
708 * see the clearing of I_LOCK after the other inode initialisation has 696 * see the clearing of I_NEW after the other inode initialisation has
709 * completed. 697 * completed.
710 */ 698 */
711 smp_mb(); 699 smp_mb();
712 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); 700 WARN_ON(!(inode->i_state & I_NEW));
713 inode->i_state &= ~(I_LOCK|I_NEW); 701 inode->i_state &= ~I_NEW;
714 wake_up_inode(inode); 702 wake_up_inode(inode);
715} 703}
716EXPORT_SYMBOL(unlock_new_inode); 704EXPORT_SYMBOL(unlock_new_inode);
@@ -741,7 +729,7 @@ static struct inode *get_new_inode(struct super_block *sb,
741 goto set_failed; 729 goto set_failed;
742 730
743 __inode_add_to_lists(sb, head, inode); 731 __inode_add_to_lists(sb, head, inode);
744 inode->i_state = I_LOCK|I_NEW; 732 inode->i_state = I_NEW;
745 spin_unlock(&inode_lock); 733 spin_unlock(&inode_lock);
746 734
747 /* Return the locked inode with I_NEW set, the 735 /* Return the locked inode with I_NEW set, the
@@ -788,7 +776,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
788 if (!old) { 776 if (!old) {
789 inode->i_ino = ino; 777 inode->i_ino = ino;
790 __inode_add_to_lists(sb, head, inode); 778 __inode_add_to_lists(sb, head, inode);
791 inode->i_state = I_LOCK|I_NEW; 779 inode->i_state = I_NEW;
792 spin_unlock(&inode_lock); 780 spin_unlock(&inode_lock);
793 781
794 /* Return the locked inode with I_NEW set, the 782 /* Return the locked inode with I_NEW set, the
@@ -1093,7 +1081,7 @@ int insert_inode_locked(struct inode *inode)
1093 ino_t ino = inode->i_ino; 1081 ino_t ino = inode->i_ino;
1094 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1082 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1095 1083
1096 inode->i_state |= I_LOCK|I_NEW; 1084 inode->i_state |= I_NEW;
1097 while (1) { 1085 while (1) {
1098 struct hlist_node *node; 1086 struct hlist_node *node;
1099 struct inode *old = NULL; 1087 struct inode *old = NULL;
@@ -1130,7 +1118,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1130 struct super_block *sb = inode->i_sb; 1118 struct super_block *sb = inode->i_sb;
1131 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1119 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1132 1120
1133 inode->i_state |= I_LOCK|I_NEW; 1121 inode->i_state |= I_NEW;
1134 1122
1135 while (1) { 1123 while (1) {
1136 struct hlist_node *node; 1124 struct hlist_node *node;
@@ -1221,8 +1209,6 @@ void generic_delete_inode(struct inode *inode)
1221 1209
1222 if (op->delete_inode) { 1210 if (op->delete_inode) {
1223 void (*delete)(struct inode *) = op->delete_inode; 1211 void (*delete)(struct inode *) = op->delete_inode;
1224 if (!is_bad_inode(inode))
1225 vfs_dq_init(inode);
1226 /* Filesystems implementing their own 1212 /* Filesystems implementing their own
1227 * s_op->delete_inode are required to call 1213 * s_op->delete_inode are required to call
1228 * truncate_inode_pages and clear_inode() 1214 * truncate_inode_pages and clear_inode()
@@ -1520,7 +1506,7 @@ EXPORT_SYMBOL(inode_wait);
1520 * until the deletion _might_ have completed. Callers are responsible 1506 * until the deletion _might_ have completed. Callers are responsible
1521 * to recheck inode state. 1507 * to recheck inode state.
1522 * 1508 *
1523 * It doesn't matter if I_LOCK is not set initially, a call to 1509 * It doesn't matter if I_NEW is not set initially, a call to
1524 * wake_up_inode() after removing from the hash list will DTRT. 1510 * wake_up_inode() after removing from the hash list will DTRT.
1525 * 1511 *
1526 * This is called with inode_lock held. 1512 * This is called with inode_lock held.
@@ -1528,8 +1514,8 @@ EXPORT_SYMBOL(inode_wait);
1528static void __wait_on_freeing_inode(struct inode *inode) 1514static void __wait_on_freeing_inode(struct inode *inode)
1529{ 1515{
1530 wait_queue_head_t *wq; 1516 wait_queue_head_t *wq;
1531 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); 1517 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1532 wq = bit_waitqueue(&inode->i_state, __I_LOCK); 1518 wq = bit_waitqueue(&inode->i_state, __I_NEW);
1533 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1519 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1534 spin_unlock(&inode_lock); 1520 spin_unlock(&inode_lock);
1535 schedule(); 1521 schedule();
diff --git a/fs/internal.h b/fs/internal.h
index 515175b8b72e..8a03a5447bdf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 70
71extern void __init mnt_init(void); 71extern void __init mnt_init(void);
72 72
73extern spinlock_t vfsmount_lock;
74
73/* 75/*
74 * fs_struct.c 76 * fs_struct.c
75 */ 77 */
@@ -79,8 +81,16 @@ extern void chroot_fs_refs(struct path *, struct path *);
79 * file_table.c 81 * file_table.c
80 */ 82 */
81extern void mark_files_ro(struct super_block *); 83extern void mark_files_ro(struct super_block *);
84extern struct file *get_empty_filp(void);
82 85
83/* 86/*
84 * super.c 87 * super.c
85 */ 88 */
86extern int do_remount_sb(struct super_block *, int, void *, int); 89extern int do_remount_sb(struct super_block *, int, void *, int);
90
91/*
92 * open.c
93 */
94struct nameidata;
95extern struct file *nameidata_to_filp(struct nameidata *);
96extern void release_open_intent(struct nameidata *);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
19 * See also Documentation/block/ioprio.txt 19 * See also Documentation/block/ioprio.txt
20 * 20 *
21 */ 21 */
22#include <linux/gfp.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/ioprio.h> 24#include <linux/ioprio.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index defb932eee9a..0b3fa7974fa8 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -36,286 +36,323 @@ static void *zisofs_zlib_workspace;
36static DEFINE_MUTEX(zisofs_zlib_lock); 36static DEFINE_MUTEX(zisofs_zlib_lock);
37 37
38/* 38/*
39 * When decompressing, we typically obtain more than one page 39 * Read data of @inode from @block_start to @block_end and uncompress
40 * per reference. We inject the additional pages into the page 40 * to one zisofs block. Store the data in the @pages array with @pcount
41 * cache as a form of readahead. 41 * entries. Start storing at offset @poffset of the first page.
42 */ 42 */
43static int zisofs_readpage(struct file *file, struct page *page) 43static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
44 loff_t block_end, int pcount,
45 struct page **pages, unsigned poffset,
46 int *errp)
44{ 47{
45 struct inode *inode = file->f_path.dentry->d_inode;
46 struct address_space *mapping = inode->i_mapping;
47 unsigned int maxpage, xpage, fpage, blockindex;
48 unsigned long offset;
49 unsigned long blockptr, blockendptr, cstart, cend, csize;
50 struct buffer_head *bh, *ptrbh[2];
51 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
52 unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
53 unsigned long bufmask = bufsize - 1;
54 int err = -EIO;
55 int i;
56 unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
57 unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; 48 unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
58 /* unsigned long zisofs_block_size = 1UL << zisofs_block_shift; */ 49 unsigned int bufsize = ISOFS_BUFFER_SIZE(inode);
59 unsigned int zisofs_block_page_shift = zisofs_block_shift-PAGE_CACHE_SHIFT; 50 unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
60 unsigned long zisofs_block_pages = 1UL << zisofs_block_page_shift; 51 unsigned int bufmask = bufsize - 1;
61 unsigned long zisofs_block_page_mask = zisofs_block_pages-1; 52 int i, block_size = block_end - block_start;
62 struct page *pages[zisofs_block_pages]; 53 z_stream stream = { .total_out = 0,
63 unsigned long index = page->index; 54 .avail_in = 0,
64 int indexblocks; 55 .avail_out = 0, };
65 56 int zerr;
66 /* We have already been given one page, this is the one 57 int needblocks = (block_size + (block_start & bufmask) + bufmask)
67 we must do. */ 58 >> bufshift;
68 xpage = index & zisofs_block_page_mask; 59 int haveblocks;
69 pages[xpage] = page; 60 blkcnt_t blocknum;
70 61 struct buffer_head *bhs[needblocks + 1];
71 /* The remaining pages need to be allocated and inserted */ 62 int curbh, curpage;
72 offset = index & ~zisofs_block_page_mask; 63
73 blockindex = offset >> zisofs_block_page_shift; 64 if (block_size > deflateBound(1UL << zisofs_block_shift)) {
74 maxpage = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 65 *errp = -EIO;
75
76 /*
77 * If this page is wholly outside i_size we just return zero;
78 * do_generic_file_read() will handle this for us
79 */
80 if (page->index >= maxpage) {
81 SetPageUptodate(page);
82 unlock_page(page);
83 return 0; 66 return 0;
84 } 67 }
85 68 /* Empty block? */
86 maxpage = min(zisofs_block_pages, maxpage-offset); 69 if (block_size == 0) {
87 70 for ( i = 0 ; i < pcount ; i++ ) {
88 for ( i = 0 ; i < maxpage ; i++, offset++ ) { 71 if (!pages[i])
89 if ( i != xpage ) { 72 continue;
90 pages[i] = grab_cache_page_nowait(mapping, offset); 73 memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
91 } 74 flush_dcache_page(pages[i]);
92 page = pages[i]; 75 SetPageUptodate(pages[i]);
93 if ( page ) {
94 ClearPageError(page);
95 kmap(page);
96 } 76 }
77 return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
97 } 78 }
98 79
99 /* This is the last page filled, plus one; used in case of abort. */ 80 /* Because zlib is not thread-safe, do all the I/O at the top. */
100 fpage = 0; 81 blocknum = block_start >> bufshift;
82 memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
83 haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
84 ll_rw_block(READ, haveblocks, bhs);
101 85
102 /* Find the pointer to this specific chunk */ 86 curbh = 0;
103 /* Note: we're not using isonum_731() here because the data is known aligned */ 87 curpage = 0;
104 /* Note: header_size is in 32-bit words (4 bytes) */ 88 /*
105 blockptr = (header_size + blockindex) << 2; 89 * First block is special since it may be fractional. We also wait for
106 blockendptr = blockptr + 4; 90 * it before grabbing the zlib mutex; odds are that the subsequent
91 * blocks are going to come in in short order so we don't hold the zlib
92 * mutex longer than necessary.
93 */
107 94
108 indexblocks = ((blockptr^blockendptr) >> bufshift) ? 2 : 1; 95 if (!bhs[0])
109 ptrbh[0] = ptrbh[1] = NULL; 96 goto b_eio;
110 97
111 if ( isofs_get_blocks(inode, blockptr >> bufshift, ptrbh, indexblocks) != indexblocks ) { 98 wait_on_buffer(bhs[0]);
112 if ( ptrbh[0] ) brelse(ptrbh[0]); 99 if (!buffer_uptodate(bhs[0])) {
113 printk(KERN_DEBUG "zisofs: Null buffer on reading block table, inode = %lu, block = %lu\n", 100 *errp = -EIO;
114 inode->i_ino, blockptr >> bufshift); 101 goto b_eio;
115 goto eio;
116 }
117 ll_rw_block(READ, indexblocks, ptrbh);
118
119 bh = ptrbh[0];
120 if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
121 printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
122 inode->i_ino, blockptr >> bufshift);
123 if ( ptrbh[1] )
124 brelse(ptrbh[1]);
125 goto eio;
126 }
127 cstart = le32_to_cpu(*(__le32 *)(bh->b_data + (blockptr & bufmask)));
128
129 if ( indexblocks == 2 ) {
130 /* We just crossed a block boundary. Switch to the next block */
131 brelse(bh);
132 bh = ptrbh[1];
133 if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
134 printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
135 inode->i_ino, blockendptr >> bufshift);
136 goto eio;
137 }
138 } 102 }
139 cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask)));
140 brelse(bh);
141 103
142 if (cstart > cend) 104 stream.workspace = zisofs_zlib_workspace;
143 goto eio; 105 mutex_lock(&zisofs_zlib_lock);
144 106
145 csize = cend-cstart; 107 zerr = zlib_inflateInit(&stream);
146 108 if (zerr != Z_OK) {
147 if (csize > deflateBound(1UL << zisofs_block_shift)) 109 if (zerr == Z_MEM_ERROR)
148 goto eio; 110 *errp = -ENOMEM;
149 111 else
150 /* Now page[] contains an array of pages, any of which can be NULL, 112 *errp = -EIO;
151 and the locks on which we hold. We should now read the data and 113 printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
152 release the pages. If the pages are NULL the decompressed data 114 zerr);
153 for that particular page should be discarded. */ 115 goto z_eio;
154 116 }
155 if ( csize == 0 ) { 117
156 /* This data block is empty. */ 118 while (curpage < pcount && curbh < haveblocks &&
157 119 zerr != Z_STREAM_END) {
158 for ( fpage = 0 ; fpage < maxpage ; fpage++ ) { 120 if (!stream.avail_out) {
159 if ( (page = pages[fpage]) != NULL ) { 121 if (pages[curpage]) {
160 memset(page_address(page), 0, PAGE_CACHE_SIZE); 122 stream.next_out = page_address(pages[curpage])
161 123 + poffset;
162 flush_dcache_page(page); 124 stream.avail_out = PAGE_CACHE_SIZE - poffset;
163 SetPageUptodate(page); 125 poffset = 0;
164 kunmap(page); 126 } else {
165 unlock_page(page); 127 stream.next_out = (void *)&zisofs_sink_page;
166 if ( fpage == xpage ) 128 stream.avail_out = PAGE_CACHE_SIZE;
167 err = 0; /* The critical page */
168 else
169 page_cache_release(page);
170 } 129 }
171 } 130 }
172 } else { 131 if (!stream.avail_in) {
173 /* This data block is compressed. */ 132 wait_on_buffer(bhs[curbh]);
174 z_stream stream; 133 if (!buffer_uptodate(bhs[curbh])) {
175 int bail = 0, left_out = -1; 134 *errp = -EIO;
176 int zerr; 135 break;
177 int needblocks = (csize + (cstart & bufmask) + bufmask) >> bufshift; 136 }
178 int haveblocks; 137 stream.next_in = bhs[curbh]->b_data +
179 struct buffer_head *bhs[needblocks+1]; 138 (block_start & bufmask);
180 struct buffer_head **bhptr; 139 stream.avail_in = min_t(unsigned, bufsize -
181 140 (block_start & bufmask),
182 /* Because zlib is not thread-safe, do all the I/O at the top. */ 141 block_size);
183 142 block_size -= stream.avail_in;
184 blockptr = cstart >> bufshift; 143 block_start = 0;
185 memset(bhs, 0, (needblocks+1)*sizeof(struct buffer_head *));
186 haveblocks = isofs_get_blocks(inode, blockptr, bhs, needblocks);
187 ll_rw_block(READ, haveblocks, bhs);
188
189 bhptr = &bhs[0];
190 bh = *bhptr++;
191
192 /* First block is special since it may be fractional.
193 We also wait for it before grabbing the zlib
194 mutex; odds are that the subsequent blocks are
195 going to come in in short order so we don't hold
196 the zlib mutex longer than necessary. */
197
198 if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
199 printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n",
200 fpage, xpage, csize);
201 goto b_eio;
202 }
203 stream.next_in = bh->b_data + (cstart & bufmask);
204 stream.avail_in = min(bufsize-(cstart & bufmask), csize);
205 csize -= stream.avail_in;
206
207 stream.workspace = zisofs_zlib_workspace;
208 mutex_lock(&zisofs_zlib_lock);
209
210 zerr = zlib_inflateInit(&stream);
211 if ( zerr != Z_OK ) {
212 if ( err && zerr == Z_MEM_ERROR )
213 err = -ENOMEM;
214 printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
215 zerr);
216 goto z_eio;
217 } 144 }
218 145
219 while ( !bail && fpage < maxpage ) { 146 while (stream.avail_out && stream.avail_in) {
220 page = pages[fpage]; 147 zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
221 if ( page ) 148 if (zerr == Z_BUF_ERROR && stream.avail_in == 0)
222 stream.next_out = page_address(page); 149 break;
223 else 150 if (zerr == Z_STREAM_END)
224 stream.next_out = (void *)&zisofs_sink_page; 151 break;
225 stream.avail_out = PAGE_CACHE_SIZE; 152 if (zerr != Z_OK) {
226 153 /* EOF, error, or trying to read beyond end of input */
227 while ( stream.avail_out ) { 154 if (zerr == Z_MEM_ERROR)
228 int ao, ai; 155 *errp = -ENOMEM;
229 if ( stream.avail_in == 0 && left_out ) { 156 else {
230 if ( !csize ) { 157 printk(KERN_DEBUG
231 printk(KERN_WARNING "zisofs: ZF read beyond end of input\n"); 158 "zisofs: zisofs_inflate returned"
232 bail = 1; 159 " %d, inode = %lu,"
233 break; 160 " page idx = %d, bh idx = %d,"
234 } else { 161 " avail_in = %d,"
235 bh = *bhptr++; 162 " avail_out = %d\n",
236 if ( !bh || 163 zerr, inode->i_ino, curpage,
237 (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { 164 curbh, stream.avail_in,
238 /* Reached an EIO */ 165 stream.avail_out);
239 printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n", 166 *errp = -EIO;
240 fpage, xpage, csize);
241
242 bail = 1;
243 break;
244 }
245 stream.next_in = bh->b_data;
246 stream.avail_in = min(csize,bufsize);
247 csize -= stream.avail_in;
248 }
249 }
250 ao = stream.avail_out; ai = stream.avail_in;
251 zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
252 left_out = stream.avail_out;
253 if ( zerr == Z_BUF_ERROR && stream.avail_in == 0 )
254 continue;
255 if ( zerr != Z_OK ) {
256 /* EOF, error, or trying to read beyond end of input */
257 if ( err && zerr == Z_MEM_ERROR )
258 err = -ENOMEM;
259 if ( zerr != Z_STREAM_END )
260 printk(KERN_DEBUG "zisofs: zisofs_inflate returned %d, inode = %lu, index = %lu, fpage = %d, xpage = %d, avail_in = %d, avail_out = %d, ai = %d, ao = %d\n",
261 zerr, inode->i_ino, index,
262 fpage, xpage,
263 stream.avail_in, stream.avail_out,
264 ai, ao);
265 bail = 1;
266 break;
267 } 167 }
168 goto inflate_out;
268 } 169 }
170 }
269 171
270 if ( stream.avail_out && zerr == Z_STREAM_END ) { 172 if (!stream.avail_out) {
271 /* Fractional page written before EOF. This may 173 /* This page completed */
272 be the last page in the file. */ 174 if (pages[curpage]) {
273 memset(stream.next_out, 0, stream.avail_out); 175 flush_dcache_page(pages[curpage]);
274 stream.avail_out = 0; 176 SetPageUptodate(pages[curpage]);
275 } 177 }
178 curpage++;
179 }
180 if (!stream.avail_in)
181 curbh++;
182 }
183inflate_out:
184 zlib_inflateEnd(&stream);
276 185
277 if ( !stream.avail_out ) { 186z_eio:
278 /* This page completed */ 187 mutex_unlock(&zisofs_zlib_lock);
279 if ( page ) { 188
280 flush_dcache_page(page); 189b_eio:
281 SetPageUptodate(page); 190 for (i = 0; i < haveblocks; i++)
282 kunmap(page); 191 brelse(bhs[i]);
283 unlock_page(page); 192 return stream.total_out;
284 if ( fpage == xpage ) 193}
285 err = 0; /* The critical page */ 194
286 else 195/*
287 page_cache_release(page); 196 * Uncompress data so that pages[full_page] is fully uptodate and possibly
288 } 197 * fills in other pages if we have data for them.
289 fpage++; 198 */
290 } 199static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
200 struct page **pages)
201{
202 loff_t start_off, end_off;
203 loff_t block_start, block_end;
204 unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
205 unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
206 unsigned int blockptr;
207 loff_t poffset = 0;
208 blkcnt_t cstart_block, cend_block;
209 struct buffer_head *bh;
210 unsigned int blkbits = ISOFS_BUFFER_BITS(inode);
211 unsigned int blksize = 1 << blkbits;
212 int err;
213 loff_t ret;
214
215 BUG_ON(!pages[full_page]);
216
217 /*
218 * We want to read at least 'full_page' page. Because we have to
219 * uncompress the whole compression block anyway, fill the surrounding
220 * pages with the data we have anyway...
221 */
222 start_off = page_offset(pages[full_page]);
223 end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
224
225 cstart_block = start_off >> zisofs_block_shift;
226 cend_block = (end_off + (1 << zisofs_block_shift) - 1)
227 >> zisofs_block_shift;
228
229 WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
230 ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
231
232 /* Find the pointer to this specific chunk */
233 /* Note: we're not using isonum_731() here because the data is known aligned */
234 /* Note: header_size is in 32-bit words (4 bytes) */
235 blockptr = (header_size + cstart_block) << 2;
236 bh = isofs_bread(inode, blockptr >> blkbits);
237 if (!bh)
238 return -EIO;
239 block_start = le32_to_cpu(*(__le32 *)
240 (bh->b_data + (blockptr & (blksize - 1))));
241
242 while (cstart_block < cend_block && pcount > 0) {
243 /* Load end of the compressed block in the file */
244 blockptr += 4;
245 /* Traversed to next block? */
246 if (!(blockptr & (blksize - 1))) {
247 brelse(bh);
248
249 bh = isofs_bread(inode, blockptr >> blkbits);
250 if (!bh)
251 return -EIO;
252 }
253 block_end = le32_to_cpu(*(__le32 *)
254 (bh->b_data + (blockptr & (blksize - 1))));
255 if (block_start > block_end) {
256 brelse(bh);
257 return -EIO;
258 }
259 err = 0;
260 ret = zisofs_uncompress_block(inode, block_start, block_end,
261 pcount, pages, poffset, &err);
262 poffset += ret;
263 pages += poffset >> PAGE_CACHE_SHIFT;
264 pcount -= poffset >> PAGE_CACHE_SHIFT;
265 full_page -= poffset >> PAGE_CACHE_SHIFT;
266 poffset &= ~PAGE_CACHE_MASK;
267
268 if (err) {
269 brelse(bh);
270 /*
271 * Did we finish reading the page we really wanted
272 * to read?
273 */
274 if (full_page < 0)
275 return 0;
276 return err;
291 } 277 }
292 zlib_inflateEnd(&stream);
293 278
294 z_eio: 279 block_start = block_end;
295 mutex_unlock(&zisofs_zlib_lock); 280 cstart_block++;
281 }
282
283 if (poffset && *pages) {
284 memset(page_address(*pages) + poffset, 0,
285 PAGE_CACHE_SIZE - poffset);
286 flush_dcache_page(*pages);
287 SetPageUptodate(*pages);
288 }
289 return 0;
290}
296 291
297 b_eio: 292/*
298 for ( i = 0 ; i < haveblocks ; i++ ) { 293 * When decompressing, we typically obtain more than one page
299 if ( bhs[i] ) 294 * per reference. We inject the additional pages into the page
300 brelse(bhs[i]); 295 * cache as a form of readahead.
296 */
297static int zisofs_readpage(struct file *file, struct page *page)
298{
299 struct inode *inode = file->f_path.dentry->d_inode;
300 struct address_space *mapping = inode->i_mapping;
301 int err;
302 int i, pcount, full_page;
303 unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
304 unsigned int zisofs_pages_per_cblock =
305 PAGE_CACHE_SHIFT <= zisofs_block_shift ?
306 (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
307 struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
308 pgoff_t index = page->index, end_index;
309
310 end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
311 /*
312 * If this page is wholly outside i_size we just return zero;
313 * do_generic_file_read() will handle this for us
314 */
315 if (index >= end_index) {
316 SetPageUptodate(page);
317 unlock_page(page);
318 return 0;
319 }
320
321 if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
322 /* We have already been given one page, this is the one
323 we must do. */
324 full_page = index & (zisofs_pages_per_cblock - 1);
325 pcount = min_t(int, zisofs_pages_per_cblock,
326 end_index - (index & ~(zisofs_pages_per_cblock - 1)));
327 index -= full_page;
328 } else {
329 full_page = 0;
330 pcount = 1;
331 }
332 pages[full_page] = page;
333
334 for (i = 0; i < pcount; i++, index++) {
335 if (i != full_page)
336 pages[i] = grab_cache_page_nowait(mapping, index);
337 if (pages[i]) {
338 ClearPageError(pages[i]);
339 kmap(pages[i]);
301 } 340 }
302 } 341 }
303 342
304eio: 343 err = zisofs_fill_pages(inode, full_page, pcount, pages);
305 344
306 /* Release any residual pages, do not SetPageUptodate */ 345 /* Release any residual pages, do not SetPageUptodate */
307 while ( fpage < maxpage ) { 346 for (i = 0; i < pcount; i++) {
308 page = pages[fpage]; 347 if (pages[i]) {
309 if ( page ) { 348 flush_dcache_page(pages[i]);
310 flush_dcache_page(page); 349 if (i == full_page && err)
311 if ( fpage == xpage ) 350 SetPageError(pages[i]);
312 SetPageError(page); 351 kunmap(pages[i]);
313 kunmap(page); 352 unlock_page(pages[i]);
314 unlock_page(page); 353 if (i != full_page)
315 if ( fpage != xpage ) 354 page_cache_release(pages[i]);
316 page_cache_release(page);
317 } 355 }
318 fpage++;
319 } 356 }
320 357
321 /* At this point, err contains 0 or -EIO depending on the "critical" page */ 358 /* At this point, err contains 0 or -EIO depending on the "critical" page */
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include <linux/gfp.h>
14#include "isofs.h" 15#include "isofs.h"
15 16
16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 17int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index e81a30593ba9..ed752cb38474 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * The following files are helpful: 10 * The following files are helpful:
11 * 11 *
12 * Documentation/filesystems/Exporting 12 * Documentation/filesystems/nfs/Exporting
13 * fs/exportfs/expfs.c. 13 * fs/exportfs/expfs.c.
14 */ 14 */
15 15
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/gfp.h>
10#include "isofs.h" 11#include "isofs.h"
11 12
12/* 13/*
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c2fb2dd0131f..96a685c550fd 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -518,8 +518,7 @@ repeat:
518 if (algo == SIG('p', 'z')) { 518 if (algo == SIG('p', 'z')) {
519 int block_shift = 519 int block_shift =
520 isonum_711(&rr->u.ZF.parms[1]); 520 isonum_711(&rr->u.ZF.parms[1]);
521 if (block_shift < PAGE_CACHE_SHIFT 521 if (block_shift > 17) {
522 || block_shift > 17) {
523 printk(KERN_WARNING "isofs: " 522 printk(KERN_WARNING "isofs: "
524 "Can't handle ZF block " 523 "Can't handle ZF block "
525 "size of 2^%d\n", 524 "size of 2^%d\n",
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 4bd882548c45..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd.h> 18#include <linux/jbd.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h> 20#include <linux/mm.h>
22#include <linux/pagemap.h> 21#include <linux/pagemap.h>
23#include <linux/bio.h> 22#include <linux/bio.h>
@@ -862,12 +861,12 @@ restart_loop:
862 /* A buffer which has been freed while still being 861 /* A buffer which has been freed while still being
863 * journaled by a previous transaction may end up still 862 * journaled by a previous transaction may end up still
864 * being dirty here, but we want to avoid writing back 863 * being dirty here, but we want to avoid writing back
865 * that buffer in the future now that the last use has 864 * that buffer in the future after the "add to orphan"
866 * been committed. That's not only a performance gain, 865 * operation been committed, That's not only a performance
867 * it also stops aliasing problems if the buffer is left 866 * gain, it also stops aliasing problems if the buffer is
868 * behind for writeback and gets reallocated for another 867 * left behind for writeback and gets reallocated for another
869 * use in a different page. */ 868 * use in a different page. */
870 if (buffer_freed(bh)) { 869 if (buffer_freed(bh) && !jh->b_next_transaction) {
871 clear_buffer_freed(bh); 870 clear_buffer_freed(bh);
872 clear_buffer_jbddirty(bh); 871 clear_buffer_jbddirty(bh);
873 } 872 }
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 4160afad6d00..bd224eec9b07 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void)
1913{ 1913{
1914 jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); 1914 jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
1915 if (jbd_debugfs_dir) 1915 if (jbd_debugfs_dir)
1916 jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, 1916 jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
1917 jbd_debugfs_dir, 1917 jbd_debugfs_dir,
1918 &journal_enable_debug); 1918 &journal_enable_debug);
1919} 1919}
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd.h> 21#include <linux/jbd.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif 23#endif
25 24
26/* 25/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 006f9ad838a2..5ae71e75a491 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
1398 * the case where our storage is so fast that it is more optimal to go 1398 * the case where our storage is so fast that it is more optimal to go
1399 * ahead and force a flush and wait for the transaction to be committed 1399 * ahead and force a flush and wait for the transaction to be committed
1400 * than it is to wait for an arbitrary amount of time for new writers to 1400 * than it is to wait for an arbitrary amount of time for new writers to
1401 * join the transaction. We acheive this by measuring how long it takes 1401 * join the transaction. We achieve this by measuring how long it takes
1402 * to commit a transaction, and compare it with how long this 1402 * to commit a transaction, and compare it with how long this
1403 * transaction has been running, and if run time < commit time then we 1403 * transaction has been running, and if run time < commit time then we
1404 * sleep for the delta and commit. This greatly helps super fast disks 1404 * sleep for the delta and commit. This greatly helps super fast disks
@@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1864 if (!jh) 1864 if (!jh)
1865 goto zap_buffer_no_jh; 1865 goto zap_buffer_no_jh;
1866 1866
1867 /*
1868 * We cannot remove the buffer from checkpoint lists until the
1869 * transaction adding inode to orphan list (let's call it T)
1870 * is committed. Otherwise if the transaction changing the
1871 * buffer would be cleaned from the journal before T is
1872 * committed, a crash will cause that the correct contents of
1873 * the buffer will be lost. On the other hand we have to
1874 * clear the buffer dirty bit at latest at the moment when the
1875 * transaction marking the buffer as freed in the filesystem
1876 * structures is committed because from that moment on the
1877 * buffer can be reallocated and used by a different page.
1878 * Since the block hasn't been freed yet but the inode has
1879 * already been added to orphan list, it is safe for us to add
1880 * the buffer to BJ_Forget list of the newest transaction.
1881 */
1867 transaction = jh->b_transaction; 1882 transaction = jh->b_transaction;
1868 if (transaction == NULL) { 1883 if (transaction == NULL) {
1869 /* First case: not on any transaction. If it 1884 /* First case: not on any transaction. If it
@@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1929 goto zap_buffer; 1944 goto zap_buffer;
1930 } 1945 }
1931 /* 1946 /*
1932 * If it is committing, we simply cannot touch it. We 1947 * The buffer is committing, we simply cannot touch
1933 * can remove it's next_transaction pointer from the 1948 * it. So we just set j_next_transaction to the
1934 * running transaction if that is set, but nothing 1949 * running transaction (if there is one) and mark
1935 * else. */ 1950 * buffer as freed so that commit code knows it should
1951 * clear dirty bits when it is done with the buffer.
1952 */
1936 set_buffer_freed(bh); 1953 set_buffer_freed(bh);
1937 if (jh->b_next_transaction) { 1954 if (journal->j_running_transaction && buffer_jbddirty(bh))
1938 J_ASSERT(jh->b_next_transaction == 1955 jh->b_next_transaction = journal->j_running_transaction;
1939 journal->j_running_transaction);
1940 jh->b_next_transaction = NULL;
1941 }
1942 journal_put_journal_head(jh); 1956 journal_put_journal_head(jh);
1943 spin_unlock(&journal->j_list_lock); 1957 spin_unlock(&journal->j_list_lock);
1944 jbd_unlock_bh_state(bh); 1958 jbd_unlock_bh_state(bh);
@@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh,
2120 */ 2134 */
2121void __journal_refile_buffer(struct journal_head *jh) 2135void __journal_refile_buffer(struct journal_head *jh)
2122{ 2136{
2123 int was_dirty; 2137 int was_dirty, jlist;
2124 struct buffer_head *bh = jh2bh(jh); 2138 struct buffer_head *bh = jh2bh(jh);
2125 2139
2126 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 2140 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh)
2142 __journal_temp_unlink_buffer(jh); 2156 __journal_temp_unlink_buffer(jh);
2143 jh->b_transaction = jh->b_next_transaction; 2157 jh->b_transaction = jh->b_next_transaction;
2144 jh->b_next_transaction = NULL; 2158 jh->b_next_transaction = NULL;
2145 __journal_file_buffer(jh, jh->b_transaction, 2159 if (buffer_freed(bh))
2146 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2160 jlist = BJ_Forget;
2161 else if (jh->b_modified)
2162 jlist = BJ_Metadata;
2163 else
2164 jlist = BJ_Reserved;
2165 __journal_file_buffer(jh, jh->b_transaction, jlist);
2147 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2166 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2148 2167
2149 if (was_dirty) 2168 if (was_dirty)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index ca0f5eb62b20..30beb11ef928 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -22,6 +22,7 @@
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/blkdev.h>
25#include <trace/events/jbd2.h> 26#include <trace/events/jbd2.h>
26 27
27/* 28/*
@@ -506,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
506 if (blocknr < journal->j_tail) 507 if (blocknr < journal->j_tail)
507 freed = freed + journal->j_last - journal->j_first; 508 freed = freed + journal->j_last - journal->j_first;
508 509
510 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
509 jbd_debug(1, 511 jbd_debug(1,
510 "Cleaning journal tail from %d to %d (offset %lu), " 512 "Cleaning journal tail from %d to %d (offset %lu), "
511 "freeing %lu\n", 513 "freeing %lu\n",
@@ -515,6 +517,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
515 journal->j_tail_sequence = first_tid; 517 journal->j_tail_sequence = first_tid;
516 journal->j_tail = blocknr; 518 journal->j_tail = blocknr;
517 spin_unlock(&journal->j_state_lock); 519 spin_unlock(&journal->j_state_lock);
520
521 /*
522 * If there is an external journal, we need to make sure that
523 * any data blocks that were recently written out --- perhaps
524 * by jbd2_log_do_checkpoint() --- are flushed out before we
525 * drop the transactions from the external journal. It's
526 * unlikely this will be necessary, especially with a
527 * appropriately sized journal, but we need this to guarantee
528 * correctness. Fortunately jbd2_cleanup_journal_tail()
529 * doesn't get called all that often.
530 */
531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL);
518 if (!(journal->j_flags & JBD2_ABORT)) 534 if (!(journal->j_flags & JBD2_ABORT))
519 jbd2_journal_update_superblock(journal, 1); 535 jbd2_journal_update_superblock(journal, 1);
520 return 0; 536 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d2779e..671da7fb7ffd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal,
259 ret = err; 259 ret = err;
260 spin_lock(&journal->j_list_lock); 260 spin_lock(&journal->j_list_lock);
261 J_ASSERT(jinode->i_transaction == commit_transaction); 261 J_ASSERT(jinode->i_transaction == commit_transaction);
262 commit_transaction->t_flushed_data_blocks = 1;
262 jinode->i_flags &= ~JI_COMMIT_RUNNING; 263 jinode->i_flags &= ~JI_COMMIT_RUNNING;
263 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 264 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
264 } 265 }
@@ -286,7 +287,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
286 if (err) { 287 if (err) {
287 /* 288 /*
288 * Because AS_EIO is cleared by 289 * Because AS_EIO is cleared by
289 * wait_on_page_writeback_range(), set it again so 290 * filemap_fdatawait_range(), set it again so
290 * that user process can get -EIO from fsync(). 291 * that user process can get -EIO from fsync().
291 */ 292 */
292 set_bit(AS_EIO, 293 set_bit(AS_EIO,
@@ -636,6 +637,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
636 JBUFFER_TRACE(jh, "ph3: write metadata"); 637 JBUFFER_TRACE(jh, "ph3: write metadata");
637 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 638 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
638 jh, &new_jh, blocknr); 639 jh, &new_jh, blocknr);
640 if (flags < 0) {
641 jbd2_journal_abort(journal, flags);
642 continue;
643 }
639 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 644 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
640 wbuf[bufs++] = jh2bh(new_jh); 645 wbuf[bufs++] = jh2bh(new_jh);
641 646
@@ -704,8 +709,17 @@ start_journal_io:
704 } 709 }
705 } 710 }
706 711
707 /* Done it all: now write the commit record asynchronously. */ 712 /*
713 * If the journal is not located on the file system device,
714 * then we must flush the file system device before we issue
715 * the commit record
716 */
717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL);
708 721
722 /* Done it all: now write the commit record asynchronously. */
709 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 723 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 724 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
711 err = journal_submit_commit_record(journal, commit_transaction, 725 err = journal_submit_commit_record(journal, commit_transaction,
@@ -716,13 +730,6 @@ start_journal_io:
716 blkdev_issue_flush(journal->j_dev, NULL); 730 blkdev_issue_flush(journal->j_dev, NULL);
717 } 731 }
718 732
719 /*
720 * This is the right place to wait for data buffers both for ASYNC
721 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
722 * the commit block went to disk (which happens above). If commit is
723 * SYNC, we need to wait for data buffers before we start writing
724 * commit block, which happens below in such setting.
725 */
726 err = journal_finish_inode_data_buffers(journal, commit_transaction); 733 err = journal_finish_inode_data_buffers(journal, commit_transaction);
727 if (err) { 734 if (err) {
728 printk(KERN_WARNING 735 printk(KERN_WARNING
@@ -876,8 +883,7 @@ restart_loop:
876 spin_unlock(&journal->j_list_lock); 883 spin_unlock(&journal->j_list_lock);
877 bh = jh2bh(jh); 884 bh = jh2bh(jh);
878 jbd_lock_bh_state(bh); 885 jbd_lock_bh_state(bh);
879 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || 886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
880 jh->b_transaction == journal->j_running_transaction);
881 887
882 /* 888 /*
883 * If there is undo-protected committed data against 889 * If there is undo-protected committed data against
@@ -923,12 +929,12 @@ restart_loop:
923 /* A buffer which has been freed while still being 929 /* A buffer which has been freed while still being
924 * journaled by a previous transaction may end up still 930 * journaled by a previous transaction may end up still
925 * being dirty here, but we want to avoid writing back 931 * being dirty here, but we want to avoid writing back
926 * that buffer in the future now that the last use has 932 * that buffer in the future after the "add to orphan"
927 * been committed. That's not only a performance gain, 933 * operation been committed, That's not only a performance
928 * it also stops aliasing problems if the buffer is left 934 * gain, it also stops aliasing problems if the buffer is
929 * behind for writeback and gets reallocated for another 935 * left behind for writeback and gets reallocated for another
930 * use in a different page. */ 936 * use in a different page. */
931 if (buffer_freed(bh)) { 937 if (buffer_freed(bh) && !jh->b_next_transaction) {
932 clear_buffer_freed(bh); 938 clear_buffer_freed(bh);
933 clear_buffer_jbddirty(bh); 939 clear_buffer_jbddirty(bh);
934 } 940 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fed85388ee86..c03d4dce4d76 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h> 40#include <linux/math64.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h>
43#include <linux/vmalloc.h>
42 44
43#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
44#include <trace/events/jbd2.h> 46#include <trace/events/jbd2.h>
@@ -78,6 +80,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
78EXPORT_SYMBOL(jbd2_journal_ack_err); 80EXPORT_SYMBOL(jbd2_journal_ack_err);
79EXPORT_SYMBOL(jbd2_journal_clear_err); 81EXPORT_SYMBOL(jbd2_journal_clear_err);
80EXPORT_SYMBOL(jbd2_log_wait_commit); 82EXPORT_SYMBOL(jbd2_log_wait_commit);
83EXPORT_SYMBOL(jbd2_log_start_commit);
81EXPORT_SYMBOL(jbd2_journal_start_commit); 84EXPORT_SYMBOL(jbd2_journal_start_commit);
82EXPORT_SYMBOL(jbd2_journal_force_commit_nested); 85EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
83EXPORT_SYMBOL(jbd2_journal_wipe); 86EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -92,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
92 95
93static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 96static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
94static void __journal_abort_soft (journal_t *journal, int errno); 97static void __journal_abort_soft (journal_t *journal, int errno);
98static int jbd2_journal_create_slab(size_t slab_size);
95 99
96/* 100/*
97 * Helper function used to manage commit timeouts 101 * Helper function used to manage commit timeouts
@@ -358,6 +362,10 @@ repeat:
358 362
359 jbd_unlock_bh_state(bh_in); 363 jbd_unlock_bh_state(bh_in);
360 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 364 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
365 if (!tmp) {
366 jbd2_journal_put_journal_head(new_jh);
367 return -ENOMEM;
368 }
361 jbd_lock_bh_state(bh_in); 369 jbd_lock_bh_state(bh_in);
362 if (jh_in->b_frozen_data) { 370 if (jh_in->b_frozen_data) {
363 jbd2_free(tmp, bh_in->b_size); 371 jbd2_free(tmp, bh_in->b_size);
@@ -809,7 +817,7 @@ static journal_t * journal_init_common (void)
809 journal_t *journal; 817 journal_t *journal;
810 int err; 818 int err;
811 819
812 journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL); 820 journal = kzalloc(sizeof(*journal), GFP_KERNEL);
813 if (!journal) 821 if (!journal)
814 goto fail; 822 goto fail;
815 823
@@ -1243,11 +1251,25 @@ int jbd2_journal_load(journal_t *journal)
1243 } 1251 }
1244 } 1252 }
1245 1253
1254 /*
1255 * Create a slab for this blocksize
1256 */
1257 err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1258 if (err)
1259 return err;
1260
1246 /* Let the recovery code check whether it needs to recover any 1261 /* Let the recovery code check whether it needs to recover any
1247 * data from the journal. */ 1262 * data from the journal. */
1248 if (jbd2_journal_recover(journal)) 1263 if (jbd2_journal_recover(journal))
1249 goto recovery_error; 1264 goto recovery_error;
1250 1265
1266 if (journal->j_failed_commit) {
1267 printk(KERN_ERR "JBD2: journal transaction %u on %s "
1268 "is corrupt.\n", journal->j_failed_commit,
1269 journal->j_devname);
1270 return -EIO;
1271 }
1272
1251 /* OK, we've finished with the dynamic journal bits: 1273 /* OK, we've finished with the dynamic journal bits:
1252 * reinitialise the dynamic contents of the superblock in memory 1274 * reinitialise the dynamic contents of the superblock in memory
1253 * and reset them on disk. */ 1275 * and reset them on disk. */
@@ -1795,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
1795} 1817}
1796 1818
1797/* 1819/*
1820 * JBD memory management
1821 *
1822 * These functions are used to allocate block-sized chunks of memory
1823 * used for making copies of buffer_head data. Very often it will be
1824 * page-sized chunks of data, but sometimes it will be in
1825 * sub-page-size chunks. (For example, 16k pages on Power systems
1826 * with a 4k block file system.) For blocks smaller than a page, we
1827 * use a SLAB allocator. There are slab caches for each block size,
1828 * which are allocated at mount time, if necessary, and we only free
1829 * (all of) the slab caches when/if the jbd2 module is unloaded. For
1830 * this reason we don't need to a mutex to protect access to
1831 * jbd2_slab[] allocating or releasing memory; only in
1832 * jbd2_journal_create_slab().
1833 */
1834#define JBD2_MAX_SLABS 8
1835static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1836static DECLARE_MUTEX(jbd2_slab_create_sem);
1837
1838static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1839 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
1840 "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
1841};
1842
1843
1844static void jbd2_journal_destroy_slabs(void)
1845{
1846 int i;
1847
1848 for (i = 0; i < JBD2_MAX_SLABS; i++) {
1849 if (jbd2_slab[i])
1850 kmem_cache_destroy(jbd2_slab[i]);
1851 jbd2_slab[i] = NULL;
1852 }
1853}
1854
1855static int jbd2_journal_create_slab(size_t size)
1856{
1857 int i = order_base_2(size) - 10;
1858 size_t slab_size;
1859
1860 if (size == PAGE_SIZE)
1861 return 0;
1862
1863 if (i >= JBD2_MAX_SLABS)
1864 return -EINVAL;
1865
1866 if (unlikely(i < 0))
1867 i = 0;
1868 down(&jbd2_slab_create_sem);
1869 if (jbd2_slab[i]) {
1870 up(&jbd2_slab_create_sem);
1871 return 0; /* Already created */
1872 }
1873
1874 slab_size = 1 << (i+10);
1875 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1876 slab_size, 0, NULL);
1877 up(&jbd2_slab_create_sem);
1878 if (!jbd2_slab[i]) {
1879 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1880 return -ENOMEM;
1881 }
1882 return 0;
1883}
1884
1885static struct kmem_cache *get_slab(size_t size)
1886{
1887 int i = order_base_2(size) - 10;
1888
1889 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0))
1891 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0);
1893 return jbd2_slab[i];
1894}
1895
1896void *jbd2_alloc(size_t size, gfp_t flags)
1897{
1898 void *ptr;
1899
1900 BUG_ON(size & (size-1)); /* Must be a power of 2 */
1901
1902 flags |= __GFP_REPEAT;
1903 if (size == PAGE_SIZE)
1904 ptr = (void *)__get_free_pages(flags, 0);
1905 else if (size > PAGE_SIZE) {
1906 int order = get_order(size);
1907
1908 if (order < 3)
1909 ptr = (void *)__get_free_pages(flags, order);
1910 else
1911 ptr = vmalloc(size);
1912 } else
1913 ptr = kmem_cache_alloc(get_slab(size), flags);
1914
1915 /* Check alignment; SLUB has gotten this wrong in the past,
1916 * and this can lead to user data corruption! */
1917 BUG_ON(((unsigned long) ptr) & (size-1));
1918
1919 return ptr;
1920}
1921
1922void jbd2_free(void *ptr, size_t size)
1923{
1924 if (size == PAGE_SIZE) {
1925 free_pages((unsigned long)ptr, 0);
1926 return;
1927 }
1928 if (size > PAGE_SIZE) {
1929 int order = get_order(size);
1930
1931 if (order < 3)
1932 free_pages((unsigned long)ptr, order);
1933 else
1934 vfree(ptr);
1935 return;
1936 }
1937 kmem_cache_free(get_slab(size), ptr);
1938};
1939
1940/*
1798 * Journal_head storage management 1941 * Journal_head storage management
1799 */ 1942 */
1800static struct kmem_cache *jbd2_journal_head_cache; 1943static struct kmem_cache *jbd2_journal_head_cache;
@@ -2103,7 +2246,8 @@ static void __init jbd2_create_debugfs_entry(void)
2103{ 2246{
2104 jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); 2247 jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
2105 if (jbd2_debugfs_dir) 2248 if (jbd2_debugfs_dir)
2106 jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO, 2249 jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
2250 S_IRUGO | S_IWUSR,
2107 jbd2_debugfs_dir, 2251 jbd2_debugfs_dir,
2108 &jbd2_journal_enable_debug); 2252 &jbd2_journal_enable_debug);
2109} 2253}
@@ -2191,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
2191 jbd2_journal_destroy_revoke_caches(); 2335 jbd2_journal_destroy_revoke_caches();
2192 jbd2_journal_destroy_jbd2_journal_head_cache(); 2336 jbd2_journal_destroy_jbd2_journal_head_cache();
2193 jbd2_journal_destroy_handle_cache(); 2337 jbd2_journal_destroy_handle_cache();
2338 jbd2_journal_destroy_slabs();
2194} 2339}
2195 2340
2196static int __init journal_init(void) 2341static int __init journal_init(void)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#include <linux/crc32.h> 23#include <linux/crc32.h>
25#endif 24#endif
26 25
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..bfc70f57900f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1727 if (!jh) 1727 if (!jh)
1728 goto zap_buffer_no_jh; 1728 goto zap_buffer_no_jh;
1729 1729
1730 /*
1731 * We cannot remove the buffer from checkpoint lists until the
1732 * transaction adding inode to orphan list (let's call it T)
1733 * is committed. Otherwise if the transaction changing the
1734 * buffer would be cleaned from the journal before T is
1735 * committed, a crash will cause that the correct contents of
1736 * the buffer will be lost. On the other hand we have to
1737 * clear the buffer dirty bit at latest at the moment when the
1738 * transaction marking the buffer as freed in the filesystem
1739 * structures is committed because from that moment on the
1740 * buffer can be reallocated and used by a different page.
1741 * Since the block hasn't been freed yet but the inode has
1742 * already been added to orphan list, it is safe for us to add
1743 * the buffer to BJ_Forget list of the newest transaction.
1744 */
1730 transaction = jh->b_transaction; 1745 transaction = jh->b_transaction;
1731 if (transaction == NULL) { 1746 if (transaction == NULL) {
1732 /* First case: not on any transaction. If it 1747 /* First case: not on any transaction. If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1783 } else if (transaction == journal->j_committing_transaction) { 1798 } else if (transaction == journal->j_committing_transaction) {
1784 JBUFFER_TRACE(jh, "on committing transaction"); 1799 JBUFFER_TRACE(jh, "on committing transaction");
1785 /* 1800 /*
1786 * If it is committing, we simply cannot touch it. We 1801 * The buffer is committing, we simply cannot touch
1787 * can remove it's next_transaction pointer from the 1802 * it. So we just set j_next_transaction to the
1788 * running transaction if that is set, but nothing 1803 * running transaction (if there is one) and mark
1789 * else. */ 1804 * buffer as freed so that commit code knows it should
1805 * clear dirty bits when it is done with the buffer.
1806 */
1790 set_buffer_freed(bh); 1807 set_buffer_freed(bh);
1791 if (jh->b_next_transaction) { 1808 if (journal->j_running_transaction && buffer_jbddirty(bh))
1792 J_ASSERT(jh->b_next_transaction == 1809 jh->b_next_transaction = journal->j_running_transaction;
1793 journal->j_running_transaction);
1794 jh->b_next_transaction = NULL;
1795 }
1796 jbd2_journal_put_journal_head(jh); 1810 jbd2_journal_put_journal_head(jh);
1797 spin_unlock(&journal->j_list_lock); 1811 spin_unlock(&journal->j_list_lock);
1798 jbd_unlock_bh_state(bh); 1812 jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
1969 */ 1983 */
1970void __jbd2_journal_refile_buffer(struct journal_head *jh) 1984void __jbd2_journal_refile_buffer(struct journal_head *jh)
1971{ 1985{
1972 int was_dirty; 1986 int was_dirty, jlist;
1973 struct buffer_head *bh = jh2bh(jh); 1987 struct buffer_head *bh = jh2bh(jh);
1974 1988
1975 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1989 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
1991 __jbd2_journal_temp_unlink_buffer(jh); 2005 __jbd2_journal_temp_unlink_buffer(jh);
1992 jh->b_transaction = jh->b_next_transaction; 2006 jh->b_transaction = jh->b_next_transaction;
1993 jh->b_next_transaction = NULL; 2007 jh->b_next_transaction = NULL;
1994 __jbd2_journal_file_buffer(jh, jh->b_transaction, 2008 if (buffer_freed(bh))
1995 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2009 jlist = BJ_Forget;
2010 else if (jh->b_modified)
2011 jlist = BJ_Metadata;
2012 else
2013 jlist = BJ_Reserved;
2014 __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
1996 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2015 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
1997 2016
1998 if (was_dirty) 2017 if (was_dirty)
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7edb62e97419..7cdc3196476a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -350,8 +350,8 @@ int jffs2_acl_chmod(struct inode *inode)
350 return rc; 350 return rc;
351} 351}
352 352
353static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size, 353static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
354 const char *name, size_t name_len) 354 size_t list_size, const char *name, size_t name_len, int type)
355{ 355{
356 const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); 356 const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
357 357
@@ -360,8 +360,8 @@ static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t
360 return retlen; 360 return retlen;
361} 361}
362 362
363static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size, 363static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
364 const char *name, size_t name_len) 364 size_t list_size, const char *name, size_t name_len, int type)
365{ 365{
366 const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); 366 const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
367 367
@@ -370,12 +370,16 @@ static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_
370 return retlen; 370 return retlen;
371} 371}
372 372
373static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size) 373static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
374 void *buffer, size_t size, int type)
374{ 375{
375 struct posix_acl *acl; 376 struct posix_acl *acl;
376 int rc; 377 int rc;
377 378
378 acl = jffs2_get_acl(inode, type); 379 if (name[0] != '\0')
380 return -EINVAL;
381
382 acl = jffs2_get_acl(dentry->d_inode, type);
379 if (IS_ERR(acl)) 383 if (IS_ERR(acl))
380 return PTR_ERR(acl); 384 return PTR_ERR(acl);
381 if (!acl) 385 if (!acl)
@@ -386,26 +390,15 @@ static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_
386 return rc; 390 return rc;
387} 391}
388 392
389static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) 393static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
390{ 394 const void *value, size_t size, int flags, int type)
391 if (name[0] != '\0')
392 return -EINVAL;
393 return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size);
394}
395
396static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
397{
398 if (name[0] != '\0')
399 return -EINVAL;
400 return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
401}
402
403static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size)
404{ 395{
405 struct posix_acl *acl; 396 struct posix_acl *acl;
406 int rc; 397 int rc;
407 398
408 if (!is_owner_or_cap(inode)) 399 if (name[0] != '\0')
400 return -EINVAL;
401 if (!is_owner_or_cap(dentry->d_inode))
409 return -EPERM; 402 return -EPERM;
410 403
411 if (value) { 404 if (value) {
@@ -420,38 +413,24 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value,
420 } else { 413 } else {
421 acl = NULL; 414 acl = NULL;
422 } 415 }
423 rc = jffs2_set_acl(inode, type, acl); 416 rc = jffs2_set_acl(dentry->d_inode, type, acl);
424 out: 417 out:
425 posix_acl_release(acl); 418 posix_acl_release(acl);
426 return rc; 419 return rc;
427} 420}
428 421
429static int jffs2_acl_access_setxattr(struct inode *inode, const char *name,
430 const void *buffer, size_t size, int flags)
431{
432 if (name[0] != '\0')
433 return -EINVAL;
434 return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size);
435}
436
437static int jffs2_acl_default_setxattr(struct inode *inode, const char *name,
438 const void *buffer, size_t size, int flags)
439{
440 if (name[0] != '\0')
441 return -EINVAL;
442 return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
443}
444
445struct xattr_handler jffs2_acl_access_xattr_handler = { 422struct xattr_handler jffs2_acl_access_xattr_handler = {
446 .prefix = POSIX_ACL_XATTR_ACCESS, 423 .prefix = POSIX_ACL_XATTR_ACCESS,
424 .flags = ACL_TYPE_DEFAULT,
447 .list = jffs2_acl_access_listxattr, 425 .list = jffs2_acl_access_listxattr,
448 .get = jffs2_acl_access_getxattr, 426 .get = jffs2_acl_getxattr,
449 .set = jffs2_acl_access_setxattr, 427 .set = jffs2_acl_setxattr,
450}; 428};
451 429
452struct xattr_handler jffs2_acl_default_xattr_handler = { 430struct xattr_handler jffs2_acl_default_xattr_handler = {
453 .prefix = POSIX_ACL_XATTR_DEFAULT, 431 .prefix = POSIX_ACL_XATTR_DEFAULT,
432 .flags = ACL_TYPE_DEFAULT,
454 .list = jffs2_acl_default_listxattr, 433 .list = jffs2_acl_default_listxattr,
455 .get = jffs2_acl_default_getxattr, 434 .get = jffs2_acl_getxattr,
456 .set = jffs2_acl_default_setxattr, 435 .set = jffs2_acl_setxattr,
457}; 436};
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f25e70c1b51c..f0294410868d 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -177,7 +177,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
177 spin_unlock(&jffs2_compressor_list_lock); 177 spin_unlock(&jffs2_compressor_list_lock);
178 break; 178 break;
179 default: 179 default:
180 printk(KERN_ERR "JFFS2: unknow compression mode.\n"); 180 printk(KERN_ERR "JFFS2: unknown compression mode.\n");
181 } 181 }
182 out: 182 out:
183 if (ret == JFFS2_COMPR_NONE) { 183 if (ret == JFFS2_COMPR_NONE) {
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/lzo.h> 16#include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
14#endif 14#endif
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
18#include <linux/zlib.h> 17#include <linux/zlib.h>
19#include <linux/zutil.h> 18#include <linux/zutil.h>
20#include "nodelist.h" 19#include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/jffs2.h> 16#include <linux/jffs2.h>
17#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
18#include <linux/slab.h>
18#include "nodelist.h" 19#include "nodelist.h"
19#include "debug.h" 20#include "debug.h"
20 21
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/time.h> 14#include <linux/time.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 090c556ffed2..3b6f2fa12cff 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -700,7 +700,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
700 struct jffs2_raw_inode ri; 700 struct jffs2_raw_inode ri;
701 struct jffs2_node_frag *last_frag; 701 struct jffs2_node_frag *last_frag;
702 union jffs2_device_node dev; 702 union jffs2_device_node dev;
703 char *mdata = NULL, mdatalen = 0; 703 char *mdata = NULL;
704 int mdatalen = 0;
704 uint32_t alloclen, ilen; 705 uint32_t alloclen, ilen;
705 int ret; 706 int ret;
706 707
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
15#include <linux/mtd/mtd.h> 15#include <linux/mtd/mtd.h>
16#include <linux/rbtree.h> 16#include <linux/rbtree.h>
17#include <linux/crc32.h> 17#include <linux/crc32.h>
18#include <linux/slab.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include "nodelist.h" 19#include "nodelist.h"
21 20
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..191359dde4e1 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/mtd/mtd.h> 13#include <linux/mtd/mtd.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
16#include <linux/sched.h> /* For cond_resched() */ 15#include <linux/sched.h> /* For cond_resched() */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1a80301004b8..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
567 else BUG(); 567 else BUG();
568 } 568 }
569 } 569 }
570 list->rb_node = NULL; 570 *list = RB_ROOT;
571} 571}
572 572
573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) 573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
@@ -931,7 +931,7 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
931 * Helper function for jffs2_get_inode_nodes(). 931 * Helper function for jffs2_get_inode_nodes().
932 * The function detects whether more data should be read and reads it if yes. 932 * The function detects whether more data should be read and reads it if yes.
933 * 933 *
934 * Returns: 0 on succes; 934 * Returns: 0 on success;
935 * negative error code on failure. 935 * negative error code on failure.
936 */ 936 */
937static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 937static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
@@ -1284,7 +1284,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
1284 f->target = NULL; 1284 f->target = NULL;
1285 mutex_unlock(&f->sem); 1285 mutex_unlock(&f->sem);
1286 jffs2_do_clear_inode(c, f); 1286 jffs2_do_clear_inode(c, f);
1287 return -ret; 1287 return ret;
1288 } 1288 }
1289 1289
1290 f->target[je32_to_cpu(latest_node->csize)] = '\0'; 1290 f->target[je32_to_cpu(latest_node->csize)] = '\0';
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 02c39c64ecb3..eaccee058583 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -44,26 +44,28 @@ int jffs2_init_security(struct inode *inode, struct inode *dir)
44} 44}
45 45
46/* ---- XATTR Handler for "security.*" ----------------- */ 46/* ---- XATTR Handler for "security.*" ----------------- */
47static int jffs2_security_getxattr(struct inode *inode, const char *name, 47static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
48 void *buffer, size_t size) 48 void *buffer, size_t size, int type)
49{ 49{
50 if (!strcmp(name, "")) 50 if (!strcmp(name, ""))
51 return -EINVAL; 51 return -EINVAL;
52 52
53 return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size); 53 return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
54 name, buffer, size);
54} 55}
55 56
56static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer, 57static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
57 size_t size, int flags) 58 const void *buffer, size_t size, int flags, int type)
58{ 59{
59 if (!strcmp(name, "")) 60 if (!strcmp(name, ""))
60 return -EINVAL; 61 return -EINVAL;
61 62
62 return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); 63 return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
64 name, buffer, size, flags);
63} 65}
64 66
65static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size, 67static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
66 const char *name, size_t name_len) 68 size_t list_size, const char *name, size_t name_len, int type)
67{ 69{
68 size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; 70 size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
69 71
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 6caf1e1ee26d..800171dca53b 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -23,7 +23,7 @@
23 23
24int jffs2_sum_init(struct jffs2_sb_info *c) 24int jffs2_sum_init(struct jffs2_sb_info *c)
25{ 25{
26 uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE); 26 uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
27 27
28 c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); 28 c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
29 29
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include "nodelist.h" 15#include "nodelist.h"
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
17#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
18#include "nodelist.h" 17#include "nodelist.h"
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 082e844ab2db..9e75c62c85d6 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
31 * is used to release xattr name/value pair and detach from c->xattrindex. 31 * is used to release xattr name/value pair and detach from c->xattrindex.
32 * reclaim_xattr_datum(c) 32 * reclaim_xattr_datum(c)
33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when 33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
34 * memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 34 * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
35 * is hard coded as 32KiB. 35 * is hard coded as 32KiB.
36 * do_verify_xattr_datum(c, xd) 36 * do_verify_xattr_datum(c, xd)
37 * is used to load the xdatum informations without name/value pair from the medium. 37 * is used to load the xdatum informations without name/value pair from the medium.
@@ -990,9 +990,11 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
990 if (!xhandle) 990 if (!xhandle)
991 continue; 991 continue;
992 if (buffer) { 992 if (buffer) {
993 rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len); 993 rc = xhandle->list(dentry, buffer+len, size-len,
994 xd->xname, xd->name_len, xd->flags);
994 } else { 995 } else {
995 rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len); 996 rc = xhandle->list(dentry, NULL, 0, xd->xname,
997 xd->name_len, xd->flags);
996 } 998 }
997 if (rc < 0) 999 if (rc < 0)
998 goto out; 1000 goto out;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 8ec5765ef348..3e5a5e356e05 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,24 +16,26 @@
16#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
17#include "nodelist.h" 17#include "nodelist.h"
18 18
19static int jffs2_trusted_getxattr(struct inode *inode, const char *name, 19static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
20 void *buffer, size_t size) 20 void *buffer, size_t size, int type)
21{ 21{
22 if (!strcmp(name, "")) 22 if (!strcmp(name, ""))
23 return -EINVAL; 23 return -EINVAL;
24 return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size); 24 return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
25 name, buffer, size);
25} 26}
26 27
27static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer, 28static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
28 size_t size, int flags) 29 const void *buffer, size_t size, int flags, int type)
29{ 30{
30 if (!strcmp(name, "")) 31 if (!strcmp(name, ""))
31 return -EINVAL; 32 return -EINVAL;
32 return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); 33 return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
34 name, buffer, size, flags);
33} 35}
34 36
35static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size, 37static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
36 const char *name, size_t name_len) 38 size_t list_size, const char *name, size_t name_len, int type)
37{ 39{
38 size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; 40 size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
39 41
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8bbeab90ada1..8544af67dffe 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,24 +16,26 @@
16#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
17#include "nodelist.h" 17#include "nodelist.h"
18 18
19static int jffs2_user_getxattr(struct inode *inode, const char *name, 19static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
20 void *buffer, size_t size) 20 void *buffer, size_t size, int type)
21{ 21{
22 if (!strcmp(name, "")) 22 if (!strcmp(name, ""))
23 return -EINVAL; 23 return -EINVAL;
24 return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size); 24 return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
25 name, buffer, size);
25} 26}
26 27
27static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer, 28static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
28 size_t size, int flags) 29 const void *buffer, size_t size, int flags, int type)
29{ 30{
30 if (!strcmp(name, "")) 31 if (!strcmp(name, ""))
31 return -EINVAL; 32 return -EINVAL;
32 return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); 33 return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
34 name, buffer, size, flags);
33} 35}
34 36
35static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size, 37static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
36 const char *name, size_t name_len) 38 size_t list_size, const char *name, size_t name_len, int type)
37{ 39{
38 size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; 40 size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
39 41
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d66477c34306..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,8 +19,8 @@
19 */ 19 */
20 20
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/slab.h>
22#include <linux/fs.h> 23#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
25#include "jfs_incore.h" 25#include "jfs_incore.h"
26#include "jfs_txnmgr.h" 26#include "jfs_txnmgr.h"
@@ -174,7 +174,7 @@ cleanup:
174 return rc; 174 return rc;
175} 175}
176 176
177static int jfs_acl_chmod(struct inode *inode) 177int jfs_acl_chmod(struct inode *inode)
178{ 178{
179 struct posix_acl *acl, *clone; 179 struct posix_acl *acl, *clone;
180 int rc; 180 int rc;
@@ -205,26 +205,3 @@ static int jfs_acl_chmod(struct inode *inode)
205 posix_acl_release(clone); 205 posix_acl_release(clone);
206 return rc; 206 return rc;
207} 207}
208
209int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
210{
211 struct inode *inode = dentry->d_inode;
212 int rc;
213
214 rc = inode_change_ok(inode, iattr);
215 if (rc)
216 return rc;
217
218 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
219 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
220 if (vfs_dq_transfer(inode, iattr))
221 return -EDQUOT;
222 }
223
224 rc = inode_setattr(inode, iattr);
225
226 if (!rc && (iattr->ia_valid & ATTR_MODE))
227 rc = jfs_acl_chmod(inode);
228
229 return rc;
230}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2b70fa78e4a7..14ba982b3f24 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/quotaops.h>
21#include "jfs_incore.h" 22#include "jfs_incore.h"
22#include "jfs_inode.h" 23#include "jfs_inode.h"
23#include "jfs_dmap.h" 24#include "jfs_dmap.h"
@@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file)
47{ 48{
48 int rc; 49 int rc;
49 50
50 if ((rc = generic_file_open(inode, file))) 51 if ((rc = dquot_file_open(inode, file)))
51 return rc; 52 return rc;
52 53
53 /* 54 /*
@@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file)
88 return 0; 89 return 0;
89} 90}
90 91
92int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
93{
94 struct inode *inode = dentry->d_inode;
95 int rc;
96
97 rc = inode_change_ok(inode, iattr);
98 if (rc)
99 return rc;
100
101 if (iattr->ia_valid & ATTR_SIZE)
102 dquot_initialize(inode);
103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
105 rc = dquot_transfer(inode, iattr);
106 if (rc)
107 return rc;
108 }
109
110 rc = inode_setattr(inode, iattr);
111
112 if (!rc && (iattr->ia_valid & ATTR_MODE))
113 rc = jfs_acl_chmod(inode);
114
115 return rc;
116}
117
91const struct inode_operations jfs_file_inode_operations = { 118const struct inode_operations jfs_file_inode_operations = {
92 .truncate = jfs_truncate, 119 .truncate = jfs_truncate,
93 .setxattr = jfs_setxattr, 120 .setxattr = jfs_setxattr,
94 .getxattr = jfs_getxattr, 121 .getxattr = jfs_getxattr,
95 .listxattr = jfs_listxattr, 122 .listxattr = jfs_listxattr,
96 .removexattr = jfs_removexattr, 123 .removexattr = jfs_removexattr,
97#ifdef CONFIG_JFS_POSIX_ACL
98 .setattr = jfs_setattr, 124 .setattr = jfs_setattr,
125#ifdef CONFIG_JFS_POSIX_ACL
99 .check_acl = jfs_check_acl, 126 .check_acl = jfs_check_acl,
100#endif 127#endif
101}; 128};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..9dd126276c9f 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/quotaops.h> 24#include <linux/quotaops.h>
25#include <linux/writeback.h>
25#include "jfs_incore.h" 26#include "jfs_incore.h"
26#include "jfs_inode.h" 27#include "jfs_inode.h"
27#include "jfs_filsys.h" 28#include "jfs_filsys.h"
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
120 return rc; 121 return rc;
121} 122}
122 123
123int jfs_write_inode(struct inode *inode, int wait) 124int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
124{ 125{
126 int wait = wbc->sync_mode == WB_SYNC_ALL;
127
125 if (test_cflag(COMMIT_Nolink, inode)) 128 if (test_cflag(COMMIT_Nolink, inode))
126 return 0; 129 return 0;
127 /* 130 /*
@@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode)
146{ 149{
147 jfs_info("In jfs_delete_inode, inode = 0x%p", inode); 150 jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
148 151
152 if (!is_bad_inode(inode))
153 dquot_initialize(inode);
154
149 if (!is_bad_inode(inode) && 155 if (!is_bad_inode(inode) &&
150 (JFS_IP(inode)->fileset == FILESYSTEM_I)) { 156 (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
151 truncate_inode_pages(&inode->i_data, 0); 157 truncate_inode_pages(&inode->i_data, 0);
@@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode)
158 /* 164 /*
159 * Free the inode from the quota allocation. 165 * Free the inode from the quota allocation.
160 */ 166 */
161 vfs_dq_init(inode); 167 dquot_initialize(inode);
162 vfs_dq_free_inode(inode); 168 dquot_free_inode(inode);
163 vfs_dq_drop(inode); 169 dquot_drop(inode);
164 } 170 }
165 171
166 clear_inode(inode); 172 clear_inode(inode);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index b07bd417ef85..54e07559878d 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -22,7 +22,7 @@
22 22
23int jfs_check_acl(struct inode *, int); 23int jfs_check_acl(struct inode *, int);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_setattr(struct dentry *, struct iattr *); 25int jfs_acl_chmod(struct inode *inode);
26 26
27#else 27#else
28 28
@@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
32 return 0; 32 return 0;
33} 33}
34 34
35static inline int jfs_acl_chmod(struct inode *inode)
36{
37 return 0;
38}
39
35#endif 40#endif
36#endif /* _H_JFS_ACL */ 41#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2bc7d8aa5740..6c4dfcbf3f55 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include "jfs_incore.h" 21#include "jfs_incore.h"
21#include "jfs_superblock.h" 22#include "jfs_superblock.h"
22#include "jfs_dmap.h" 23#include "jfs_dmap.h"
@@ -755,7 +756,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
755 * allocation group. 756 * allocation group.
756 */ 757 */
757 if ((blkno & (bmp->db_agsize - 1)) == 0) 758 if ((blkno & (bmp->db_agsize - 1)) == 0)
758 /* check if the AG is currenly being written to. 759 /* check if the AG is currently being written to.
759 * if so, call dbNextAG() to find a non-busy 760 * if so, call dbNextAG() to find a non-busy
760 * AG with sufficient free space. 761 * AG with sufficient free space.
761 */ 762 */
@@ -3337,7 +3338,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3337 for (i = 0, n = 0; i < agno; n++) { 3338 for (i = 0, n = 0; i < agno; n++) {
3338 bmp->db_agfree[n] = 0; /* init collection point */ 3339 bmp->db_agfree[n] = 0; /* init collection point */
3339 3340
3340 /* coalesce cotiguous k AGs; */ 3341 /* coalesce contiguous k AGs; */
3341 for (j = 0; j < k && i < agno; j++, i++) { 3342 for (j = 0; j < k && i < agno; j++, i++) {
3342 /* merge AGi to AGn */ 3343 /* merge AGi to AGn */
3343 bmp->db_agfree[n] += bmp->db_agfree[i]; 3344 bmp->db_agfree[n] += bmp->db_agfree[i];
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887b..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
102 102
103#include <linux/fs.h> 103#include <linux/fs.h>
104#include <linux/quotaops.h> 104#include <linux/quotaops.h>
105#include <linux/slab.h>
105#include "jfs_incore.h" 106#include "jfs_incore.h"
106#include "jfs_superblock.h" 107#include "jfs_superblock.h"
107#include "jfs_filsys.h" 108#include "jfs_filsys.h"
@@ -381,10 +382,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
381 * It's time to move the inline table to an external 382 * It's time to move the inline table to an external
382 * page and begin to build the xtree 383 * page and begin to build the xtree
383 */ 384 */
384 if (vfs_dq_alloc_block(ip, sbi->nbperpage)) 385 if (dquot_alloc_block(ip, sbi->nbperpage))
385 goto clean_up; 386 goto clean_up;
386 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { 387 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
387 vfs_dq_free_block(ip, sbi->nbperpage); 388 dquot_free_block(ip, sbi->nbperpage);
388 goto clean_up; 389 goto clean_up;
389 } 390 }
390 391
@@ -408,7 +409,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
408 memcpy(&jfs_ip->i_dirtable, temp_table, 409 memcpy(&jfs_ip->i_dirtable, temp_table,
409 sizeof (temp_table)); 410 sizeof (temp_table));
410 dbFree(ip, xaddr, sbi->nbperpage); 411 dbFree(ip, xaddr, sbi->nbperpage);
411 vfs_dq_free_block(ip, sbi->nbperpage); 412 dquot_free_block(ip, sbi->nbperpage);
412 goto clean_up; 413 goto clean_up;
413 } 414 }
414 ip->i_size = PSIZE; 415 ip->i_size = PSIZE;
@@ -1027,10 +1028,9 @@ static int dtSplitUp(tid_t tid,
1027 n = xlen; 1028 n = xlen;
1028 1029
1029 /* Allocate blocks to quota. */ 1030 /* Allocate blocks to quota. */
1030 if (vfs_dq_alloc_block(ip, n)) { 1031 rc = dquot_alloc_block(ip, n);
1031 rc = -EDQUOT; 1032 if (rc)
1032 goto extendOut; 1033 goto extendOut;
1033 }
1034 quota_allocation += n; 1034 quota_allocation += n;
1035 1035
1036 if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, 1036 if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1308,7 @@ static int dtSplitUp(tid_t tid,
1308 1308
1309 /* Rollback quota allocation */ 1309 /* Rollback quota allocation */
1310 if (rc && quota_allocation) 1310 if (rc && quota_allocation)
1311 vfs_dq_free_block(ip, quota_allocation); 1311 dquot_free_block(ip, quota_allocation);
1312 1312
1313 dtSplitUp_Exit: 1313 dtSplitUp_Exit:
1314 1314
@@ -1369,9 +1369,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1369 return -EIO; 1369 return -EIO;
1370 1370
1371 /* Allocate blocks to quota. */ 1371 /* Allocate blocks to quota. */
1372 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1372 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1373 if (rc) {
1373 release_metapage(rmp); 1374 release_metapage(rmp);
1374 return -EDQUOT; 1375 return rc;
1375 } 1376 }
1376 1377
1377 jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); 1378 jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1893,7 @@ static int dtSplitRoot(tid_t tid,
1892 struct dt_lock *dtlck; 1893 struct dt_lock *dtlck;
1893 struct tlock *tlck; 1894 struct tlock *tlck;
1894 struct lv *lv; 1895 struct lv *lv;
1896 int rc;
1895 1897
1896 /* get split root page */ 1898 /* get split root page */
1897 smp = split->mp; 1899 smp = split->mp;
@@ -1916,9 +1918,10 @@ static int dtSplitRoot(tid_t tid,
1916 rp = rmp->data; 1918 rp = rmp->data;
1917 1919
1918 /* Allocate blocks to quota. */ 1920 /* Allocate blocks to quota. */
1919 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1921 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1922 if (rc) {
1920 release_metapage(rmp); 1923 release_metapage(rmp);
1921 return -EDQUOT; 1924 return rc;
1922 } 1925 }
1923 1926
1924 BT_MARK_DIRTY(rmp, ip); 1927 BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2290,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2287 xlen = lengthPXD(&fp->header.self); 2290 xlen = lengthPXD(&fp->header.self);
2288 2291
2289 /* Free quota allocation. */ 2292 /* Free quota allocation. */
2290 vfs_dq_free_block(ip, xlen); 2293 dquot_free_block(ip, xlen);
2291 2294
2292 /* free/invalidate its buffer page */ 2295 /* free/invalidate its buffer page */
2293 discard_metapage(fmp); 2296 discard_metapage(fmp);
@@ -2363,7 +2366,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2363 xlen = lengthPXD(&p->header.self); 2366 xlen = lengthPXD(&p->header.self);
2364 2367
2365 /* Free quota allocation */ 2368 /* Free quota allocation */
2366 vfs_dq_free_block(ip, xlen); 2369 dquot_free_block(ip, xlen);
2367 2370
2368 /* free/invalidate its buffer page */ 2371 /* free/invalidate its buffer page */
2369 discard_metapage(mp); 2372 discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb0..5d3bbd10f8db 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
141 } 141 }
142 142
143 /* Allocate blocks to quota. */ 143 /* Allocate blocks to quota. */
144 if (vfs_dq_alloc_block(ip, nxlen)) { 144 rc = dquot_alloc_block(ip, nxlen);
145 if (rc) {
145 dbFree(ip, nxaddr, (s64) nxlen); 146 dbFree(ip, nxaddr, (s64) nxlen);
146 mutex_unlock(&JFS_IP(ip)->commit_mutex); 147 mutex_unlock(&JFS_IP(ip)->commit_mutex);
147 return -EDQUOT; 148 return rc;
148 } 149 }
149 150
150 /* determine the value of the extent flag */ 151 /* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
164 */ 165 */
165 if (rc) { 166 if (rc) {
166 dbFree(ip, nxaddr, nxlen); 167 dbFree(ip, nxaddr, nxlen);
167 vfs_dq_free_block(ip, nxlen); 168 dquot_free_block(ip, nxlen);
168 mutex_unlock(&JFS_IP(ip)->commit_mutex); 169 mutex_unlock(&JFS_IP(ip)->commit_mutex);
169 return (rc); 170 return (rc);
170 } 171 }
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
256 goto exit; 257 goto exit;
257 258
258 /* Allocat blocks to quota. */ 259 /* Allocat blocks to quota. */
259 if (vfs_dq_alloc_block(ip, nxlen)) { 260 rc = dquot_alloc_block(ip, nxlen);
261 if (rc) {
260 dbFree(ip, nxaddr, (s64) nxlen); 262 dbFree(ip, nxaddr, (s64) nxlen);
261 mutex_unlock(&JFS_IP(ip)->commit_mutex); 263 mutex_unlock(&JFS_IP(ip)->commit_mutex);
262 return -EDQUOT; 264 return rc;
263 } 265 }
264 266
265 delta = nxlen - xlen; 267 delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
297 /* extend the extent */ 299 /* extend the extent */
298 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { 300 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
299 dbFree(ip, xaddr + xlen, delta); 301 dbFree(ip, xaddr + xlen, delta);
300 vfs_dq_free_block(ip, nxlen); 302 dquot_free_block(ip, nxlen);
301 goto exit; 303 goto exit;
302 } 304 }
303 } else { 305 } else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
308 */ 310 */
309 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { 311 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
310 dbFree(ip, nxaddr, nxlen); 312 dbFree(ip, nxaddr, nxlen);
311 vfs_dq_free_block(ip, nxlen); 313 dquot_free_block(ip, nxlen);
312 goto exit; 314 goto exit;
313 } 315 }
314 } 316 }
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/pagemap.h> 46#include <linux/pagemap.h>
47#include <linux/quotaops.h> 47#include <linux/quotaops.h>
48#include <linux/slab.h>
48 49
49#include "jfs_incore.h" 50#include "jfs_incore.h"
50#include "jfs_inode.h" 51#include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac9..829921b67765 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
116 /* 116 /*
117 * Allocate inode to quota. 117 * Allocate inode to quota.
118 */ 118 */
119 if (vfs_dq_alloc_inode(inode)) { 119 dquot_initialize(inode);
120 rc = -EDQUOT; 120 rc = dquot_alloc_inode(inode);
121 if (rc)
121 goto fail_drop; 122 goto fail_drop;
122 }
123 123
124 inode->i_mode = mode; 124 inode->i_mode = mode;
125 /* inherit flags from parent */ 125 /* inherit flags from parent */
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
162 return inode; 162 return inode;
163 163
164fail_drop: 164fail_drop:
165 vfs_dq_drop(inode); 165 dquot_drop(inode);
166 inode->i_flags |= S_NOQUOTA; 166 inode->i_flags |= S_NOQUOTA;
167fail_unlock: 167fail_unlock:
168 inode->i_nlink = 0; 168 inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..79e2c79661df 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode*, int); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_delete_inode(struct inode *); 30extern void jfs_delete_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
@@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
40 int fh_len, int fh_type); 40 int fh_len, int fh_type);
41extern void jfs_set_inode_flags(struct inode *); 41extern void jfs_set_inode_flags(struct inode *);
42extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 42extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
43extern int jfs_setattr(struct dentry *, struct iattr *);
43 44
44extern const struct address_space_operations jfs_aops; 45extern const struct address_space_operations jfs_aops;
45extern const struct inode_operations jfs_dir_inode_operations; 46extern const struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h> 72#include <linux/seq_file.h>
73#include <linux/slab.h>
73#include "jfs_incore.h" 74#include "jfs_incore.h"
74#include "jfs_filsys.h" 75#include "jfs_filsys.h"
75#include "jfs_metapage.h" 76#include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/bio.h> 23#include <linux/bio.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
26#include <linux/mempool.h> 27#include <linux/mempool.h>
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index f26e4d03ada5..d945ea76b445 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1292,7 +1292,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1292 */ 1292 */
1293 /* 1293 /*
1294 * I believe this code is no longer needed. Splitting I_LOCK 1294 * I believe this code is no longer needed. Splitting I_LOCK
1295 * into two bits, I_LOCK and I_SYNC should prevent this 1295 * into two bits, I_NEW and I_SYNC should prevent this
1296 * deadlock as well. But since I don't have a JFS testload 1296 * deadlock as well. But since I don't have a JFS testload
1297 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. 1297 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
1298 * Joern 1298 * Joern
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
19#ifndef _H_JFS_UNICODE 19#ifndef _H_JFS_UNICODE
20#define _H_JFS_UNICODE 20#define _H_JFS_UNICODE
21 21
22#include <linux/slab.h>
22#include <asm/byteorder.h> 23#include <asm/byteorder.h>
23#include "jfs_types.h" 24#include "jfs_types.h"
24 25
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a6458648..6c50871e6220 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */
585 hint = addressXAD(xad) + lengthXAD(xad) - 1; 585 hint = addressXAD(xad) + lengthXAD(xad) - 1;
586 } else 586 } else
587 hint = 0; 587 hint = 0;
588 if ((rc = vfs_dq_alloc_block(ip, xlen))) 588 if ((rc = dquot_alloc_block(ip, xlen)))
589 goto out; 589 goto out;
590 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { 590 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
591 vfs_dq_free_block(ip, xlen); 591 dquot_free_block(ip, xlen);
592 goto out; 592 goto out;
593 } 593 }
594 } 594 }
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */
617 /* undo data extent allocation */ 617 /* undo data extent allocation */
618 if (*xaddrp == 0) { 618 if (*xaddrp == 0) {
619 dbFree(ip, xaddr, (s64) xlen); 619 dbFree(ip, xaddr, (s64) xlen);
620 vfs_dq_free_block(ip, xlen); 620 dquot_free_block(ip, xlen);
621 } 621 }
622 return rc; 622 return rc;
623 } 623 }
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
985 rbn = addressPXD(pxd); 985 rbn = addressPXD(pxd);
986 986
987 /* Allocate blocks to quota. */ 987 /* Allocate blocks to quota. */
988 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 988 rc = dquot_alloc_block(ip, lengthPXD(pxd));
989 rc = -EDQUOT; 989 if (rc)
990 goto clean_up; 990 goto clean_up;
991 }
992 991
993 quota_allocation += lengthPXD(pxd); 992 quota_allocation += lengthPXD(pxd);
994 993
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1195 1194
1196 /* Rollback quota allocation. */ 1195 /* Rollback quota allocation. */
1197 if (quota_allocation) 1196 if (quota_allocation)
1198 vfs_dq_free_block(ip, quota_allocation); 1197 dquot_free_block(ip, quota_allocation);
1199 1198
1200 return (rc); 1199 return (rc);
1201} 1200}
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
1235 struct pxdlist *pxdlist; 1234 struct pxdlist *pxdlist;
1236 struct tlock *tlck; 1235 struct tlock *tlck;
1237 struct xtlock *xtlck; 1236 struct xtlock *xtlck;
1237 int rc;
1238 1238
1239 sp = &JFS_IP(ip)->i_xtroot; 1239 sp = &JFS_IP(ip)->i_xtroot;
1240 1240
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
1252 return -EIO; 1252 return -EIO;
1253 1253
1254 /* Allocate blocks to quota. */ 1254 /* Allocate blocks to quota. */
1255 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1255 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1256 if (rc) {
1256 release_metapage(rmp); 1257 release_metapage(rmp);
1257 return -EDQUOT; 1258 return rc;
1258 } 1259 }
1259 1260
1260 jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); 1261 jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3680 ip->i_size = newsize; 3681 ip->i_size = newsize;
3681 3682
3682 /* update quota allocation to reflect freed blocks */ 3683 /* update quota allocation to reflect freed blocks */
3683 vfs_dq_free_block(ip, nfreed); 3684 dquot_free_block(ip, nfreed);
3684 3685
3685 /* 3686 /*
3686 * free tlock of invalidated pages 3687 * free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f083..4a3e9f39c21d 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
85 85
86 jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); 86 jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
87 87
88 dquot_initialize(dip);
89
88 /* 90 /*
89 * search parent directory for entry/freespace 91 * search parent directory for entry/freespace
90 * (dtSearch() returns parent directory page pinned) 92 * (dtSearch() returns parent directory page pinned)
@@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
215 217
216 jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); 218 jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
217 219
220 dquot_initialize(dip);
221
218 /* link count overflow on parent directory ? */ 222 /* link count overflow on parent directory ? */
219 if (dip->i_nlink == JFS_LINK_MAX) { 223 if (dip->i_nlink == JFS_LINK_MAX) {
220 rc = -EMLINK; 224 rc = -EMLINK;
@@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
356 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 360 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
357 361
358 /* Init inode for quota operations. */ 362 /* Init inode for quota operations. */
359 vfs_dq_init(ip); 363 dquot_initialize(dip);
364 dquot_initialize(ip);
360 365
361 /* directory must be empty to be removed */ 366 /* directory must be empty to be removed */
362 if (!dtEmpty(ip)) { 367 if (!dtEmpty(ip)) {
@@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
483 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); 488 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
484 489
485 /* Init inode for quota operations. */ 490 /* Init inode for quota operations. */
486 vfs_dq_init(ip); 491 dquot_initialize(dip);
492 dquot_initialize(ip);
487 493
488 if ((rc = get_UCSname(&dname, dentry))) 494 if ((rc = get_UCSname(&dname, dentry)))
489 goto out; 495 goto out;
@@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry,
805 if (ip->i_nlink == 0) 811 if (ip->i_nlink == 0)
806 return -ENOENT; 812 return -ENOENT;
807 813
814 dquot_initialize(dir);
815
808 tid = txBegin(ip->i_sb, 0); 816 tid = txBegin(ip->i_sb, 0);
809 817
810 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); 818 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
@@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
896 904
897 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); 905 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
898 906
907 dquot_initialize(dip);
908
899 ssize = strlen(name) + 1; 909 ssize = strlen(name) + 1;
900 910
901 /* 911 /*
@@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1087 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, 1097 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1088 new_dentry->d_name.name); 1098 new_dentry->d_name.name);
1089 1099
1100 dquot_initialize(old_dir);
1101 dquot_initialize(new_dir);
1102
1090 old_ip = old_dentry->d_inode; 1103 old_ip = old_dentry->d_inode;
1091 new_ip = new_dentry->d_inode; 1104 new_ip = new_dentry->d_inode;
1092 1105
@@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1136 } else if (new_ip) { 1149 } else if (new_ip) {
1137 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); 1150 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
1138 /* Init inode for quota operations. */ 1151 /* Init inode for quota operations. */
1139 vfs_dq_init(new_ip); 1152 dquot_initialize(new_ip);
1140 } 1153 }
1141 1154
1142 /* 1155 /*
@@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1360 1373
1361 jfs_info("jfs_mknod: %s", dentry->d_name.name); 1374 jfs_info("jfs_mknod: %s", dentry->d_name.name);
1362 1375
1376 dquot_initialize(dir);
1377
1363 if ((rc = get_UCSname(&dname, dentry))) 1378 if ((rc = get_UCSname(&dname, dentry)))
1364 goto out; 1379 goto out;
1365 1380
@@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = {
1541 .getxattr = jfs_getxattr, 1556 .getxattr = jfs_getxattr,
1542 .listxattr = jfs_listxattr, 1557 .listxattr = jfs_listxattr,
1543 .removexattr = jfs_removexattr, 1558 .removexattr = jfs_removexattr,
1544#ifdef CONFIG_JFS_POSIX_ACL
1545 .setattr = jfs_setattr, 1559 .setattr = jfs_setattr,
1560#ifdef CONFIG_JFS_POSIX_ACL
1546 .check_acl = jfs_check_acl, 1561 .check_acl = jfs_check_acl,
1547#endif 1562#endif
1548}; 1563};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2234c73fc577..157382fa6256 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 31#include <linux/exportfs.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/slab.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
@@ -131,6 +132,11 @@ static void jfs_destroy_inode(struct inode *inode)
131 kmem_cache_free(jfs_inode_cachep, ji); 132 kmem_cache_free(jfs_inode_cachep, ji);
132} 133}
133 134
135static void jfs_clear_inode(struct inode *inode)
136{
137 dquot_drop(inode);
138}
139
134static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) 140static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
135{ 141{
136 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); 142 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -524,7 +530,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
524 * Page cache is indexed by long. 530 * Page cache is indexed by long.
525 * I would use MAX_LFS_FILESIZE, but it's only half as big 531 * I would use MAX_LFS_FILESIZE, but it's only half as big
526 */ 532 */
527 sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes); 533 sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
528#endif 534#endif
529 sb->s_time_gran = 1; 535 sb->s_time_gran = 1;
530 return 0; 536 return 0;
@@ -745,6 +751,7 @@ static const struct super_operations jfs_super_operations = {
745 .dirty_inode = jfs_dirty_inode, 751 .dirty_inode = jfs_dirty_inode,
746 .write_inode = jfs_write_inode, 752 .write_inode = jfs_write_inode,
747 .delete_inode = jfs_delete_inode, 753 .delete_inode = jfs_delete_inode,
754 .clear_inode = jfs_clear_inode,
748 .put_super = jfs_put_super, 755 .put_super = jfs_put_super,
749 .sync_fs = jfs_sync_fs, 756 .sync_fs = jfs_sync_fs,
750 .freeze_fs = jfs_freeze, 757 .freeze_fs = jfs_freeze,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc9..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include <linux/quotaops.h> 25#include <linux/quotaops.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include "jfs_incore.h" 27#include "jfs_incore.h"
@@ -260,14 +261,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
260 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; 261 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
261 262
262 /* Allocate new blocks to quota. */ 263 /* Allocate new blocks to quota. */
263 if (vfs_dq_alloc_block(ip, nblocks)) { 264 rc = dquot_alloc_block(ip, nblocks);
264 return -EDQUOT; 265 if (rc)
265 } 266 return rc;
266 267
267 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); 268 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
268 if (rc) { 269 if (rc) {
269 /*Rollback quota allocation. */ 270 /*Rollback quota allocation. */
270 vfs_dq_free_block(ip, nblocks); 271 dquot_free_block(ip, nblocks);
271 return rc; 272 return rc;
272 } 273 }
273 274
@@ -332,7 +333,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
332 333
333 failed: 334 failed:
334 /* Rollback quota allocation. */ 335 /* Rollback quota allocation. */
335 vfs_dq_free_block(ip, nblocks); 336 dquot_free_block(ip, nblocks);
336 337
337 dbFree(ip, blkno, nblocks); 338 dbFree(ip, blkno, nblocks);
338 return rc; 339 return rc;
@@ -538,7 +539,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
538 539
539 if (blocks_needed > current_blocks) { 540 if (blocks_needed > current_blocks) {
540 /* Allocate new blocks to quota. */ 541 /* Allocate new blocks to quota. */
541 if (vfs_dq_alloc_block(inode, blocks_needed)) 542 rc = dquot_alloc_block(inode, blocks_needed);
543 if (rc)
542 return -EDQUOT; 544 return -EDQUOT;
543 545
544 quota_allocation = blocks_needed; 546 quota_allocation = blocks_needed;
@@ -602,7 +604,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
602 clean_up: 604 clean_up:
603 /* Rollback quota allocation */ 605 /* Rollback quota allocation */
604 if (quota_allocation) 606 if (quota_allocation)
605 vfs_dq_free_block(inode, quota_allocation); 607 dquot_free_block(inode, quota_allocation);
606 608
607 return (rc); 609 return (rc);
608} 610}
@@ -677,7 +679,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
677 679
678 /* If old blocks exist, they must be removed from quota allocation. */ 680 /* If old blocks exist, they must be removed from quota allocation. */
679 if (old_blocks) 681 if (old_blocks)
680 vfs_dq_free_block(inode, old_blocks); 682 dquot_free_block(inode, old_blocks);
681 683
682 inode->i_ctime = CURRENT_TIME; 684 inode->i_ctime = CURRENT_TIME;
683 685
diff --git a/fs/libfs.c b/fs/libfs.c
index 219576c52d80..ea9a6cc9b35c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/slab.h>
8#include <linux/mount.h> 9#include <linux/mount.h>
9#include <linux/vfs.h> 10#include <linux/vfs.h>
10#include <linux/mutex.h> 11#include <linux/mutex.h>
@@ -338,28 +339,14 @@ int simple_readpage(struct file *file, struct page *page)
338 return 0; 339 return 0;
339} 340}
340 341
341int simple_prepare_write(struct file *file, struct page *page,
342 unsigned from, unsigned to)
343{
344 if (!PageUptodate(page)) {
345 if (to - from != PAGE_CACHE_SIZE)
346 zero_user_segments(page,
347 0, from,
348 to, PAGE_CACHE_SIZE);
349 }
350 return 0;
351}
352
353int simple_write_begin(struct file *file, struct address_space *mapping, 342int simple_write_begin(struct file *file, struct address_space *mapping,
354 loff_t pos, unsigned len, unsigned flags, 343 loff_t pos, unsigned len, unsigned flags,
355 struct page **pagep, void **fsdata) 344 struct page **pagep, void **fsdata)
356{ 345{
357 struct page *page; 346 struct page *page;
358 pgoff_t index; 347 pgoff_t index;
359 unsigned from;
360 348
361 index = pos >> PAGE_CACHE_SHIFT; 349 index = pos >> PAGE_CACHE_SHIFT;
362 from = pos & (PAGE_CACHE_SIZE - 1);
363 350
364 page = grab_cache_page_write_begin(mapping, index, flags); 351 page = grab_cache_page_write_begin(mapping, index, flags);
365 if (!page) 352 if (!page)
@@ -367,43 +354,59 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
367 354
368 *pagep = page; 355 *pagep = page;
369 356
370 return simple_prepare_write(file, page, from, from+len); 357 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
371} 358 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
372 359
373static int simple_commit_write(struct file *file, struct page *page, 360 zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
374 unsigned from, unsigned to) 361 }
375{
376 struct inode *inode = page->mapping->host;
377 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
378
379 if (!PageUptodate(page))
380 SetPageUptodate(page);
381 /*
382 * No need to use i_size_read() here, the i_size
383 * cannot change under us because we hold the i_mutex.
384 */
385 if (pos > inode->i_size)
386 i_size_write(inode, pos);
387 set_page_dirty(page);
388 return 0; 362 return 0;
389} 363}
390 364
365/**
366 * simple_write_end - .write_end helper for non-block-device FSes
367 * @available: See .write_end of address_space_operations
368 * @file: "
369 * @mapping: "
370 * @pos: "
371 * @len: "
372 * @copied: "
373 * @page: "
374 * @fsdata: "
375 *
376 * simple_write_end does the minimum needed for updating a page after writing is
377 * done. It has the same API signature as the .write_end of
378 * address_space_operations vector. So it can just be set onto .write_end for
379 * FSes that don't need any other processing. i_mutex is assumed to be held.
380 * Block based filesystems should use generic_write_end().
381 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
382 * is not called, so a filesystem that actually does store data in .write_inode
383 * should extend on what's done here with a call to mark_inode_dirty() in the
384 * case that i_size has changed.
385 */
391int simple_write_end(struct file *file, struct address_space *mapping, 386int simple_write_end(struct file *file, struct address_space *mapping,
392 loff_t pos, unsigned len, unsigned copied, 387 loff_t pos, unsigned len, unsigned copied,
393 struct page *page, void *fsdata) 388 struct page *page, void *fsdata)
394{ 389{
395 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 390 struct inode *inode = page->mapping->host;
391 loff_t last_pos = pos + copied;
396 392
397 /* zero the stale part of the page if we did a short copy */ 393 /* zero the stale part of the page if we did a short copy */
398 if (copied < len) { 394 if (copied < len) {
399 void *kaddr = kmap_atomic(page, KM_USER0); 395 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
400 memset(kaddr + from + copied, 0, len - copied); 396
401 flush_dcache_page(page); 397 zero_user(page, from + copied, len - copied);
402 kunmap_atomic(kaddr, KM_USER0);
403 } 398 }
404 399
405 simple_commit_write(file, page, from, from+copied); 400 if (!PageUptodate(page))
401 SetPageUptodate(page);
402 /*
403 * No need to use i_size_read() here, the i_size
404 * cannot change under us because we hold the i_mutex.
405 */
406 if (last_pos > inode->i_size)
407 i_size_write(inode, last_pos);
406 408
409 set_page_dirty(page);
407 unlock_page(page); 410 unlock_page(page);
408 page_cache_release(page); 411 page_cache_release(page);
409 412
@@ -848,13 +851,11 @@ EXPORT_SYMBOL(simple_write_end);
848EXPORT_SYMBOL(simple_dir_inode_operations); 851EXPORT_SYMBOL(simple_dir_inode_operations);
849EXPORT_SYMBOL(simple_dir_operations); 852EXPORT_SYMBOL(simple_dir_operations);
850EXPORT_SYMBOL(simple_empty); 853EXPORT_SYMBOL(simple_empty);
851EXPORT_SYMBOL(d_alloc_name);
852EXPORT_SYMBOL(simple_fill_super); 854EXPORT_SYMBOL(simple_fill_super);
853EXPORT_SYMBOL(simple_getattr); 855EXPORT_SYMBOL(simple_getattr);
854EXPORT_SYMBOL(simple_link); 856EXPORT_SYMBOL(simple_link);
855EXPORT_SYMBOL(simple_lookup); 857EXPORT_SYMBOL(simple_lookup);
856EXPORT_SYMBOL(simple_pin_fs); 858EXPORT_SYMBOL(simple_pin_fs);
857EXPORT_UNUSED_SYMBOL(simple_prepare_write);
858EXPORT_SYMBOL(simple_readpage); 859EXPORT_SYMBOL(simple_readpage);
859EXPORT_SYMBOL(simple_release_fs); 860EXPORT_SYMBOL(simple_release_fs);
860EXPORT_SYMBOL(simple_rename); 861EXPORT_SYMBOL(simple_rename);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/slab.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
13#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/slab.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 4600c2037b8b..bb464d12104c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -479,8 +479,8 @@ again: mutex_lock(&nlm_host_mutex);
479 } 479 }
480 } 480 }
481 } 481 }
482
483 mutex_unlock(&nlm_host_mutex); 482 mutex_unlock(&nlm_host_mutex);
483 nsm_release(nsm);
484} 484}
485 485
486/* 486/*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index f956651d0f65..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h> 12#include <linux/ktime.h>
13#include <linux/slab.h>
13 14
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/xprtsock.h> 16#include <linux/sunrpc/xprtsock.h>
@@ -349,9 +350,9 @@ retry:
349 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle 350 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
350 * @info: pointer to NLMPROC_SM_NOTIFY arguments 351 * @info: pointer to NLMPROC_SM_NOTIFY arguments
351 * 352 *
352 * Returns a matching nsm_handle if found in the nsm cache; the returned 353 * Returns a matching nsm_handle if found in the nsm cache. The returned
353 * nsm_handle's reference count is bumped and sm_monitored is cleared. 354 * nsm_handle's reference count is bumped. Otherwise returns NULL if some
354 * Otherwise returns NULL if some error occurred. 355 * error occurred.
355 */ 356 */
356struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) 357struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
357{ 358{
@@ -370,12 +371,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
370 atomic_inc(&cached->sm_count); 371 atomic_inc(&cached->sm_count);
371 spin_unlock(&nsm_lock); 372 spin_unlock(&nsm_lock);
372 373
373 /*
374 * During subsequent lock activity, force a fresh
375 * notification to be set up for this host.
376 */
377 cached->sm_monitored = 0;
378
379 dprintk("lockd: host %s (%s) rebooted, cnt %d\n", 374 dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
380 cached->sm_name, cached->sm_addrbuf, 375 cached->sm_name, cached->sm_addrbuf,
381 atomic_read(&cached->sm_count)); 376 atomic_read(&cached->sm_count));
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1a54ae14a192..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
27#include <linux/mutex.h> 26#include <linux/mutex.h>
@@ -243,11 +242,9 @@ static int make_socks(struct svc_serv *serv)
243 if (err < 0) 242 if (err < 0)
244 goto out_err; 243 goto out_err;
245 244
246#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
247 err = create_lockd_family(serv, PF_INET6); 245 err = create_lockd_family(serv, PF_INET6);
248 if (err < 0 && err != -EAFNOSUPPORT) 246 if (err < 0 && err != -EAFNOSUPPORT)
249 goto out_err; 247 goto out_err;
250#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
251 248
252 warned = 0; 249 warned = 0;
253 return 0; 250 return 0;
@@ -371,82 +368,74 @@ EXPORT_SYMBOL_GPL(lockd_down);
371 368
372static ctl_table nlm_sysctls[] = { 369static ctl_table nlm_sysctls[] = {
373 { 370 {
374 .ctl_name = CTL_UNNUMBERED,
375 .procname = "nlm_grace_period", 371 .procname = "nlm_grace_period",
376 .data = &nlm_grace_period, 372 .data = &nlm_grace_period,
377 .maxlen = sizeof(unsigned long), 373 .maxlen = sizeof(unsigned long),
378 .mode = 0644, 374 .mode = 0644,
379 .proc_handler = &proc_doulongvec_minmax, 375 .proc_handler = proc_doulongvec_minmax,
380 .extra1 = (unsigned long *) &nlm_grace_period_min, 376 .extra1 = (unsigned long *) &nlm_grace_period_min,
381 .extra2 = (unsigned long *) &nlm_grace_period_max, 377 .extra2 = (unsigned long *) &nlm_grace_period_max,
382 }, 378 },
383 { 379 {
384 .ctl_name = CTL_UNNUMBERED,
385 .procname = "nlm_timeout", 380 .procname = "nlm_timeout",
386 .data = &nlm_timeout, 381 .data = &nlm_timeout,
387 .maxlen = sizeof(unsigned long), 382 .maxlen = sizeof(unsigned long),
388 .mode = 0644, 383 .mode = 0644,
389 .proc_handler = &proc_doulongvec_minmax, 384 .proc_handler = proc_doulongvec_minmax,
390 .extra1 = (unsigned long *) &nlm_timeout_min, 385 .extra1 = (unsigned long *) &nlm_timeout_min,
391 .extra2 = (unsigned long *) &nlm_timeout_max, 386 .extra2 = (unsigned long *) &nlm_timeout_max,
392 }, 387 },
393 { 388 {
394 .ctl_name = CTL_UNNUMBERED,
395 .procname = "nlm_udpport", 389 .procname = "nlm_udpport",
396 .data = &nlm_udpport, 390 .data = &nlm_udpport,
397 .maxlen = sizeof(int), 391 .maxlen = sizeof(int),
398 .mode = 0644, 392 .mode = 0644,
399 .proc_handler = &proc_dointvec_minmax, 393 .proc_handler = proc_dointvec_minmax,
400 .extra1 = (int *) &nlm_port_min, 394 .extra1 = (int *) &nlm_port_min,
401 .extra2 = (int *) &nlm_port_max, 395 .extra2 = (int *) &nlm_port_max,
402 }, 396 },
403 { 397 {
404 .ctl_name = CTL_UNNUMBERED,
405 .procname = "nlm_tcpport", 398 .procname = "nlm_tcpport",
406 .data = &nlm_tcpport, 399 .data = &nlm_tcpport,
407 .maxlen = sizeof(int), 400 .maxlen = sizeof(int),
408 .mode = 0644, 401 .mode = 0644,
409 .proc_handler = &proc_dointvec_minmax, 402 .proc_handler = proc_dointvec_minmax,
410 .extra1 = (int *) &nlm_port_min, 403 .extra1 = (int *) &nlm_port_min,
411 .extra2 = (int *) &nlm_port_max, 404 .extra2 = (int *) &nlm_port_max,
412 }, 405 },
413 { 406 {
414 .ctl_name = CTL_UNNUMBERED,
415 .procname = "nsm_use_hostnames", 407 .procname = "nsm_use_hostnames",
416 .data = &nsm_use_hostnames, 408 .data = &nsm_use_hostnames,
417 .maxlen = sizeof(int), 409 .maxlen = sizeof(int),
418 .mode = 0644, 410 .mode = 0644,
419 .proc_handler = &proc_dointvec, 411 .proc_handler = proc_dointvec,
420 }, 412 },
421 { 413 {
422 .ctl_name = CTL_UNNUMBERED,
423 .procname = "nsm_local_state", 414 .procname = "nsm_local_state",
424 .data = &nsm_local_state, 415 .data = &nsm_local_state,
425 .maxlen = sizeof(int), 416 .maxlen = sizeof(int),
426 .mode = 0644, 417 .mode = 0644,
427 .proc_handler = &proc_dointvec, 418 .proc_handler = proc_dointvec,
428 }, 419 },
429 { .ctl_name = 0 } 420 { }
430}; 421};
431 422
432static ctl_table nlm_sysctl_dir[] = { 423static ctl_table nlm_sysctl_dir[] = {
433 { 424 {
434 .ctl_name = CTL_UNNUMBERED,
435 .procname = "nfs", 425 .procname = "nfs",
436 .mode = 0555, 426 .mode = 0555,
437 .child = nlm_sysctls, 427 .child = nlm_sysctls,
438 }, 428 },
439 { .ctl_name = 0 } 429 { }
440}; 430};
441 431
442static ctl_table nlm_sysctl_root[] = { 432static ctl_table nlm_sysctl_root[] = {
443 { 433 {
444 .ctl_name = CTL_FS,
445 .procname = "fs", 434 .procname = "fs",
446 .mode = 0555, 435 .mode = 0555,
447 .child = nlm_sysctl_dir, 436 .child = nlm_sysctl_dir,
448 }, 437 },
449 { .ctl_name = 0 } 438 { }
450}; 439};
451 440
452#endif /* CONFIG_SYSCTL */ 441#endif /* CONFIG_SYSCTL */
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bd173a6ca3b1..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,12 +9,7 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/in.h>
15#include <linux/sunrpc/svc.h>
16#include <linux/sunrpc/clnt.h>
17#include <linux/nfsd/nfsd.h>
18#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
19#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
20 15
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/slab.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index e1d28ddd2169..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,12 +9,7 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/in.h>
15#include <linux/sunrpc/svc.h>
16#include <linux/sunrpc/clnt.h>
17#include <linux/nfsd/nfsd.h>
18#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
19#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
20 15
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/in.h> 12#include <linux/in.h>
13#include <linux/slab.h>
13#include <linux/mutex.h> 14#include <linux/mutex.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index a8794f233bc9..ab24d49fc048 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode)
1182 struct file_lock *fl; 1182 struct file_lock *fl;
1183 unsigned long break_time; 1183 unsigned long break_time;
1184 int i_have_this_lease = 0; 1184 int i_have_this_lease = 0;
1185 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1185 1186
1186 new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK); 1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1187 1188
1188 lock_kernel(); 1189 lock_kernel();
1189 1190
@@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1197 if (fl->fl_owner == current->files) 1198 if (fl->fl_owner == current->files)
1198 i_have_this_lease = 1; 1199 i_have_this_lease = 1;
1199 1200
1200 if (mode & FMODE_WRITE) { 1201 if (want_write) {
1201 /* If we want write access, we have to revoke any lease. */ 1202 /* If we want write access, we have to revoke any lease. */
1202 future = F_UNLCK | F_INPROGRESS; 1203 future = F_UNLCK | F_INPROGRESS;
1203 } else if (flock->fl_type & F_INPROGRESS) { 1204 } else if (flock->fl_type & F_INPROGRESS) {
@@ -1454,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
1454 * leases held by processes on this node. 1455 * leases held by processes on this node.
1455 * 1456 *
1456 * There is also no break_lease method; filesystems that 1457 * There is also no break_lease method; filesystems that
1457 * handle their own leases shoud break leases themselves from the 1458 * handle their own leases should break leases themselves from the
1458 * filesystem's open, create, and (on truncate) setattr methods. 1459 * filesystem's open, create, and (on truncate) setattr methods.
1459 * 1460 *
1460 * Warning: the only current setlease methods exist only to disable 1461 * Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
1config LOGFS
2 tristate "LogFS file system (EXPERIMENTAL)"
3 depends on (MTD || BLOCK) && EXPERIMENTAL
4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE
6 select CRC32
7 select BTREE
8 help
9 Flash filesystem aimed to scale efficiently to large devices.
10 In comparison to JFFS2 it offers significantly faster mount
11 times and potentially less RAM usage, although the latter has
12 not been measured yet.
13
14 In its current state it is still very experimental and should
15 not be used for other than testing purposes.
16
17 If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
1obj-$(CONFIG_LOGFS) += logfs.o
2
3logfs-y += compr.o
4logfs-y += dir.o
5logfs-y += file.o
6logfs-y += gc.o
7logfs-y += inode.o
8logfs-y += journal.o
9logfs-y += readwrite.o
10logfs-y += segment.o
11logfs-y += super.o
12logfs-$(CONFIG_BLOCK) += dev_bdev.o
13logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
1/*
2 * fs/logfs/compr.c - compression routines
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/vmalloc.h>
10#include <linux/zlib.h>
11
12#define COMPR_LEVEL 3
13
14static DEFINE_MUTEX(compr_mutex);
15static struct z_stream_s stream;
16
17int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
18{
19 int err, ret;
20
21 ret = -EIO;
22 mutex_lock(&compr_mutex);
23 err = zlib_deflateInit(&stream, COMPR_LEVEL);
24 if (err != Z_OK)
25 goto error;
26
27 stream.next_in = in;
28 stream.avail_in = inlen;
29 stream.total_in = 0;
30 stream.next_out = out;
31 stream.avail_out = outlen;
32 stream.total_out = 0;
33
34 err = zlib_deflate(&stream, Z_FINISH);
35 if (err != Z_STREAM_END)
36 goto error;
37
38 err = zlib_deflateEnd(&stream);
39 if (err != Z_OK)
40 goto error;
41
42 if (stream.total_out >= stream.total_in)
43 goto error;
44
45 ret = stream.total_out;
46error:
47 mutex_unlock(&compr_mutex);
48 return ret;
49}
50
51int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
52{
53 int err, ret;
54
55 ret = -EIO;
56 mutex_lock(&compr_mutex);
57 err = zlib_inflateInit(&stream);
58 if (err != Z_OK)
59 goto error;
60
61 stream.next_in = in;
62 stream.avail_in = inlen;
63 stream.total_in = 0;
64 stream.next_out = out;
65 stream.avail_out = outlen;
66 stream.total_out = 0;
67
68 err = zlib_inflate(&stream, Z_FINISH);
69 if (err != Z_STREAM_END)
70 goto error;
71
72 err = zlib_inflateEnd(&stream);
73 if (err != Z_OK)
74 goto error;
75
76 ret = 0;
77error:
78 mutex_unlock(&compr_mutex);
79 return ret;
80}
81
82int __init logfs_compr_init(void)
83{
84 size_t size = max(zlib_deflate_workspacesize(),
85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size);
87 if (!stream.workspace)
88 return -ENOMEM;
89 return 0;
90}
91
92void logfs_compr_exit(void)
93{
94 vfree(stream.workspace);
95}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..243c00071f76
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,333 @@
1/*
2 * fs/logfs/dev_bdev.c - Device access methods for block devices
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/bio.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
13
14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
15
16static void request_complete(struct bio *bio, int err)
17{
18 complete((struct completion *)bio->bi_private);
19}
20
21static int sync_request(struct page *page, struct block_device *bdev, int rw)
22{
23 struct bio bio;
24 struct bio_vec bio_vec;
25 struct completion complete;
26
27 bio_init(&bio);
28 bio.bi_io_vec = &bio_vec;
29 bio_vec.bv_page = page;
30 bio_vec.bv_len = PAGE_SIZE;
31 bio_vec.bv_offset = 0;
32 bio.bi_vcnt = 1;
33 bio.bi_idx = 0;
34 bio.bi_size = PAGE_SIZE;
35 bio.bi_bdev = bdev;
36 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
37 init_completion(&complete);
38 bio.bi_private = &complete;
39 bio.bi_end_io = request_complete;
40
41 submit_bio(rw, &bio);
42 generic_unplug_device(bdev_get_queue(bdev));
43 wait_for_completion(&complete);
44 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
45}
46
47static int bdev_readpage(void *_sb, struct page *page)
48{
49 struct super_block *sb = _sb;
50 struct block_device *bdev = logfs_super(sb)->s_bdev;
51 int err;
52
53 err = sync_request(page, bdev, READ);
54 if (err) {
55 ClearPageUptodate(page);
56 SetPageError(page);
57 } else {
58 SetPageUptodate(page);
59 ClearPageError(page);
60 }
61 unlock_page(page);
62 return err;
63}
64
65static DECLARE_WAIT_QUEUE_HEAD(wq);
66
67static void writeseg_end_io(struct bio *bio, int err)
68{
69 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
70 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
71 struct super_block *sb = bio->bi_private;
72 struct logfs_super *super = logfs_super(sb);
73 struct page *page;
74
75 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
76 BUG_ON(err);
77 BUG_ON(bio->bi_vcnt == 0);
78 do {
79 page = bvec->bv_page;
80 if (--bvec >= bio->bi_io_vec)
81 prefetchw(&bvec->bv_page->flags);
82
83 end_page_writeback(page);
84 page_cache_release(page);
85 } while (bvec >= bio->bi_io_vec);
86 bio_put(bio);
87 if (atomic_dec_and_test(&super->s_pending_writes))
88 wake_up(&wq);
89}
90
91static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
92 size_t nr_pages)
93{
94 struct logfs_super *super = logfs_super(sb);
95 struct address_space *mapping = super->s_mapping_inode->i_mapping;
96 struct bio *bio;
97 struct page *page;
98 struct request_queue *q = bdev_get_queue(sb->s_bdev);
99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
100 int i;
101
102 if (max_pages > BIO_MAX_PAGES)
103 max_pages = BIO_MAX_PAGES;
104 bio = bio_alloc(GFP_NOFS, max_pages);
105 BUG_ON(!bio);
106
107 for (i = 0; i < nr_pages; i++) {
108 if (i >= max_pages) {
109 /* Block layer cannot split bios :( */
110 bio->bi_vcnt = i;
111 bio->bi_idx = 0;
112 bio->bi_size = i * PAGE_SIZE;
113 bio->bi_bdev = super->s_bdev;
114 bio->bi_sector = ofs >> 9;
115 bio->bi_private = sb;
116 bio->bi_end_io = writeseg_end_io;
117 atomic_inc(&super->s_pending_writes);
118 submit_bio(WRITE, bio);
119
120 ofs += i * PAGE_SIZE;
121 index += i;
122 nr_pages -= i;
123 i = 0;
124
125 bio = bio_alloc(GFP_NOFS, max_pages);
126 BUG_ON(!bio);
127 }
128 page = find_lock_page(mapping, index + i);
129 BUG_ON(!page);
130 bio->bi_io_vec[i].bv_page = page;
131 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
132 bio->bi_io_vec[i].bv_offset = 0;
133
134 BUG_ON(PageWriteback(page));
135 set_page_writeback(page);
136 unlock_page(page);
137 }
138 bio->bi_vcnt = nr_pages;
139 bio->bi_idx = 0;
140 bio->bi_size = nr_pages * PAGE_SIZE;
141 bio->bi_bdev = super->s_bdev;
142 bio->bi_sector = ofs >> 9;
143 bio->bi_private = sb;
144 bio->bi_end_io = writeseg_end_io;
145 atomic_inc(&super->s_pending_writes);
146 submit_bio(WRITE, bio);
147 return 0;
148}
149
150static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
151{
152 struct logfs_super *super = logfs_super(sb);
153 int head;
154
155 BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
156
157 if (len == 0) {
158 /* This can happen when the object fit perfectly into a
159 * segment, the segment gets written per sync and subsequently
160 * closed.
161 */
162 return;
163 }
164 head = ofs & (PAGE_SIZE - 1);
165 if (head) {
166 ofs -= head;
167 len += head;
168 }
169 len = PAGE_ALIGN(len);
170 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
171 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
172}
173
174
175static void erase_end_io(struct bio *bio, int err)
176{
177 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
178 struct super_block *sb = bio->bi_private;
179 struct logfs_super *super = logfs_super(sb);
180
181 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
182 BUG_ON(err);
183 BUG_ON(bio->bi_vcnt == 0);
184 bio_put(bio);
185 if (atomic_dec_and_test(&super->s_pending_writes))
186 wake_up(&wq);
187}
188
189static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
190 size_t nr_pages)
191{
192 struct logfs_super *super = logfs_super(sb);
193 struct bio *bio;
194 struct request_queue *q = bdev_get_queue(sb->s_bdev);
195 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
196 int i;
197
198 if (max_pages > BIO_MAX_PAGES)
199 max_pages = BIO_MAX_PAGES;
200 bio = bio_alloc(GFP_NOFS, max_pages);
201 BUG_ON(!bio);
202
203 for (i = 0; i < nr_pages; i++) {
204 if (i >= max_pages) {
205 /* Block layer cannot split bios :( */
206 bio->bi_vcnt = i;
207 bio->bi_idx = 0;
208 bio->bi_size = i * PAGE_SIZE;
209 bio->bi_bdev = super->s_bdev;
210 bio->bi_sector = ofs >> 9;
211 bio->bi_private = sb;
212 bio->bi_end_io = erase_end_io;
213 atomic_inc(&super->s_pending_writes);
214 submit_bio(WRITE, bio);
215
216 ofs += i * PAGE_SIZE;
217 index += i;
218 nr_pages -= i;
219 i = 0;
220
221 bio = bio_alloc(GFP_NOFS, max_pages);
222 BUG_ON(!bio);
223 }
224 bio->bi_io_vec[i].bv_page = super->s_erase_page;
225 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
226 bio->bi_io_vec[i].bv_offset = 0;
227 }
228 bio->bi_vcnt = nr_pages;
229 bio->bi_idx = 0;
230 bio->bi_size = nr_pages * PAGE_SIZE;
231 bio->bi_bdev = super->s_bdev;
232 bio->bi_sector = ofs >> 9;
233 bio->bi_private = sb;
234 bio->bi_end_io = erase_end_io;
235 atomic_inc(&super->s_pending_writes);
236 submit_bio(WRITE, bio);
237 return 0;
238}
239
240static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
241 int ensure_write)
242{
243 struct logfs_super *super = logfs_super(sb);
244
245 BUG_ON(to & (PAGE_SIZE - 1));
246 BUG_ON(len & (PAGE_SIZE - 1));
247
248 if (super->s_flags & LOGFS_SB_FLAG_RO)
249 return -EROFS;
250
251 if (ensure_write) {
252 /*
253 * Object store doesn't care whether erases happen or not.
254 * But for the journal they are required. Otherwise a scan
255 * can find an old commit entry and assume it is the current
256 * one, travelling back in time.
257 */
258 do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
259 }
260
261 return 0;
262}
263
264static void bdev_sync(struct super_block *sb)
265{
266 struct logfs_super *super = logfs_super(sb);
267
268 wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
269}
270
271static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
272{
273 struct logfs_super *super = logfs_super(sb);
274 struct address_space *mapping = super->s_mapping_inode->i_mapping;
275 filler_t *filler = bdev_readpage;
276
277 *ofs = 0;
278 return read_cache_page(mapping, 0, filler, sb);
279}
280
281static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
282{
283 struct logfs_super *super = logfs_super(sb);
284 struct address_space *mapping = super->s_mapping_inode->i_mapping;
285 filler_t *filler = bdev_readpage;
286 u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
287 pgoff_t index = pos >> PAGE_SHIFT;
288
289 *ofs = pos;
290 return read_cache_page(mapping, index, filler, sb);
291}
292
293static int bdev_write_sb(struct super_block *sb, struct page *page)
294{
295 struct block_device *bdev = logfs_super(sb)->s_bdev;
296
297 /* Nothing special to do for block devices. */
298 return sync_request(page, bdev, WRITE);
299}
300
301static void bdev_put_device(struct super_block *sb)
302{
303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
304}
305
306static const struct logfs_device_ops bd_devops = {
307 .find_first_sb = bdev_find_first_sb,
308 .find_last_sb = bdev_find_last_sb,
309 .write_sb = bdev_write_sb,
310 .readpage = bdev_readpage,
311 .writeseg = bdev_writeseg,
312 .erase = bdev_erase,
313 .sync = bdev_sync,
314 .put_device = bdev_put_device,
315};
316
317int logfs_get_sb_bdev(struct file_system_type *type, int flags,
318 const char *devname, struct vfsmount *mnt)
319{
320 struct block_device *bdev;
321
322 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
323 if (IS_ERR(bdev))
324 return PTR_ERR(bdev);
325
326 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
327 int mtdnr = MINOR(bdev->bd_dev);
328 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
329 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
330 }
331
332 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
333}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..cafb6ef2e05b
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,254 @@
1/*
2 * fs/logfs/dev_mtd.c - Device access methods for MTD
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/completion.h>
10#include <linux/mount.h>
11#include <linux/sched.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
16{
17 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
18 size_t retlen;
19 int ret;
20
21 ret = mtd->read(mtd, ofs, len, &retlen, buf);
22 BUG_ON(ret == -EINVAL);
23 if (ret)
24 return ret;
25
26 /* Not sure if we should loop instead. */
27 if (retlen != len)
28 return -EIO;
29
30 return 0;
31}
32
33static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
34{
35 struct logfs_super *super = logfs_super(sb);
36 struct mtd_info *mtd = super->s_mtd;
37 size_t retlen;
38 loff_t page_start, page_end;
39 int ret;
40
41 if (super->s_flags & LOGFS_SB_FLAG_RO)
42 return -EROFS;
43
44 BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
45 BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
46 BUG_ON(len > PAGE_CACHE_SIZE);
47 page_start = ofs & PAGE_CACHE_MASK;
48 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
49 ret = mtd->write(mtd, ofs, len, &retlen, buf);
50 if (ret || (retlen != len))
51 return -EIO;
52
53 return 0;
54}
55
56/*
57 * For as long as I can remember (since about 2001) mtd->erase has been an
58 * asynchronous interface lacking the first driver to actually use the
59 * asynchronous properties. So just to prevent the first implementor of such
60 * a thing from breaking logfs in 2350, we do the usual pointless dance to
61 * declare a completion variable and wait for completion before returning
62 * from mtd_erase(). What an excercise in futility!
63 */
64static void logfs_erase_callback(struct erase_info *ei)
65{
66 complete((struct completion *)ei->priv);
67}
68
69static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
70{
71 struct logfs_super *super = logfs_super(sb);
72 struct address_space *mapping = super->s_mapping_inode->i_mapping;
73 struct page *page;
74 pgoff_t index = ofs >> PAGE_SHIFT;
75
76 for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
77 page = find_get_page(mapping, index);
78 if (!page)
79 continue;
80 memset(page_address(page), 0xFF, PAGE_SIZE);
81 page_cache_release(page);
82 }
83 return 0;
84}
85
86static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
87 int ensure_write)
88{
89 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
90 struct erase_info ei;
91 DECLARE_COMPLETION_ONSTACK(complete);
92 int ret;
93
94 BUG_ON(len % mtd->erasesize);
95 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
96 return -EROFS;
97
98 memset(&ei, 0, sizeof(ei));
99 ei.mtd = mtd;
100 ei.addr = ofs;
101 ei.len = len;
102 ei.callback = logfs_erase_callback;
103 ei.priv = (long)&complete;
104 ret = mtd->erase(mtd, &ei);
105 if (ret)
106 return -EIO;
107
108 wait_for_completion(&complete);
109 if (ei.state != MTD_ERASE_DONE)
110 return -EIO;
111 return mtd_erase_mapping(sb, ofs, len);
112}
113
114static void mtd_sync(struct super_block *sb)
115{
116 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
117
118 if (mtd->sync)
119 mtd->sync(mtd);
120}
121
122static int mtd_readpage(void *_sb, struct page *page)
123{
124 struct super_block *sb = _sb;
125 int err;
126
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page));
129 if (err == -EUCLEAN) {
130 err = 0;
131 /* FIXME: force GC this segment */
132 }
133 if (err) {
134 ClearPageUptodate(page);
135 SetPageError(page);
136 } else {
137 SetPageUptodate(page);
138 ClearPageError(page);
139 }
140 unlock_page(page);
141 return err;
142}
143
144static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
145{
146 struct logfs_super *super = logfs_super(sb);
147 struct address_space *mapping = super->s_mapping_inode->i_mapping;
148 filler_t *filler = mtd_readpage;
149 struct mtd_info *mtd = super->s_mtd;
150
151 if (!mtd->block_isbad)
152 return NULL;
153
154 *ofs = 0;
155 while (mtd->block_isbad(mtd, *ofs)) {
156 *ofs += mtd->erasesize;
157 if (*ofs >= mtd->size)
158 return NULL;
159 }
160 BUG_ON(*ofs & ~PAGE_MASK);
161 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
162}
163
164static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
165{
166 struct logfs_super *super = logfs_super(sb);
167 struct address_space *mapping = super->s_mapping_inode->i_mapping;
168 filler_t *filler = mtd_readpage;
169 struct mtd_info *mtd = super->s_mtd;
170
171 if (!mtd->block_isbad)
172 return NULL;
173
174 *ofs = mtd->size - mtd->erasesize;
175 while (mtd->block_isbad(mtd, *ofs)) {
176 *ofs -= mtd->erasesize;
177 if (*ofs <= 0)
178 return NULL;
179 }
180 *ofs = *ofs + mtd->erasesize - 0x1000;
181 BUG_ON(*ofs & ~PAGE_MASK);
182 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
183}
184
185static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
186 size_t nr_pages)
187{
188 struct logfs_super *super = logfs_super(sb);
189 struct address_space *mapping = super->s_mapping_inode->i_mapping;
190 struct page *page;
191 int i, err;
192
193 for (i = 0; i < nr_pages; i++) {
194 page = find_lock_page(mapping, index + i);
195 BUG_ON(!page);
196
197 err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
198 page_address(page));
199 unlock_page(page);
200 page_cache_release(page);
201 if (err)
202 return err;
203 }
204 return 0;
205}
206
207static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
208{
209 struct logfs_super *super = logfs_super(sb);
210 int head;
211
212 if (super->s_flags & LOGFS_SB_FLAG_RO)
213 return;
214
215 if (len == 0) {
216 /* This can happen when the object fit perfectly into a
217 * segment, the segment gets written per sync and subsequently
218 * closed.
219 */
220 return;
221 }
222 head = ofs & (PAGE_SIZE - 1);
223 if (head) {
224 ofs -= head;
225 len += head;
226 }
227 len = PAGE_ALIGN(len);
228 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
229}
230
231static void mtd_put_device(struct super_block *sb)
232{
233 put_mtd_device(logfs_super(sb)->s_mtd);
234}
235
236static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg,
241 .erase = mtd_erase,
242 .sync = mtd_sync,
243 .put_device = mtd_put_device,
244};
245
246int logfs_get_sb_mtd(struct file_system_type *type, int flags,
247 int mtdnr, struct vfsmount *mnt)
248{
249 struct mtd_info *mtd;
250 const struct logfs_device_ops *devops = &mtd_devops;
251
252 mtd = get_mtd_device(NULL, mtdnr);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..2396a85c0f55
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
1/*
2 * fs/logfs/dir.c - directory-related code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11/*
12 * Atomic dir operations
13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do
16 * a small amount of journaling.
17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do
19 * the work: __logfs_create. This function works in two atomic steps:
20 * 1. allocate inode (remember in journal)
21 * 2. allocate dentry (clear journal)
22 *
23 * As we can only get interrupted between the two, when the inode we just
24 * created is simply stored in the anchor. On next mount, if we were
25 * interrupted, we delete the inode. From a users point of view the
26 * operation never happened.
27 *
28 * Unlink and rmdir also share the same function: unlink. Again, this
29 * function works in two atomic steps
30 * 1. remove dentry (remember inode in journal)
31 * 2. unlink inode (clear journal)
32 *
33 * And again, on the next mount, if we were interrupted, we delete the inode.
34 * From a users point of view the operation succeeded.
35 *
36 * Rename is the real pain to deal with, harder than all the other methods
37 * combined. Depending on the circumstances we can run into three cases.
38 * A "target rename" where the target dentry already existed, a "local
39 * rename" where both parent directories are identical or a "cross-directory
40 * rename" in the remaining case.
41 *
42 * Local rename is atomic, as the old dentry is simply rewritten with a new
43 * name.
44 *
45 * Cross-directory rename works in two steps, similar to __logfs_create and
46 * logfs_unlink:
47 * 1. Write new dentry (remember old dentry in journal)
48 * 2. Remove old dentry (clear journal)
49 *
50 * Here we remember a dentry instead of an inode. On next mount, if we were
51 * interrupted, we delete the dentry. From a users point of view, the
52 * operation succeeded.
53 *
54 * Target rename works in three atomic steps:
55 * 1. Attach old inode to new dentry (remember old dentry and new inode)
56 * 2. Remove old dentry (still remember the new inode)
57 * 3. Remove victim inode
58 *
59 * Here we remember both an inode an a dentry. If we get interrupted
60 * between steps 1 and 2, we delete both the dentry and the inode. If
61 * we get interrupted between steps 2 and 3, we delete just the inode.
62 * In either case, the remaining objects are deleted on next mount. From
63 * a users point of view, the operation succeeded.
64 */
65
66static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
67 loff_t pos)
68{
69 return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
70}
71
72static int write_inode(struct inode *inode)
73{
74 return __logfs_write_inode(inode, WF_LOCK);
75}
76
77static s64 dir_seek_data(struct inode *inode, s64 pos)
78{
79 s64 new_pos = logfs_seek_data(inode, pos);
80
81 return max(pos, new_pos - 1);
82}
83
84static int beyond_eof(struct inode *inode, loff_t bix)
85{
86 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
87 return pos >= i_size_read(inode);
88}
89
90/*
91 * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferrably a bit-sparse
96 * one.
97 */
98static u32 hash_32(const char *s, int len, u32 seed)
99{
100 u32 hash = seed;
101 int i;
102
103 for (i = 0; i < len; i++)
104 hash = hash * 293 + s[i];
105 return hash;
106}
107
108/*
109 * We have to satisfy several conflicting requirements here. Small
110 * directories should stay fairly compact and not require too many
111 * indirect blocks. The number of possible locations for a given hash
112 * should be small to make lookup() fast. And we should try hard not
113 * to overflow the 32bit name space or nfs and 32bit host systems will
114 * be unhappy.
115 *
116 * So we use the following scheme. First we reduce the hash to 0..15
117 * and try a direct block. If that is occupied we reduce the hash to
118 * 16..255 and try an indirect block. Same for 2x and 3x indirect
119 * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
120 * but use buckets containing eight entries instead of a single one.
121 *
122 * Using 16 entries should allow for a reasonable amount of hash
123 * collisions, so the 32bit name space can be packed fairly tight
124 * before overflowing. Oh and currently we don't overflow but return
125 * and error.
126 *
127 * How likely are collisions? Doing the appropriate math is beyond me
128 * and the Bronstein textbook. But running a test program to brute
129 * force collisions for a couple of days showed that on average the
130 * first collision occurs after 598M entries, with 290M being the
131 * smallest result. Obviously 21 entries could already cause a
132 * collision if all entries are carefully chosen.
133 */
134static pgoff_t hash_index(u32 hash, int round)
135{
136 u32 i0_blocks = I0_BLOCKS;
137 u32 i1_blocks = I1_BLOCKS;
138 u32 i2_blocks = I2_BLOCKS;
139 u32 i3_blocks = I3_BLOCKS;
140
141 switch (round) {
142 case 0:
143 return hash % i0_blocks;
144 case 1:
145 return i0_blocks + hash % (i1_blocks - i0_blocks);
146 case 2:
147 return i1_blocks + hash % (i2_blocks - i1_blocks);
148 case 3:
149 return i2_blocks + hash % (i3_blocks - i2_blocks);
150 case 4 ... 19:
151 return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
152 + round - 4;
153 }
154 BUG();
155}
156
157static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
158{
159 struct qstr *name = &dentry->d_name;
160 struct page *page;
161 struct logfs_disk_dentry *dd;
162 u32 hash = hash_32(name->name, name->len, 0);
163 pgoff_t index;
164 int round;
165
166 if (name->len > LOGFS_MAX_NAMELEN)
167 return ERR_PTR(-ENAMETOOLONG);
168
169 for (round = 0; round < 20; round++) {
170 index = hash_index(hash, round);
171
172 if (beyond_eof(dir, index))
173 return NULL;
174 if (!logfs_exist_block(dir, index))
175 continue;
176 page = read_cache_page(dir->i_mapping, index,
177 (filler_t *)logfs_readpage, NULL);
178 if (IS_ERR(page))
179 return page;
180 dd = kmap_atomic(page, KM_USER0);
181 BUG_ON(dd->namelen == 0);
182
183 if (name->len != be16_to_cpu(dd->namelen) ||
184 memcmp(name->name, dd->name, name->len)) {
185 kunmap_atomic(dd, KM_USER0);
186 page_cache_release(page);
187 continue;
188 }
189
190 kunmap_atomic(dd, KM_USER0);
191 return page;
192 }
193 return NULL;
194}
195
196static int logfs_remove_inode(struct inode *inode)
197{
198 int ret;
199
200 inode->i_nlink--;
201 ret = write_inode(inode);
202 LOGFS_BUG_ON(ret, inode->i_sb);
203 return ret;
204}
205
206static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
207{
208 if (logfs_inode(inode)->li_block)
209 logfs_inode(inode)->li_block->ta = NULL;
210 kfree(ta);
211}
212
213static int logfs_unlink(struct inode *dir, struct dentry *dentry)
214{
215 struct logfs_super *super = logfs_super(dir->i_sb);
216 struct inode *inode = dentry->d_inode;
217 struct logfs_transaction *ta;
218 struct page *page;
219 pgoff_t index;
220 int ret;
221
222 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
223 if (!ta)
224 return -ENOMEM;
225
226 ta->state = UNLINK_1;
227 ta->ino = inode->i_ino;
228
229 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
230
231 page = logfs_get_dd_page(dir, dentry);
232 if (!page) {
233 kfree(ta);
234 return -ENOENT;
235 }
236 if (IS_ERR(page)) {
237 kfree(ta);
238 return PTR_ERR(page);
239 }
240 index = page->index;
241 page_cache_release(page);
242
243 mutex_lock(&super->s_dirop_mutex);
244 logfs_add_transaction(dir, ta);
245
246 ret = logfs_delete(dir, index, NULL);
247 if (!ret)
248 ret = write_inode(dir);
249
250 if (ret) {
251 abort_transaction(dir, ta);
252 printk(KERN_ERR"LOGFS: unable to delete inode\n");
253 goto out;
254 }
255
256 ta->state = UNLINK_2;
257 logfs_add_transaction(inode, ta);
258 ret = logfs_remove_inode(inode);
259out:
260 mutex_unlock(&super->s_dirop_mutex);
261 return ret;
262}
263
264static inline int logfs_empty_dir(struct inode *dir)
265{
266 u64 data;
267
268 data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
269 return data >= i_size_read(dir);
270}
271
272static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{
274 struct inode *inode = dentry->d_inode;
275
276 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY;
278
279 return logfs_unlink(dir, dentry);
280}
281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */
284#define IMPLICIT_NODES 2
285static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
286{
287 struct inode *dir = file->f_dentry->d_inode;
288 loff_t pos = file->f_pos - IMPLICIT_NODES;
289 struct page *page;
290 struct logfs_disk_dentry *dd;
291 int full;
292
293 BUG_ON(pos < 0);
294 for (;; pos++) {
295 if (beyond_eof(dir, pos))
296 break;
297 if (!logfs_exist_block(dir, pos)) {
298 /* deleted dentry */
299 pos = dir_seek_data(dir, pos);
300 continue;
301 }
302 page = read_cache_page(dir->i_mapping, pos,
303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page))
305 return PTR_ERR(page);
306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0);
308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap(page);
312 page_cache_release(page);
313 if (full)
314 break;
315 }
316
317 file->f_pos = pos + IMPLICIT_NODES;
318 return 0;
319}
320
321static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
322{
323 struct inode *inode = file->f_dentry->d_inode;
324 ino_t pino = parent_ino(file->f_dentry);
325 int err;
326
327 if (file->f_pos < 0)
328 return -EINVAL;
329
330 if (file->f_pos == 0) {
331 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
332 return 0;
333 file->f_pos++;
334 }
335 if (file->f_pos == 1) {
336 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
337 return 0;
338 file->f_pos++;
339 }
340
341 err = __logfs_readdir(file, buf, filldir);
342 return err;
343}
344
345static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
346{
347 dd->namelen = cpu_to_be16(name->len);
348 memcpy(dd->name, name->name, name->len);
349}
350
351static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
352 struct nameidata *nd)
353{
354 struct page *page;
355 struct logfs_disk_dentry *dd;
356 pgoff_t index;
357 u64 ino = 0;
358 struct inode *inode;
359
360 page = logfs_get_dd_page(dir, dentry);
361 if (IS_ERR(page))
362 return ERR_CAST(page);
363 if (!page) {
364 d_add(dentry, NULL);
365 return NULL;
366 }
367 index = page->index;
368 dd = kmap_atomic(page, KM_USER0);
369 ino = be64_to_cpu(dd->ino);
370 kunmap_atomic(dd, KM_USER0);
371 page_cache_release(page);
372
373 inode = logfs_iget(dir->i_sb, ino);
374 if (IS_ERR(inode)) {
375 printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
376 ino, dir->i_ino, index);
377 return ERR_CAST(inode);
378 }
379 return d_splice_alias(inode, dentry);
380}
381
382static void grow_dir(struct inode *dir, loff_t index)
383{
384 index = (index + 1) << dir->i_sb->s_blocksize_bits;
385 if (i_size_read(dir) < index)
386 i_size_write(dir, index);
387}
388
389static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
390 struct inode *inode)
391{
392 struct page *page;
393 struct logfs_disk_dentry *dd;
394 u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
395 pgoff_t index;
396 int round, err;
397
398 for (round = 0; round < 20; round++) {
399 index = hash_index(hash, round);
400
401 if (logfs_exist_block(dir, index))
402 continue;
403 page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
404 if (!page)
405 return -ENOMEM;
406
407 dd = kmap_atomic(page, KM_USER0);
408 memset(dd, 0, sizeof(*dd));
409 dd->ino = cpu_to_be64(inode->i_ino);
410 dd->type = logfs_type(inode);
411 logfs_set_name(dd, &dentry->d_name);
412 kunmap_atomic(dd, KM_USER0);
413
414 err = logfs_write_buf(dir, page, WF_LOCK);
415 unlock_page(page);
416 page_cache_release(page);
417 if (!err)
418 grow_dir(dir, index);
419 return err;
420 }
421 /* FIXME: Is there a better return value? In most cases neither
422 * the filesystem nor the directory are full. But we have had
423 * too many collisions for this particular hash and no fallback.
424 */
425 return -ENOSPC;
426}
427
428static int __logfs_create(struct inode *dir, struct dentry *dentry,
429 struct inode *inode, const char *dest, long destlen)
430{
431 struct logfs_super *super = logfs_super(dir->i_sb);
432 struct logfs_inode *li = logfs_inode(inode);
433 struct logfs_transaction *ta;
434 int ret;
435
436 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
437 if (!ta)
438 return -ENOMEM;
439
440 ta->state = CREATE_1;
441 ta->ino = inode->i_ino;
442 mutex_lock(&super->s_dirop_mutex);
443 logfs_add_transaction(inode, ta);
444
445 if (dest) {
446 /* symlink */
447 ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
448 if (!ret)
449 ret = write_inode(inode);
450 } else {
451 /* creat/mkdir/mknod */
452 ret = write_inode(inode);
453 }
454 if (ret) {
455 abort_transaction(inode, ta);
456 li->li_flags |= LOGFS_IF_STILLBORN;
457 /* FIXME: truncate symlink */
458 inode->i_nlink--;
459 iput(inode);
460 goto out;
461 }
462
463 ta->state = CREATE_2;
464 logfs_add_transaction(dir, ta);
465 ret = logfs_write_dir(dir, dentry, inode);
466 /* sync directory */
467 if (!ret)
468 ret = write_inode(dir);
469
470 if (ret) {
471 logfs_del_transaction(dir, ta);
472 ta->state = CREATE_2;
473 logfs_add_transaction(inode, ta);
474 logfs_remove_inode(inode);
475 iput(inode);
476 goto out;
477 }
478 d_instantiate(dentry, inode);
479out:
480 mutex_unlock(&super->s_dirop_mutex);
481 return ret;
482}
483
484static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
485{
486 struct inode *inode;
487
488 /*
489 * FIXME: why do we have to fill in S_IFDIR, while the mode is
490 * correct for mknod, creat, etc.? Smells like the vfs *should*
491 * do it for us but for some reason fails to do so.
492 */
493 inode = logfs_new_inode(dir, S_IFDIR | mode);
494 if (IS_ERR(inode))
495 return PTR_ERR(inode);
496
497 inode->i_op = &logfs_dir_iops;
498 inode->i_fop = &logfs_dir_fops;
499
500 return __logfs_create(dir, dentry, inode, NULL, 0);
501}
502
503static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
504 struct nameidata *nd)
505{
506 struct inode *inode;
507
508 inode = logfs_new_inode(dir, mode);
509 if (IS_ERR(inode))
510 return PTR_ERR(inode);
511
512 inode->i_op = &logfs_reg_iops;
513 inode->i_fop = &logfs_reg_fops;
514 inode->i_mapping->a_ops = &logfs_reg_aops;
515
516 return __logfs_create(dir, dentry, inode, NULL, 0);
517}
518
519static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
520 dev_t rdev)
521{
522 struct inode *inode;
523
524 if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
525 return -ENAMETOOLONG;
526
527 inode = logfs_new_inode(dir, mode);
528 if (IS_ERR(inode))
529 return PTR_ERR(inode);
530
531 init_special_inode(inode, mode, rdev);
532
533 return __logfs_create(dir, dentry, inode, NULL, 0);
534}
535
536static int logfs_symlink(struct inode *dir, struct dentry *dentry,
537 const char *target)
538{
539 struct inode *inode;
540 size_t destlen = strlen(target) + 1;
541
542 if (destlen > dir->i_sb->s_blocksize)
543 return -ENAMETOOLONG;
544
545 inode = logfs_new_inode(dir, S_IFLNK | 0777);
546 if (IS_ERR(inode))
547 return PTR_ERR(inode);
548
549 inode->i_op = &logfs_symlink_iops;
550 inode->i_mapping->a_ops = &logfs_reg_aops;
551
552 return __logfs_create(dir, dentry, inode, target, destlen);
553}
554
555static int logfs_permission(struct inode *inode, int mask)
556{
557 return generic_permission(inode, mask, NULL);
558}
559
560static int logfs_link(struct dentry *old_dentry, struct inode *dir,
561 struct dentry *dentry)
562{
563 struct inode *inode = old_dentry->d_inode;
564
565 if (inode->i_nlink >= LOGFS_LINK_MAX)
566 return -EMLINK;
567
568 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
569 atomic_inc(&inode->i_count);
570 inode->i_nlink++;
571 mark_inode_dirty_sync(inode);
572
573 return __logfs_create(dir, dentry, inode, NULL, 0);
574}
575
576static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
577 struct logfs_disk_dentry *dd, loff_t *pos)
578{
579 struct page *page;
580 void *map;
581
582 page = logfs_get_dd_page(dir, dentry);
583 if (IS_ERR(page))
584 return PTR_ERR(page);
585 *pos = page->index;
586 map = kmap_atomic(page, KM_USER0);
587 memcpy(dd, map, sizeof(*dd));
588 kunmap_atomic(map, KM_USER0);
589 page_cache_release(page);
590 return 0;
591}
592
593static int logfs_delete_dd(struct inode *dir, loff_t pos)
594{
595 /*
596 * Getting called with pos somewhere beyond eof is either a goofup
597 * within this file or means someone maliciously edited the
598 * (crc-protected) journal.
599 */
600 BUG_ON(beyond_eof(dir, pos));
601 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
602 log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
603 return logfs_delete(dir, pos, NULL);
604}
605
606/*
607 * Cross-directory rename, target does not exist. Just a little nasty.
608 * Create a new dentry in the target dir, then remove the old dentry,
609 * all the while taking care to remember our operation in the journal.
610 */
611static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
612 struct inode *new_dir, struct dentry *new_dentry)
613{
614 struct logfs_super *super = logfs_super(old_dir->i_sb);
615 struct logfs_disk_dentry dd;
616 struct logfs_transaction *ta;
617 loff_t pos;
618 int err;
619
620 /* 1. locate source dd */
621 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
622 if (err)
623 return err;
624
625 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
626 if (!ta)
627 return -ENOMEM;
628
629 ta->state = CROSS_RENAME_1;
630 ta->dir = old_dir->i_ino;
631 ta->pos = pos;
632
633 /* 2. write target dd */
634 mutex_lock(&super->s_dirop_mutex);
635 logfs_add_transaction(new_dir, ta);
636 err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
637 if (!err)
638 err = write_inode(new_dir);
639
640 if (err) {
641 super->s_rename_dir = 0;
642 super->s_rename_pos = 0;
643 abort_transaction(new_dir, ta);
644 goto out;
645 }
646
647 /* 3. remove source dd */
648 ta->state = CROSS_RENAME_2;
649 logfs_add_transaction(old_dir, ta);
650 err = logfs_delete_dd(old_dir, pos);
651 if (!err)
652 err = write_inode(old_dir);
653 LOGFS_BUG_ON(err, old_dir->i_sb);
654out:
655 mutex_unlock(&super->s_dirop_mutex);
656 return err;
657}
658
659static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
660 struct logfs_disk_dentry *dd, struct inode *inode)
661{
662 loff_t pos;
663 int err;
664
665 err = logfs_get_dd(dir, dentry, dd, &pos);
666 if (err)
667 return err;
668 dd->ino = cpu_to_be64(inode->i_ino);
669 dd->type = logfs_type(inode);
670
671 err = write_dir(dir, dd, pos);
672 if (err)
673 return err;
674 log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
675 dd->name, be64_to_cpu(dd->ino));
676 return write_inode(dir);
677}
678
679/* Target dentry exists - the worst case. We need to attach the source
680 * inode to the target dentry, then remove the orphaned target inode and
681 * source dentry.
682 */
683static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
684 struct inode *new_dir, struct dentry *new_dentry)
685{
686 struct logfs_super *super = logfs_super(old_dir->i_sb);
687 struct inode *old_inode = old_dentry->d_inode;
688 struct inode *new_inode = new_dentry->d_inode;
689 int isdir = S_ISDIR(old_inode->i_mode);
690 struct logfs_disk_dentry dd;
691 struct logfs_transaction *ta;
692 loff_t pos;
693 int err;
694
695 BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
696 if (isdir) {
697 if (!logfs_empty_dir(new_inode))
698 return -ENOTEMPTY;
699 }
700
701 /* 1. locate source dd */
702 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
703 if (err)
704 return err;
705
706 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
707 if (!ta)
708 return -ENOMEM;
709
710 ta->state = TARGET_RENAME_1;
711 ta->dir = old_dir->i_ino;
712 ta->pos = pos;
713 ta->ino = new_inode->i_ino;
714
715 /* 2. attach source inode to target dd */
716 mutex_lock(&super->s_dirop_mutex);
717 logfs_add_transaction(new_dir, ta);
718 err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
719 if (err) {
720 super->s_rename_dir = 0;
721 super->s_rename_pos = 0;
722 super->s_victim_ino = 0;
723 abort_transaction(new_dir, ta);
724 goto out;
725 }
726
727 /* 3. remove source dd */
728 ta->state = TARGET_RENAME_2;
729 logfs_add_transaction(old_dir, ta);
730 err = logfs_delete_dd(old_dir, pos);
731 if (!err)
732 err = write_inode(old_dir);
733 LOGFS_BUG_ON(err, old_dir->i_sb);
734
735 /* 4. remove target inode */
736 ta->state = TARGET_RENAME_3;
737 logfs_add_transaction(new_inode, ta);
738 err = logfs_remove_inode(new_inode);
739
740out:
741 mutex_unlock(&super->s_dirop_mutex);
742 return err;
743}
744
745static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
746 struct inode *new_dir, struct dentry *new_dentry)
747{
748 if (new_dentry->d_inode)
749 return logfs_rename_target(old_dir, old_dentry,
750 new_dir, new_dentry);
751 return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
752}
753
754/* No locking done here, as this is called before .get_sb() returns. */
755int logfs_replay_journal(struct super_block *sb)
756{
757 struct logfs_super *super = logfs_super(sb);
758 struct inode *inode;
759 u64 ino, pos;
760 int err;
761
762 if (super->s_victim_ino) {
763 /* delete victim inode */
764 ino = super->s_victim_ino;
765 printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
766 inode = logfs_iget(sb, ino);
767 if (IS_ERR(inode))
768 goto fail;
769
770 LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
771 super->s_victim_ino = 0;
772 err = logfs_remove_inode(inode);
773 iput(inode);
774 if (err) {
775 super->s_victim_ino = ino;
776 goto fail;
777 }
778 }
779 if (super->s_rename_dir) {
780 /* delete old dd from rename */
781 ino = super->s_rename_dir;
782 pos = super->s_rename_pos;
783 printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
784 ino, pos);
785 inode = logfs_iget(sb, ino);
786 if (IS_ERR(inode))
787 goto fail;
788
789 super->s_rename_dir = 0;
790 super->s_rename_pos = 0;
791 err = logfs_delete_dd(inode, pos);
792 iput(inode);
793 if (err) {
794 super->s_rename_dir = ino;
795 super->s_rename_pos = pos;
796 goto fail;
797 }
798 }
799 return 0;
800fail:
801 LOGFS_BUG(sb);
802 return -EIO;
803}
804
805const struct inode_operations logfs_symlink_iops = {
806 .readlink = generic_readlink,
807 .follow_link = page_follow_link_light,
808};
809
810const struct inode_operations logfs_dir_iops = {
811 .create = logfs_create,
812 .link = logfs_link,
813 .lookup = logfs_lookup,
814 .mkdir = logfs_mkdir,
815 .mknod = logfs_mknod,
816 .rename = logfs_rename,
817 .rmdir = logfs_rmdir,
818 .permission = logfs_permission,
819 .symlink = logfs_symlink,
820 .unlink = logfs_unlink,
821};
822const struct file_operations logfs_dir_fops = {
823 .fsync = logfs_fsync,
824 .ioctl = logfs_ioctl,
825 .readdir = logfs_readdir,
826 .read = generic_read_dir,
827};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..370f367a933e
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
1/*
2 * fs/logfs/file.c - prepare_write, commit_write and friends
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/writeback.h>
11
12static int logfs_write_begin(struct file *file, struct address_space *mapping,
13 loff_t pos, unsigned len, unsigned flags,
14 struct page **pagep, void **fsdata)
15{
16 struct inode *inode = mapping->host;
17 struct page *page;
18 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
19
20 page = grab_cache_page_write_begin(mapping, index, flags);
21 if (!page)
22 return -ENOMEM;
23 *pagep = page;
24
25 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
26 return 0;
27 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
28 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
29 unsigned end = start + len;
30
31 /* Reading beyond i_size is simple: memset to zero */
32 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
33 return 0;
34 }
35 return logfs_readpage_nolock(page);
36}
37
38static int logfs_write_end(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned copied, struct page *page,
40 void *fsdata)
41{
42 struct inode *inode = mapping->host;
43 pgoff_t index = page->index;
44 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
45 unsigned end = start + copied;
46 int ret = 0;
47
48 BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
49 BUG_ON(page->index > I3_BLOCKS);
50
51 if (copied < len) {
52 /*
53 * Short write of a non-initialized paged. Just tell userspace
54 * to retry the entire page.
55 */
56 if (!PageUptodate(page)) {
57 copied = 0;
58 goto out;
59 }
60 }
61 if (copied == 0)
62 goto out; /* FIXME: do we need to update inode? */
63
64 if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
65 i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
66 mark_inode_dirty_sync(inode);
67 }
68
69 SetPageUptodate(page);
70 if (!PageDirty(page)) {
71 if (!get_page_reserve(inode, page))
72 __set_page_dirty_nobuffers(page);
73 else
74 ret = logfs_write_buf(inode, page, WF_LOCK);
75 }
76out:
77 unlock_page(page);
78 page_cache_release(page);
79 return ret ? ret : copied;
80}
81
82int logfs_readpage(struct file *file, struct page *page)
83{
84 int ret;
85
86 ret = logfs_readpage_nolock(page);
87 unlock_page(page);
88 return ret;
89}
90
91/* Clear the page's dirty flag in the radix tree. */
92/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
93 * the dirty bit from the radix tree for filesystems that don't have to wait
94 * for page writeback to finish (i.e. any compressing filesystem).
95 */
96static void clear_radix_tree_dirty(struct page *page)
97{
98 BUG_ON(PagePrivate(page) || page->private);
99 set_page_writeback(page);
100 end_page_writeback(page);
101}
102
103static int __logfs_writepage(struct page *page)
104{
105 struct inode *inode = page->mapping->host;
106 int err;
107
108 err = logfs_write_buf(inode, page, WF_LOCK);
109 if (err)
110 set_page_dirty(page);
111 else
112 clear_radix_tree_dirty(page);
113 unlock_page(page);
114 return err;
115}
116
117static int logfs_writepage(struct page *page, struct writeback_control *wbc)
118{
119 struct inode *inode = page->mapping->host;
120 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset;
123 u64 bix;
124 level_t level;
125
126 log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
127 page);
128
129 logfs_unpack_index(page->index, &bix, &level);
130
131 /* Indirect blocks are never truncated */
132 if (level != 0)
133 return __logfs_writepage(page);
134
135 /*
136 * TODO: everything below is a near-verbatim copy of nobh_writepage().
137 * The relevant bits should be factored out after logfs is merged.
138 */
139
140 /* Is the page fully inside i_size? */
141 if (bix < end_index)
142 return __logfs_writepage(page);
143
144 /* Is the page fully outside i_size? (truncate in progress) */
145 offset = i_size & (PAGE_CACHE_SIZE-1);
146 if (bix > end_index || offset == 0) {
147 unlock_page(page);
148 return 0; /* don't care */
149 }
150
151 /*
152 * The page straddles i_size. It must be zeroed out on each and every
153 * writepage invokation because it may be mmapped. "A file is mapped
154 * in multiples of the page size. For a file that is not a multiple of
155 * the page size, the remaining memory is zeroed when mapped, and
156 * writes to that region are not written out to the file."
157 */
158 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
159 return __logfs_writepage(page);
160}
161
162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{
164 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private);
166}
167
168static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
169{
170 return 0; /* None of these are easy to release */
171}
172
173
174int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
175 unsigned long arg)
176{
177 struct logfs_inode *li = logfs_inode(inode);
178 unsigned int oldflags, flags;
179 int err;
180
181 switch (cmd) {
182 case FS_IOC_GETFLAGS:
183 flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
184 return put_user(flags, (int __user *)arg);
185 case FS_IOC_SETFLAGS:
186 if (IS_RDONLY(inode))
187 return -EROFS;
188
189 if (!is_owner_or_cap(inode))
190 return -EACCES;
191
192 err = get_user(flags, (int __user *)arg);
193 if (err)
194 return err;
195
196 mutex_lock(&inode->i_mutex);
197 oldflags = li->li_flags;
198 flags &= LOGFS_FL_USER_MODIFIABLE;
199 flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
200 li->li_flags = flags;
201 mutex_unlock(&inode->i_mutex);
202
203 inode->i_ctime = CURRENT_TIME;
204 mark_inode_dirty_sync(inode);
205 return 0;
206
207 default:
208 return -ENOTTY;
209 }
210}
211
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
213{
214 struct super_block *sb = dentry->d_inode->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216
217 /* FIXME: write anchor */
218 super->s_devops->sync(sb);
219 return 0;
220}
221
222static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
223{
224 struct inode *inode = dentry->d_inode;
225 int err = 0;
226
227 if (attr->ia_valid & ATTR_SIZE)
228 err = logfs_truncate(inode, attr->ia_size);
229 attr->ia_valid &= ~ATTR_SIZE;
230
231 if (!err)
232 err = inode_change_ok(inode, attr);
233 if (!err)
234 err = inode_setattr(inode, attr);
235 return err;
236}
237
238const struct inode_operations logfs_reg_iops = {
239 .setattr = logfs_setattr,
240};
241
242const struct file_operations logfs_reg_fops = {
243 .aio_read = generic_file_aio_read,
244 .aio_write = generic_file_aio_write,
245 .fsync = logfs_fsync,
246 .ioctl = logfs_ioctl,
247 .llseek = generic_file_llseek,
248 .mmap = generic_file_readonly_mmap,
249 .open = generic_file_open,
250 .read = do_sync_read,
251 .write = do_sync_write,
252};
253
254const struct address_space_operations logfs_reg_aops = {
255 .invalidatepage = logfs_invalidatepage,
256 .readpage = logfs_readpage,
257 .releasepage = logfs_releasepage,
258 .set_page_dirty = __set_page_dirty_nobuffers,
259 .writepage = logfs_writepage,
260 .writepages = generic_writepages,
261 .write_begin = logfs_write_begin,
262 .write_end = logfs_write_end,
263};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..84e36f52fe95
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,731 @@
1/*
2 * fs/logfs/gc.c - garbage collection code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/slab.h>
11
12/*
13 * Wear leveling needs to kick in when the difference between low erase
14 * counts and high erase counts gets too big. A good value for "too big"
15 * may be somewhat below 10% of maximum erase count for the device.
16 * Why not 397, to pick a nice round number with no specific meaning? :)
17 *
18 * WL_RATELIMIT is the minimum time between two wear level events. A huge
19 * number of segments may fulfil the requirements for wear leveling at the
20 * same time. If that happens we don't want to cause a latency from hell,
21 * but just gently pick one segment every so often and minimize overhead.
22 */
23#define WL_DELTA 397
24#define WL_RATELIMIT 100
25#define MAX_OBJ_ALIASES 2600
26#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
27#define LIST_SIZE 64 /* base size of candidate lists */
28#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
29#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
30
31static int no_free_segments(struct super_block *sb)
32{
33 struct logfs_super *super = logfs_super(sb);
34
35 return super->s_free_list.count;
36}
37
38/* journal has distance -1, top-most ifile layer distance 0 */
39static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
40{
41 struct logfs_super *super = logfs_super(sb);
42 u8 gc_level = (__force u8)__gc_level;
43
44 switch (gc_level) {
45 case 0: /* fall through */
46 case 1: /* fall through */
47 case 2: /* fall through */
48 case 3:
49 /* file data or indirect blocks */
50 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
51 case 6: /* fall through */
52 case 7: /* fall through */
53 case 8: /* fall through */
54 case 9:
55 /* inode file data or indirect blocks */
56 return super->s_ifile_levels - (gc_level - 6);
57 default:
58 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
59 gc_level);
60 WARN_ON(1);
61 return super->s_ifile_levels + super->s_iblock_levels;
62 }
63}
64
65static int segment_is_reserved(struct super_block *sb, u32 segno)
66{
67 struct logfs_super *super = logfs_super(sb);
68 struct logfs_area *area;
69 void *reserved;
70 int i;
71
72 /* Some segments are reserved. Just pretend they were all valid */
73 reserved = btree_lookup32(&super->s_reserved_segments, segno);
74 if (reserved)
75 return 1;
76
77 /* Currently open segments */
78 for_each_area(i) {
79 area = super->s_area[i];
80 if (area->a_is_open && area->a_segno == segno)
81 return 1;
82 }
83
84 return 0;
85}
86
87static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
88{
89 BUG();
90}
91
92/*
93 * Returns the bytes consumed by valid objects in this segment. Object headers
94 * are counted, the segment header is not.
95 */
96static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
97 gc_level_t *gc_level)
98{
99 struct logfs_segment_entry se;
100 u32 ec_level;
101
102 logfs_get_segment_entry(sb, segno, &se);
103 if (se.ec_level == cpu_to_be32(BADSEG) ||
104 se.valid == cpu_to_be32(RESERVED))
105 return RESERVED;
106
107 ec_level = be32_to_cpu(se.ec_level);
108 *ec = ec_level >> 4;
109 *gc_level = GC_LEVEL(ec_level & 0xf);
110 return be32_to_cpu(se.valid);
111}
112
113static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
114 u64 bix, gc_level_t gc_level)
115{
116 struct inode *inode;
117 int err, cookie;
118
119 inode = logfs_safe_iget(sb, ino, &cookie);
120 err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
121 BUG_ON(err);
122 logfs_safe_iput(inode, cookie);
123}
124
125static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
126{
127 struct logfs_super *super = logfs_super(sb);
128 struct logfs_segment_header sh;
129 struct logfs_object_header oh;
130 u64 ofs, ino, bix;
131 u32 seg_ofs, logical_segno, cleaned = 0;
132 int err, len, valid;
133 gc_level_t gc_level;
134
135 LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
136
137 btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
138 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
139 BUG_ON(err);
140 gc_level = GC_LEVEL(sh.level);
141 logical_segno = be32_to_cpu(sh.segno);
142 if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
143 logfs_mark_segment_bad(sb, segno);
144 cleaned = -1;
145 goto out;
146 }
147
148 for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
149 seg_ofs + sizeof(oh) < super->s_segsize; ) {
150 ofs = dev_ofs(sb, logical_segno, seg_ofs);
151 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
152 &oh);
153 BUG_ON(err);
154
155 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
156 break;
157
158 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
159 logfs_mark_segment_bad(sb, segno);
160 cleaned = super->s_segsize - 1;
161 goto out;
162 }
163
164 ino = be64_to_cpu(oh.ino);
165 bix = be64_to_cpu(oh.bix);
166 len = sizeof(oh) + be16_to_cpu(oh.len);
167 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
168 if (valid == 1) {
169 logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
170 cleaned += len;
171 } else if (valid == 2) {
172 /* Will be invalid upon journal commit */
173 cleaned += len;
174 }
175 seg_ofs += len;
176 }
177out:
178 btree_remove32(&super->s_reserved_segments, segno);
179 return cleaned;
180}
181
182static struct gc_candidate *add_list(struct gc_candidate *cand,
183 struct candidate_list *list)
184{
185 struct rb_node **p = &list->rb_tree.rb_node;
186 struct rb_node *parent = NULL;
187 struct gc_candidate *cur;
188 int comp;
189
190 cand->list = list;
191 while (*p) {
192 parent = *p;
193 cur = rb_entry(parent, struct gc_candidate, rb_node);
194
195 if (list->sort_by_ec)
196 comp = cand->erase_count < cur->erase_count;
197 else
198 comp = cand->valid < cur->valid;
199
200 if (comp)
201 p = &parent->rb_left;
202 else
203 p = &parent->rb_right;
204 }
205 rb_link_node(&cand->rb_node, parent, p);
206 rb_insert_color(&cand->rb_node, &list->rb_tree);
207
208 if (list->count <= list->maxcount) {
209 list->count++;
210 return NULL;
211 }
212 cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
213 rb_erase(&cand->rb_node, &list->rb_tree);
214 cand->list = NULL;
215 return cand;
216}
217
218static void remove_from_list(struct gc_candidate *cand)
219{
220 struct candidate_list *list = cand->list;
221
222 rb_erase(&cand->rb_node, &list->rb_tree);
223 list->count--;
224}
225
226static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
227{
228 struct logfs_super *super = logfs_super(sb);
229
230 btree_remove32(&super->s_cand_tree, cand->segno);
231 kfree(cand);
232}
233
234u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
235{
236 struct gc_candidate *cand;
237 u32 segno;
238
239 BUG_ON(list->count == 0);
240
241 cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
242 remove_from_list(cand);
243 segno = cand->segno;
244 if (ec)
245 *ec = cand->erase_count;
246 free_candidate(sb, cand);
247 return segno;
248}
249
250/*
251 * We have several lists to manage segments with. The reserve_list is used to
252 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
253 * list.
254 * The free_list contains free segments for normal usage. It usually gets the
255 * second pick after the reserve_list. But when the free_list is running short
256 * it is more important to keep the free_list full than to keep a reserve.
257 *
258 * Segments that are not free are put onto a per-level low_list. If we have
259 * to run garbage collection, we pick a candidate from there. All segments on
260 * those lists should have at least some free space so GC will make progress.
261 *
262 * And last we have the ec_list, which is used to pick segments for wear
263 * leveling.
264 *
265 * If all appropriate lists are full, we simply free the candidate and forget
266 * about that segment for a while. We have better candidates for each purpose.
267 */
268static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
269{
270 struct logfs_super *super = logfs_super(sb);
271 u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
272
273 if (cand->valid == 0) {
274 /* 100% free segments */
275 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
276 cand->segno, cand->erase_count,
277 dev_ofs(sb, cand->segno, 0));
278 cand = add_list(cand, &super->s_reserve_list);
279 if (cand) {
280 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
281 cand->segno, cand->erase_count,
282 dev_ofs(sb, cand->segno, 0));
283 cand = add_list(cand, &super->s_free_list);
284 }
285 } else {
286 /* good candidates for Garbage Collection */
287 if (cand->valid < full)
288 cand = add_list(cand, &super->s_low_list[cand->dist]);
289 /* good candidates for wear leveling,
290 * segments that were recently written get ignored */
291 if (cand)
292 cand = add_list(cand, &super->s_ec_list);
293 }
294 if (cand)
295 free_candidate(sb, cand);
296}
297
298static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
299 u8 dist)
300{
301 struct logfs_super *super = logfs_super(sb);
302 struct gc_candidate *cand;
303
304 cand = kmalloc(sizeof(*cand), GFP_NOFS);
305 if (!cand)
306 return -ENOMEM;
307
308 cand->segno = segno;
309 cand->valid = valid;
310 cand->erase_count = ec;
311 cand->dist = dist;
312
313 btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
314 __add_candidate(sb, cand);
315 return 0;
316}
317
318static void remove_segment_from_lists(struct super_block *sb, u32 segno)
319{
320 struct logfs_super *super = logfs_super(sb);
321 struct gc_candidate *cand;
322
323 cand = btree_lookup32(&super->s_cand_tree, segno);
324 if (cand) {
325 remove_from_list(cand);
326 free_candidate(sb, cand);
327 }
328}
329
330static void scan_segment(struct super_block *sb, u32 segno)
331{
332 u32 valid, ec = 0;
333 gc_level_t gc_level = 0;
334 u8 dist;
335
336 if (segment_is_reserved(sb, segno))
337 return;
338
339 remove_segment_from_lists(sb, segno);
340 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
341 if (valid == RESERVED)
342 return;
343
344 dist = root_distance(sb, gc_level);
345 add_candidate(sb, segno, valid, ec, dist);
346}
347
348static struct gc_candidate *first_in_list(struct candidate_list *list)
349{
350 if (list->count == 0)
351 return NULL;
352 return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
353}
354
355/*
356 * Find the best segment for garbage collection. Main criterion is
357 * the segment requiring the least effort to clean. Secondary
358 * criterion is to GC on the lowest level available.
359 *
360 * So we search the least effort segment on the lowest level first,
361 * then move up and pick another segment iff is requires significantly
362 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
363 */
364static struct gc_candidate *get_candidate(struct super_block *sb)
365{
366 struct logfs_super *super = logfs_super(sb);
367 int i, max_dist;
368 struct gc_candidate *cand = NULL, *this;
369
370 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
371
372 for (i = max_dist; i >= 0; i--) {
373 this = first_in_list(&super->s_low_list[i]);
374 if (!this)
375 continue;
376 if (!cand)
377 cand = this;
378 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
379 cand = this;
380 }
381 return cand;
382}
383
384static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
385{
386 struct logfs_super *super = logfs_super(sb);
387 gc_level_t gc_level;
388 u32 cleaned, valid, segno, ec;
389 u8 dist;
390
391 if (!cand) {
392 log_gc("GC attempted, but no candidate found\n");
393 return 0;
394 }
395
396 segno = cand->segno;
397 dist = cand->dist;
398 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
399 free_candidate(sb, cand);
400 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
401 segno, (u64)segno << super->s_segshift,
402 dist, no_free_segments(sb), valid,
403 super->s_free_bytes);
404 cleaned = logfs_gc_segment(sb, segno, dist);
405 log_gc("GC segment #%02x complete - now %x valid\n", segno,
406 valid - cleaned);
407 BUG_ON(cleaned != valid);
408 return 1;
409}
410
411static int logfs_gc_once(struct super_block *sb)
412{
413 struct gc_candidate *cand;
414
415 cand = get_candidate(sb);
416 if (cand)
417 remove_from_list(cand);
418 return __logfs_gc_once(sb, cand);
419}
420
421/* returns 1 if a wrap occurs, 0 otherwise */
422static int logfs_scan_some(struct super_block *sb)
423{
424 struct logfs_super *super = logfs_super(sb);
425 u32 segno;
426 int i, ret = 0;
427
428 segno = super->s_sweeper;
429 for (i = SCAN_RATIO; i > 0; i--) {
430 segno++;
431 if (segno >= super->s_no_segs) {
432 segno = 0;
433 ret = 1;
434 /* Break out of the loop. We want to read a single
435 * block from the segment size on next invocation if
436 * SCAN_RATIO is set to match block size
437 */
438 break;
439 }
440
441 scan_segment(sb, segno);
442 }
443 super->s_sweeper = segno;
444 return ret;
445}
446
447/*
448 * In principle, this function should loop forever, looking for GC candidates
449 * and moving data. LogFS is designed in such a way that this loop is
450 * guaranteed to terminate.
451 *
452 * Limiting the loop to some iterations serves purely to catch cases when
453 * these guarantees have failed. An actual endless loop is an obvious bug
454 * and should be reported as such.
455 */
456static void __logfs_gc_pass(struct super_block *sb, int target)
457{
458 struct logfs_super *super = logfs_super(sb);
459 struct logfs_block *block;
460 int round, progress, last_progress = 0;
461
462 if (no_free_segments(sb) >= target &&
463 super->s_no_object_aliases < MAX_OBJ_ALIASES)
464 return;
465
466 log_gc("__logfs_gc_pass(%x)\n", target);
467 for (round = 0; round < SCAN_ROUNDS; ) {
468 if (no_free_segments(sb) >= target)
469 goto write_alias;
470
471 /* Sync in-memory state with on-medium state in case they
472 * diverged */
473 logfs_write_anchor(sb);
474 round += logfs_scan_some(sb);
475 if (no_free_segments(sb) >= target)
476 goto write_alias;
477 progress = logfs_gc_once(sb);
478 if (progress)
479 last_progress = round;
480 else if (round - last_progress > 2)
481 break;
482 continue;
483
484 /*
485 * The goto logic is nasty, I just don't know a better way to
486 * code it. GC is supposed to ensure two things:
487 * 1. Enough free segments are available.
488 * 2. The number of aliases is bounded.
489 * When 1. is achieved, we take a look at 2. and write back
490 * some alias-containing blocks, if necessary. However, after
491 * each such write we need to go back to 1., as writes can
492 * consume free segments.
493 */
494write_alias:
495 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
496 return;
497 if (list_empty(&super->s_object_alias)) {
498 /* All aliases are still in btree */
499 return;
500 }
501 log_gc("Write back one alias\n");
502 block = list_entry(super->s_object_alias.next,
503 struct logfs_block, alias_list);
504 block->ops->write_block(block);
505 /*
506 * To round off the nasty goto logic, we reset round here. It
507 * is a safety-net for GC not making any progress and limited
508 * to something reasonably small. If incremented it for every
509 * single alias, the loop could terminate rather quickly.
510 */
511 round = 0;
512 }
513 LOGFS_BUG(sb);
514}
515
516static int wl_ratelimit(struct super_block *sb, u64 *next_event)
517{
518 struct logfs_super *super = logfs_super(sb);
519
520 if (*next_event < super->s_gec) {
521 *next_event = super->s_gec + WL_RATELIMIT;
522 return 0;
523 }
524 return 1;
525}
526
527static void logfs_wl_pass(struct super_block *sb)
528{
529 struct logfs_super *super = logfs_super(sb);
530 struct gc_candidate *wl_cand, *free_cand;
531
532 if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
533 return;
534
535 wl_cand = first_in_list(&super->s_ec_list);
536 if (!wl_cand)
537 return;
538 free_cand = first_in_list(&super->s_free_list);
539 if (!free_cand)
540 return;
541
542 if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
543 remove_from_list(wl_cand);
544 __logfs_gc_once(sb, wl_cand);
545 }
546}
547
548/*
549 * The journal needs wear leveling as well. But moving the journal is an
550 * expensive operation so we try to avoid it as much as possible. And if we
551 * have to do it, we move the whole journal, not individual segments.
552 *
553 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
554 * calculations. First we check whether moving the journal would be a
555 * significant improvement. That means that a) the current journal segments
556 * have more wear than the future journal segments and b) the current journal
557 * segments have more wear than normal ostore segments.
558 * Rationale for b) is that we don't have to move the journal if it is aging
559 * less than the ostore, even if the reserve segments age even less (they are
560 * excluded from wear leveling, after all).
561 * Next we check that the superblocks have less wear than the journal. Since
562 * moving the journal requires writing the superblocks, we have to protect the
563 * superblocks even more than the journal.
564 *
565 * Also we double the acceptable wear difference, compared to ostore wear
566 * leveling. Journal data is read and rewritten rapidly, comparatively. So
567 * soft errors have much less time to accumulate and we allow the journal to
568 * be a bit worse than the ostore.
569 */
570static void logfs_journal_wl_pass(struct super_block *sb)
571{
572 struct logfs_super *super = logfs_super(sb);
573 struct gc_candidate *cand;
574 u32 min_journal_ec = -1, max_reserve_ec = 0;
575 int i;
576
577 if (wl_ratelimit(sb, &super->s_wl_gec_journal))
578 return;
579
580 if (super->s_reserve_list.count < super->s_no_journal_segs) {
581 /* Reserve is not full enough to move complete journal */
582 return;
583 }
584
585 journal_for_each(i)
586 if (super->s_journal_seg[i])
587 min_journal_ec = min(min_journal_ec,
588 super->s_journal_ec[i]);
589 cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
590 struct gc_candidate, rb_node);
591 max_reserve_ec = cand->erase_count;
592 for (i = 0; i < 2; i++) {
593 struct logfs_segment_entry se;
594 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
595 u32 ec;
596
597 logfs_get_segment_entry(sb, segno, &se);
598 ec = be32_to_cpu(se.ec_level) >> 4;
599 max_reserve_ec = max(max_reserve_ec, ec);
600 }
601
602 if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
603 do_logfs_journal_wl_pass(sb);
604 }
605}
606
607void logfs_gc_pass(struct super_block *sb)
608{
609 struct logfs_super *super = logfs_super(sb);
610
611 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
612 /* Write journal before free space is getting saturated with dirty
613 * objects.
614 */
615 if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
616 + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
617 logfs_write_anchor(sb);
618 __logfs_gc_pass(sb, super->s_total_levels);
619 logfs_wl_pass(sb);
620 logfs_journal_wl_pass(sb);
621}
622
623static int check_area(struct super_block *sb, int i)
624{
625 struct logfs_super *super = logfs_super(sb);
626 struct logfs_area *area = super->s_area[i];
627 struct logfs_object_header oh;
628 u32 segno = area->a_segno;
629 u32 ofs = area->a_used_bytes;
630 __be32 crc;
631 int err;
632
633 if (!area->a_is_open)
634 return 0;
635
636 for (ofs = area->a_used_bytes;
637 ofs <= super->s_segsize - sizeof(oh);
638 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
639 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
640 if (err)
641 return err;
642
643 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
644 break;
645
646 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
647 if (crc != oh.crc) {
648 printk(KERN_INFO "interrupted header at %llx\n",
649 dev_ofs(sb, segno, ofs));
650 return 0;
651 }
652 }
653 if (ofs != area->a_used_bytes) {
654 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
655 ofs - area->a_used_bytes,
656 dev_ofs(sb, segno, area->a_used_bytes));
657 area->a_used_bytes = ofs;
658 }
659 return 0;
660}
661
662int logfs_check_areas(struct super_block *sb)
663{
664 int i, err;
665
666 for_each_area(i) {
667 err = check_area(sb, i);
668 if (err)
669 return err;
670 }
671 return 0;
672}
673
674static void logfs_init_candlist(struct candidate_list *list, int maxcount,
675 int sort_by_ec)
676{
677 list->count = 0;
678 list->maxcount = maxcount;
679 list->sort_by_ec = sort_by_ec;
680 list->rb_tree = RB_ROOT;
681}
682
683int logfs_init_gc(struct super_block *sb)
684{
685 struct logfs_super *super = logfs_super(sb);
686 int i;
687
688 btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
689 logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
690 logfs_init_candlist(&super->s_reserve_list,
691 super->s_bad_seg_reserve, 1);
692 for_each_area(i)
693 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
694 logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
695 return 0;
696}
697
698static void logfs_cleanup_list(struct super_block *sb,
699 struct candidate_list *list)
700{
701 struct gc_candidate *cand;
702
703 while (list->count) {
704 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
705 rb_node);
706 remove_from_list(cand);
707 free_candidate(sb, cand);
708 }
709 BUG_ON(list->rb_tree.rb_node);
710}
711
712void logfs_cleanup_gc(struct super_block *sb)
713{
714 struct logfs_super *super = logfs_super(sb);
715 int i;
716
717 if (!super->s_free_list.count)
718 return;
719
720 /*
721 * FIXME: The btree may still contain a single empty node. So we
722 * call the grim visitor to clean up that mess. Btree code should
723 * do it for us, really.
724 */
725 btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
726 logfs_cleanup_list(sb, &super->s_free_list);
727 logfs_cleanup_list(sb, &super->s_reserve_list);
728 for_each_area(i)
729 logfs_cleanup_list(sb, &super->s_low_list[i]);
730 logfs_cleanup_list(sb, &super->s_ec_list);
731}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..14ed27274da2
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,418 @@
1/*
2 * fs/logfs/inode.c - inode handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10#include <linux/writeback.h>
11#include <linux/backing-dev.h>
12
13/*
14 * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
15 * on the medium. It therefore also lacks a method to store the previous
16 * generation number for deleted inodes. Instead a single generation number
17 * is stored which will be used for new inodes. Being just a 32bit counter,
18 * this can obvious wrap relatively quickly. So we only reuse inodes if we
19 * know that a fair number of inodes can be created before we have to increment
20 * the generation again - effectively adding some bits to the counter.
21 * But being too aggressive here means we keep a very large and very sparse
22 * inode file, wasting space on indirect blocks.
23 * So what is a good value? Beats me. 64k seems moderately bad on both
24 * fronts, so let's use that for now...
25 *
26 * NFS sucks, as everyone already knows.
27 */
28#define INOS_PER_WRAP (0x10000)
29
30/*
31 * Logfs' requirement to read inodes for garbage collection makes life a bit
32 * harder. GC may have to read inodes that are in I_FREEING state, when they
33 * are being written out - and waiting for GC to make progress, naturally.
34 *
35 * So we cannot just call iget() or some variant of it, but first have to check
36 * wether the inode in question might be in I_FREEING state. Therefore we
37 * maintain our own per-sb list of "almost deleted" inodes and check against
38 * that list first. Normally this should be at most 1-2 entries long.
39 *
40 * Also, inodes have logfs-specific reference counting on top of what the vfs
41 * does. When .destroy_inode is called, normally the reference count will drop
42 * to zero and the inode gets deleted. But if GC accessed the inode, its
43 * refcount will remain nonzero and final deletion will have to wait.
44 *
45 * As a result we have two sets of functions to get/put inodes:
46 * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
47 * logfs_iget/iput - normal version
48 */
49static struct kmem_cache *logfs_inode_cache;
50
51static DEFINE_SPINLOCK(logfs_inode_lock);
52
53static void logfs_inode_setops(struct inode *inode)
54{
55 switch (inode->i_mode & S_IFMT) {
56 case S_IFDIR:
57 inode->i_op = &logfs_dir_iops;
58 inode->i_fop = &logfs_dir_fops;
59 inode->i_mapping->a_ops = &logfs_reg_aops;
60 break;
61 case S_IFREG:
62 inode->i_op = &logfs_reg_iops;
63 inode->i_fop = &logfs_reg_fops;
64 inode->i_mapping->a_ops = &logfs_reg_aops;
65 break;
66 case S_IFLNK:
67 inode->i_op = &logfs_symlink_iops;
68 inode->i_mapping->a_ops = &logfs_reg_aops;
69 break;
70 case S_IFSOCK: /* fall through */
71 case S_IFBLK: /* fall through */
72 case S_IFCHR: /* fall through */
73 case S_IFIFO:
74 init_special_inode(inode, inode->i_mode, inode->i_rdev);
75 break;
76 default:
77 BUG();
78 }
79}
80
81static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
82{
83 struct inode *inode = iget_locked(sb, ino);
84 int err;
85
86 if (!inode)
87 return ERR_PTR(-ENOMEM);
88 if (!(inode->i_state & I_NEW))
89 return inode;
90
91 err = logfs_read_inode(inode);
92 if (err || inode->i_nlink == 0) {
93 /* inode->i_nlink == 0 can be true when called from
94 * block validator */
95 /* set i_nlink to 0 to prevent caching */
96 inode->i_nlink = 0;
97 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
98 iget_failed(inode);
99 if (!err)
100 err = -ENOENT;
101 return ERR_PTR(err);
102 }
103
104 logfs_inode_setops(inode);
105 unlock_new_inode(inode);
106 return inode;
107}
108
109struct inode *logfs_iget(struct super_block *sb, ino_t ino)
110{
111 BUG_ON(ino == LOGFS_INO_MASTER);
112 BUG_ON(ino == LOGFS_INO_SEGFILE);
113 return __logfs_iget(sb, ino);
114}
115
116/*
117 * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
118 * this allows logfs_iput to do the right thing later
119 */
120struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
121{
122 struct logfs_super *super = logfs_super(sb);
123 struct logfs_inode *li;
124
125 if (ino == LOGFS_INO_MASTER)
126 return super->s_master_inode;
127 if (ino == LOGFS_INO_SEGFILE)
128 return super->s_segfile_inode;
129
130 spin_lock(&logfs_inode_lock);
131 list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
132 if (li->vfs_inode.i_ino == ino) {
133 li->li_refcount++;
134 spin_unlock(&logfs_inode_lock);
135 *is_cached = 1;
136 return &li->vfs_inode;
137 }
138 spin_unlock(&logfs_inode_lock);
139
140 *is_cached = 0;
141 return __logfs_iget(sb, ino);
142}
143
144static void __logfs_destroy_inode(struct inode *inode)
145{
146 struct logfs_inode *li = logfs_inode(inode);
147
148 BUG_ON(li->li_block);
149 list_del(&li->li_freeing_list);
150 kmem_cache_free(logfs_inode_cache, li);
151}
152
153static void logfs_destroy_inode(struct inode *inode)
154{
155 struct logfs_inode *li = logfs_inode(inode);
156
157 BUG_ON(list_empty(&li->li_freeing_list));
158 spin_lock(&logfs_inode_lock);
159 li->li_refcount--;
160 if (li->li_refcount == 0)
161 __logfs_destroy_inode(inode);
162 spin_unlock(&logfs_inode_lock);
163}
164
165void logfs_safe_iput(struct inode *inode, int is_cached)
166{
167 if (inode->i_ino == LOGFS_INO_MASTER)
168 return;
169 if (inode->i_ino == LOGFS_INO_SEGFILE)
170 return;
171
172 if (is_cached) {
173 logfs_destroy_inode(inode);
174 return;
175 }
176
177 iput(inode);
178}
179
180static void logfs_init_inode(struct super_block *sb, struct inode *inode)
181{
182 struct logfs_inode *li = logfs_inode(inode);
183 int i;
184
185 li->li_flags = 0;
186 li->li_height = 0;
187 li->li_used_bytes = 0;
188 li->li_block = NULL;
189 inode->i_uid = 0;
190 inode->i_gid = 0;
191 inode->i_size = 0;
192 inode->i_blocks = 0;
193 inode->i_ctime = CURRENT_TIME;
194 inode->i_mtime = CURRENT_TIME;
195 inode->i_nlink = 1;
196 INIT_LIST_HEAD(&li->li_freeing_list);
197
198 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
199 li->li_data[i] = 0;
200
201 return;
202}
203
204static struct inode *logfs_alloc_inode(struct super_block *sb)
205{
206 struct logfs_inode *li;
207
208 li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
209 if (!li)
210 return NULL;
211 logfs_init_inode(sb, &li->vfs_inode);
212 return &li->vfs_inode;
213}
214
215/*
216 * In logfs inodes are written to an inode file. The inode file, like any
217 * other file, is managed with a inode. The inode file's inode, aka master
218 * inode, requires special handling in several respects. First, it cannot be
219 * written to the inode file, so it is stored in the journal instead.
220 *
221 * Secondly, this inode cannot be written back and destroyed before all other
222 * inodes have been written. The ordering is important. Linux' VFS is happily
223 * unaware of the ordering constraint and would ordinarily destroy the master
224 * inode at umount time while other inodes are still in use and dirty. Not
225 * good.
226 *
227 * So logfs makes sure the master inode is not written until all other inodes
228 * have been destroyed. Sadly, this method has another side-effect. The VFS
229 * will notice one remaining inode and print a frightening warning message.
230 * Worse, it is impossible to judge whether such a warning was caused by the
231 * master inode or any other inodes have leaked as well.
232 *
233 * Our attempt of solving this is with logfs_new_meta_inode() below. Its
234 * purpose is to create a new inode that will not trigger the warning if such
235 * an inode is still in use. An ugly hack, no doubt. Suggections for
236 * improvement are welcome.
237 */
238struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
239{
240 struct inode *inode;
241
242 inode = logfs_alloc_inode(sb);
243 if (!inode)
244 return ERR_PTR(-ENOMEM);
245
246 inode->i_mode = S_IFREG;
247 inode->i_ino = ino;
248 inode->i_sb = sb;
249
250 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
251 * to be nonstatic, alas. */
252 {
253 struct address_space * const mapping = &inode->i_data;
254
255 mapping->a_ops = &logfs_reg_aops;
256 mapping->host = inode;
257 mapping->flags = 0;
258 mapping_set_gfp_mask(mapping, GFP_NOFS);
259 mapping->assoc_mapping = NULL;
260 mapping->backing_dev_info = &default_backing_dev_info;
261 inode->i_mapping = mapping;
262 inode->i_nlink = 1;
263 }
264
265 return inode;
266}
267
268struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
269{
270 struct inode *inode;
271 int err;
272
273 inode = logfs_new_meta_inode(sb, ino);
274 if (IS_ERR(inode))
275 return inode;
276
277 err = logfs_read_inode(inode);
278 if (err) {
279 destroy_meta_inode(inode);
280 return ERR_PTR(err);
281 }
282 logfs_inode_setops(inode);
283 return inode;
284}
285
286static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
287{
288 int ret;
289 long flags = WF_LOCK;
290
291 /* Can only happen if creat() failed. Safe to skip. */
292 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
293 return 0;
294
295 ret = __logfs_write_inode(inode, flags);
296 LOGFS_BUG_ON(ret, inode->i_sb);
297 return ret;
298}
299
300void destroy_meta_inode(struct inode *inode)
301{
302 if (inode) {
303 if (inode->i_data.nrpages)
304 truncate_inode_pages(&inode->i_data, 0);
305 logfs_clear_inode(inode);
306 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
307 }
308}
309
310/* called with inode_lock held */
311static void logfs_drop_inode(struct inode *inode)
312{
313 struct logfs_super *super = logfs_super(inode->i_sb);
314 struct logfs_inode *li = logfs_inode(inode);
315
316 spin_lock(&logfs_inode_lock);
317 list_move(&li->li_freeing_list, &super->s_freeing_list);
318 spin_unlock(&logfs_inode_lock);
319 generic_drop_inode(inode);
320}
321
322static void logfs_set_ino_generation(struct super_block *sb,
323 struct inode *inode)
324{
325 struct logfs_super *super = logfs_super(sb);
326 u64 ino;
327
328 mutex_lock(&super->s_journal_mutex);
329 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
330 super->s_last_ino = ino;
331 super->s_inos_till_wrap--;
332 if (super->s_inos_till_wrap < 0) {
333 super->s_last_ino = LOGFS_RESERVED_INOS;
334 super->s_generation++;
335 super->s_inos_till_wrap = INOS_PER_WRAP;
336 }
337 inode->i_ino = ino;
338 inode->i_generation = super->s_generation;
339 mutex_unlock(&super->s_journal_mutex);
340}
341
342struct inode *logfs_new_inode(struct inode *dir, int mode)
343{
344 struct super_block *sb = dir->i_sb;
345 struct inode *inode;
346
347 inode = new_inode(sb);
348 if (!inode)
349 return ERR_PTR(-ENOMEM);
350
351 logfs_init_inode(sb, inode);
352
353 /* inherit parent flags */
354 logfs_inode(inode)->li_flags |=
355 logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
356
357 inode->i_mode = mode;
358 logfs_set_ino_generation(sb, inode);
359
360 inode->i_uid = current_fsuid();
361 inode->i_gid = current_fsgid();
362 if (dir->i_mode & S_ISGID) {
363 inode->i_gid = dir->i_gid;
364 if (S_ISDIR(mode))
365 inode->i_mode |= S_ISGID;
366 }
367
368 logfs_inode_setops(inode);
369 insert_inode_hash(inode);
370
371 return inode;
372}
373
374static void logfs_init_once(void *_li)
375{
376 struct logfs_inode *li = _li;
377 int i;
378
379 li->li_flags = 0;
380 li->li_used_bytes = 0;
381 li->li_refcount = 1;
382 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
383 li->li_data[i] = 0;
384 inode_init_once(&li->vfs_inode);
385}
386
387static int logfs_sync_fs(struct super_block *sb, int wait)
388{
389 /* FIXME: write anchor */
390 logfs_super(sb)->s_devops->sync(sb);
391 return 0;
392}
393
394const struct super_operations logfs_super_operations = {
395 .alloc_inode = logfs_alloc_inode,
396 .clear_inode = logfs_clear_inode,
397 .delete_inode = logfs_delete_inode,
398 .destroy_inode = logfs_destroy_inode,
399 .drop_inode = logfs_drop_inode,
400 .write_inode = logfs_write_inode,
401 .statfs = logfs_statfs,
402 .sync_fs = logfs_sync_fs,
403};
404
405int logfs_init_inode_cache(void)
406{
407 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
408 sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
409 logfs_init_once);
410 if (!logfs_inode_cache)
411 return -ENOMEM;
412 return 0;
413}
414
415void logfs_destroy_inode_cache(void)
416{
417 kmem_cache_destroy(logfs_inode_cache);
418}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..33bd260b8309
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,891 @@
1/*
2 * fs/logfs/journal.c - journal handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11static void logfs_calc_free(struct super_block *sb)
12{
13 struct logfs_super *super = logfs_super(sb);
14 u64 reserve, no_segs = super->s_no_segs;
15 s64 free;
16 int i;
17
18 /* superblock segments */
19 no_segs -= 2;
20 super->s_no_journal_segs = 0;
21 /* journal */
22 journal_for_each(i)
23 if (super->s_journal_seg[i]) {
24 no_segs--;
25 super->s_no_journal_segs++;
26 }
27
28 /* open segments plus one extra per level for GC */
29 no_segs -= 2 * super->s_total_levels;
30
31 free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
32 free -= super->s_used_bytes;
33 /* just a bit extra */
34 free -= super->s_total_levels * 4096;
35
36 /* Bad blocks are 'paid' for with speed reserve - the filesystem
37 * simply gets slower as bad blocks accumulate. Until the bad blocks
38 * exceed the speed reserve - then the filesystem gets smaller.
39 */
40 reserve = super->s_bad_segments + super->s_bad_seg_reserve;
41 reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
42 reserve = max(reserve, super->s_speed_reserve);
43 free -= reserve;
44 if (free < 0)
45 free = 0;
46
47 super->s_free_bytes = free;
48}
49
50static void reserve_sb_and_journal(struct super_block *sb)
51{
52 struct logfs_super *super = logfs_super(sb);
53 struct btree_head32 *head = &super->s_reserved_segments;
54 int i, err;
55
56 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
57 GFP_KERNEL);
58 BUG_ON(err);
59
60 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
61 GFP_KERNEL);
62 BUG_ON(err);
63
64 journal_for_each(i) {
65 if (!super->s_journal_seg[i])
66 continue;
67 err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
68 GFP_KERNEL);
69 BUG_ON(err);
70 }
71}
72
73static void read_dynsb(struct super_block *sb,
74 struct logfs_je_dynsb *dynsb)
75{
76 struct logfs_super *super = logfs_super(sb);
77
78 super->s_gec = be64_to_cpu(dynsb->ds_gec);
79 super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
80 super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
81 super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
82 super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
83 super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
84 super->s_generation = be32_to_cpu(dynsb->ds_generation);
85}
86
87static void read_anchor(struct super_block *sb,
88 struct logfs_je_anchor *da)
89{
90 struct logfs_super *super = logfs_super(sb);
91 struct inode *inode = super->s_master_inode;
92 struct logfs_inode *li = logfs_inode(inode);
93 int i;
94
95 super->s_last_ino = be64_to_cpu(da->da_last_ino);
96 li->li_flags = 0;
97 li->li_height = da->da_height;
98 i_size_write(inode, be64_to_cpu(da->da_size));
99 li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
100
101 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
102 li->li_data[i] = be64_to_cpu(da->da_data[i]);
103}
104
105static void read_erasecount(struct super_block *sb,
106 struct logfs_je_journal_ec *ec)
107{
108 struct logfs_super *super = logfs_super(sb);
109 int i;
110
111 journal_for_each(i)
112 super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
113}
114
115static int read_area(struct super_block *sb, struct logfs_je_area *a)
116{
117 struct logfs_super *super = logfs_super(sb);
118 struct logfs_area *area = super->s_area[a->gc_level];
119 u64 ofs;
120 u32 writemask = ~(super->s_writesize - 1);
121
122 if (a->gc_level >= LOGFS_NO_AREAS)
123 return -EIO;
124 if (a->vim != VIM_DEFAULT)
125 return -EIO; /* TODO: close area and continue */
126
127 area->a_used_bytes = be32_to_cpu(a->used_bytes);
128 area->a_written_bytes = area->a_used_bytes & writemask;
129 area->a_segno = be32_to_cpu(a->segno);
130 if (area->a_segno)
131 area->a_is_open = 1;
132
133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
134 if (super->s_writesize > 1)
135 logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
136 else
137 logfs_buf_recover(area, ofs, NULL, 0);
138 return 0;
139}
140
141static void *unpack(void *from, void *to)
142{
143 struct logfs_journal_header *jh = from;
144 void *data = from + sizeof(struct logfs_journal_header);
145 int err;
146 size_t inlen, outlen;
147
148 inlen = be16_to_cpu(jh->h_len);
149 outlen = be16_to_cpu(jh->h_datalen);
150
151 if (jh->h_compr == COMPR_NONE)
152 memcpy(to, data, inlen);
153 else {
154 err = logfs_uncompress(data, to, inlen, outlen);
155 BUG_ON(err);
156 }
157 return to;
158}
159
160static int __read_je_header(struct super_block *sb, u64 ofs,
161 struct logfs_journal_header *jh)
162{
163 struct logfs_super *super = logfs_super(sb);
164 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
165 + MAX_JOURNAL_HEADER;
166 u16 type, len, datalen;
167 int err;
168
169 /* read header only */
170 err = wbuf_read(sb, ofs, sizeof(*jh), jh);
171 if (err)
172 return err;
173 type = be16_to_cpu(jh->h_type);
174 len = be16_to_cpu(jh->h_len);
175 datalen = be16_to_cpu(jh->h_datalen);
176 if (len > sb->s_blocksize)
177 return -EIO;
178 if ((type < JE_FIRST) || (type > JE_LAST))
179 return -EIO;
180 if (datalen > bufsize)
181 return -EIO;
182 return 0;
183}
184
185static int __read_je_payload(struct super_block *sb, u64 ofs,
186 struct logfs_journal_header *jh)
187{
188 u16 len;
189 int err;
190
191 len = be16_to_cpu(jh->h_len);
192 err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
193 if (err)
194 return err;
195 if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
196 /* Old code was confused. It forgot about the header length
197 * and stopped calculating the crc 16 bytes before the end
198 * of data - ick!
199 * FIXME: Remove this hack once the old code is fixed.
200 */
201 if (jh->h_crc == logfs_crc32(jh, len, 4))
202 WARN_ON_ONCE(1);
203 else
204 return -EIO;
205 }
206 return 0;
207}
208
209/*
210 * jh needs to be large enough to hold the complete entry, not just the header
211 */
212static int __read_je(struct super_block *sb, u64 ofs,
213 struct logfs_journal_header *jh)
214{
215 int err;
216
217 err = __read_je_header(sb, ofs, jh);
218 if (err)
219 return err;
220 return __read_je_payload(sb, ofs, jh);
221}
222
223static int read_je(struct super_block *sb, u64 ofs)
224{
225 struct logfs_super *super = logfs_super(sb);
226 struct logfs_journal_header *jh = super->s_compressed_je;
227 void *scratch = super->s_je;
228 u16 type, datalen;
229 int err;
230
231 err = __read_je(sb, ofs, jh);
232 if (err)
233 return err;
234 type = be16_to_cpu(jh->h_type);
235 datalen = be16_to_cpu(jh->h_datalen);
236
237 switch (type) {
238 case JE_DYNSB:
239 read_dynsb(sb, unpack(jh, scratch));
240 break;
241 case JE_ANCHOR:
242 read_anchor(sb, unpack(jh, scratch));
243 break;
244 case JE_ERASECOUNT:
245 read_erasecount(sb, unpack(jh, scratch));
246 break;
247 case JE_AREA:
248 read_area(sb, unpack(jh, scratch));
249 break;
250 case JE_OBJ_ALIAS:
251 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
252 datalen);
253 break;
254 default:
255 WARN_ON_ONCE(1);
256 return -EIO;
257 }
258 return err;
259}
260
261static int logfs_read_segment(struct super_block *sb, u32 segno)
262{
263 struct logfs_super *super = logfs_super(sb);
264 struct logfs_journal_header *jh = super->s_compressed_je;
265 u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
266 u32 h_ofs, last_ofs = 0;
267 u16 len, datalen, last_len = 0;
268 int i, err;
269
270 /* search for most recent commit */
271 for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
272 ofs = seg_ofs + h_ofs;
273 err = __read_je_header(sb, ofs, jh);
274 if (err)
275 continue;
276 if (jh->h_type != cpu_to_be16(JE_COMMIT))
277 continue;
278 err = __read_je_payload(sb, ofs, jh);
279 if (err)
280 continue;
281 len = be16_to_cpu(jh->h_len);
282 datalen = be16_to_cpu(jh->h_datalen);
283 if ((datalen > sizeof(super->s_je_array)) ||
284 (datalen % sizeof(__be64)))
285 continue;
286 last_ofs = h_ofs;
287 last_len = datalen;
288 h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
289 }
290 /* read commit */
291 if (last_ofs == 0)
292 return -ENOENT;
293 ofs = seg_ofs + last_ofs;
294 log_journal("Read commit from %llx\n", ofs);
295 err = __read_je(sb, ofs, jh);
296 BUG_ON(err); /* We should have caught it in the scan loop already */
297 if (err)
298 return err;
299 /* uncompress */
300 unpack(jh, super->s_je_array);
301 super->s_no_je = last_len / sizeof(__be64);
302 /* iterate over array */
303 for (i = 0; i < super->s_no_je; i++) {
304 err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
305 if (err)
306 return err;
307 }
308 super->s_journal_area->a_segno = segno;
309 return 0;
310}
311
312static u64 read_gec(struct super_block *sb, u32 segno)
313{
314 struct logfs_segment_header sh;
315 __be32 crc;
316 int err;
317
318 if (!segno)
319 return 0;
320 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
321 if (err)
322 return 0;
323 crc = logfs_crc32(&sh, sizeof(sh), 4);
324 if (crc != sh.crc) {
325 WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
326 /* Most likely it was just erased */
327 return 0;
328 }
329 return be64_to_cpu(sh.gec);
330}
331
332static int logfs_read_journal(struct super_block *sb)
333{
334 struct logfs_super *super = logfs_super(sb);
335 u64 gec[LOGFS_JOURNAL_SEGS], max;
336 u32 segno;
337 int i, max_i;
338
339 max = 0;
340 max_i = -1;
341 journal_for_each(i) {
342 segno = super->s_journal_seg[i];
343 gec[i] = read_gec(sb, super->s_journal_seg[i]);
344 if (gec[i] > max) {
345 max = gec[i];
346 max_i = i;
347 }
348 }
349 if (max_i == -1)
350 return -EIO;
351 /* FIXME: Try older segments in case of error */
352 return logfs_read_segment(sb, super->s_journal_seg[max_i]);
353}
354
355/*
356 * First search the current segment (outer loop), then pick the next segment
357 * in the array, skipping any zero entries (inner loop).
358 */
359static void journal_get_free_segment(struct logfs_area *area)
360{
361 struct logfs_super *super = logfs_super(area->a_sb);
362 int i;
363
364 journal_for_each(i) {
365 if (area->a_segno != super->s_journal_seg[i])
366 continue;
367
368 do {
369 i++;
370 if (i == LOGFS_JOURNAL_SEGS)
371 i = 0;
372 } while (!super->s_journal_seg[i]);
373
374 area->a_segno = super->s_journal_seg[i];
375 area->a_erase_count = ++(super->s_journal_ec[i]);
376 log_journal("Journal now at %x (ec %x)\n", area->a_segno,
377 area->a_erase_count);
378 return;
379 }
380 BUG();
381}
382
383static void journal_get_erase_count(struct logfs_area *area)
384{
385 /* erase count is stored globally and incremented in
386 * journal_get_free_segment() - nothing to do here */
387}
388
389static int journal_erase_segment(struct logfs_area *area)
390{
391 struct super_block *sb = area->a_sb;
392 struct logfs_segment_header sh;
393 u64 ofs;
394 int err;
395
396 err = logfs_erase_segment(sb, area->a_segno, 1);
397 if (err)
398 return err;
399
400 sh.pad = 0;
401 sh.type = SEG_JOURNAL;
402 sh.level = 0;
403 sh.segno = cpu_to_be32(area->a_segno);
404 sh.ec = cpu_to_be32(area->a_erase_count);
405 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
406 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
407
408 /* This causes a bug in segment.c. Not yet. */
409 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
410
411 ofs = dev_ofs(sb, area->a_segno, 0);
412 area->a_used_bytes = ALIGN(sizeof(sh), 16);
413 logfs_buf_write(area, ofs, &sh, sizeof(sh));
414 return 0;
415}
416
417static size_t __logfs_write_header(struct logfs_super *super,
418 struct logfs_journal_header *jh, size_t len, size_t datalen,
419 u16 type, u8 compr)
420{
421 jh->h_len = cpu_to_be16(len);
422 jh->h_type = cpu_to_be16(type);
423 jh->h_datalen = cpu_to_be16(datalen);
424 jh->h_compr = compr;
425 jh->h_pad[0] = 'H';
426 jh->h_pad[1] = 'E';
427 jh->h_pad[2] = 'A';
428 jh->h_pad[3] = 'D';
429 jh->h_pad[4] = 'R';
430 jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
431 return ALIGN(len, 16) + sizeof(*jh);
432}
433
434static size_t logfs_write_header(struct logfs_super *super,
435 struct logfs_journal_header *jh, size_t datalen, u16 type)
436{
437 size_t len = datalen;
438
439 return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
440}
441
442static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
443{
444 return LOGFS_JOURNAL_SEGS * sizeof(__be32);
445}
446
447static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
448 u16 *type, size_t *len)
449{
450 struct logfs_super *super = logfs_super(sb);
451 struct logfs_je_journal_ec *ec = _ec;
452 int i;
453
454 journal_for_each(i)
455 ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
456 *type = JE_ERASECOUNT;
457 *len = logfs_journal_erasecount_size(super);
458 return ec;
459}
460
461static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
462 size_t ignore2)
463{
464 struct logfs_shadow *shadow = _shadow;
465 struct super_block *sb = (void *)_sb;
466 struct logfs_super *super = logfs_super(sb);
467
468 /* consume new space */
469 super->s_free_bytes -= shadow->new_len;
470 super->s_used_bytes += shadow->new_len;
471 super->s_dirty_used_bytes -= shadow->new_len;
472
473 /* free up old space */
474 super->s_free_bytes += shadow->old_len;
475 super->s_used_bytes -= shadow->old_len;
476 super->s_dirty_free_bytes -= shadow->old_len;
477
478 logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
479 logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
480
481 log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
482 shadow->ino, shadow->bix, shadow->gc_level,
483 shadow->old_ofs, shadow->new_ofs,
484 shadow->old_len, shadow->new_len);
485 mempool_free(shadow, super->s_shadow_pool);
486}
487
488static void account_shadows(struct super_block *sb)
489{
490 struct logfs_super *super = logfs_super(sb);
491 struct inode *inode = super->s_master_inode;
492 struct logfs_inode *li = logfs_inode(inode);
493 struct shadow_tree *tree = &super->s_shadow_tree;
494
495 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
496 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
497
498 if (li->li_block) {
499 /*
500 * We never actually use the structure, when attached to the
501 * master inode. But it is easier to always free it here than
502 * to have checks in several places elsewhere when allocating
503 * it.
504 */
505 li->li_block->ops->free_block(sb, li->li_block);
506 }
507 BUG_ON((s64)li->li_used_bytes < 0);
508}
509
510static void *__logfs_write_anchor(struct super_block *sb, void *_da,
511 u16 *type, size_t *len)
512{
513 struct logfs_super *super = logfs_super(sb);
514 struct logfs_je_anchor *da = _da;
515 struct inode *inode = super->s_master_inode;
516 struct logfs_inode *li = logfs_inode(inode);
517 int i;
518
519 da->da_height = li->li_height;
520 da->da_last_ino = cpu_to_be64(super->s_last_ino);
521 da->da_size = cpu_to_be64(i_size_read(inode));
522 da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
523 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
524 da->da_data[i] = cpu_to_be64(li->li_data[i]);
525 *type = JE_ANCHOR;
526 *len = sizeof(*da);
527 return da;
528}
529
530static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
531 u16 *type, size_t *len)
532{
533 struct logfs_super *super = logfs_super(sb);
534 struct logfs_je_dynsb *dynsb = _dynsb;
535
536 dynsb->ds_gec = cpu_to_be64(super->s_gec);
537 dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
538 dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
539 dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
540 dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
541 dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
542 dynsb->ds_generation = cpu_to_be32(super->s_generation);
543 *type = JE_DYNSB;
544 *len = sizeof(*dynsb);
545 return dynsb;
546}
547
548static void write_wbuf(struct super_block *sb, struct logfs_area *area,
549 void *wbuf)
550{
551 struct logfs_super *super = logfs_super(sb);
552 struct address_space *mapping = super->s_mapping_inode->i_mapping;
553 u64 ofs;
554 pgoff_t index;
555 int page_ofs;
556 struct page *page;
557
558 ofs = dev_ofs(sb, area->a_segno,
559 area->a_used_bytes & ~(super->s_writesize - 1));
560 index = ofs >> PAGE_SHIFT;
561 page_ofs = ofs & (PAGE_SIZE - 1);
562
563 page = find_lock_page(mapping, index);
564 BUG_ON(!page);
565 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
566 unlock_page(page);
567}
568
569static void *logfs_write_area(struct super_block *sb, void *_a,
570 u16 *type, size_t *len)
571{
572 struct logfs_super *super = logfs_super(sb);
573 struct logfs_area *area = super->s_area[super->s_sum_index];
574 struct logfs_je_area *a = _a;
575
576 a->vim = VIM_DEFAULT;
577 a->gc_level = super->s_sum_index;
578 a->used_bytes = cpu_to_be32(area->a_used_bytes);
579 a->segno = cpu_to_be32(area->a_segno);
580 if (super->s_writesize > 1)
581 write_wbuf(sb, area, a + 1);
582
583 *type = JE_AREA;
584 *len = sizeof(*a) + super->s_writesize;
585 return a;
586}
587
588static void *logfs_write_commit(struct super_block *sb, void *h,
589 u16 *type, size_t *len)
590{
591 struct logfs_super *super = logfs_super(sb);
592
593 *type = JE_COMMIT;
594 *len = super->s_no_je * sizeof(__be64);
595 return super->s_je_array;
596}
597
598static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
599 size_t len)
600{
601 struct logfs_super *super = logfs_super(sb);
602 void *header = super->s_compressed_je;
603 void *data = header + sizeof(struct logfs_journal_header);
604 ssize_t compr_len, pad_len;
605 u8 compr = COMPR_ZLIB;
606
607 if (len == 0)
608 return logfs_write_header(super, header, 0, type);
609
610 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
611 if (compr_len < 0 || type == JE_ANCHOR) {
612 BUG_ON(len > sb->s_blocksize);
613 memcpy(data, buf, len);
614 compr_len = len;
615 compr = COMPR_NONE;
616 }
617
618 pad_len = ALIGN(compr_len, 16);
619 memset(data + compr_len, 0, pad_len - compr_len);
620
621 return __logfs_write_header(super, header, compr_len, len, type, compr);
622}
623
624static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
625 int must_pad)
626{
627 u32 writesize = logfs_super(area->a_sb)->s_writesize;
628 s32 ofs;
629 int ret;
630
631 ret = logfs_open_area(area, *bytes);
632 if (ret)
633 return -EAGAIN;
634
635 ofs = area->a_used_bytes;
636 area->a_used_bytes += *bytes;
637
638 if (must_pad) {
639 area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
640 *bytes = area->a_used_bytes - ofs;
641 }
642
643 return dev_ofs(area->a_sb, area->a_segno, ofs);
644}
645
646static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
647 size_t buf_len)
648{
649 struct logfs_super *super = logfs_super(sb);
650 struct logfs_area *area = super->s_journal_area;
651 struct logfs_journal_header *jh = super->s_compressed_je;
652 size_t len;
653 int must_pad = 0;
654 s64 ofs;
655
656 len = __logfs_write_je(sb, buf, type, buf_len);
657 if (jh->h_type == cpu_to_be16(JE_COMMIT))
658 must_pad = 1;
659
660 ofs = logfs_get_free_bytes(area, &len, must_pad);
661 if (ofs < 0)
662 return ofs;
663 logfs_buf_write(area, ofs, super->s_compressed_je, len);
664 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
665 return 0;
666}
667
668static int logfs_write_je(struct super_block *sb,
669 void* (*write)(struct super_block *sb, void *scratch,
670 u16 *type, size_t *len))
671{
672 void *buf;
673 size_t len;
674 u16 type;
675
676 buf = write(sb, logfs_super(sb)->s_je, &type, &len);
677 return logfs_write_je_buf(sb, buf, type, len);
678}
679
680int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
681 level_t level, int child_no, __be64 val)
682{
683 struct logfs_super *super = logfs_super(sb);
684 struct logfs_obj_alias *oa = super->s_je;
685 int err = 0, fill = super->s_je_fill;
686
687 log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
688 fill, ino, bix, level, child_no, be64_to_cpu(val));
689 oa[fill].ino = cpu_to_be64(ino);
690 oa[fill].bix = cpu_to_be64(bix);
691 oa[fill].val = val;
692 oa[fill].level = (__force u8)level;
693 oa[fill].child_no = cpu_to_be16(child_no);
694 fill++;
695 if (fill >= sb->s_blocksize / sizeof(*oa)) {
696 err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
697 fill = 0;
698 }
699
700 super->s_je_fill = fill;
701 return err;
702}
703
704static int logfs_write_obj_aliases(struct super_block *sb)
705{
706 struct logfs_super *super = logfs_super(sb);
707 int err;
708
709 log_journal("logfs_write_obj_aliases: %d aliases to write\n",
710 super->s_no_object_aliases);
711 super->s_je_fill = 0;
712 err = logfs_write_obj_aliases_pagecache(sb);
713 if (err)
714 return err;
715
716 if (super->s_je_fill)
717 err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
718 super->s_je_fill
719 * sizeof(struct logfs_obj_alias));
720 return err;
721}
722
723/*
724 * Write all journal entries. The goto logic ensures that all journal entries
725 * are written whenever a new segment is used. It is ugly and potentially a
726 * bit wasteful, but robustness is more important. With this we can *always*
727 * erase all journal segments except the one containing the most recent commit.
728 */
729void logfs_write_anchor(struct super_block *sb)
730{
731 struct logfs_super *super = logfs_super(sb);
732 struct logfs_area *area = super->s_journal_area;
733 int i, err;
734
735 if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
736 return;
737 super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
738
739 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
740 mutex_lock(&super->s_journal_mutex);
741
742 /* Do this first or suffer corruption */
743 logfs_sync_segments(sb);
744 account_shadows(sb);
745
746again:
747 super->s_no_je = 0;
748 for_each_area(i) {
749 if (!super->s_area[i]->a_is_open)
750 continue;
751 super->s_sum_index = i;
752 err = logfs_write_je(sb, logfs_write_area);
753 if (err)
754 goto again;
755 }
756 err = logfs_write_obj_aliases(sb);
757 if (err)
758 goto again;
759 err = logfs_write_je(sb, logfs_write_erasecount);
760 if (err)
761 goto again;
762 err = logfs_write_je(sb, __logfs_write_anchor);
763 if (err)
764 goto again;
765 err = logfs_write_je(sb, logfs_write_dynsb);
766 if (err)
767 goto again;
768 /*
769 * Order is imperative. First we sync all writes, including the
770 * non-committed journal writes. Then we write the final commit and
771 * sync the current journal segment.
772 * There is a theoretical bug here. Syncing the journal segment will
773 * write a number of journal entries and the final commit. All these
774 * are written in a single operation. If the device layer writes the
775 * data back-to-front, the commit will precede the other journal
776 * entries, leaving a race window.
777 * Two fixes are possible. Preferred is to fix the device layer to
778 * ensure writes happen front-to-back. Alternatively we can insert
779 * another logfs_sync_area() super->s_devops->sync() combo before
780 * writing the commit.
781 */
782 /*
783 * On another subject, super->s_devops->sync is usually not necessary.
784 * Unless called from sys_sync or friends, a barrier would suffice.
785 */
786 super->s_devops->sync(sb);
787 err = logfs_write_je(sb, logfs_write_commit);
788 if (err)
789 goto again;
790 log_journal("Write commit to %llx\n",
791 be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
792 logfs_sync_area(area);
793 BUG_ON(area->a_used_bytes != area->a_written_bytes);
794 super->s_devops->sync(sb);
795
796 mutex_unlock(&super->s_journal_mutex);
797 return;
798}
799
800void do_logfs_journal_wl_pass(struct super_block *sb)
801{
802 struct logfs_super *super = logfs_super(sb);
803 struct logfs_area *area = super->s_journal_area;
804 struct btree_head32 *head = &super->s_reserved_segments;
805 u32 segno, ec;
806 int i, err;
807
808 log_journal("Journal requires wear-leveling.\n");
809 /* Drop old segments */
810 journal_for_each(i)
811 if (super->s_journal_seg[i]) {
812 btree_remove32(head, super->s_journal_seg[i]);
813 logfs_set_segment_unreserved(sb,
814 super->s_journal_seg[i],
815 super->s_journal_ec[i]);
816 super->s_journal_seg[i] = 0;
817 super->s_journal_ec[i] = 0;
818 }
819 /* Get new segments */
820 for (i = 0; i < super->s_no_journal_segs; i++) {
821 segno = get_best_cand(sb, &super->s_reserve_list, &ec);
822 super->s_journal_seg[i] = segno;
823 super->s_journal_ec[i] = ec;
824 logfs_set_segment_reserved(sb, segno);
825 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
826 BUG_ON(err); /* mempool should prevent this */
827 err = logfs_erase_segment(sb, segno, 1);
828 BUG_ON(err); /* FIXME: remount-ro would be nicer */
829 }
830 /* Manually move journal_area */
831 freeseg(sb, area->a_segno);
832 area->a_segno = super->s_journal_seg[0];
833 area->a_is_open = 0;
834 area->a_used_bytes = 0;
835 /* Write journal */
836 logfs_write_anchor(sb);
837 /* Write superblocks */
838 err = logfs_write_sb(sb);
839 BUG_ON(err);
840}
841
842static const struct logfs_area_ops journal_area_ops = {
843 .get_free_segment = journal_get_free_segment,
844 .get_erase_count = journal_get_erase_count,
845 .erase_segment = journal_erase_segment,
846};
847
848int logfs_init_journal(struct super_block *sb)
849{
850 struct logfs_super *super = logfs_super(sb);
851 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
852 + MAX_JOURNAL_HEADER;
853 int ret = -ENOMEM;
854
855 mutex_init(&super->s_journal_mutex);
856 btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
857
858 super->s_je = kzalloc(bufsize, GFP_KERNEL);
859 if (!super->s_je)
860 return ret;
861
862 super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
863 if (!super->s_compressed_je)
864 return ret;
865
866 super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
867 if (IS_ERR(super->s_master_inode))
868 return PTR_ERR(super->s_master_inode);
869
870 ret = logfs_read_journal(sb);
871 if (ret)
872 return -EIO;
873
874 reserve_sb_and_journal(sb);
875 logfs_calc_free(sb);
876
877 super->s_journal_area->a_ops = &journal_area_ops;
878 return 0;
879}
880
881void logfs_cleanup_journal(struct super_block *sb)
882{
883 struct logfs_super *super = logfs_super(sb);
884
885 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
886 destroy_meta_inode(super->s_master_inode);
887 super->s_master_inode = NULL;
888
889 kfree(super->s_compressed_je);
890 kfree(super->s_je);
891}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..b84b0eec6024
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,725 @@
1/*
2 * fs/logfs/logfs.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Private header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_H
11#define FS_LOGFS_LOGFS_H
12
13#undef __CHECK_ENDIAN__
14#define __CHECK_ENDIAN__
15
16#include <linux/btree.h>
17#include <linux/crc32.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mempool.h>
21#include <linux/pagemap.h>
22#include <linux/mtd/mtd.h>
23#include "logfs_abi.h"
24
25#define LOGFS_DEBUG_SUPER (0x0001)
26#define LOGFS_DEBUG_SEGMENT (0x0002)
27#define LOGFS_DEBUG_JOURNAL (0x0004)
28#define LOGFS_DEBUG_DIR (0x0008)
29#define LOGFS_DEBUG_FILE (0x0010)
30#define LOGFS_DEBUG_INODE (0x0020)
31#define LOGFS_DEBUG_READWRITE (0x0040)
32#define LOGFS_DEBUG_GC (0x0080)
33#define LOGFS_DEBUG_GC_NOISY (0x0100)
34#define LOGFS_DEBUG_ALIASES (0x0200)
35#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
36#define LOGFS_DEBUG_ALL (0xffffffff)
37
38#define LOGFS_DEBUG (0x01)
39/*
40 * To enable specific log messages, simply define LOGFS_DEBUG to match any
41 * or all of the above.
42 */
43#ifndef LOGFS_DEBUG
44#define LOGFS_DEBUG (0)
45#endif
46
47#define log_cond(cond, fmt, arg...) do { \
48 if (cond) \
49 printk(KERN_DEBUG fmt, ##arg); \
50} while (0)
51
52#define log_super(fmt, arg...) \
53 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
54#define log_segment(fmt, arg...) \
55 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
56#define log_journal(fmt, arg...) \
57 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
58#define log_dir(fmt, arg...) \
59 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
60#define log_file(fmt, arg...) \
61 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
62#define log_inode(fmt, arg...) \
63 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
64#define log_readwrite(fmt, arg...) \
65 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
66#define log_gc(fmt, arg...) \
67 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
68#define log_gc_noisy(fmt, arg...) \
69 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
70#define log_aliases(fmt, arg...) \
71 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
72#define log_blockmove(fmt, arg...) \
73 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
74
75#define PG_pre_locked PG_owner_priv_1
76#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
77#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
78#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
79
80/* FIXME: This should really be somewhere in the 64bit area. */
81#define LOGFS_LINK_MAX (1<<30)
82
83/* Read-only filesystem */
84#define LOGFS_SB_FLAG_RO 0x0001
85#define LOGFS_SB_FLAG_DIRTY 0x0002
86#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
87#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
88
89/* Write Control Flags */
90#define WF_LOCK 0x01 /* take write lock */
91#define WF_WRITE 0x02 /* write block */
92#define WF_DELETE 0x04 /* delete old block */
93
94typedef u8 __bitwise level_t;
95typedef u8 __bitwise gc_level_t;
96
97#define LEVEL(level) ((__force level_t)(level))
98#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
99
100#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
101 (__force level_t)((__force u8)(level) - 1) )
102
103/**
104 * struct logfs_area - area management information
105 *
106 * @a_sb: the superblock this area belongs to
107 * @a_is_open: 1 if the area is currently open, else 0
108 * @a_segno: segment number of area
109 * @a_written_bytes: number of bytes already written back
110 * @a_used_bytes: number of used bytes
111 * @a_ops: area operations (either journal or ostore)
112 * @a_erase_count: erase count
113 * @a_level: GC level
114 */
115struct logfs_area { /* a segment open for writing */
116 struct super_block *a_sb;
117 int a_is_open;
118 u32 a_segno;
119 u32 a_written_bytes;
120 u32 a_used_bytes;
121 const struct logfs_area_ops *a_ops;
122 u32 a_erase_count;
123 gc_level_t a_level;
124};
125
126/**
127 * struct logfs_area_ops - area operations
128 *
129 * @get_free_segment: fill area->ofs with the offset of a free segment
130 * @get_erase_count: fill area->erase_count (needs area->ofs)
131 * @erase_segment: erase and setup segment
132 */
133struct logfs_area_ops {
134 void (*get_free_segment)(struct logfs_area *area);
135 void (*get_erase_count)(struct logfs_area *area);
136 int (*erase_segment)(struct logfs_area *area);
137};
138
139/**
140 * struct logfs_device_ops - device access operations
141 *
142 * @readpage: read one page (mm page)
143 * @writeseg: write one segment. may be a partial segment
144 * @erase: erase one segment
145 * @read: read from the device
146 * @erase: erase part of the device
147 */
148struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
150 struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
151 int (*write_sb)(struct super_block *sb, struct page *page);
152 int (*readpage)(void *_sb, struct page *page);
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write);
156 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb);
158};
159
160/**
161 * struct candidate_list - list of similar candidates
162 */
163struct candidate_list {
164 struct rb_root rb_tree;
165 int count;
166 int maxcount;
167 int sort_by_ec;
168};
169
170/**
171 * struct gc_candidate - "candidate" segment to be garbage collected next
172 *
173 * @list: list (either free of low)
174 * @segno: segment number
175 * @valid: number of valid bytes
176 * @erase_count: erase count of segment
177 * @dist: distance from tree root
178 *
179 * Candidates can be on two lists. The free list contains electees rather
180 * than candidates - segments that no longer contain any valid data. The
181 * low list contains candidates to be picked for GC. It should be kept
182 * short. It is not required to always pick a perfect candidate. In the
183 * worst case GC will have to move more data than absolutely necessary.
184 */
185struct gc_candidate {
186 struct rb_node rb_node;
187 struct candidate_list *list;
188 u32 segno;
189 u32 valid;
190 u32 erase_count;
191 u8 dist;
192};
193
194/**
195 * struct logfs_journal_entry - temporary structure used during journal scan
196 *
197 * @used:
198 * @version: normalized version
199 * @len: length
200 * @offset: offset
201 */
202struct logfs_journal_entry {
203 int used;
204 s16 version;
205 u16 len;
206 u16 datalen;
207 u64 offset;
208};
209
210enum transaction_state {
211 CREATE_1 = 1,
212 CREATE_2,
213 UNLINK_1,
214 UNLINK_2,
215 CROSS_RENAME_1,
216 CROSS_RENAME_2,
217 TARGET_RENAME_1,
218 TARGET_RENAME_2,
219 TARGET_RENAME_3
220};
221
222/**
223 * struct logfs_transaction - essential fields to support atomic dirops
224 *
225 * @ino: target inode
226 * @dir: inode of directory containing dentry
227 * @pos: pos of dentry in directory
228 */
229struct logfs_transaction {
230 enum transaction_state state;
231 u64 ino;
232 u64 dir;
233 u64 pos;
234};
235
236/**
237 * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
238 * @old_ofs: offset of old block on medium
239 * @new_ofs: offset of new block on medium
240 * @ino: inode number
241 * @bix: block index
242 * @old_len: size of old block, including header
243 * @new_len: size of new block, including header
244 * @level: block level
245 */
246struct logfs_shadow {
247 u64 old_ofs;
248 u64 new_ofs;
249 u64 ino;
250 u64 bix;
251 int old_len;
252 int new_len;
253 gc_level_t gc_level;
254};
255
256/**
257 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs
260 */
261struct shadow_tree {
262 struct btree_head64 new;
263 struct btree_head64 old;
264};
265
266struct object_alias_item {
267 struct list_head list;
268 __be64 val;
269 int child_no;
270};
271
272/**
273 * struct logfs_block - contains any block state
274 * @type: indirect block or inode
275 * @full: number of fully populated children
276 * @partial: number of partially populated children
277 *
278 * Most blocks are directly represented by page cache pages. But when a block
279 * becomes dirty, is part of a transaction, contains aliases or is otherwise
280 * special, a struct logfs_block is allocated to track the additional state.
281 * Inodes are very similar to indirect blocks, so they can also get one of
282 * these structures added when appropriate.
283 */
284#define BLOCK_INDIRECT 1 /* Indirect block */
285#define BLOCK_INODE 2 /* Inode */
286struct logfs_block_ops;
287struct logfs_block {
288 struct list_head alias_list;
289 struct list_head item_list;
290 struct super_block *sb;
291 u64 ino;
292 u64 bix;
293 level_t level;
294 struct page *page;
295 struct inode *inode;
296 struct logfs_transaction *ta;
297 unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
298 struct logfs_block_ops *ops;
299 int full;
300 int partial;
301 int reserved_bytes;
302};
303
304typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
305 level_t level, int child_no, __be64 val);
306struct logfs_block_ops {
307 void (*write_block)(struct logfs_block *block);
308 gc_level_t (*block_level)(struct logfs_block *block);
309 void (*free_block)(struct super_block *sb, struct logfs_block*block);
310 int (*write_alias)(struct super_block *sb,
311 struct logfs_block *block,
312 write_alias_t *write_one_alias);
313};
314
315struct logfs_super {
316 struct mtd_info *s_mtd; /* underlying device */
317 struct block_device *s_bdev; /* underlying device */
318 const struct logfs_device_ops *s_devops;/* device access */
319 struct inode *s_master_inode; /* inode file */
320 struct inode *s_segfile_inode; /* segment file */
321 struct inode *s_mapping_inode; /* device mapping */
322 atomic_t s_pending_writes; /* outstanting bios */
323 long s_flags;
324 mempool_t *s_btree_pool; /* for btree nodes */
325 mempool_t *s_alias_pool; /* aliases in segment.c */
326 u64 s_feature_incompat;
327 u64 s_feature_ro_compat;
328 u64 s_feature_compat;
329 u64 s_feature_flags;
330 u64 s_sb_ofs[2];
331 struct page *s_erase_page; /* for dev_bdev.c */
332 /* alias.c fields */
333 struct btree_head32 s_segment_alias; /* remapped segments */
334 int s_no_object_aliases;
335 struct list_head s_object_alias; /* remapped objects */
336 struct btree_head128 s_object_alias_tree; /* remapped objects */
337 struct mutex s_object_alias_mutex;
338 /* dir.c fields */
339 struct mutex s_dirop_mutex; /* for creat/unlink/rename */
340 u64 s_victim_ino; /* used for atomic dir-ops */
341 u64 s_rename_dir; /* source directory ino */
342 u64 s_rename_pos; /* position of source dd */
343 /* gc.c fields */
344 long s_segsize; /* size of a segment */
345 int s_segshift; /* log2 of segment size */
346 long s_segmask; /* 1 << s_segshift - 1 */
347 long s_no_segs; /* segments on device */
348 long s_no_journal_segs; /* segments used for journal */
349 long s_no_blocks; /* blocks per segment */
350 long s_writesize; /* minimum write size */
351 int s_writeshift; /* log2 of write size */
352 u64 s_size; /* filesystem size */
353 struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
354 u64 s_gec; /* global erase count */
355 u64 s_wl_gec_ostore; /* time of last wl event */
356 u64 s_wl_gec_journal; /* time of last wl event */
357 u64 s_sweeper; /* current sweeper pos */
358 u8 s_ifile_levels; /* max level of ifile */
359 u8 s_iblock_levels; /* max level of regular files */
360 u8 s_data_levels; /* # of segments to leaf block*/
361 u8 s_total_levels; /* sum of above three */
362 struct btree_head32 s_cand_tree; /* all candidates */
363 struct candidate_list s_free_list; /* 100% free segments */
364 struct candidate_list s_reserve_list; /* Bad segment reserve */
365 struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
366 struct candidate_list s_ec_list; /* wear level candidates */
367 struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
368 /* inode.c fields */
369 u64 s_last_ino; /* highest ino used */
370 long s_inos_till_wrap;
371 u32 s_generation; /* i_generation for new files */
372 struct list_head s_freeing_list; /* inodes being freed */
373 /* journal.c fields */
374 struct mutex s_journal_mutex;
375 void *s_je; /* journal entry to compress */
376 void *s_compressed_je; /* block to write to journal */
377 u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
378 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
379 u64 s_last_version;
380 struct logfs_area *s_journal_area; /* open journal segment */
381 __be64 s_je_array[64];
382 int s_no_je;
383
384 int s_sum_index; /* for the 12 summaries */
385 struct shadow_tree s_shadow_tree;
386 int s_je_fill; /* index of current je */
387 /* readwrite.c fields */
388 struct mutex s_write_mutex;
389 int s_lock_count;
390 mempool_t *s_block_pool; /* struct logfs_block pool */
391 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
392 /*
393 * Space accounting:
394 * - s_used_bytes specifies space used to store valid data objects.
395 * - s_dirty_used_bytes is space used to store non-committed data
396 * objects. Those objects have already been written themselves,
397 * but they don't become valid until all indirect blocks up to the
398 * journal have been written as well.
399 * - s_dirty_free_bytes is space used to store the old copy of a
400 * replaced object, as long as the replacement is non-committed.
401 * In other words, it is the amount of space freed when all dirty
402 * blocks are written back.
403 * - s_free_bytes is the amount of free space available for any
404 * purpose.
405 * - s_root_reserve is the amount of free space available only to
406 * the root user. Non-privileged users can no longer write once
407 * this watermark has been reached.
408 * - s_speed_reserve is space which remains unused to speed up
409 * garbage collection performance.
410 * - s_dirty_pages is the space reserved for currently dirty pages.
411 * It is a pessimistic estimate, so some/most will get freed on
412 * page writeback.
413 *
414 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
415 */
416 u64 s_free_bytes;
417 u64 s_used_bytes;
418 u64 s_dirty_free_bytes;
419 u64 s_dirty_used_bytes;
420 u64 s_root_reserve;
421 u64 s_speed_reserve;
422 u64 s_dirty_pages;
423 /* Bad block handling:
424 * - s_bad_seg_reserve is a number of segments usually kept
425 * free. When encountering bad blocks, the affected segment's data
426 * is _temporarily_ moved to a reserved segment.
427 * - s_bad_segments is the number of known bad segments.
428 */
429 u32 s_bad_seg_reserve;
430 u32 s_bad_segments;
431};
432
433/**
434 * struct logfs_inode - in-memory inode
435 *
436 * @vfs_inode: struct inode
437 * @li_data: data pointers
438 * @li_used_bytes: number of used bytes
439 * @li_freeing_list: used to track inodes currently being freed
440 * @li_flags: inode flags
441 * @li_refcount: number of internal (GC-induced) references
442 */
443struct logfs_inode {
444 struct inode vfs_inode;
445 u64 li_data[LOGFS_EMBEDDED_FIELDS];
446 u64 li_used_bytes;
447 struct list_head li_freeing_list;
448 struct logfs_block *li_block;
449 u32 li_flags;
450 u8 li_height;
451 int li_refcount;
452};
453
454#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
455#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
456#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
457
458/* compr.c */
459int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
460int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
461int __init logfs_compr_init(void);
462void logfs_compr_exit(void);
463
464/* dev_bdev.c */
465#ifdef CONFIG_BLOCK
466int logfs_get_sb_bdev(struct file_system_type *type, int flags,
467 const char *devname, struct vfsmount *mnt);
468#else
469static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
470 const char *devname, struct vfsmount *mnt)
471{
472 return -ENODEV;
473}
474#endif
475
476/* dev_mtd.c */
477#ifdef CONFIG_MTD
478int logfs_get_sb_mtd(struct file_system_type *type, int flags,
479 int mtdnr, struct vfsmount *mnt);
480#else
481static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
482 int mtdnr, struct vfsmount *mnt)
483{
484 return -ENODEV;
485}
486#endif
487
488/* dir.c */
489extern const struct inode_operations logfs_symlink_iops;
490extern const struct inode_operations logfs_dir_iops;
491extern const struct file_operations logfs_dir_fops;
492int logfs_replay_journal(struct super_block *sb);
493
494/* file.c */
495extern const struct inode_operations logfs_reg_iops;
496extern const struct file_operations logfs_reg_fops;
497extern const struct address_space_operations logfs_reg_aops;
498int logfs_readpage(struct file *file, struct page *page);
499int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
500 unsigned long arg);
501int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
502
503/* gc.c */
504u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
505void logfs_gc_pass(struct super_block *sb);
506int logfs_check_areas(struct super_block *sb);
507int logfs_init_gc(struct super_block *sb);
508void logfs_cleanup_gc(struct super_block *sb);
509
510/* inode.c */
511extern const struct super_operations logfs_super_operations;
512struct inode *logfs_iget(struct super_block *sb, ino_t ino);
513struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
514void logfs_safe_iput(struct inode *inode, int cookie);
515struct inode *logfs_new_inode(struct inode *dir, int mode);
516struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
517struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
518int logfs_init_inode_cache(void);
519void logfs_destroy_inode_cache(void);
520void destroy_meta_inode(struct inode *inode);
521void logfs_set_blocks(struct inode *inode, u64 no);
522/* these logically belong into inode.c but actually reside in readwrite.c */
523int logfs_read_inode(struct inode *inode);
524int __logfs_write_inode(struct inode *inode, long flags);
525void logfs_delete_inode(struct inode *inode);
526void logfs_clear_inode(struct inode *inode);
527
528/* journal.c */
529void logfs_write_anchor(struct super_block *sb);
530int logfs_init_journal(struct super_block *sb);
531void logfs_cleanup_journal(struct super_block *sb);
532int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
533 level_t level, int child_no, __be64 val);
534void do_logfs_journal_wl_pass(struct super_block *sb);
535
536/* readwrite.c */
537pgoff_t logfs_pack_index(u64 bix, level_t level);
538void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
539int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
540 loff_t bix, long flags, struct shadow_tree *shadow_tree);
541int logfs_readpage_nolock(struct page *page);
542int logfs_write_buf(struct inode *inode, struct page *page, long flags);
543int logfs_delete(struct inode *inode, pgoff_t index,
544 struct shadow_tree *shadow_tree);
545int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
546 gc_level_t gc_level, long flags);
547int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
548 gc_level_t gc_level);
549int logfs_truncate(struct inode *inode, u64 size);
550u64 logfs_seek_hole(struct inode *inode, u64 bix);
551u64 logfs_seek_data(struct inode *inode, u64 bix);
552int logfs_open_segfile(struct super_block *sb);
553int logfs_init_rw(struct super_block *sb);
554void logfs_cleanup_rw(struct super_block *sb);
555void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
556void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
557void logfs_write_block(struct logfs_block *block, long flags);
558int logfs_write_obj_aliases_pagecache(struct super_block *sb);
559void logfs_get_segment_entry(struct super_block *sb, u32 segno,
560 struct logfs_segment_entry *se);
561void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
562void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
563 gc_level_t gc_level);
564void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
565void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
566struct logfs_block *__alloc_block(struct super_block *sb,
567 u64 ino, u64 bix, level_t level);
568void __free_block(struct super_block *sb, struct logfs_block *block);
569void btree_write_block(struct logfs_block *block);
570void initialize_block_counters(struct page *page, struct logfs_block *block,
571 __be64 *array, int page_is_empty);
572int logfs_exist_block(struct inode *inode, u64 bix);
573int get_page_reserve(struct inode *inode, struct page *page);
574extern struct logfs_block_ops indirect_block_ops;
575
576/* segment.c */
577int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
578int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
579int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
580 level_t level);
581int logfs_segment_write(struct inode *inode, struct page *page,
582 struct logfs_shadow *shadow);
583int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
584int logfs_load_object_aliases(struct super_block *sb,
585 struct logfs_obj_alias *oa, int count);
586void move_page_to_btree(struct page *page);
587int logfs_init_mapping(struct super_block *sb);
588void logfs_sync_area(struct logfs_area *area);
589void logfs_sync_segments(struct super_block *sb);
590void freeseg(struct super_block *sb, u32 segno);
591
592/* area handling */
593int logfs_init_areas(struct super_block *sb);
594void logfs_cleanup_areas(struct super_block *sb);
595int logfs_open_area(struct logfs_area *area, size_t bytes);
596void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
597 int use_filler);
598
599static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
600 void *buf, size_t len)
601{
602 __logfs_buf_write(area, ofs, buf, len, 0);
603}
604
605static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
606 void *buf, size_t len)
607{
608 __logfs_buf_write(area, ofs, buf, len, 1);
609}
610
611/* super.c */
612struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
613void emergency_read_end(struct page *page);
614void logfs_crash_dump(struct super_block *sb);
615void *memchr_inv(const void *s, int c, size_t n);
616int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
617int logfs_get_sb_device(struct file_system_type *type, int flags,
618 struct mtd_info *mtd, struct block_device *bdev,
619 const struct logfs_device_ops *devops, struct vfsmount *mnt);
620int logfs_check_ds(struct logfs_disk_super *ds);
621int logfs_write_sb(struct super_block *sb);
622
623static inline struct logfs_super *logfs_super(struct super_block *sb)
624{
625 return sb->s_fs_info;
626}
627
628static inline struct logfs_inode *logfs_inode(struct inode *inode)
629{
630 return container_of(inode, struct logfs_inode, vfs_inode);
631}
632
633static inline void logfs_set_ro(struct super_block *sb)
634{
635 logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
636}
637
638#define LOGFS_BUG(sb) do { \
639 struct super_block *__sb = sb; \
640 logfs_crash_dump(__sb); \
641 logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
642 BUG(); \
643} while (0)
644
645#define LOGFS_BUG_ON(condition, sb) \
646 do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
647
648static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
649{
650 return cpu_to_be32(crc32(~0, data+skip, len-skip));
651}
652
653static inline u8 logfs_type(struct inode *inode)
654{
655 return (inode->i_mode >> 12) & 15;
656}
657
658static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
659{
660 return pos >> sb->s_blocksize_bits;
661}
662
663static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
664{
665 return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
666}
667
668static inline u32 seg_no(struct super_block *sb, u64 ofs)
669{
670 return ofs >> logfs_super(sb)->s_segshift;
671}
672
673static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
674{
675 return ofs & logfs_super(sb)->s_segmask;
676}
677
678static inline u64 seg_align(struct super_block *sb, u64 ofs)
679{
680 return ofs & ~logfs_super(sb)->s_segmask;
681}
682
683static inline struct logfs_block *logfs_block(struct page *page)
684{
685 return (void *)page->private;
686}
687
688static inline level_t shrink_level(gc_level_t __level)
689{
690 u8 level = (__force u8)__level;
691
692 if (level >= LOGFS_MAX_LEVELS)
693 level -= LOGFS_MAX_LEVELS;
694 return (__force level_t)level;
695}
696
697static inline gc_level_t expand_level(u64 ino, level_t __level)
698{
699 u8 level = (__force u8)__level;
700
701 if (ino == LOGFS_INO_MASTER) {
702 /* ifile has seperate areas */
703 level += LOGFS_MAX_LEVELS;
704 }
705 return (__force gc_level_t)level;
706}
707
708static inline int logfs_block_shift(struct super_block *sb, level_t level)
709{
710 level = shrink_level((__force gc_level_t)level);
711 return (__force int)level * (sb->s_blocksize_bits - 3);
712}
713
714static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
715{
716 return ~0ull << logfs_block_shift(sb, level);
717}
718
719static inline struct logfs_area *get_area(struct super_block *sb,
720 gc_level_t gc_level)
721{
722 return logfs_super(sb)->s_area[(__force u8)gc_level];
723}
724
725#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..f674725663fe
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
1/*
2 * fs/logfs/logfs_abi.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Public header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_ABI_H
11#define FS_LOGFS_LOGFS_ABI_H
12
13/* For out-of-kernel compiles */
14#ifndef BUILD_BUG_ON
15#define BUILD_BUG_ON(condition) /**/
16#endif
17
18#define SIZE_CHECK(type, size) \
19static inline void check_##type(void) \
20{ \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
22}
23
24/*
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
29 *
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
32 */
33
34/*
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
59 * short-lived.
60 */
61
62
63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
65#define LOGFS_MAGIC_U32 0xc97e8168u
66
67/*
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
71 * will remain 64bit.
72 *
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
76 */
77#define LOGFS_BLOCKSIZE (4096ull)
78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79#define LOGFS_BLOCK_BITS (9)
80
81/*
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
84 */
85#define I0_BLOCKS (16)
86#define I1_BLOCKS LOGFS_BLOCK_FACTOR
87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91
92#define INDIRECT_INDEX I0_BLOCKS
93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
94
95/*
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
99 *
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
102 */
103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
110
111/*
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
115 */
116#define LOGFS_FULLY_POPULATED (1ULL << 63)
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118
119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
125 * progress.
126 *
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
132 */
133#define LOGFS_MAX_INDIRECT (5)
134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
136
137/* Maximum size of filenames */
138#define LOGFS_MAX_NAMELEN (255)
139
140/* Number of segments in the primary journal. */
141#define LOGFS_JOURNAL_SEGS (16)
142
143/* Maximum number of free/erased/etc. segments in journal entries */
144#define MAX_CACHED_SEGS (64)
145
146
147/*
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150 * its header,
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
153 * fit.
154 */
155#define LOGFS_OBJECT_HEADERSIZE (0x1c)
156#define LOGFS_SEGMENT_HEADERSIZE (0x18)
157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158#define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160
161/*
162 * Segment types:
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
166 */
167enum {
168 SEG_SUPER = 0x01,
169 SEG_JOURNAL = 0x02,
170 SEG_OSTORE = 0x03,
171};
172
173/**
174 * struct logfs_segment_header - per-segment header in the ostore
175 *
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
183 */
184struct logfs_segment_header {
185 __be32 crc;
186 __be16 pad;
187 __u8 type;
188 __u8 level;
189 __be32 segno;
190 __be32 ec;
191 __be64 gec;
192};
193
194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195
196#define LOGFS_FEATURES_INCOMPAT (0ull)
197#define LOGFS_FEATURES_RO_COMPAT (0ull)
198#define LOGFS_FEATURES_COMPAT (0ull)
199
200/**
201 * struct logfs_disk_super - on-medium superblock
202 *
203 * @ds_magic: magic number, must equal LOGFS_MAGIC
204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data
208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features
211 * @ds_feature_compat: compatible filesystem features
212 * @ds_flags: flags
213 * @ds_segment_shift: log2 of segment size
214 * @ds_block_shift: log2 of block size
215 * @ds_write_shift: log2 of write size
216 * @pad1: reserved, must be 0
217 * @ds_journal_seg: segments used by primary journal
218 * @ds_root_reserve: bytes reserved for the superuser
219 * @ds_speed_reserve: bytes reserved to speed up GC
220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
221 * @pad2: reserved, must be 0
222 * @pad3: reserved, must be 0
223 *
224 * Contains only read-only fields. Read-write fields like the amount of used
225 * space is tracked in the dynamic superblock, which is stored in the journal.
226 */
227struct logfs_disk_super {
228 struct logfs_segment_header ds_sh;
229 __be64 ds_magic;
230
231 __be32 ds_crc;
232 __u8 ds_ifile_levels;
233 __u8 ds_iblock_levels;
234 __u8 ds_data_levels;
235 __u8 ds_segment_shift;
236 __u8 ds_block_shift;
237 __u8 ds_write_shift;
238 __u8 pad0[6];
239
240 __be64 ds_filesystem_size;
241 __be32 ds_segment_size;
242 __be32 ds_bad_seg_reserve;
243
244 __be64 ds_feature_incompat;
245 __be64 ds_feature_ro_compat;
246
247 __be64 ds_feature_compat;
248 __be64 ds_feature_flags;
249
250 __be64 ds_root_reserve;
251 __be64 ds_speed_reserve;
252
253 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
254
255 __be64 ds_super_ofs[2];
256 __be64 pad3[8];
257};
258
259SIZE_CHECK(logfs_disk_super, 256);
260
261/*
262 * Object types:
263 * OBJ_BLOCK - Data or indirect block
264 * OBJ_INODE - Inode
265 * OBJ_DENTRY - Dentry
266 */
267enum {
268 OBJ_BLOCK = 0x04,
269 OBJ_INODE = 0x05,
270 OBJ_DENTRY = 0x06,
271};
272
273/**
274 * struct logfs_object_header - per-object header in the ostore
275 *
276 * @crc: crc32 of header, excluding data_crc
277 * @len: length of data
278 * @type: object type, see above
279 * @compr: compression type
280 * @ino: inode number
281 * @bix: block index
282 * @data_crc: crc32 of payload
283 */
284struct logfs_object_header {
285 __be32 crc;
286 __be16 len;
287 __u8 type;
288 __u8 compr;
289 __be64 ino;
290 __be64 bix;
291 __be32 data_crc;
292} __attribute__((packed));
293
294SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
295
296/*
297 * Reserved inode numbers:
298 * LOGFS_INO_MASTER - master inode (for inode file)
299 * LOGFS_INO_ROOT - root directory
300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
301 */
302enum {
303 LOGFS_INO_MAPPING = 0x00,
304 LOGFS_INO_MASTER = 0x01,
305 LOGFS_INO_ROOT = 0x02,
306 LOGFS_INO_SEGFILE = 0x03,
307 LOGFS_RESERVED_INOS = 0x10,
308};
309
310/*
311 * Inode flags. High bits should never be written to the medium. They are
312 * reserved for in-memory usage.
313 * Low bits should either remain in sync with the corresponding FS_*_FL or
314 * reuse slots that obviously don't make sense for logfs.
315 *
316 * LOGFS_IF_DIRTY Inode must be written back
317 * LOGFS_IF_ZOMBIE Inode has been deleted
318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
319 */
320#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
321#define LOGFS_IF_DIRTY 0x20000000
322#define LOGFS_IF_ZOMBIE 0x40000000
323#define LOGFS_IF_STILLBORN 0x80000000
324
325/* Flags available to chattr */
326#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
327#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328/* Flags inherited from parent directory on file/directory creation */
329#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
330
331/**
332 * struct logfs_disk_inode - on-medium inode
333 *
334 * @di_mode: file mode
335 * @di_pad: reserved, must be 0
336 * @di_flags: inode flags, see above
337 * @di_uid: user id
338 * @di_gid: group id
339 * @di_ctime: change time
340 * @di_mtime: modify time
341 * @di_refcount: reference count (aka nlink or link count)
342 * @di_generation: inode generation, for nfs
343 * @di_used_bytes: number of bytes used
344 * @di_size: file size
345 * @di_data: data pointers
346 */
347struct logfs_disk_inode {
348 __be16 di_mode;
349 __u8 di_height;
350 __u8 di_pad;
351 __be32 di_flags;
352 __be32 di_uid;
353 __be32 di_gid;
354
355 __be64 di_ctime;
356 __be64 di_mtime;
357
358 __be64 di_atime;
359 __be32 di_refcount;
360 __be32 di_generation;
361
362 __be64 di_used_bytes;
363 __be64 di_size;
364
365 __be64 di_data[LOGFS_EMBEDDED_FIELDS];
366};
367
368SIZE_CHECK(logfs_disk_inode, 200);
369
370#define INODE_POINTER_OFS \
371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372#define INODE_USED_OFS \
373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374#define INODE_SIZE_OFS \
375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376#define INODE_HEIGHT_OFS (0)
377
378/**
379 * struct logfs_disk_dentry - on-medium dentry structure
380 *
381 * @ino: inode number
382 * @namelen: length of file name
383 * @type: file type, identical to bits 12..15 of mode
384 * @name: file name
385 */
386/* FIXME: add 6 bytes of padding to remove the __packed */
387struct logfs_disk_dentry {
388 __be64 ino;
389 __be16 namelen;
390 __u8 type;
391 __u8 name[LOGFS_MAX_NAMELEN];
392} __attribute__((packed));
393
394SIZE_CHECK(logfs_disk_dentry, 266);
395
396#define RESERVED 0xffffffff
397#define BADSEG 0xffffffff
398/**
399 * struct logfs_segment_entry - segment file entry
400 *
401 * @ec_level: erase count and level
402 * @valid: number of valid bytes
403 *
404 * Segment file contains one entry for every segment. ec_level contains the
405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408 * superblock or the journal, or when the segment is bad.
409 */
410struct logfs_segment_entry {
411 __be32 ec_level;
412 __be32 valid;
413};
414
415SIZE_CHECK(logfs_segment_entry, 8);
416
417/**
418 * struct logfs_journal_header - header for journal entries (JEs)
419 *
420 * @h_crc: crc32 of journal entry
421 * @h_len: length of compressed journal entry,
422 * not including header
423 * @h_datalen: length of uncompressed data
424 * @h_type: JE type
425 * @h_compr: compression type
426 * @h_pad: reserved
427 */
428struct logfs_journal_header {
429 __be32 h_crc;
430 __be16 h_len;
431 __be16 h_datalen;
432 __be16 h_type;
433 __u8 h_compr;
434 __u8 h_pad[5];
435};
436
437SIZE_CHECK(logfs_journal_header, 16);
438
439/*
440 * Life expectency of data.
441 * VIM_DEFAULT - default vim
442 * VIM_SEGFILE - for segment file only - very short-living
443 * VIM_GC - GC'd data - likely long-living
444 */
445enum logfs_vim {
446 VIM_DEFAULT = 0,
447 VIM_SEGFILE = 1,
448};
449
450/**
451 * struct logfs_je_area - wbuf header
452 *
453 * @segno: segment number of area
454 * @used_bytes: number of bytes already used
455 * @gc_level: GC level
456 * @vim: life expectancy of data
457 *
458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed.
462 * The write buffer immediately follow this header.
463 */
464struct logfs_je_area {
465 __be32 segno;
466 __be32 used_bytes;
467 __u8 gc_level;
468 __u8 vim;
469} __attribute__((packed));
470
471SIZE_CHECK(logfs_je_area, 10);
472
473#define MAX_JOURNAL_HEADER \
474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
475
476/**
477 * struct logfs_je_dynsb - dynamic superblock
478 *
479 * @ds_gec: global erase count
480 * @ds_sweeper: current position of GC "sweeper"
481 * @ds_rename_dir: source directory ino (see dir.c documentation)
482 * @ds_rename_pos: position of source dd (see dir.c documentation)
483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
484 * @ds_victim_ino: parent inode of victim (see dir.c)
485 * @ds_used_bytes: number of used bytes
486 */
487struct logfs_je_dynsb {
488 __be64 ds_gec;
489 __be64 ds_sweeper;
490
491 __be64 ds_rename_dir;
492 __be64 ds_rename_pos;
493
494 __be64 ds_victim_ino;
495 __be64 ds_victim_parent; /* XXX */
496
497 __be64 ds_used_bytes;
498 __be32 ds_generation;
499 __be32 pad;
500};
501
502SIZE_CHECK(logfs_je_dynsb, 64);
503
504/**
505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
506 *
507 * @da_size: size of inode file
508 * @da_last_ino: last created inode
509 * @da_used_bytes: number of bytes used
510 * @da_data: data pointers
511 */
512struct logfs_je_anchor {
513 __be64 da_size;
514 __be64 da_last_ino;
515
516 __be64 da_used_bytes;
517 u8 da_height;
518 u8 pad[7];
519
520 __be64 da_data[LOGFS_EMBEDDED_FIELDS];
521};
522
523SIZE_CHECK(logfs_je_anchor, 168);
524
525/**
526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
527 *
528 * @so_segment: segments used for 2nd journal
529 *
530 * Length of the array is given by h_len field in the header.
531 */
532struct logfs_je_spillout {
533 __be64 so_segment[0];
534};
535
536SIZE_CHECK(logfs_je_spillout, 0);
537
538/**
539 * struct logfs_je_journal_ec - erase counts for all journal segments
540 *
541 * @ec: erase count
542 *
543 * Length of the array is given by h_len field in the header.
544 */
545struct logfs_je_journal_ec {
546 __be32 ec[0];
547};
548
549SIZE_CHECK(logfs_je_journal_ec, 0);
550
551/**
552 * struct logfs_je_free_segments - list of free segmetns with erase count
553 */
554struct logfs_je_free_segments {
555 __be32 segno;
556 __be32 ec;
557};
558
559SIZE_CHECK(logfs_je_free_segments, 8);
560
561/**
562 * struct logfs_seg_alias - list of segment aliases
563 */
564struct logfs_seg_alias {
565 __be32 old_segno;
566 __be32 new_segno;
567};
568
569SIZE_CHECK(logfs_seg_alias, 8);
570
571/**
572 * struct logfs_obj_alias - list of object aliases
573 */
574struct logfs_obj_alias {
575 __be64 ino;
576 __be64 bix;
577 __be64 val;
578 u8 level;
579 u8 pad[5];
580 __be16 child_no;
581};
582
583SIZE_CHECK(logfs_obj_alias, 32);
584
585/**
586 * Compression types.
587 *
588 * COMPR_NONE - uncompressed
589 * COMPR_ZLIB - compressed with zlib
590 */
591enum {
592 COMPR_NONE = 0,
593 COMPR_ZLIB = 1,
594};
595
596/*
597 * Journal entries come in groups of 16. First group contains unique
598 * entries, next groups contain one entry per level
599 *
600 * JE_FIRST - smallest possible journal entry number
601 *
602 * JEG_BASE - base group, containing unique entries
603 * JE_COMMIT - commit entry, validates all previous entries
604 * JE_DYNSB - dynamic superblock, anything that ought to be in the
605 * superblock but cannot because it is read-write data
606 * JE_ANCHOR - anchor aka master inode aka inode file's inode
607 * JE_ERASECOUNT erasecounts for all journal segments
608 * JE_SPILLOUT - unused
609 * JE_SEG_ALIAS - aliases segments
610 * JE_AREA - area description
611 *
612 * JE_LAST - largest possible journal entry number
613 */
614enum {
615 JE_FIRST = 0x01,
616
617 JEG_BASE = 0x00,
618 JE_COMMIT = 0x02,
619 JE_DYNSB = 0x03,
620 JE_ANCHOR = 0x04,
621 JE_ERASECOUNT = 0x05,
622 JE_SPILLOUT = 0x06,
623 JE_OBJ_ALIAS = 0x0d,
624 JE_AREA = 0x0e,
625
626 JE_LAST = 0x0e,
627};
628
629#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..bff40253dfb2
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2258 @@
1/*
2 * fs/logfs/readwrite.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 *
9 * Actually contains five sets of very similar functions:
10 * read read blocks from a file
11 * seek_hole find next hole
12 * seek_data find next data block
13 * valid check whether a block still belongs to a file
14 * write write blocks to a file
15 * delete delete a block (for directories and ifile)
16 * rewrite move existing blocks of a file to a new location (gc helper)
17 * truncate truncate a file
18 */
19#include "logfs.h"
20#include <linux/sched.h>
21#include <linux/slab.h>
22
23static u64 adjust_bix(u64 bix, level_t level)
24{
25 switch (level) {
26 case 0:
27 return bix;
28 case LEVEL(1):
29 return max_t(u64, bix, I0_BLOCKS);
30 case LEVEL(2):
31 return max_t(u64, bix, I1_BLOCKS);
32 case LEVEL(3):
33 return max_t(u64, bix, I2_BLOCKS);
34 case LEVEL(4):
35 return max_t(u64, bix, I3_BLOCKS);
36 case LEVEL(5):
37 return max_t(u64, bix, I4_BLOCKS);
38 default:
39 WARN_ON(1);
40 return bix;
41 }
42}
43
44static inline u64 maxbix(u8 height)
45{
46 return 1ULL << (LOGFS_BLOCK_BITS * height);
47}
48
49/**
50 * The inode address space is cut in two halves. Lower half belongs to data
51 * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
52 * set, the actual block index (bix) and level can be derived from the page
53 * index.
54 *
55 * The lowest three bits of the block index are set to 0 after packing and
56 * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
57 * anyway this is harmless.
58 */
59#define ARCH_SHIFT (BITS_PER_LONG - 32)
60#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
61#define LEVEL_SHIFT (28 + ARCH_SHIFT)
62static inline pgoff_t first_indirect_block(void)
63{
64 return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
65}
66
67pgoff_t logfs_pack_index(u64 bix, level_t level)
68{
69 pgoff_t index;
70
71 BUG_ON(bix >= INDIRECT_BIT);
72 if (level == 0)
73 return bix;
74
75 index = INDIRECT_BIT;
76 index |= (__force long)level << LEVEL_SHIFT;
77 index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
78 return index;
79}
80
81void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
82{
83 u8 __level;
84
85 if (!(index & INDIRECT_BIT)) {
86 *bix = index;
87 *level = 0;
88 return;
89 }
90
91 __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
92 *level = LEVEL(__level);
93 *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
94 *bix = adjust_bix(*bix, *level);
95 return;
96}
97#undef ARCH_SHIFT
98#undef INDIRECT_BIT
99#undef LEVEL_SHIFT
100
101/*
102 * Time is stored as nanoseconds since the epoch.
103 */
104static struct timespec be64_to_timespec(__be64 betime)
105{
106 return ns_to_timespec(be64_to_cpu(betime));
107}
108
109static __be64 timespec_to_be64(struct timespec tsp)
110{
111 return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
112}
113
114static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
115{
116 struct logfs_inode *li = logfs_inode(inode);
117 int i;
118
119 inode->i_mode = be16_to_cpu(di->di_mode);
120 li->li_height = di->di_height;
121 li->li_flags = be32_to_cpu(di->di_flags);
122 inode->i_uid = be32_to_cpu(di->di_uid);
123 inode->i_gid = be32_to_cpu(di->di_gid);
124 inode->i_size = be64_to_cpu(di->di_size);
125 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
126 inode->i_atime = be64_to_timespec(di->di_atime);
127 inode->i_ctime = be64_to_timespec(di->di_ctime);
128 inode->i_mtime = be64_to_timespec(di->di_mtime);
129 inode->i_nlink = be32_to_cpu(di->di_refcount);
130 inode->i_generation = be32_to_cpu(di->di_generation);
131
132 switch (inode->i_mode & S_IFMT) {
133 case S_IFSOCK: /* fall through */
134 case S_IFBLK: /* fall through */
135 case S_IFCHR: /* fall through */
136 case S_IFIFO:
137 inode->i_rdev = be64_to_cpu(di->di_data[0]);
138 break;
139 case S_IFDIR: /* fall through */
140 case S_IFREG: /* fall through */
141 case S_IFLNK:
142 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
143 li->li_data[i] = be64_to_cpu(di->di_data[i]);
144 break;
145 default:
146 BUG();
147 }
148}
149
150static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
151{
152 struct logfs_inode *li = logfs_inode(inode);
153 int i;
154
155 di->di_mode = cpu_to_be16(inode->i_mode);
156 di->di_height = li->li_height;
157 di->di_pad = 0;
158 di->di_flags = cpu_to_be32(li->li_flags);
159 di->di_uid = cpu_to_be32(inode->i_uid);
160 di->di_gid = cpu_to_be32(inode->i_gid);
161 di->di_size = cpu_to_be64(i_size_read(inode));
162 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
163 di->di_atime = timespec_to_be64(inode->i_atime);
164 di->di_ctime = timespec_to_be64(inode->i_ctime);
165 di->di_mtime = timespec_to_be64(inode->i_mtime);
166 di->di_refcount = cpu_to_be32(inode->i_nlink);
167 di->di_generation = cpu_to_be32(inode->i_generation);
168
169 switch (inode->i_mode & S_IFMT) {
170 case S_IFSOCK: /* fall through */
171 case S_IFBLK: /* fall through */
172 case S_IFCHR: /* fall through */
173 case S_IFIFO:
174 di->di_data[0] = cpu_to_be64(inode->i_rdev);
175 break;
176 case S_IFDIR: /* fall through */
177 case S_IFREG: /* fall through */
178 case S_IFLNK:
179 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
180 di->di_data[i] = cpu_to_be64(li->li_data[i]);
181 break;
182 default:
183 BUG();
184 }
185}
186
187static void __logfs_set_blocks(struct inode *inode)
188{
189 struct super_block *sb = inode->i_sb;
190 struct logfs_inode *li = logfs_inode(inode);
191
192 inode->i_blocks = ULONG_MAX;
193 if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
194 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
195}
196
197void logfs_set_blocks(struct inode *inode, u64 bytes)
198{
199 struct logfs_inode *li = logfs_inode(inode);
200
201 li->li_used_bytes = bytes;
202 __logfs_set_blocks(inode);
203}
204
205static void prelock_page(struct super_block *sb, struct page *page, int lock)
206{
207 struct logfs_super *super = logfs_super(sb);
208
209 BUG_ON(!PageLocked(page));
210 if (lock) {
211 BUG_ON(PagePreLocked(page));
212 SetPagePreLocked(page);
213 } else {
214 /* We are in GC path. */
215 if (PagePreLocked(page))
216 super->s_lock_count++;
217 else
218 SetPagePreLocked(page);
219 }
220}
221
222static void preunlock_page(struct super_block *sb, struct page *page, int lock)
223{
224 struct logfs_super *super = logfs_super(sb);
225
226 BUG_ON(!PageLocked(page));
227 if (lock)
228 ClearPagePreLocked(page);
229 else {
230 /* We are in GC path. */
231 BUG_ON(!PagePreLocked(page));
232 if (super->s_lock_count)
233 super->s_lock_count--;
234 else
235 ClearPagePreLocked(page);
236 }
237}
238
239/*
240 * Logfs is prone to an AB-BA deadlock where one task tries to acquire
241 * s_write_mutex with a locked page and GC tries to get that page while holding
242 * s_write_mutex.
243 * To solve this issue logfs will ignore the page lock iff the page in question
244 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
245 * in addition to PG_locked.
246 */
247static void logfs_get_wblocks(struct super_block *sb, struct page *page,
248 int lock)
249{
250 struct logfs_super *super = logfs_super(sb);
251
252 if (page)
253 prelock_page(sb, page, lock);
254
255 if (lock) {
256 mutex_lock(&super->s_write_mutex);
257 logfs_gc_pass(sb);
258 /* FIXME: We also have to check for shadowed space
259 * and mempool fill grade */
260 }
261}
262
263static void logfs_put_wblocks(struct super_block *sb, struct page *page,
264 int lock)
265{
266 struct logfs_super *super = logfs_super(sb);
267
268 if (page)
269 preunlock_page(sb, page, lock);
270 /* Order matters - we must clear PG_pre_locked before releasing
271 * s_write_mutex or we could race against another task. */
272 if (lock)
273 mutex_unlock(&super->s_write_mutex);
274}
275
276static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
277 level_t level)
278{
279 return find_or_create_page(inode->i_mapping,
280 logfs_pack_index(bix, level), GFP_NOFS);
281}
282
283static void logfs_put_read_page(struct page *page)
284{
285 unlock_page(page);
286 page_cache_release(page);
287}
288
289static void logfs_lock_write_page(struct page *page)
290{
291 int loop = 0;
292
293 while (unlikely(!trylock_page(page))) {
294 if (loop++ > 0x1000) {
295 /* Has been observed once so far... */
296 printk(KERN_ERR "stack at %p\n", &loop);
297 BUG();
298 }
299 if (PagePreLocked(page)) {
300 /* Holder of page lock is waiting for us, it
301 * is safe to use this page. */
302 break;
303 }
304 /* Some other process has this page locked and has
305 * nothing to do with us. Wait for it to finish.
306 */
307 schedule();
308 }
309 BUG_ON(!PageLocked(page));
310}
311
312static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
313 level_t level)
314{
315 struct address_space *mapping = inode->i_mapping;
316 pgoff_t index = logfs_pack_index(bix, level);
317 struct page *page;
318 int err;
319
320repeat:
321 page = find_get_page(mapping, index);
322 if (!page) {
323 page = __page_cache_alloc(GFP_NOFS);
324 if (!page)
325 return NULL;
326 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
327 if (unlikely(err)) {
328 page_cache_release(page);
329 if (err == -EEXIST)
330 goto repeat;
331 return NULL;
332 }
333 } else logfs_lock_write_page(page);
334 BUG_ON(!PageLocked(page));
335 return page;
336}
337
338static void logfs_unlock_write_page(struct page *page)
339{
340 if (!PagePreLocked(page))
341 unlock_page(page);
342}
343
344static void logfs_put_write_page(struct page *page)
345{
346 logfs_unlock_write_page(page);
347 page_cache_release(page);
348}
349
350static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
351 int rw)
352{
353 if (rw == READ)
354 return logfs_get_read_page(inode, bix, level);
355 else
356 return logfs_get_write_page(inode, bix, level);
357}
358
359static void logfs_put_page(struct page *page, int rw)
360{
361 if (rw == READ)
362 logfs_put_read_page(page);
363 else
364 logfs_put_write_page(page);
365}
366
367static unsigned long __get_bits(u64 val, int skip, int no)
368{
369 u64 ret = val;
370
371 ret >>= skip * no;
372 ret <<= 64 - no;
373 ret >>= 64 - no;
374 return ret;
375}
376
377static unsigned long get_bits(u64 val, level_t skip)
378{
379 return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
380}
381
382static inline void init_shadow_tree(struct super_block *sb,
383 struct shadow_tree *tree)
384{
385 struct logfs_super *super = logfs_super(sb);
386
387 btree_init_mempool64(&tree->new, super->s_btree_pool);
388 btree_init_mempool64(&tree->old, super->s_btree_pool);
389}
390
391static void indirect_write_block(struct logfs_block *block)
392{
393 struct page *page;
394 struct inode *inode;
395 int ret;
396
397 page = block->page;
398 inode = page->mapping->host;
399 logfs_lock_write_page(page);
400 ret = logfs_write_buf(inode, page, 0);
401 logfs_unlock_write_page(page);
402 /*
403 * This needs some rework. Unless you want your filesystem to run
404 * completely synchronously (you don't), the filesystem will always
405 * report writes as 'successful' before the actual work has been
406 * done. The actual work gets done here and this is where any errors
407 * will show up. And there isn't much we can do about it, really.
408 *
409 * Some attempts to fix the errors (move from bad blocks, retry io,...)
410 * have already been done, so anything left should be either a broken
411 * device or a bug somewhere in logfs itself. Being relatively new,
412 * the odds currently favor a bug, so for now the line below isn't
413 * entirely tasteles.
414 */
415 BUG_ON(ret);
416}
417
418static void inode_write_block(struct logfs_block *block)
419{
420 struct inode *inode;
421 int ret;
422
423 inode = block->inode;
424 if (inode->i_ino == LOGFS_INO_MASTER)
425 logfs_write_anchor(inode->i_sb);
426 else {
427 ret = __logfs_write_inode(inode, 0);
428 /* see indirect_write_block comment */
429 BUG_ON(ret);
430 }
431}
432
433static gc_level_t inode_block_level(struct logfs_block *block)
434{
435 BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
436 return GC_LEVEL(LOGFS_MAX_LEVELS);
437}
438
439static gc_level_t indirect_block_level(struct logfs_block *block)
440{
441 struct page *page;
442 struct inode *inode;
443 u64 bix;
444 level_t level;
445
446 page = block->page;
447 inode = page->mapping->host;
448 logfs_unpack_index(page->index, &bix, &level);
449 return expand_level(inode->i_ino, level);
450}
451
452/*
453 * This silences a false, yet annoying gcc warning. I hate it when my editor
454 * jumps into bitops.h each time I recompile this file.
455 * TODO: Complain to gcc folks about this and upgrade compiler.
456 */
457static unsigned long fnb(const unsigned long *addr,
458 unsigned long size, unsigned long offset)
459{
460 return find_next_bit(addr, size, offset);
461}
462
463static __be64 inode_val0(struct inode *inode)
464{
465 struct logfs_inode *li = logfs_inode(inode);
466 u64 val;
467
468 /*
469 * Explicit shifting generates good code, but must match the format
470 * of the structure. Add some paranoia just in case.
471 */
472 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
473 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
474 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
475
476 val = (u64)inode->i_mode << 48 |
477 (u64)li->li_height << 40 |
478 (u64)li->li_flags;
479 return cpu_to_be64(val);
480}
481
482static int inode_write_alias(struct super_block *sb,
483 struct logfs_block *block, write_alias_t *write_one_alias)
484{
485 struct inode *inode = block->inode;
486 struct logfs_inode *li = logfs_inode(inode);
487 unsigned long pos;
488 u64 ino , bix;
489 __be64 val;
490 level_t level;
491 int err;
492
493 for (pos = 0; ; pos++) {
494 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
495 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
496 return 0;
497
498 switch (pos) {
499 case INODE_HEIGHT_OFS:
500 val = inode_val0(inode);
501 break;
502 case INODE_USED_OFS:
503 val = cpu_to_be64(li->li_used_bytes);;
504 break;
505 case INODE_SIZE_OFS:
506 val = cpu_to_be64(i_size_read(inode));
507 break;
508 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
509 val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
510 break;
511 default:
512 BUG();
513 }
514
515 ino = LOGFS_INO_MASTER;
516 bix = inode->i_ino;
517 level = LEVEL(0);
518 err = write_one_alias(sb, ino, bix, level, pos, val);
519 if (err)
520 return err;
521 }
522}
523
524static int indirect_write_alias(struct super_block *sb,
525 struct logfs_block *block, write_alias_t *write_one_alias)
526{
527 unsigned long pos;
528 struct page *page = block->page;
529 u64 ino , bix;
530 __be64 *child, val;
531 level_t level;
532 int err;
533
534 for (pos = 0; ; pos++) {
535 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
536 if (pos >= LOGFS_BLOCK_FACTOR)
537 return 0;
538
539 ino = page->mapping->host->i_ino;
540 logfs_unpack_index(page->index, &bix, &level);
541 child = kmap_atomic(page, KM_USER0);
542 val = child[pos];
543 kunmap_atomic(child, KM_USER0);
544 err = write_one_alias(sb, ino, bix, level, pos, val);
545 if (err)
546 return err;
547 }
548}
549
550int logfs_write_obj_aliases_pagecache(struct super_block *sb)
551{
552 struct logfs_super *super = logfs_super(sb);
553 struct logfs_block *block;
554 int err;
555
556 list_for_each_entry(block, &super->s_object_alias, alias_list) {
557 err = block->ops->write_alias(sb, block, write_alias_journal);
558 if (err)
559 return err;
560 }
561 return 0;
562}
563
564void __free_block(struct super_block *sb, struct logfs_block *block)
565{
566 BUG_ON(!list_empty(&block->item_list));
567 list_del(&block->alias_list);
568 mempool_free(block, logfs_super(sb)->s_block_pool);
569}
570
571static void inode_free_block(struct super_block *sb, struct logfs_block *block)
572{
573 struct inode *inode = block->inode;
574
575 logfs_inode(inode)->li_block = NULL;
576 __free_block(sb, block);
577}
578
579static void indirect_free_block(struct super_block *sb,
580 struct logfs_block *block)
581{
582 ClearPagePrivate(block->page);
583 block->page->private = 0;
584 __free_block(sb, block);
585}
586
587
588static struct logfs_block_ops inode_block_ops = {
589 .write_block = inode_write_block,
590 .block_level = inode_block_level,
591 .free_block = inode_free_block,
592 .write_alias = inode_write_alias,
593};
594
595struct logfs_block_ops indirect_block_ops = {
596 .write_block = indirect_write_block,
597 .block_level = indirect_block_level,
598 .free_block = indirect_free_block,
599 .write_alias = indirect_write_alias,
600};
601
602struct logfs_block *__alloc_block(struct super_block *sb,
603 u64 ino, u64 bix, level_t level)
604{
605 struct logfs_super *super = logfs_super(sb);
606 struct logfs_block *block;
607
608 block = mempool_alloc(super->s_block_pool, GFP_NOFS);
609 memset(block, 0, sizeof(*block));
610 INIT_LIST_HEAD(&block->alias_list);
611 INIT_LIST_HEAD(&block->item_list);
612 block->sb = sb;
613 block->ino = ino;
614 block->bix = bix;
615 block->level = level;
616 return block;
617}
618
619static void alloc_inode_block(struct inode *inode)
620{
621 struct logfs_inode *li = logfs_inode(inode);
622 struct logfs_block *block;
623
624 if (li->li_block)
625 return;
626
627 block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
628 block->inode = inode;
629 li->li_block = block;
630 block->ops = &inode_block_ops;
631}
632
633void initialize_block_counters(struct page *page, struct logfs_block *block,
634 __be64 *array, int page_is_empty)
635{
636 u64 ptr;
637 int i, start;
638
639 block->partial = 0;
640 block->full = 0;
641 start = 0;
642 if (page->index < first_indirect_block()) {
643 /* Counters are pointless on level 0 */
644 return;
645 }
646 if (page->index == first_indirect_block()) {
647 /* Skip unused pointers */
648 start = I0_BLOCKS;
649 block->full = I0_BLOCKS;
650 }
651 if (!page_is_empty) {
652 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
653 ptr = be64_to_cpu(array[i]);
654 if (ptr)
655 block->partial++;
656 if (ptr & LOGFS_FULLY_POPULATED)
657 block->full++;
658 }
659 }
660}
661
662static void alloc_data_block(struct inode *inode, struct page *page)
663{
664 struct logfs_block *block;
665 u64 bix;
666 level_t level;
667
668 if (PagePrivate(page))
669 return;
670
671 logfs_unpack_index(page->index, &bix, &level);
672 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
673 block->page = page;
674 SetPagePrivate(page);
675 page->private = (unsigned long)block;
676 block->ops = &indirect_block_ops;
677}
678
679static void alloc_indirect_block(struct inode *inode, struct page *page,
680 int page_is_empty)
681{
682 struct logfs_block *block;
683 __be64 *array;
684
685 if (PagePrivate(page))
686 return;
687
688 alloc_data_block(inode, page);
689
690 block = logfs_block(page);
691 array = kmap_atomic(page, KM_USER0);
692 initialize_block_counters(page, block, array, page_is_empty);
693 kunmap_atomic(array, KM_USER0);
694}
695
696static void block_set_pointer(struct page *page, int index, u64 ptr)
697{
698 struct logfs_block *block = logfs_block(page);
699 __be64 *array;
700 u64 oldptr;
701
702 BUG_ON(!block);
703 array = kmap_atomic(page, KM_USER0);
704 oldptr = be64_to_cpu(array[index]);
705 array[index] = cpu_to_be64(ptr);
706 kunmap_atomic(array, KM_USER0);
707 SetPageUptodate(page);
708
709 block->full += !!(ptr & LOGFS_FULLY_POPULATED)
710 - !!(oldptr & LOGFS_FULLY_POPULATED);
711 block->partial += !!ptr - !!oldptr;
712}
713
714static u64 block_get_pointer(struct page *page, int index)
715{
716 __be64 *block;
717 u64 ptr;
718
719 block = kmap_atomic(page, KM_USER0);
720 ptr = be64_to_cpu(block[index]);
721 kunmap_atomic(block, KM_USER0);
722 return ptr;
723}
724
725static int logfs_read_empty(struct page *page)
726{
727 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
728 return 0;
729}
730
731static int logfs_read_direct(struct inode *inode, struct page *page)
732{
733 struct logfs_inode *li = logfs_inode(inode);
734 pgoff_t index = page->index;
735 u64 block;
736
737 block = li->li_data[index];
738 if (!block)
739 return logfs_read_empty(page);
740
741 return logfs_segment_read(inode, page, block, index, 0);
742}
743
744static int logfs_read_loop(struct inode *inode, struct page *page,
745 int rw_context)
746{
747 struct logfs_inode *li = logfs_inode(inode);
748 u64 bix, bofs = li->li_data[INDIRECT_INDEX];
749 level_t level, target_level;
750 int ret;
751 struct page *ipage;
752
753 logfs_unpack_index(page->index, &bix, &target_level);
754 if (!bofs)
755 return logfs_read_empty(page);
756
757 if (bix >= maxbix(li->li_height))
758 return logfs_read_empty(page);
759
760 for (level = LEVEL(li->li_height);
761 (__force u8)level > (__force u8)target_level;
762 level = SUBLEVEL(level)){
763 ipage = logfs_get_page(inode, bix, level, rw_context);
764 if (!ipage)
765 return -ENOMEM;
766
767 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
768 if (ret) {
769 logfs_put_read_page(ipage);
770 return ret;
771 }
772
773 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
774 logfs_put_page(ipage, rw_context);
775 if (!bofs)
776 return logfs_read_empty(page);
777 }
778
779 return logfs_segment_read(inode, page, bofs, bix, 0);
780}
781
782static int logfs_read_block(struct inode *inode, struct page *page,
783 int rw_context)
784{
785 pgoff_t index = page->index;
786
787 if (index < I0_BLOCKS)
788 return logfs_read_direct(inode, page);
789 return logfs_read_loop(inode, page, rw_context);
790}
791
792static int logfs_exist_loop(struct inode *inode, u64 bix)
793{
794 struct logfs_inode *li = logfs_inode(inode);
795 u64 bofs = li->li_data[INDIRECT_INDEX];
796 level_t level;
797 int ret;
798 struct page *ipage;
799
800 if (!bofs)
801 return 0;
802 if (bix >= maxbix(li->li_height))
803 return 0;
804
805 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
806 ipage = logfs_get_read_page(inode, bix, level);
807 if (!ipage)
808 return -ENOMEM;
809
810 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
811 if (ret) {
812 logfs_put_read_page(ipage);
813 return ret;
814 }
815
816 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
817 logfs_put_read_page(ipage);
818 if (!bofs)
819 return 0;
820 }
821
822 return 1;
823}
824
825int logfs_exist_block(struct inode *inode, u64 bix)
826{
827 struct logfs_inode *li = logfs_inode(inode);
828
829 if (bix < I0_BLOCKS)
830 return !!li->li_data[bix];
831 return logfs_exist_loop(inode, bix);
832}
833
834static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
835{
836 struct logfs_inode *li = logfs_inode(inode);
837
838 for (; bix < I0_BLOCKS; bix++)
839 if (data ^ (li->li_data[bix] == 0))
840 return bix;
841 return I0_BLOCKS;
842}
843
844static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
845{
846 struct logfs_inode *li = logfs_inode(inode);
847 __be64 *rblock;
848 u64 increment, bofs = li->li_data[INDIRECT_INDEX];
849 level_t level;
850 int ret, slot;
851 struct page *page;
852
853 BUG_ON(!bofs);
854
855 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
856 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
857 page = logfs_get_read_page(inode, bix, level);
858 if (!page)
859 return bix;
860
861 ret = logfs_segment_read(inode, page, bofs, bix, level);
862 if (ret) {
863 logfs_put_read_page(page);
864 return bix;
865 }
866
867 slot = get_bits(bix, SUBLEVEL(level));
868 rblock = kmap_atomic(page, KM_USER0);
869 while (slot < LOGFS_BLOCK_FACTOR) {
870 if (data && (rblock[slot] != 0))
871 break;
872 if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
873 break;
874 slot++;
875 bix += increment;
876 bix &= ~(increment - 1);
877 }
878 if (slot >= LOGFS_BLOCK_FACTOR) {
879 kunmap_atomic(rblock, KM_USER0);
880 logfs_put_read_page(page);
881 return bix;
882 }
883 bofs = be64_to_cpu(rblock[slot]);
884 kunmap_atomic(rblock, KM_USER0);
885 logfs_put_read_page(page);
886 if (!bofs) {
887 BUG_ON(data);
888 return bix;
889 }
890 }
891 return bix;
892}
893
894/**
895 * logfs_seek_hole - find next hole starting at a given block index
896 * @inode: inode to search in
897 * @bix: block index to start searching
898 *
899 * Returns next hole. If the file doesn't contain any further holes, the
900 * block address next to eof is returned instead.
901 */
902u64 logfs_seek_hole(struct inode *inode, u64 bix)
903{
904 struct logfs_inode *li = logfs_inode(inode);
905
906 if (bix < I0_BLOCKS) {
907 bix = seek_holedata_direct(inode, bix, 0);
908 if (bix < I0_BLOCKS)
909 return bix;
910 }
911
912 if (!li->li_data[INDIRECT_INDEX])
913 return bix;
914 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
915 bix = maxbix(li->li_height);
916 else {
917 bix = seek_holedata_loop(inode, bix, 0);
918 if (bix < maxbix(li->li_height))
919 return bix;
920 /* Should not happen anymore. But if some port writes semi-
921 * corrupt images (as this one used to) we might run into it.
922 */
923 WARN_ON_ONCE(bix == maxbix(li->li_height));
924 }
925
926 return bix;
927}
928
929static u64 __logfs_seek_data(struct inode *inode, u64 bix)
930{
931 struct logfs_inode *li = logfs_inode(inode);
932
933 if (bix < I0_BLOCKS) {
934 bix = seek_holedata_direct(inode, bix, 1);
935 if (bix < I0_BLOCKS)
936 return bix;
937 }
938
939 if (bix < maxbix(li->li_height)) {
940 if (!li->li_data[INDIRECT_INDEX])
941 bix = maxbix(li->li_height);
942 else
943 return seek_holedata_loop(inode, bix, 1);
944 }
945
946 return bix;
947}
948
949/**
950 * logfs_seek_data - find next data block after a given block index
951 * @inode: inode to search in
952 * @bix: block index to start searching
953 *
954 * Returns next data block. If the file doesn't contain any further data
955 * blocks, the last block in the file is returned instead.
956 */
957u64 logfs_seek_data(struct inode *inode, u64 bix)
958{
959 struct super_block *sb = inode->i_sb;
960 u64 ret, end;
961
962 ret = __logfs_seek_data(inode, bix);
963 end = i_size_read(inode) >> sb->s_blocksize_bits;
964 if (ret >= end)
965 ret = max(bix, end);
966 return ret;
967}
968
969static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
970{
971 return pure_ofs(li->li_data[bix]) == ofs;
972}
973
974static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
975 u64 ofs, u64 bofs)
976{
977 struct logfs_inode *li = logfs_inode(inode);
978 level_t level;
979 int ret;
980 struct page *page;
981
982 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
983 page = logfs_get_write_page(inode, bix, level);
984 BUG_ON(!page);
985
986 ret = logfs_segment_read(inode, page, bofs, bix, level);
987 if (ret) {
988 logfs_put_write_page(page);
989 return 0;
990 }
991
992 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
993 logfs_put_write_page(page);
994 if (!bofs)
995 return 0;
996
997 if (pure_ofs(bofs) == ofs)
998 return 1;
999 }
1000 return 0;
1001}
1002
1003static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
1004{
1005 struct logfs_inode *li = logfs_inode(inode);
1006 u64 bofs = li->li_data[INDIRECT_INDEX];
1007
1008 if (!bofs)
1009 return 0;
1010
1011 if (bix >= maxbix(li->li_height))
1012 return 0;
1013
1014 if (pure_ofs(bofs) == ofs)
1015 return 1;
1016
1017 return __logfs_is_valid_loop(inode, bix, ofs, bofs);
1018}
1019
1020static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1021{
1022 struct logfs_inode *li = logfs_inode(inode);
1023
1024 if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1025 return 0;
1026
1027 if (bix < I0_BLOCKS)
1028 return logfs_is_valid_direct(li, bix, ofs);
1029 return logfs_is_valid_loop(inode, bix, ofs);
1030}
1031
1032/**
1033 * logfs_is_valid_block - check whether this block is still valid
1034 *
1035 * @sb - superblock
1036 * @ofs - block physical offset
1037 * @ino - block inode number
1038 * @bix - block index
1039 * @level - block level
1040 *
1041 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1042 * become invalid once the journal is written.
1043 */
1044int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1045 gc_level_t gc_level)
1046{
1047 struct logfs_super *super = logfs_super(sb);
1048 struct inode *inode;
1049 int ret, cookie;
1050
1051 /* Umount closes a segment with free blocks remaining. Those
1052 * blocks are by definition invalid. */
1053 if (ino == -1)
1054 return 0;
1055
1056 LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1057
1058 inode = logfs_safe_iget(sb, ino, &cookie);
1059 if (IS_ERR(inode))
1060 goto invalid;
1061
1062 ret = __logfs_is_valid_block(inode, bix, ofs);
1063 logfs_safe_iput(inode, cookie);
1064 if (ret)
1065 return ret;
1066
1067invalid:
1068 /* Block is nominally invalid, but may still sit in the shadow tree,
1069 * waiting for a journal commit.
1070 */
1071 if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1072 return 2;
1073 return 0;
1074}
1075
1076int logfs_readpage_nolock(struct page *page)
1077{
1078 struct inode *inode = page->mapping->host;
1079 int ret = -EIO;
1080
1081 ret = logfs_read_block(inode, page, READ);
1082
1083 if (ret) {
1084 ClearPageUptodate(page);
1085 SetPageError(page);
1086 } else {
1087 SetPageUptodate(page);
1088 ClearPageError(page);
1089 }
1090 flush_dcache_page(page);
1091
1092 return ret;
1093}
1094
1095static int logfs_reserve_bytes(struct inode *inode, int bytes)
1096{
1097 struct logfs_super *super = logfs_super(inode->i_sb);
1098 u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1099 - super->s_dirty_used_bytes - super->s_dirty_pages;
1100
1101 if (!bytes)
1102 return 0;
1103
1104 if (available < bytes)
1105 return -ENOSPC;
1106
1107 if (available < bytes + super->s_root_reserve &&
1108 !capable(CAP_SYS_RESOURCE))
1109 return -ENOSPC;
1110
1111 return 0;
1112}
1113
1114int get_page_reserve(struct inode *inode, struct page *page)
1115{
1116 struct logfs_super *super = logfs_super(inode->i_sb);
1117 int ret;
1118
1119 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1120 return 0;
1121
1122 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1123 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
1124 if (!ret) {
1125 alloc_data_block(inode, page);
1126 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1127 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1128 }
1129 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1130 return ret;
1131}
1132
1133/*
1134 * We are protected by write lock. Push victims up to superblock level
1135 * and release transaction when appropriate.
1136 */
1137/* FIXME: This is currently called from the wrong spots. */
1138static void logfs_handle_transaction(struct inode *inode,
1139 struct logfs_transaction *ta)
1140{
1141 struct logfs_super *super = logfs_super(inode->i_sb);
1142
1143 if (!ta)
1144 return;
1145 logfs_inode(inode)->li_block->ta = NULL;
1146
1147 if (inode->i_ino != LOGFS_INO_MASTER) {
1148 BUG(); /* FIXME: Yes, this needs more thought */
1149 /* just remember the transaction until inode is written */
1150 //BUG_ON(logfs_inode(inode)->li_transaction);
1151 //logfs_inode(inode)->li_transaction = ta;
1152 return;
1153 }
1154
1155 switch (ta->state) {
1156 case CREATE_1: /* fall through */
1157 case UNLINK_1:
1158 BUG_ON(super->s_victim_ino);
1159 super->s_victim_ino = ta->ino;
1160 break;
1161 case CREATE_2: /* fall through */
1162 case UNLINK_2:
1163 BUG_ON(super->s_victim_ino != ta->ino);
1164 super->s_victim_ino = 0;
1165 /* transaction ends here - free it */
1166 kfree(ta);
1167 break;
1168 case CROSS_RENAME_1:
1169 BUG_ON(super->s_rename_dir);
1170 BUG_ON(super->s_rename_pos);
1171 super->s_rename_dir = ta->dir;
1172 super->s_rename_pos = ta->pos;
1173 break;
1174 case CROSS_RENAME_2:
1175 BUG_ON(super->s_rename_dir != ta->dir);
1176 BUG_ON(super->s_rename_pos != ta->pos);
1177 super->s_rename_dir = 0;
1178 super->s_rename_pos = 0;
1179 kfree(ta);
1180 break;
1181 case TARGET_RENAME_1:
1182 BUG_ON(super->s_rename_dir);
1183 BUG_ON(super->s_rename_pos);
1184 BUG_ON(super->s_victim_ino);
1185 super->s_rename_dir = ta->dir;
1186 super->s_rename_pos = ta->pos;
1187 super->s_victim_ino = ta->ino;
1188 break;
1189 case TARGET_RENAME_2:
1190 BUG_ON(super->s_rename_dir != ta->dir);
1191 BUG_ON(super->s_rename_pos != ta->pos);
1192 BUG_ON(super->s_victim_ino != ta->ino);
1193 super->s_rename_dir = 0;
1194 super->s_rename_pos = 0;
1195 break;
1196 case TARGET_RENAME_3:
1197 BUG_ON(super->s_rename_dir);
1198 BUG_ON(super->s_rename_pos);
1199 BUG_ON(super->s_victim_ino != ta->ino);
1200 super->s_victim_ino = 0;
1201 kfree(ta);
1202 break;
1203 default:
1204 BUG();
1205 }
1206}
1207
1208/*
1209 * Not strictly a reservation, but rather a check that we still have enough
1210 * space to satisfy the write.
1211 */
1212static int logfs_reserve_blocks(struct inode *inode, int blocks)
1213{
1214 return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1215}
1216
1217struct write_control {
1218 u64 ofs;
1219 long flags;
1220};
1221
1222static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1223 level_t level, u64 old_ofs)
1224{
1225 struct logfs_super *super = logfs_super(inode->i_sb);
1226 struct logfs_shadow *shadow;
1227
1228 shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1229 memset(shadow, 0, sizeof(*shadow));
1230 shadow->ino = inode->i_ino;
1231 shadow->bix = bix;
1232 shadow->gc_level = expand_level(inode->i_ino, level);
1233 shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1234 return shadow;
1235}
1236
1237static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1238{
1239 struct logfs_super *super = logfs_super(inode->i_sb);
1240
1241 mempool_free(shadow, super->s_shadow_pool);
1242}
1243
1244/**
1245 * fill_shadow_tree - Propagate shadow tree changes due to a write
1246 * @inode: Inode owning the page
1247 * @page: Struct page that was written
1248 * @shadow: Shadow for the current write
1249 *
1250 * Writes in logfs can result in two semi-valid objects. The old object
1251 * is still valid as long as it can be reached by following pointers on
1252 * the medium. Only when writes propagate all the way up to the journal
1253 * has the new object safely replaced the old one.
1254 *
1255 * To handle this problem, a struct logfs_shadow is used to represent
1256 * every single write. It is attached to the indirect block, which is
1257 * marked dirty. When the indirect block is written, its shadows are
1258 * handed up to the next indirect block (or inode). Untimately they
1259 * will reach the master inode and be freed upon journal commit.
1260 *
1261 * This function handles a single step in the propagation. It adds the
1262 * shadow for the current write to the tree, along with any shadows in
1263 * the page's tree, in case it was an indirect block. If a page is
1264 * written, the inode parameter is left NULL, if an inode is written,
1265 * the page parameter is left NULL.
1266 */
1267static void fill_shadow_tree(struct inode *inode, struct page *page,
1268 struct logfs_shadow *shadow)
1269{
1270 struct logfs_super *super = logfs_super(inode->i_sb);
1271 struct logfs_block *block = logfs_block(page);
1272 struct shadow_tree *tree = &super->s_shadow_tree;
1273
1274 if (PagePrivate(page)) {
1275 if (block->alias_map)
1276 super->s_no_object_aliases -= bitmap_weight(
1277 block->alias_map, LOGFS_BLOCK_FACTOR);
1278 logfs_handle_transaction(inode, block->ta);
1279 block->ops->free_block(inode->i_sb, block);
1280 }
1281 if (shadow) {
1282 if (shadow->old_ofs)
1283 btree_insert64(&tree->old, shadow->old_ofs, shadow,
1284 GFP_NOFS);
1285 else
1286 btree_insert64(&tree->new, shadow->new_ofs, shadow,
1287 GFP_NOFS);
1288
1289 super->s_dirty_used_bytes += shadow->new_len;
1290 super->s_dirty_free_bytes += shadow->old_len;
1291 }
1292}
1293
1294static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1295 long child_no)
1296{
1297 struct logfs_super *super = logfs_super(sb);
1298
1299 if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1300 /* Aliases in the master inode are pointless. */
1301 return;
1302 }
1303
1304 if (!test_bit(child_no, block->alias_map)) {
1305 set_bit(child_no, block->alias_map);
1306 super->s_no_object_aliases++;
1307 }
1308 list_move_tail(&block->alias_list, &super->s_object_alias);
1309}
1310
1311/*
1312 * Object aliases can and often do change the size and occupied space of a
1313 * file. So not only do we have to change the pointers, we also have to
1314 * change inode->i_size and li->li_used_bytes. Which is done by setting
1315 * another two object aliases for the inode itself.
1316 */
1317static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1318{
1319 struct logfs_inode *li = logfs_inode(inode);
1320
1321 if (shadow->new_len == shadow->old_len)
1322 return;
1323
1324 alloc_inode_block(inode);
1325 li->li_used_bytes += shadow->new_len - shadow->old_len;
1326 __logfs_set_blocks(inode);
1327 logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1328 logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1329}
1330
1331static int logfs_write_i0(struct inode *inode, struct page *page,
1332 struct write_control *wc)
1333{
1334 struct logfs_shadow *shadow;
1335 u64 bix;
1336 level_t level;
1337 int full, err = 0;
1338
1339 logfs_unpack_index(page->index, &bix, &level);
1340 if (wc->ofs == 0)
1341 if (logfs_reserve_blocks(inode, 1))
1342 return -ENOSPC;
1343
1344 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1345 if (wc->flags & WF_WRITE)
1346 err = logfs_segment_write(inode, page, shadow);
1347 if (wc->flags & WF_DELETE)
1348 logfs_segment_delete(inode, shadow);
1349 if (err) {
1350 free_shadow(inode, shadow);
1351 return err;
1352 }
1353
1354 set_iused(inode, shadow);
1355 full = 1;
1356 if (level != 0) {
1357 alloc_indirect_block(inode, page, 0);
1358 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1359 }
1360 fill_shadow_tree(inode, page, shadow);
1361 wc->ofs = shadow->new_ofs;
1362 if (wc->ofs && full)
1363 wc->ofs |= LOGFS_FULLY_POPULATED;
1364 return 0;
1365}
1366
1367static int logfs_write_direct(struct inode *inode, struct page *page,
1368 long flags)
1369{
1370 struct logfs_inode *li = logfs_inode(inode);
1371 struct write_control wc = {
1372 .ofs = li->li_data[page->index],
1373 .flags = flags,
1374 };
1375 int err;
1376
1377 alloc_inode_block(inode);
1378
1379 err = logfs_write_i0(inode, page, &wc);
1380 if (err)
1381 return err;
1382
1383 li->li_data[page->index] = wc.ofs;
1384 logfs_set_alias(inode->i_sb, li->li_block,
1385 page->index + INODE_POINTER_OFS);
1386 return 0;
1387}
1388
1389static int ptr_change(u64 ofs, struct page *page)
1390{
1391 struct logfs_block *block = logfs_block(page);
1392 int empty0, empty1, full0, full1;
1393
1394 empty0 = ofs == 0;
1395 empty1 = block->partial == 0;
1396 if (empty0 != empty1)
1397 return 1;
1398
1399 /* The !! is necessary to shrink result to int */
1400 full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1401 full1 = block->full == LOGFS_BLOCK_FACTOR;
1402 if (full0 != full1)
1403 return 1;
1404 return 0;
1405}
1406
1407static int __logfs_write_rec(struct inode *inode, struct page *page,
1408 struct write_control *this_wc,
1409 pgoff_t bix, level_t target_level, level_t level)
1410{
1411 int ret, page_empty = 0;
1412 int child_no = get_bits(bix, SUBLEVEL(level));
1413 struct page *ipage;
1414 struct write_control child_wc = {
1415 .flags = this_wc->flags,
1416 };
1417
1418 ipage = logfs_get_write_page(inode, bix, level);
1419 if (!ipage)
1420 return -ENOMEM;
1421
1422 if (this_wc->ofs) {
1423 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1424 if (ret)
1425 goto out;
1426 } else if (!PageUptodate(ipage)) {
1427 page_empty = 1;
1428 logfs_read_empty(ipage);
1429 }
1430
1431 child_wc.ofs = block_get_pointer(ipage, child_no);
1432
1433 if ((__force u8)level-1 > (__force u8)target_level)
1434 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1435 target_level, SUBLEVEL(level));
1436 else
1437 ret = logfs_write_i0(inode, page, &child_wc);
1438
1439 if (ret)
1440 goto out;
1441
1442 alloc_indirect_block(inode, ipage, page_empty);
1443 block_set_pointer(ipage, child_no, child_wc.ofs);
1444 /* FIXME: first condition seems superfluous */
1445 if (child_wc.ofs || logfs_block(ipage)->partial)
1446 this_wc->flags |= WF_WRITE;
1447 /* the condition on this_wc->ofs ensures that we won't consume extra
1448 * space for indirect blocks in the future, which we cannot reserve */
1449 if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1450 ret = logfs_write_i0(inode, ipage, this_wc);
1451 else
1452 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1453out:
1454 logfs_put_write_page(ipage);
1455 return ret;
1456}
1457
1458static int logfs_write_rec(struct inode *inode, struct page *page,
1459 pgoff_t bix, level_t target_level, long flags)
1460{
1461 struct logfs_inode *li = logfs_inode(inode);
1462 struct write_control wc = {
1463 .ofs = li->li_data[INDIRECT_INDEX],
1464 .flags = flags,
1465 };
1466 int ret;
1467
1468 alloc_inode_block(inode);
1469
1470 if (li->li_height > (__force u8)target_level)
1471 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1472 LEVEL(li->li_height));
1473 else
1474 ret = logfs_write_i0(inode, page, &wc);
1475 if (ret)
1476 return ret;
1477
1478 if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1479 li->li_data[INDIRECT_INDEX] = wc.ofs;
1480 logfs_set_alias(inode->i_sb, li->li_block,
1481 INDIRECT_INDEX + INODE_POINTER_OFS);
1482 }
1483 return ret;
1484}
1485
1486void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1487{
1488 alloc_inode_block(inode);
1489 logfs_inode(inode)->li_block->ta = ta;
1490}
1491
1492void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1493{
1494 struct logfs_block *block = logfs_inode(inode)->li_block;
1495
1496 if (block && block->ta)
1497 block->ta = NULL;
1498}
1499
1500static int grow_inode(struct inode *inode, u64 bix, level_t level)
1501{
1502 struct logfs_inode *li = logfs_inode(inode);
1503 u8 height = (__force u8)level;
1504 struct page *page;
1505 struct write_control wc = {
1506 .flags = WF_WRITE,
1507 };
1508 int err;
1509
1510 BUG_ON(height > 5 || li->li_height > 5);
1511 while (height > li->li_height || bix >= maxbix(li->li_height)) {
1512 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1513 LEVEL(li->li_height + 1));
1514 if (!page)
1515 return -ENOMEM;
1516 logfs_read_empty(page);
1517 alloc_indirect_block(inode, page, 1);
1518 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1519 err = logfs_write_i0(inode, page, &wc);
1520 logfs_put_write_page(page);
1521 if (err)
1522 return err;
1523 li->li_data[INDIRECT_INDEX] = wc.ofs;
1524 wc.ofs = 0;
1525 li->li_height++;
1526 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1527 }
1528 return 0;
1529}
1530
1531static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1532{
1533 struct logfs_super *super = logfs_super(inode->i_sb);
1534 pgoff_t index = page->index;
1535 u64 bix;
1536 level_t level;
1537 int err;
1538
1539 flags |= WF_WRITE | WF_DELETE;
1540 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1541
1542 logfs_unpack_index(index, &bix, &level);
1543 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1544 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1545
1546 if (index < I0_BLOCKS)
1547 return logfs_write_direct(inode, page, flags);
1548
1549 bix = adjust_bix(bix, level);
1550 err = grow_inode(inode, bix, level);
1551 if (err)
1552 return err;
1553 return logfs_write_rec(inode, page, bix, level, flags);
1554}
1555
1556int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1557{
1558 struct super_block *sb = inode->i_sb;
1559 int ret;
1560
1561 logfs_get_wblocks(sb, page, flags & WF_LOCK);
1562 ret = __logfs_write_buf(inode, page, flags);
1563 logfs_put_wblocks(sb, page, flags & WF_LOCK);
1564 return ret;
1565}
1566
1567static int __logfs_delete(struct inode *inode, struct page *page)
1568{
1569 long flags = WF_DELETE;
1570
1571 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1572
1573 if (page->index < I0_BLOCKS)
1574 return logfs_write_direct(inode, page, flags);
1575 return logfs_write_rec(inode, page, page->index, 0, flags);
1576}
1577
1578int logfs_delete(struct inode *inode, pgoff_t index,
1579 struct shadow_tree *shadow_tree)
1580{
1581 struct super_block *sb = inode->i_sb;
1582 struct page *page;
1583 int ret;
1584
1585 page = logfs_get_read_page(inode, index, 0);
1586 if (!page)
1587 return -ENOMEM;
1588
1589 logfs_get_wblocks(sb, page, 1);
1590 ret = __logfs_delete(inode, page);
1591 logfs_put_wblocks(sb, page, 1);
1592
1593 logfs_put_read_page(page);
1594
1595 return ret;
1596}
1597
1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1599 gc_level_t gc_level, long flags)
1600{
1601 level_t level = shrink_level(gc_level);
1602 struct page *page;
1603 int err;
1604
1605 page = logfs_get_write_page(inode, bix, level);
1606 if (!page)
1607 return -ENOMEM;
1608
1609 err = logfs_segment_read(inode, page, ofs, bix, level);
1610 if (!err) {
1611 if (level != 0)
1612 alloc_indirect_block(inode, page, 0);
1613 err = logfs_write_buf(inode, page, flags);
1614 if (!err && shrink_level(gc_level) == 0) {
1615 /* Rewrite cannot mark the inode dirty but has to
1616 * write it immediatly.
1617 * Q: Can't we just create an alias for the inode
1618 * instead? And if not, why not?
1619 */
1620 if (inode->i_ino == LOGFS_INO_MASTER)
1621 logfs_write_anchor(inode->i_sb);
1622 else {
1623 err = __logfs_write_inode(inode, flags);
1624 }
1625 }
1626 }
1627 logfs_put_write_page(page);
1628 return err;
1629}
1630
1631static int truncate_data_block(struct inode *inode, struct page *page,
1632 u64 ofs, struct logfs_shadow *shadow, u64 size)
1633{
1634 loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1635 u64 bix;
1636 level_t level;
1637 int err;
1638
1639 /* Does truncation happen within this page? */
1640 if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1641 return 0;
1642
1643 logfs_unpack_index(page->index, &bix, &level);
1644 BUG_ON(level != 0);
1645
1646 err = logfs_segment_read(inode, page, ofs, bix, level);
1647 if (err)
1648 return err;
1649
1650 zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
1651 return logfs_segment_write(inode, page, shadow);
1652}
1653
1654static int logfs_truncate_i0(struct inode *inode, struct page *page,
1655 struct write_control *wc, u64 size)
1656{
1657 struct logfs_shadow *shadow;
1658 u64 bix;
1659 level_t level;
1660 int err = 0;
1661
1662 logfs_unpack_index(page->index, &bix, &level);
1663 BUG_ON(level != 0);
1664 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1665
1666 err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1667 if (err) {
1668 free_shadow(inode, shadow);
1669 return err;
1670 }
1671
1672 logfs_segment_delete(inode, shadow);
1673 set_iused(inode, shadow);
1674 fill_shadow_tree(inode, page, shadow);
1675 wc->ofs = shadow->new_ofs;
1676 return 0;
1677}
1678
1679static int logfs_truncate_direct(struct inode *inode, u64 size)
1680{
1681 struct logfs_inode *li = logfs_inode(inode);
1682 struct write_control wc;
1683 struct page *page;
1684 int e;
1685 int err;
1686
1687 alloc_inode_block(inode);
1688
1689 for (e = I0_BLOCKS - 1; e >= 0; e--) {
1690 if (size > (e+1) * LOGFS_BLOCKSIZE)
1691 break;
1692
1693 wc.ofs = li->li_data[e];
1694 if (!wc.ofs)
1695 continue;
1696
1697 page = logfs_get_write_page(inode, e, 0);
1698 if (!page)
1699 return -ENOMEM;
1700 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1701 if (err) {
1702 logfs_put_write_page(page);
1703 return err;
1704 }
1705 err = logfs_truncate_i0(inode, page, &wc, size);
1706 logfs_put_write_page(page);
1707 if (err)
1708 return err;
1709
1710 li->li_data[e] = wc.ofs;
1711 }
1712 return 0;
1713}
1714
1715/* FIXME: these need to become per-sb once we support different blocksizes */
1716static u64 __logfs_step[] = {
1717 1,
1718 I1_BLOCKS,
1719 I2_BLOCKS,
1720 I3_BLOCKS,
1721};
1722
1723static u64 __logfs_start_index[] = {
1724 I0_BLOCKS,
1725 I1_BLOCKS,
1726 I2_BLOCKS,
1727 I3_BLOCKS
1728};
1729
1730static inline u64 logfs_step(level_t level)
1731{
1732 return __logfs_step[(__force u8)level];
1733}
1734
1735static inline u64 logfs_factor(u8 level)
1736{
1737 return __logfs_step[level] * LOGFS_BLOCKSIZE;
1738}
1739
1740static inline u64 logfs_start_index(level_t level)
1741{
1742 return __logfs_start_index[(__force u8)level];
1743}
1744
1745static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1746{
1747 logfs_unpack_index(index, bix, level);
1748 if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1749 *bix = 0;
1750}
1751
1752static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1753 struct write_control *this_wc, u64 size)
1754{
1755 int truncate_happened = 0;
1756 int e, err = 0;
1757 u64 bix, child_bix, next_bix;
1758 level_t level;
1759 struct page *page;
1760 struct write_control child_wc = { /* FIXME: flags */ };
1761
1762 logfs_unpack_raw_index(ipage->index, &bix, &level);
1763 err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1764 if (err)
1765 return err;
1766
1767 for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1768 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1769 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1770 if (size > next_bix * LOGFS_BLOCKSIZE)
1771 break;
1772
1773 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1774 if (!child_wc.ofs)
1775 continue;
1776
1777 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1778 if (!page)
1779 return -ENOMEM;
1780
1781 if ((__force u8)level > 1)
1782 err = __logfs_truncate_rec(inode, page, &child_wc, size);
1783 else
1784 err = logfs_truncate_i0(inode, page, &child_wc, size);
1785 logfs_put_write_page(page);
1786 if (err)
1787 return err;
1788
1789 truncate_happened = 1;
1790 alloc_indirect_block(inode, ipage, 0);
1791 block_set_pointer(ipage, e, child_wc.ofs);
1792 }
1793
1794 if (!truncate_happened) {
1795 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1796 return 0;
1797 }
1798
1799 this_wc->flags = WF_DELETE;
1800 if (logfs_block(ipage)->partial)
1801 this_wc->flags |= WF_WRITE;
1802
1803 return logfs_write_i0(inode, ipage, this_wc);
1804}
1805
1806static int logfs_truncate_rec(struct inode *inode, u64 size)
1807{
1808 struct logfs_inode *li = logfs_inode(inode);
1809 struct write_control wc = {
1810 .ofs = li->li_data[INDIRECT_INDEX],
1811 };
1812 struct page *page;
1813 int err;
1814
1815 alloc_inode_block(inode);
1816
1817 if (!wc.ofs)
1818 return 0;
1819
1820 page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1821 if (!page)
1822 return -ENOMEM;
1823
1824 err = __logfs_truncate_rec(inode, page, &wc, size);
1825 logfs_put_write_page(page);
1826 if (err)
1827 return err;
1828
1829 if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1830 li->li_data[INDIRECT_INDEX] = wc.ofs;
1831 return 0;
1832}
1833
1834static int __logfs_truncate(struct inode *inode, u64 size)
1835{
1836 int ret;
1837
1838 if (size >= logfs_factor(logfs_inode(inode)->li_height))
1839 return 0;
1840
1841 ret = logfs_truncate_rec(inode, size);
1842 if (ret)
1843 return ret;
1844
1845 return logfs_truncate_direct(inode, size);
1846}
1847
1848int logfs_truncate(struct inode *inode, u64 size)
1849{
1850 struct super_block *sb = inode->i_sb;
1851 int err;
1852
1853 logfs_get_wblocks(sb, NULL, 1);
1854 err = __logfs_truncate(inode, size);
1855 if (!err)
1856 err = __logfs_write_inode(inode, 0);
1857 logfs_put_wblocks(sb, NULL, 1);
1858
1859 if (!err)
1860 err = vmtruncate(inode, size);
1861
1862 /* I don't trust error recovery yet. */
1863 WARN_ON(err);
1864 return err;
1865}
1866
1867static void move_page_to_inode(struct inode *inode, struct page *page)
1868{
1869 struct logfs_inode *li = logfs_inode(inode);
1870 struct logfs_block *block = logfs_block(page);
1871
1872 if (!block)
1873 return;
1874
1875 log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1876 block->ino, block->bix, block->level);
1877 BUG_ON(li->li_block);
1878 block->ops = &inode_block_ops;
1879 block->inode = inode;
1880 li->li_block = block;
1881
1882 block->page = NULL;
1883 page->private = 0;
1884 ClearPagePrivate(page);
1885}
1886
1887static void move_inode_to_page(struct page *page, struct inode *inode)
1888{
1889 struct logfs_inode *li = logfs_inode(inode);
1890 struct logfs_block *block = li->li_block;
1891
1892 if (!block)
1893 return;
1894
1895 log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1896 block->ino, block->bix, block->level);
1897 BUG_ON(PagePrivate(page));
1898 block->ops = &indirect_block_ops;
1899 block->page = page;
1900 page->private = (unsigned long)block;
1901 SetPagePrivate(page);
1902
1903 block->inode = NULL;
1904 li->li_block = NULL;
1905}
1906
1907int logfs_read_inode(struct inode *inode)
1908{
1909 struct super_block *sb = inode->i_sb;
1910 struct logfs_super *super = logfs_super(sb);
1911 struct inode *master_inode = super->s_master_inode;
1912 struct page *page;
1913 struct logfs_disk_inode *di;
1914 u64 ino = inode->i_ino;
1915
1916 if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1917 return -ENODATA;
1918 if (!logfs_exist_block(master_inode, ino))
1919 return -ENODATA;
1920
1921 page = read_cache_page(master_inode->i_mapping, ino,
1922 (filler_t *)logfs_readpage, NULL);
1923 if (IS_ERR(page))
1924 return PTR_ERR(page);
1925
1926 di = kmap_atomic(page, KM_USER0);
1927 logfs_disk_to_inode(di, inode);
1928 kunmap_atomic(di, KM_USER0);
1929 move_page_to_inode(inode, page);
1930 page_cache_release(page);
1931 return 0;
1932}
1933
1934/* Caller must logfs_put_write_page(page); */
1935static struct page *inode_to_page(struct inode *inode)
1936{
1937 struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1938 struct logfs_disk_inode *di;
1939 struct page *page;
1940
1941 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1942
1943 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1944 if (!page)
1945 return NULL;
1946
1947 di = kmap_atomic(page, KM_USER0);
1948 logfs_inode_to_disk(inode, di);
1949 kunmap_atomic(di, KM_USER0);
1950 move_inode_to_page(page, inode);
1951 return page;
1952}
1953
1954/* Cheaper version of write_inode. All changes are concealed in
1955 * aliases, which are moved back. No write to the medium happens.
1956 */
1957void logfs_clear_inode(struct inode *inode)
1958{
1959 struct super_block *sb = inode->i_sb;
1960 struct logfs_inode *li = logfs_inode(inode);
1961 struct logfs_block *block = li->li_block;
1962 struct page *page;
1963
1964 /* Only deleted files may be dirty at this point */
1965 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1966 if (!block)
1967 return;
1968 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1969 block->ops->free_block(inode->i_sb, block);
1970 return;
1971 }
1972
1973 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1974 page = inode_to_page(inode);
1975 BUG_ON(!page); /* FIXME: Use emergency page */
1976 logfs_put_write_page(page);
1977}
1978
1979static int do_write_inode(struct inode *inode)
1980{
1981 struct super_block *sb = inode->i_sb;
1982 struct inode *master_inode = logfs_super(sb)->s_master_inode;
1983 loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
1984 struct page *page;
1985 int err;
1986
1987 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1988 /* FIXME: lock inode */
1989
1990 if (i_size_read(master_inode) < size)
1991 i_size_write(master_inode, size);
1992
1993 /* TODO: Tell vfs this inode is clean now */
1994
1995 page = inode_to_page(inode);
1996 if (!page)
1997 return -ENOMEM;
1998
1999 /* FIXME: transaction is part of logfs_block now. Is that enough? */
2000 err = logfs_write_buf(master_inode, page, 0);
2001 logfs_put_write_page(page);
2002 return err;
2003}
2004
2005static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
2006 int write,
2007 void (*change_se)(struct logfs_segment_entry *, long),
2008 long arg)
2009{
2010 struct logfs_super *super = logfs_super(sb);
2011 struct inode *inode;
2012 struct page *page;
2013 struct logfs_segment_entry *se;
2014 pgoff_t page_no;
2015 int child_no;
2016
2017 page_no = segno >> (sb->s_blocksize_bits - 3);
2018 child_no = segno & ((sb->s_blocksize >> 3) - 1);
2019
2020 inode = super->s_segfile_inode;
2021 page = logfs_get_write_page(inode, page_no, 0);
2022 BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2023 if (!PageUptodate(page))
2024 logfs_read_block(inode, page, WRITE);
2025
2026 if (write)
2027 alloc_indirect_block(inode, page, 0);
2028 se = kmap_atomic(page, KM_USER0);
2029 change_se(se + child_no, arg);
2030 if (write) {
2031 logfs_set_alias(sb, logfs_block(page), child_no);
2032 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2033 }
2034 kunmap_atomic(se, KM_USER0);
2035
2036 logfs_put_write_page(page);
2037}
2038
2039static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2040{
2041 struct logfs_segment_entry *target = (void *)_target;
2042
2043 *target = *se;
2044}
2045
2046void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2047 struct logfs_segment_entry *se)
2048{
2049 logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2050}
2051
2052static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2053{
2054 u32 valid;
2055
2056 valid = be32_to_cpu(se->valid);
2057 valid += increment;
2058 se->valid = cpu_to_be32(valid);
2059}
2060
2061void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2062{
2063 struct logfs_super *super = logfs_super(sb);
2064 u32 segno = ofs >> super->s_segshift;
2065
2066 if (!increment)
2067 return;
2068
2069 logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2070}
2071
2072static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2073{
2074 se->ec_level = cpu_to_be32(ec_level);
2075}
2076
2077void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2078 gc_level_t gc_level)
2079{
2080 u32 ec_level = ec << 4 | (__force u8)gc_level;
2081
2082 logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2083}
2084
2085static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2086{
2087 se->valid = cpu_to_be32(RESERVED);
2088}
2089
2090void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2091{
2092 logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2093}
2094
2095static void __set_segment_unreserved(struct logfs_segment_entry *se,
2096 long ec_level)
2097{
2098 se->valid = 0;
2099 se->ec_level = cpu_to_be32(ec_level);
2100}
2101
2102void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2103{
2104 u32 ec_level = ec << 4;
2105
2106 logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2107 ec_level);
2108}
2109
2110int __logfs_write_inode(struct inode *inode, long flags)
2111{
2112 struct super_block *sb = inode->i_sb;
2113 int ret;
2114
2115 logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
2116 ret = do_write_inode(inode);
2117 logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
2118 return ret;
2119}
2120
2121static int do_delete_inode(struct inode *inode)
2122{
2123 struct super_block *sb = inode->i_sb;
2124 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2125 struct page *page;
2126 int ret;
2127
2128 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2129 if (!page)
2130 return -ENOMEM;
2131
2132 move_inode_to_page(page, inode);
2133
2134 logfs_get_wblocks(sb, page, 1);
2135 ret = __logfs_delete(master_inode, page);
2136 logfs_put_wblocks(sb, page, 1);
2137
2138 logfs_put_write_page(page);
2139 return ret;
2140}
2141
2142/*
2143 * ZOMBIE inodes have already been deleted before and should remain dead,
2144 * if it weren't for valid checking. No need to kill them again here.
2145 */
2146void logfs_delete_inode(struct inode *inode)
2147{
2148 struct logfs_inode *li = logfs_inode(inode);
2149
2150 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2151 li->li_flags |= LOGFS_IF_ZOMBIE;
2152 if (i_size_read(inode) > 0)
2153 logfs_truncate(inode, 0);
2154 do_delete_inode(inode);
2155 }
2156 truncate_inode_pages(&inode->i_data, 0);
2157 clear_inode(inode);
2158}
2159
2160void btree_write_block(struct logfs_block *block)
2161{
2162 struct inode *inode;
2163 struct page *page;
2164 int err, cookie;
2165
2166 inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2167 page = logfs_get_write_page(inode, block->bix, block->level);
2168
2169 err = logfs_readpage_nolock(page);
2170 BUG_ON(err);
2171 BUG_ON(!PagePrivate(page));
2172 BUG_ON(logfs_block(page) != block);
2173 err = __logfs_write_buf(inode, page, 0);
2174 BUG_ON(err);
2175 BUG_ON(PagePrivate(page) || page->private);
2176
2177 logfs_put_write_page(page);
2178 logfs_safe_iput(inode, cookie);
2179}
2180
2181/**
2182 * logfs_inode_write - write inode or dentry objects
2183 *
2184 * @inode: parent inode (ifile or directory)
2185 * @buf: object to write (inode or dentry)
2186 * @n: object size
2187 * @_pos: object number (file position in blocks/objects)
2188 * @flags: write flags
2189 * @lock: 0 if write lock is already taken, 1 otherwise
2190 * @shadow_tree: shadow below this inode
2191 *
2192 * FIXME: All caller of this put a 200-300 byte variable on the stack,
2193 * only to call here and do a memcpy from that stack variable. A good
2194 * example of wasted performance and stack space.
2195 */
2196int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2197 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2198{
2199 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2200 int err;
2201 struct page *page;
2202 void *pagebuf;
2203
2204 BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2205 BUG_ON(count > LOGFS_BLOCKSIZE);
2206 page = logfs_get_write_page(inode, bix, 0);
2207 if (!page)
2208 return -ENOMEM;
2209
2210 pagebuf = kmap_atomic(page, KM_USER0);
2211 memcpy(pagebuf, buf, count);
2212 flush_dcache_page(page);
2213 kunmap_atomic(pagebuf, KM_USER0);
2214
2215 if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2216 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2217
2218 err = logfs_write_buf(inode, page, flags);
2219 logfs_put_write_page(page);
2220 return err;
2221}
2222
2223int logfs_open_segfile(struct super_block *sb)
2224{
2225 struct logfs_super *super = logfs_super(sb);
2226 struct inode *inode;
2227
2228 inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2229 if (IS_ERR(inode))
2230 return PTR_ERR(inode);
2231 super->s_segfile_inode = inode;
2232 return 0;
2233}
2234
2235int logfs_init_rw(struct super_block *sb)
2236{
2237 struct logfs_super *super = logfs_super(sb);
2238 int min_fill = 3 * super->s_no_blocks;
2239
2240 INIT_LIST_HEAD(&super->s_object_alias);
2241 mutex_init(&super->s_write_mutex);
2242 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2243 sizeof(struct logfs_block));
2244 super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2245 sizeof(struct logfs_shadow));
2246 return 0;
2247}
2248
2249void logfs_cleanup_rw(struct super_block *sb)
2250{
2251 struct logfs_super *super = logfs_super(sb);
2252
2253 destroy_meta_inode(super->s_segfile_inode);
2254 if (super->s_block_pool)
2255 mempool_destroy(super->s_block_pool);
2256 if (super->s_shadow_pool)
2257 mempool_destroy(super->s_shadow_pool);
2258}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..801a3a141625
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,936 @@
1/*
2 * fs/logfs/segment.c - Handling the Object Store
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Object store or ostore makes up the complete device with exception of
9 * the superblock and journal areas. Apart from its own metadata it stores
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */
12#include "logfs.h"
13#include <linux/slab.h>
14
15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
16{
17 struct logfs_super *super = logfs_super(sb);
18 struct btree_head32 *head = &super->s_reserved_segments;
19 int err;
20
21 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
22 if (err)
23 return err;
24 logfs_super(sb)->s_bad_segments++;
25 /* FIXME: write to journal */
26 return 0;
27}
28
29int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
30{
31 struct logfs_super *super = logfs_super(sb);
32
33 super->s_gec++;
34
35 return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
36 super->s_segsize, ensure_erase);
37}
38
39static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
40{
41 s32 ofs;
42
43 logfs_open_area(area, bytes);
44
45 ofs = area->a_used_bytes;
46 area->a_used_bytes += bytes;
47 BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
48
49 return dev_ofs(area->a_sb, area->a_segno, ofs);
50}
51
52static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
53 int use_filler)
54{
55 struct logfs_super *super = logfs_super(sb);
56 struct address_space *mapping = super->s_mapping_inode->i_mapping;
57 filler_t *filler = super->s_devops->readpage;
58 struct page *page;
59
60 BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
61 if (use_filler)
62 page = read_cache_page(mapping, index, filler, sb);
63 else {
64 page = find_or_create_page(mapping, index, GFP_NOFS);
65 unlock_page(page);
66 }
67 return page;
68}
69
70void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
71 int use_filler)
72{
73 pgoff_t index = ofs >> PAGE_SHIFT;
74 struct page *page;
75 long offset = ofs & (PAGE_SIZE-1);
76 long copylen;
77
78 /* Only logfs_wbuf_recover may use len==0 */
79 BUG_ON(!len && !use_filler);
80 do {
81 copylen = min((ulong)len, PAGE_SIZE - offset);
82
83 page = get_mapping_page(area->a_sb, index, use_filler);
84 SetPageUptodate(page);
85 BUG_ON(!page); /* FIXME: reserve a pool */
86 memcpy(page_address(page) + offset, buf, copylen);
87 SetPagePrivate(page);
88 page_cache_release(page);
89
90 buf += copylen;
91 len -= copylen;
92 offset = 0;
93 index++;
94 } while (len);
95}
96
97static void pad_partial_page(struct logfs_area *area)
98{
99 struct super_block *sb = area->a_sb;
100 struct page *page;
101 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
102 pgoff_t index = ofs >> PAGE_SHIFT;
103 long offset = ofs & (PAGE_SIZE-1);
104 u32 len = PAGE_SIZE - offset;
105
106 if (len % PAGE_SIZE) {
107 page = get_mapping_page(sb, index, 0);
108 BUG_ON(!page); /* FIXME: reserve a pool */
109 memset(page_address(page) + offset, 0xff, len);
110 SetPagePrivate(page);
111 page_cache_release(page);
112 }
113}
114
115static void pad_full_pages(struct logfs_area *area)
116{
117 struct super_block *sb = area->a_sb;
118 struct logfs_super *super = logfs_super(sb);
119 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
120 u32 len = super->s_segsize - area->a_used_bytes;
121 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
122 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
123 struct page *page;
124
125 while (no_indizes) {
126 page = get_mapping_page(sb, index, 0);
127 BUG_ON(!page); /* FIXME: reserve a pool */
128 SetPageUptodate(page);
129 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
130 SetPagePrivate(page);
131 page_cache_release(page);
132 index++;
133 no_indizes--;
134 }
135}
136
137/*
138 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
139 * Also make sure we allocate (and memset) all pages for final writeout.
140 */
141static void pad_wbuf(struct logfs_area *area, int final)
142{
143 pad_partial_page(area);
144 if (final)
145 pad_full_pages(area);
146}
147
148/*
149 * We have to be careful with the alias tree. Since lookup is done by bix,
150 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
151 * indirect blocks. So always use it through accessor functions.
152 */
153static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
154 level_t level)
155{
156 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
157 pgoff_t index = logfs_pack_index(bix, level);
158
159 return btree_lookup128(head, ino, index);
160}
161
162static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
163 level_t level, void *val)
164{
165 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
166 pgoff_t index = logfs_pack_index(bix, level);
167
168 return btree_insert128(head, ino, index, val, GFP_NOFS);
169}
170
171static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
172 write_alias_t *write_one_alias)
173{
174 struct object_alias_item *item;
175 int err;
176
177 list_for_each_entry(item, &block->item_list, list) {
178 err = write_alias_journal(sb, block->ino, block->bix,
179 block->level, item->child_no, item->val);
180 if (err)
181 return err;
182 }
183 return 0;
184}
185
186static gc_level_t btree_block_level(struct logfs_block *block)
187{
188 return expand_level(block->ino, block->level);
189}
190
191static struct logfs_block_ops btree_block_ops = {
192 .write_block = btree_write_block,
193 .block_level = btree_block_level,
194 .free_block = __free_block,
195 .write_alias = btree_write_alias,
196};
197
198int logfs_load_object_aliases(struct super_block *sb,
199 struct logfs_obj_alias *oa, int count)
200{
201 struct logfs_super *super = logfs_super(sb);
202 struct logfs_block *block;
203 struct object_alias_item *item;
204 u64 ino, bix;
205 level_t level;
206 int i, err;
207
208 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
209 count /= sizeof(*oa);
210 for (i = 0; i < count; i++) {
211 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
212 if (!item)
213 return -ENOMEM;
214 memset(item, 0, sizeof(*item));
215
216 super->s_no_object_aliases++;
217 item->val = oa[i].val;
218 item->child_no = be16_to_cpu(oa[i].child_no);
219
220 ino = be64_to_cpu(oa[i].ino);
221 bix = be64_to_cpu(oa[i].bix);
222 level = LEVEL(oa[i].level);
223
224 log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
225 ino, bix, level, item->child_no,
226 be64_to_cpu(item->val));
227 block = alias_tree_lookup(sb, ino, bix, level);
228 if (!block) {
229 block = __alloc_block(sb, ino, bix, level);
230 block->ops = &btree_block_ops;
231 err = alias_tree_insert(sb, ino, bix, level, block);
232 BUG_ON(err); /* mempool empty */
233 }
234 if (test_and_set_bit(item->child_no, block->alias_map)) {
235 printk(KERN_ERR"LogFS: Alias collision detected\n");
236 return -EIO;
237 }
238 list_move_tail(&block->alias_list, &super->s_object_alias);
239 list_add(&item->list, &block->item_list);
240 }
241 return 0;
242}
243
244static void kill_alias(void *_block, unsigned long ignore0,
245 u64 ignore1, u64 ignore2, size_t ignore3)
246{
247 struct logfs_block *block = _block;
248 struct super_block *sb = block->sb;
249 struct logfs_super *super = logfs_super(sb);
250 struct object_alias_item *item;
251
252 while (!list_empty(&block->item_list)) {
253 item = list_entry(block->item_list.next, typeof(*item), list);
254 list_del(&item->list);
255 mempool_free(item, super->s_alias_pool);
256 }
257 block->ops->free_block(sb, block);
258}
259
260static int obj_type(struct inode *inode, level_t level)
261{
262 if (level == 0) {
263 if (S_ISDIR(inode->i_mode))
264 return OBJ_DENTRY;
265 if (inode->i_ino == LOGFS_INO_MASTER)
266 return OBJ_INODE;
267 }
268 return OBJ_BLOCK;
269}
270
271static int obj_len(struct super_block *sb, int obj_type)
272{
273 switch (obj_type) {
274 case OBJ_DENTRY:
275 return sizeof(struct logfs_disk_dentry);
276 case OBJ_INODE:
277 return sizeof(struct logfs_disk_inode);
278 case OBJ_BLOCK:
279 return sb->s_blocksize;
280 default:
281 BUG();
282 }
283}
284
285static int __logfs_segment_write(struct inode *inode, void *buf,
286 struct logfs_shadow *shadow, int type, int len, int compr)
287{
288 struct logfs_area *area;
289 struct super_block *sb = inode->i_sb;
290 s64 ofs;
291 struct logfs_object_header h;
292 int acc_len;
293
294 if (shadow->gc_level == 0)
295 acc_len = len;
296 else
297 acc_len = obj_len(sb, type);
298
299 area = get_area(sb, shadow->gc_level);
300 ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
301 LOGFS_BUG_ON(ofs <= 0, sb);
302 /*
303 * Order is important. logfs_get_free_bytes(), by modifying the
304 * segment file, may modify the content of the very page we're about
305 * to write now. Which is fine, as long as the calculated crc and
306 * written data still match. So do the modifications _before_
307 * calculating the crc.
308 */
309
310 h.len = cpu_to_be16(len);
311 h.type = type;
312 h.compr = compr;
313 h.ino = cpu_to_be64(inode->i_ino);
314 h.bix = cpu_to_be64(shadow->bix);
315 h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
316 h.data_crc = logfs_crc32(buf, len, 0);
317
318 logfs_buf_write(area, ofs, &h, sizeof(h));
319 logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
320
321 shadow->new_ofs = ofs;
322 shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
323
324 return 0;
325}
326
327static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
328 struct logfs_shadow *shadow, int type, int len)
329{
330 struct super_block *sb = inode->i_sb;
331 void *compressor_buf = logfs_super(sb)->s_compressed_je;
332 ssize_t compr_len;
333 int ret;
334
335 mutex_lock(&logfs_super(sb)->s_journal_mutex);
336 compr_len = logfs_compress(buf, compressor_buf, len, len);
337
338 if (compr_len >= 0) {
339 ret = __logfs_segment_write(inode, compressor_buf, shadow,
340 type, compr_len, COMPR_ZLIB);
341 } else {
342 ret = __logfs_segment_write(inode, buf, shadow, type, len,
343 COMPR_NONE);
344 }
345 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
346 return ret;
347}
348
349/**
350 * logfs_segment_write - write data block to object store
351 * @inode: inode containing data
352 *
353 * Returns an errno or zero.
354 */
355int logfs_segment_write(struct inode *inode, struct page *page,
356 struct logfs_shadow *shadow)
357{
358 struct super_block *sb = inode->i_sb;
359 struct logfs_super *super = logfs_super(sb);
360 int do_compress, type, len;
361 int ret;
362 void *buf;
363
364 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
365 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
366 do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
367 if (shadow->gc_level != 0) {
368 /* temporarily disable compression for indirect blocks */
369 do_compress = 0;
370 }
371
372 type = obj_type(inode, shrink_level(shadow->gc_level));
373 len = obj_len(sb, type);
374 buf = kmap(page);
375 if (do_compress)
376 ret = logfs_segment_write_compress(inode, buf, shadow, type,
377 len);
378 else
379 ret = __logfs_segment_write(inode, buf, shadow, type, len,
380 COMPR_NONE);
381 kunmap(page);
382
383 log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
384 shadow->ino, shadow->bix, shadow->gc_level,
385 shadow->old_ofs, shadow->new_ofs,
386 shadow->old_len, shadow->new_len);
387 /* this BUG_ON did catch a locking bug. useful */
388 BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
389 return ret;
390}
391
392int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
393{
394 pgoff_t index = ofs >> PAGE_SHIFT;
395 struct page *page;
396 long offset = ofs & (PAGE_SIZE-1);
397 long copylen;
398
399 while (len) {
400 copylen = min((ulong)len, PAGE_SIZE - offset);
401
402 page = get_mapping_page(sb, index, 1);
403 if (IS_ERR(page))
404 return PTR_ERR(page);
405 memcpy(buf, page_address(page) + offset, copylen);
406 page_cache_release(page);
407
408 buf += copylen;
409 len -= copylen;
410 offset = 0;
411 index++;
412 }
413 return 0;
414}
415
416/*
417 * The "position" of indirect blocks is ambiguous. It can be the position
418 * of any data block somewhere behind this indirect block. So we need to
419 * normalize the positions through logfs_block_mask() before comparing.
420 */
421static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
422{
423 return (pos1 & logfs_block_mask(sb, level)) !=
424 (pos2 & logfs_block_mask(sb, level));
425}
426
427#if 0
428static int read_seg_header(struct super_block *sb, u64 ofs,
429 struct logfs_segment_header *sh)
430{
431 __be32 crc;
432 int err;
433
434 err = wbuf_read(sb, ofs, sizeof(*sh), sh);
435 if (err)
436 return err;
437 crc = logfs_crc32(sh, sizeof(*sh), 4);
438 if (crc != sh->crc) {
439 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
440 "got %x\n", ofs, be32_to_cpu(sh->crc),
441 be32_to_cpu(crc));
442 return -EIO;
443 }
444 return 0;
445}
446#endif
447
448static int read_obj_header(struct super_block *sb, u64 ofs,
449 struct logfs_object_header *oh)
450{
451 __be32 crc;
452 int err;
453
454 err = wbuf_read(sb, ofs, sizeof(*oh), oh);
455 if (err)
456 return err;
457 crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
458 if (crc != oh->crc) {
459 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
460 "got %x\n", ofs, be32_to_cpu(oh->crc),
461 be32_to_cpu(crc));
462 return -EIO;
463 }
464 return 0;
465}
466
467static void move_btree_to_page(struct inode *inode, struct page *page,
468 __be64 *data)
469{
470 struct super_block *sb = inode->i_sb;
471 struct logfs_super *super = logfs_super(sb);
472 struct btree_head128 *head = &super->s_object_alias_tree;
473 struct logfs_block *block;
474 struct object_alias_item *item, *next;
475
476 if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
477 return;
478
479 block = btree_remove128(head, inode->i_ino, page->index);
480 if (!block)
481 return;
482
483 log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
484 block->ino, block->bix, block->level);
485 list_for_each_entry_safe(item, next, &block->item_list, list) {
486 data[item->child_no] = item->val;
487 list_del(&item->list);
488 mempool_free(item, super->s_alias_pool);
489 }
490 block->page = page;
491 SetPagePrivate(page);
492 page->private = (unsigned long)block;
493 block->ops = &indirect_block_ops;
494 initialize_block_counters(page, block, data, 0);
495}
496
497/*
498 * This silences a false, yet annoying gcc warning. I hate it when my editor
499 * jumps into bitops.h each time I recompile this file.
500 * TODO: Complain to gcc folks about this and upgrade compiler.
501 */
502static unsigned long fnb(const unsigned long *addr,
503 unsigned long size, unsigned long offset)
504{
505 return find_next_bit(addr, size, offset);
506}
507
508void move_page_to_btree(struct page *page)
509{
510 struct logfs_block *block = logfs_block(page);
511 struct super_block *sb = block->sb;
512 struct logfs_super *super = logfs_super(sb);
513 struct object_alias_item *item;
514 unsigned long pos;
515 __be64 *child;
516 int err;
517
518 if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
519 block->ops->free_block(sb, block);
520 return;
521 }
522 log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
523 block->ino, block->bix, block->level);
524 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
525
526 for (pos = 0; ; pos++) {
527 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
528 if (pos >= LOGFS_BLOCK_FACTOR)
529 break;
530
531 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
532 BUG_ON(!item); /* mempool empty */
533 memset(item, 0, sizeof(*item));
534
535 child = kmap_atomic(page, KM_USER0);
536 item->val = child[pos];
537 kunmap_atomic(child, KM_USER0);
538 item->child_no = pos;
539 list_add(&item->list, &block->item_list);
540 }
541 block->page = NULL;
542 ClearPagePrivate(page);
543 page->private = 0;
544 block->ops = &btree_block_ops;
545 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
546 block);
547 BUG_ON(err); /* mempool empty */
548 ClearPageUptodate(page);
549}
550
551static int __logfs_segment_read(struct inode *inode, void *buf,
552 u64 ofs, u64 bix, level_t level)
553{
554 struct super_block *sb = inode->i_sb;
555 void *compressor_buf = logfs_super(sb)->s_compressed_je;
556 struct logfs_object_header oh;
557 __be32 crc;
558 u16 len;
559 int err, block_len;
560
561 block_len = obj_len(sb, obj_type(inode, level));
562 err = read_obj_header(sb, ofs, &oh);
563 if (err)
564 goto out_err;
565
566 err = -EIO;
567 if (be64_to_cpu(oh.ino) != inode->i_ino
568 || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
569 printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
570 "expected (%lx, %llx), got (%llx, %llx)\n",
571 ofs, inode->i_ino, bix,
572 be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
573 goto out_err;
574 }
575
576 len = be16_to_cpu(oh.len);
577
578 switch (oh.compr) {
579 case COMPR_NONE:
580 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
581 if (err)
582 goto out_err;
583 crc = logfs_crc32(buf, len, 0);
584 if (crc != oh.data_crc) {
585 printk(KERN_ERR"LOGFS: uncompressed data crc error at "
586 "%llx: expected %x, got %x\n", ofs,
587 be32_to_cpu(oh.data_crc),
588 be32_to_cpu(crc));
589 goto out_err;
590 }
591 break;
592 case COMPR_ZLIB:
593 mutex_lock(&logfs_super(sb)->s_journal_mutex);
594 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
595 compressor_buf);
596 if (err) {
597 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
598 goto out_err;
599 }
600 crc = logfs_crc32(compressor_buf, len, 0);
601 if (crc != oh.data_crc) {
602 printk(KERN_ERR"LOGFS: compressed data crc error at "
603 "%llx: expected %x, got %x\n", ofs,
604 be32_to_cpu(oh.data_crc),
605 be32_to_cpu(crc));
606 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
607 goto out_err;
608 }
609 err = logfs_uncompress(compressor_buf, buf, len, block_len);
610 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
611 if (err) {
612 printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
613 goto out_err;
614 }
615 break;
616 default:
617 LOGFS_BUG(sb);
618 err = -EIO;
619 goto out_err;
620 }
621 return 0;
622
623out_err:
624 logfs_set_ro(sb);
625 printk(KERN_ERR"LOGFS: device is read-only now\n");
626 LOGFS_BUG(sb);
627 return err;
628}
629
630/**
631 * logfs_segment_read - read data block from object store
632 * @inode: inode containing data
633 * @buf: data buffer
634 * @ofs: physical data offset
635 * @bix: block index
636 * @level: block level
637 *
638 * Returns 0 on success or a negative errno.
639 */
640int logfs_segment_read(struct inode *inode, struct page *page,
641 u64 ofs, u64 bix, level_t level)
642{
643 int err;
644 void *buf;
645
646 if (PageUptodate(page))
647 return 0;
648
649 ofs &= ~LOGFS_FULLY_POPULATED;
650
651 buf = kmap(page);
652 err = __logfs_segment_read(inode, buf, ofs, bix, level);
653 if (!err) {
654 move_btree_to_page(inode, page, buf);
655 SetPageUptodate(page);
656 }
657 kunmap(page);
658 log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
659 inode->i_ino, bix, level, ofs, err);
660 return err;
661}
662
663int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
664{
665 struct super_block *sb = inode->i_sb;
666 struct logfs_super *super = logfs_super(sb);
667 struct logfs_object_header h;
668 u16 len;
669 int err;
670
671 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
672 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
673 BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
674 if (!shadow->old_ofs)
675 return 0;
676
677 log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
678 shadow->ino, shadow->bix, shadow->gc_level,
679 shadow->old_ofs, shadow->new_ofs,
680 shadow->old_len, shadow->new_len);
681 err = read_obj_header(sb, shadow->old_ofs, &h);
682 LOGFS_BUG_ON(err, sb);
683 LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
684 LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
685 shrink_level(shadow->gc_level)), sb);
686
687 if (shadow->gc_level == 0)
688 len = be16_to_cpu(h.len);
689 else
690 len = obj_len(sb, h.type);
691 shadow->old_len = len + sizeof(h);
692 return 0;
693}
694
695void freeseg(struct super_block *sb, u32 segno)
696{
697 struct logfs_super *super = logfs_super(sb);
698 struct address_space *mapping = super->s_mapping_inode->i_mapping;
699 struct page *page;
700 u64 ofs, start, end;
701
702 start = dev_ofs(sb, segno, 0);
703 end = dev_ofs(sb, segno + 1, 0);
704 for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
705 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
706 if (!page)
707 continue;
708 ClearPagePrivate(page);
709 page_cache_release(page);
710 }
711}
712
713int logfs_open_area(struct logfs_area *area, size_t bytes)
714{
715 struct super_block *sb = area->a_sb;
716 struct logfs_super *super = logfs_super(sb);
717 int err, closed = 0;
718
719 if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
720 return 0;
721
722 if (area->a_is_open) {
723 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
724 u32 len = super->s_segsize - area->a_written_bytes;
725
726 log_gc("logfs_close_area(%x)\n", area->a_segno);
727 pad_wbuf(area, 1);
728 super->s_devops->writeseg(area->a_sb, ofs, len);
729 freeseg(sb, area->a_segno);
730 closed = 1;
731 }
732
733 area->a_used_bytes = 0;
734 area->a_written_bytes = 0;
735again:
736 area->a_ops->get_free_segment(area);
737 area->a_ops->get_erase_count(area);
738
739 log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
740 err = area->a_ops->erase_segment(area);
741 if (err) {
742 printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
743 area->a_segno);
744 logfs_mark_segment_bad(sb, area->a_segno);
745 goto again;
746 }
747 area->a_is_open = 1;
748 return closed;
749}
750
751void logfs_sync_area(struct logfs_area *area)
752{
753 struct super_block *sb = area->a_sb;
754 struct logfs_super *super = logfs_super(sb);
755 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
756 u32 len = (area->a_used_bytes - area->a_written_bytes);
757
758 if (super->s_writesize)
759 len &= ~(super->s_writesize - 1);
760 if (len == 0)
761 return;
762 pad_wbuf(area, 0);
763 super->s_devops->writeseg(sb, ofs, len);
764 area->a_written_bytes += len;
765}
766
767void logfs_sync_segments(struct super_block *sb)
768{
769 struct logfs_super *super = logfs_super(sb);
770 int i;
771
772 for_each_area(i)
773 logfs_sync_area(super->s_area[i]);
774}
775
776/*
777 * Pick a free segment to be used for this area. Effectively takes a
778 * candidate from the free list (not really a candidate anymore).
779 */
780static void ostore_get_free_segment(struct logfs_area *area)
781{
782 struct super_block *sb = area->a_sb;
783 struct logfs_super *super = logfs_super(sb);
784
785 if (super->s_free_list.count == 0) {
786 printk(KERN_ERR"LOGFS: ran out of free segments\n");
787 LOGFS_BUG(sb);
788 }
789
790 area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
791}
792
793static void ostore_get_erase_count(struct logfs_area *area)
794{
795 struct logfs_segment_entry se;
796 u32 ec_level;
797
798 logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
799 BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
800 se.valid == cpu_to_be32(RESERVED));
801
802 ec_level = be32_to_cpu(se.ec_level);
803 area->a_erase_count = (ec_level >> 4) + 1;
804}
805
806static int ostore_erase_segment(struct logfs_area *area)
807{
808 struct super_block *sb = area->a_sb;
809 struct logfs_segment_header sh;
810 u64 ofs;
811 int err;
812
813 err = logfs_erase_segment(sb, area->a_segno, 0);
814 if (err)
815 return err;
816
817 sh.pad = 0;
818 sh.type = SEG_OSTORE;
819 sh.level = (__force u8)area->a_level;
820 sh.segno = cpu_to_be32(area->a_segno);
821 sh.ec = cpu_to_be32(area->a_erase_count);
822 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
823 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
824
825 logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
826 area->a_level);
827
828 ofs = dev_ofs(sb, area->a_segno, 0);
829 area->a_used_bytes = sizeof(sh);
830 logfs_buf_write(area, ofs, &sh, sizeof(sh));
831 return 0;
832}
833
834static const struct logfs_area_ops ostore_area_ops = {
835 .get_free_segment = ostore_get_free_segment,
836 .get_erase_count = ostore_get_erase_count,
837 .erase_segment = ostore_erase_segment,
838};
839
840static void free_area(struct logfs_area *area)
841{
842 if (area)
843 freeseg(area->a_sb, area->a_segno);
844 kfree(area);
845}
846
847static struct logfs_area *alloc_area(struct super_block *sb)
848{
849 struct logfs_area *area;
850
851 area = kzalloc(sizeof(*area), GFP_KERNEL);
852 if (!area)
853 return NULL;
854
855 area->a_sb = sb;
856 return area;
857}
858
859static void map_invalidatepage(struct page *page, unsigned long l)
860{
861 BUG();
862}
863
864static int map_releasepage(struct page *page, gfp_t g)
865{
866 /* Don't release these pages */
867 return 0;
868}
869
870static const struct address_space_operations mapping_aops = {
871 .invalidatepage = map_invalidatepage,
872 .releasepage = map_releasepage,
873 .set_page_dirty = __set_page_dirty_nobuffers,
874};
875
876int logfs_init_mapping(struct super_block *sb)
877{
878 struct logfs_super *super = logfs_super(sb);
879 struct address_space *mapping;
880 struct inode *inode;
881
882 inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
883 if (IS_ERR(inode))
884 return PTR_ERR(inode);
885 super->s_mapping_inode = inode;
886 mapping = inode->i_mapping;
887 mapping->a_ops = &mapping_aops;
888 /* Would it be possible to use __GFP_HIGHMEM as well? */
889 mapping_set_gfp_mask(mapping, GFP_NOFS);
890 return 0;
891}
892
893int logfs_init_areas(struct super_block *sb)
894{
895 struct logfs_super *super = logfs_super(sb);
896 int i = -1;
897
898 super->s_alias_pool = mempool_create_kmalloc_pool(600,
899 sizeof(struct object_alias_item));
900 if (!super->s_alias_pool)
901 return -ENOMEM;
902
903 super->s_journal_area = alloc_area(sb);
904 if (!super->s_journal_area)
905 goto err;
906
907 for_each_area(i) {
908 super->s_area[i] = alloc_area(sb);
909 if (!super->s_area[i])
910 goto err;
911 super->s_area[i]->a_level = GC_LEVEL(i);
912 super->s_area[i]->a_ops = &ostore_area_ops;
913 }
914 btree_init_mempool128(&super->s_object_alias_tree,
915 super->s_btree_pool);
916 return 0;
917
918err:
919 for (i--; i >= 0; i--)
920 free_area(super->s_area[i]);
921 free_area(super->s_journal_area);
922 mempool_destroy(super->s_alias_pool);
923 return -ENOMEM;
924}
925
926void logfs_cleanup_areas(struct super_block *sb)
927{
928 struct logfs_super *super = logfs_super(sb);
929 int i;
930
931 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
932 for_each_area(i)
933 free_area(super->s_area[i]);
934 free_area(super->s_journal_area);
935 destroy_meta_inode(super->s_mapping_inode);
936}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..b60bfac3263c
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,650 @@
1/*
2 * fs/logfs/super.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Generally contains mount/umount code and also serves as a dump area for
9 * any functions that don't fit elsewhere and neither justify a file of their
10 * own.
11 */
12#include "logfs.h"
13#include <linux/bio.h>
14#include <linux/slab.h>
15#include <linux/mtd/mtd.h>
16#include <linux/statfs.h>
17#include <linux/buffer_head.h>
18
19static DEFINE_MUTEX(emergency_mutex);
20static struct page *emergency_page;
21
22struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
23{
24 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
25 struct page *page;
26 int err;
27
28 page = read_cache_page(mapping, index, filler, NULL);
29 if (page)
30 return page;
31
32 /* No more pages available, switch to emergency page */
33 printk(KERN_INFO"Logfs: Using emergency page\n");
34 mutex_lock(&emergency_mutex);
35 err = filler(NULL, emergency_page);
36 if (err) {
37 mutex_unlock(&emergency_mutex);
38 printk(KERN_EMERG"Logfs: Error reading emergency page\n");
39 return ERR_PTR(err);
40 }
41 return emergency_page;
42}
43
44void emergency_read_end(struct page *page)
45{
46 if (page == emergency_page)
47 mutex_unlock(&emergency_mutex);
48 else
49 page_cache_release(page);
50}
51
52static void dump_segfile(struct super_block *sb)
53{
54 struct logfs_super *super = logfs_super(sb);
55 struct logfs_segment_entry se;
56 u32 segno;
57
58 for (segno = 0; segno < super->s_no_segs; segno++) {
59 logfs_get_segment_entry(sb, segno, &se);
60 printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
61 be32_to_cpu(se.valid));
62 if (++segno < super->s_no_segs) {
63 logfs_get_segment_entry(sb, segno, &se);
64 printk(" %6x %8x", be32_to_cpu(se.ec_level),
65 be32_to_cpu(se.valid));
66 }
67 if (++segno < super->s_no_segs) {
68 logfs_get_segment_entry(sb, segno, &se);
69 printk(" %6x %8x", be32_to_cpu(se.ec_level),
70 be32_to_cpu(se.valid));
71 }
72 if (++segno < super->s_no_segs) {
73 logfs_get_segment_entry(sb, segno, &se);
74 printk(" %6x %8x", be32_to_cpu(se.ec_level),
75 be32_to_cpu(se.valid));
76 }
77 printk("\n");
78 }
79}
80
81/*
82 * logfs_crash_dump - dump debug information to device
83 *
84 * The LogFS superblock only occupies part of a segment. This function will
85 * write as much debug information as it can gather into the spare space.
86 */
87void logfs_crash_dump(struct super_block *sb)
88{
89 dump_segfile(sb);
90}
91
92/*
93 * TODO: move to lib/string.c
94 */
95/**
96 * memchr_inv - Find a character in an area of memory.
97 * @s: The memory area
98 * @c: The byte to search for
99 * @n: The size of the area.
100 *
101 * returns the address of the first character other than @c, or %NULL
102 * if the whole buffer contains just @c.
103 */
104void *memchr_inv(const void *s, int c, size_t n)
105{
106 const unsigned char *p = s;
107 while (n-- != 0)
108 if ((unsigned char)c != *p++)
109 return (void *)(p - 1);
110
111 return NULL;
112}
113
114/*
115 * FIXME: There should be a reserve for root, similar to ext2.
116 */
117int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
118{
119 struct super_block *sb = dentry->d_sb;
120 struct logfs_super *super = logfs_super(sb);
121
122 stats->f_type = LOGFS_MAGIC_U32;
123 stats->f_bsize = sb->s_blocksize;
124 stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
125 stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
126 stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
127 stats->f_files = 0;
128 stats->f_ffree = 0;
129 stats->f_namelen = LOGFS_MAX_NAMELEN;
130 return 0;
131}
132
133static int logfs_sb_set(struct super_block *sb, void *_super)
134{
135 struct logfs_super *super = _super;
136
137 sb->s_fs_info = super;
138 sb->s_mtd = super->s_mtd;
139 sb->s_bdev = super->s_bdev;
140 return 0;
141}
142
143static int logfs_sb_test(struct super_block *sb, void *_super)
144{
145 struct logfs_super *super = _super;
146 struct mtd_info *mtd = super->s_mtd;
147
148 if (mtd && sb->s_mtd == mtd)
149 return 1;
150 if (super->s_bdev && sb->s_bdev == super->s_bdev)
151 return 1;
152 return 0;
153}
154
155static void set_segment_header(struct logfs_segment_header *sh, u8 type,
156 u8 level, u32 segno, u32 ec)
157{
158 sh->pad = 0;
159 sh->type = type;
160 sh->level = level;
161 sh->segno = cpu_to_be32(segno);
162 sh->ec = cpu_to_be32(ec);
163 sh->gec = cpu_to_be64(segno);
164 sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
165}
166
167static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
168 u32 segno, u32 ec)
169{
170 struct logfs_super *super = logfs_super(sb);
171 struct logfs_segment_header *sh = &ds->ds_sh;
172 int i;
173
174 memset(ds, 0, sizeof(*ds));
175 set_segment_header(sh, SEG_SUPER, 0, segno, ec);
176
177 ds->ds_ifile_levels = super->s_ifile_levels;
178 ds->ds_iblock_levels = super->s_iblock_levels;
179 ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
180 ds->ds_segment_shift = super->s_segshift;
181 ds->ds_block_shift = sb->s_blocksize_bits;
182 ds->ds_write_shift = super->s_writeshift;
183 ds->ds_filesystem_size = cpu_to_be64(super->s_size);
184 ds->ds_segment_size = cpu_to_be32(super->s_segsize);
185 ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
186 ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
187 ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
188 ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
189 ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
190 ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
191 ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
192 journal_for_each(i)
193 ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
194 ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
195 ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
196 LOGFS_SEGMENT_HEADERSIZE + 12);
197}
198
199static int write_one_sb(struct super_block *sb,
200 struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
201{
202 struct logfs_super *super = logfs_super(sb);
203 struct logfs_disk_super *ds;
204 struct logfs_segment_entry se;
205 struct page *page;
206 u64 ofs;
207 u32 ec, segno;
208 int err;
209
210 page = find_sb(sb, &ofs);
211 if (!page)
212 return -EIO;
213 ds = page_address(page);
214 segno = seg_no(sb, ofs);
215 logfs_get_segment_entry(sb, segno, &se);
216 ec = be32_to_cpu(se.ec_level) >> 4;
217 ec++;
218 logfs_set_segment_erased(sb, segno, ec, 0);
219 logfs_write_ds(sb, ds, segno, ec);
220 err = super->s_devops->write_sb(sb, page);
221 page_cache_release(page);
222 return err;
223}
224
225int logfs_write_sb(struct super_block *sb)
226{
227 struct logfs_super *super = logfs_super(sb);
228 int err;
229
230 /* First superblock */
231 err = write_one_sb(sb, super->s_devops->find_first_sb);
232 if (err)
233 return err;
234
235 /* Last superblock */
236 err = write_one_sb(sb, super->s_devops->find_last_sb);
237 if (err)
238 return err;
239 return 0;
240}
241
242static int ds_cmp(const void *ds0, const void *ds1)
243{
244 size_t len = sizeof(struct logfs_disk_super);
245
246 /* We know the segment headers differ, so ignore them */
247 len -= LOGFS_SEGMENT_HEADERSIZE;
248 ds0 += LOGFS_SEGMENT_HEADERSIZE;
249 ds1 += LOGFS_SEGMENT_HEADERSIZE;
250 return memcmp(ds0, ds1, len);
251}
252
253static int logfs_recover_sb(struct super_block *sb)
254{
255 struct logfs_super *super = logfs_super(sb);
256 struct logfs_disk_super _ds0, *ds0 = &_ds0;
257 struct logfs_disk_super _ds1, *ds1 = &_ds1;
258 int err, valid0, valid1;
259
260 /* read first superblock */
261 err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
262 if (err)
263 return err;
264 /* read last superblock */
265 err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
266 if (err)
267 return err;
268 valid0 = logfs_check_ds(ds0) == 0;
269 valid1 = logfs_check_ds(ds1) == 0;
270
271 if (!valid0 && valid1) {
272 printk(KERN_INFO"First superblock is invalid - fixing.\n");
273 return write_one_sb(sb, super->s_devops->find_first_sb);
274 }
275 if (valid0 && !valid1) {
276 printk(KERN_INFO"Last superblock is invalid - fixing.\n");
277 return write_one_sb(sb, super->s_devops->find_last_sb);
278 }
279 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
280 printk(KERN_INFO"Superblocks don't match - fixing.\n");
281 return logfs_write_sb(sb);
282 }
283 /* If neither is valid now, something's wrong. Didn't we properly
284 * check them before?!? */
285 BUG_ON(!valid0 && !valid1);
286 return 0;
287}
288
289static int logfs_make_writeable(struct super_block *sb)
290{
291 int err;
292
293 err = logfs_open_segfile(sb);
294 if (err)
295 return err;
296
297 /* Repair any broken superblock copies */
298 err = logfs_recover_sb(sb);
299 if (err)
300 return err;
301
302 /* Check areas for trailing unaccounted data */
303 err = logfs_check_areas(sb);
304 if (err)
305 return err;
306
307 /* Do one GC pass before any data gets dirtied */
308 logfs_gc_pass(sb);
309
310 /* after all initializations are done, replay the journal
311 * for rw-mounts, if necessary */
312 err = logfs_replay_journal(sb);
313 if (err)
314 return err;
315
316 return 0;
317}
318
319static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
320{
321 struct logfs_super *super = logfs_super(sb);
322 struct inode *rootdir;
323 int err;
324
325 /* root dir */
326 rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
327 if (IS_ERR(rootdir))
328 goto fail;
329
330 sb->s_root = d_alloc_root(rootdir);
331 if (!sb->s_root)
332 goto fail2;
333
334 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
335 if (!super->s_erase_page)
336 goto fail2;
337 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
338
339 /* FIXME: check for read-only mounts */
340 err = logfs_make_writeable(sb);
341 if (err)
342 goto fail3;
343
344 log_super("LogFS: Finished mounting\n");
345 simple_set_mnt(mnt, sb);
346 return 0;
347
348fail3:
349 __free_page(super->s_erase_page);
350fail2:
351 iput(rootdir);
352fail:
353 iput(logfs_super(sb)->s_master_inode);
354 return -EIO;
355}
356
357int logfs_check_ds(struct logfs_disk_super *ds)
358{
359 struct logfs_segment_header *sh = &ds->ds_sh;
360
361 if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
362 return -EINVAL;
363 if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
364 return -EINVAL;
365 if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
366 LOGFS_SEGMENT_HEADERSIZE + 12))
367 return -EINVAL;
368 return 0;
369}
370
371static struct page *find_super_block(struct super_block *sb)
372{
373 struct logfs_super *super = logfs_super(sb);
374 struct page *first, *last;
375
376 first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
377 if (!first || IS_ERR(first))
378 return NULL;
379 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
380 if (!last || IS_ERR(first)) {
381 page_cache_release(first);
382 return NULL;
383 }
384
385 if (!logfs_check_ds(page_address(first))) {
386 page_cache_release(last);
387 return first;
388 }
389
390 /* First one didn't work, try the second superblock */
391 if (!logfs_check_ds(page_address(last))) {
392 page_cache_release(first);
393 return last;
394 }
395
396 /* Neither worked, sorry folks */
397 page_cache_release(first);
398 page_cache_release(last);
399 return NULL;
400}
401
402static int __logfs_read_sb(struct super_block *sb)
403{
404 struct logfs_super *super = logfs_super(sb);
405 struct page *page;
406 struct logfs_disk_super *ds;
407 int i;
408
409 page = find_super_block(sb);
410 if (!page)
411 return -EIO;
412
413 ds = page_address(page);
414 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
415 super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
416 super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
417 super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
418 super->s_segsize = 1 << ds->ds_segment_shift;
419 super->s_segmask = (1 << ds->ds_segment_shift) - 1;
420 super->s_segshift = ds->ds_segment_shift;
421 sb->s_blocksize = 1 << ds->ds_block_shift;
422 sb->s_blocksize_bits = ds->ds_block_shift;
423 super->s_writesize = 1 << ds->ds_write_shift;
424 super->s_writeshift = ds->ds_write_shift;
425 super->s_no_segs = super->s_size >> super->s_segshift;
426 super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
427 super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
428 super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
429 super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
430 super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
431
432 journal_for_each(i)
433 super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
434
435 super->s_ifile_levels = ds->ds_ifile_levels;
436 super->s_iblock_levels = ds->ds_iblock_levels;
437 super->s_data_levels = ds->ds_data_levels;
438 super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
439 + super->s_data_levels;
440 page_cache_release(page);
441 return 0;
442}
443
444static int logfs_read_sb(struct super_block *sb, int read_only)
445{
446 struct logfs_super *super = logfs_super(sb);
447 int ret;
448
449 super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
450 if (!super->s_btree_pool)
451 return -ENOMEM;
452
453 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
454 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
455
456 ret = logfs_init_mapping(sb);
457 if (ret)
458 return ret;
459
460 ret = __logfs_read_sb(sb);
461 if (ret)
462 return ret;
463
464 if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
465 return -EIO;
466 if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
467 !read_only)
468 return -EIO;
469
470 mutex_init(&super->s_dirop_mutex);
471 mutex_init(&super->s_object_alias_mutex);
472 INIT_LIST_HEAD(&super->s_freeing_list);
473
474 ret = logfs_init_rw(sb);
475 if (ret)
476 return ret;
477
478 ret = logfs_init_areas(sb);
479 if (ret)
480 return ret;
481
482 ret = logfs_init_gc(sb);
483 if (ret)
484 return ret;
485
486 ret = logfs_init_journal(sb);
487 if (ret)
488 return ret;
489
490 return 0;
491}
492
493static void logfs_kill_sb(struct super_block *sb)
494{
495 struct logfs_super *super = logfs_super(sb);
496
497 log_super("LogFS: Start unmounting\n");
498 /* Alias entries slow down mount, so evict as many as possible */
499 sync_filesystem(sb);
500 logfs_write_anchor(sb);
501
502 /*
503 * From this point on alias entries are simply dropped - and any
504 * writes to the object store are considered bugs.
505 */
506 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
507 log_super("LogFS: Now in shutdown\n");
508 generic_shutdown_super(sb);
509
510 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
511
512 logfs_cleanup_gc(sb);
513 logfs_cleanup_journal(sb);
514 logfs_cleanup_areas(sb);
515 logfs_cleanup_rw(sb);
516 if (super->s_erase_page)
517 __free_page(super->s_erase_page);
518 super->s_devops->put_device(sb);
519 mempool_destroy(super->s_btree_pool);
520 mempool_destroy(super->s_alias_pool);
521 kfree(super);
522 log_super("LogFS: Finished unmounting\n");
523}
524
525int logfs_get_sb_device(struct file_system_type *type, int flags,
526 struct mtd_info *mtd, struct block_device *bdev,
527 const struct logfs_device_ops *devops, struct vfsmount *mnt)
528{
529 struct logfs_super *super;
530 struct super_block *sb;
531 int err = -ENOMEM;
532 static int mount_count;
533
534 log_super("LogFS: Start mount %x\n", mount_count++);
535 super = kzalloc(sizeof(*super), GFP_KERNEL);
536 if (!super)
537 goto err0;
538
539 super->s_mtd = mtd;
540 super->s_bdev = bdev;
541 err = -EINVAL;
542 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
543 if (IS_ERR(sb))
544 goto err0;
545
546 if (sb->s_root) {
547 /* Device is already in use */
548 err = 0;
549 simple_set_mnt(mnt, sb);
550 goto err0;
551 }
552
553 super->s_devops = devops;
554
555 /*
556 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
557 * only covers 16TB and the upper 8TB are used for indirect blocks.
558 * On 64bit system we could bump up the limit, but that would make
559 * the filesystem incompatible with 32bit systems.
560 */
561 sb->s_maxbytes = (1ull << 43) - 1;
562 sb->s_op = &logfs_super_operations;
563 sb->s_flags = flags | MS_NOATIME;
564
565 err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
566 if (err)
567 goto err1;
568
569 sb->s_flags |= MS_ACTIVE;
570 err = logfs_get_sb_final(sb, mnt);
571 if (err)
572 goto err1;
573 return 0;
574
575err1:
576 deactivate_locked_super(sb);
577 return err;
578err0:
579 kfree(super);
580 //devops->put_device(sb);
581 return err;
582}
583
584static int logfs_get_sb(struct file_system_type *type, int flags,
585 const char *devname, void *data, struct vfsmount *mnt)
586{
587 ulong mtdnr;
588
589 if (!devname)
590 return logfs_get_sb_bdev(type, flags, devname, mnt);
591 if (strncmp(devname, "mtd", 3))
592 return logfs_get_sb_bdev(type, flags, devname, mnt);
593
594 {
595 char *garbage;
596 mtdnr = simple_strtoul(devname+3, &garbage, 0);
597 if (*garbage)
598 return -EINVAL;
599 }
600
601 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
602}
603
604static struct file_system_type logfs_fs_type = {
605 .owner = THIS_MODULE,
606 .name = "logfs",
607 .get_sb = logfs_get_sb,
608 .kill_sb = logfs_kill_sb,
609 .fs_flags = FS_REQUIRES_DEV,
610
611};
612
613static int __init logfs_init(void)
614{
615 int ret;
616
617 emergency_page = alloc_pages(GFP_KERNEL, 0);
618 if (!emergency_page)
619 return -ENOMEM;
620
621 ret = logfs_compr_init();
622 if (ret)
623 goto out1;
624
625 ret = logfs_init_inode_cache();
626 if (ret)
627 goto out2;
628
629 return register_filesystem(&logfs_fs_type);
630out2:
631 logfs_compr_exit();
632out1:
633 __free_pages(emergency_page, 0);
634 return ret;
635}
636
637static void __exit logfs_exit(void)
638{
639 unregister_filesystem(&logfs_fs_type);
640 logfs_destroy_inode_cache();
641 logfs_compr_exit();
642 __free_pages(emergency_page, 0);
643}
644
645module_init(logfs_init);
646module_exit(logfs_exit);
647
648MODULE_LICENSE("GPL v2");
649MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
650MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d72164..756f8c93780c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/vfs.h> 19#include <linux/vfs.h>
20#include <linux/writeback.h>
20 21
21static int minix_write_inode(struct inode * inode, int wait); 22static int minix_write_inode(struct inode *inode,
23 struct writeback_control *wbc);
22static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); 24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
23static int minix_remount (struct super_block * sb, int * flags, char * data); 25static int minix_remount (struct super_block * sb, int * flags, char * data);
24 26
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
552 return bh; 554 return bh;
553} 555}
554 556
555static int minix_write_inode(struct inode *inode, int wait) 557static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
556{ 558{
557 int err = 0; 559 int err = 0;
558 struct buffer_head *bh; 560 struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
563 bh = V2_minix_update_inode(inode); 565 bh = V2_minix_update_inode(inode);
564 if (!bh) 566 if (!bh)
565 return -EIO; 567 return -EIO;
566 if (wait && buffer_dirty(bh)) { 568 if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
567 sync_dirty_buffer(bh); 569 sync_dirty_buffer(bh);
568 if (buffer_req(bh) && !buffer_uptodate(bh)) { 570 if (buffer_req(bh) && !buffer_uptodate(bh)) {
569 printk("IO error syncing minix inode [%s:%08lx]\n", 571 printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
1#include <linux/buffer_head.h> 1#include <linux/buffer_head.h>
2#include <linux/slab.h>
2#include "minix.h" 3#include "minix.h"
3 4
4enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ 5enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543b..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
19#include <linux/gfp.h>
19#include <linux/bio.h> 20#include <linux/bio.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
@@ -561,7 +562,7 @@ page_is_mapped:
561 if (page->index >= end_index) { 562 if (page->index >= end_index) {
562 /* 563 /*
563 * The page straddles i_size. It must be zeroed out on each 564 * The page straddles i_size. It must be zeroed out on each
564 * and every writepage invokation because it may be mmapped. 565 * and every writepage invocation because it may be mmapped.
565 * "A file is mapped in multiples of the page size. For a file 566 * "A file is mapped in multiples of the page size. For a file
566 * that is not a multiple of the page size, the remaining memory 567 * that is not a multiple of the page size, the remaining memory
567 * is zeroed when mapped, and writes to that region are not 568 * is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index d11f404667e9..a7dce91a7e42 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/namei.h> 21#include <linux/namei.h>
22#include <linux/quotaops.h>
23#include <linux/pagemap.h> 22#include <linux/pagemap.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
@@ -35,7 +34,7 @@
35#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
36#include <asm/uaccess.h> 35#include <asm/uaccess.h>
37 36
38#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) 37#include "internal.h"
39 38
40/* [Feb-1997 T. Schoebel-Theuer] 39/* [Feb-1997 T. Schoebel-Theuer]
41 * Fundamental changes in the pathname lookup mechanisms (namei) 40 * Fundamental changes in the pathname lookup mechanisms (namei)
@@ -108,8 +107,6 @@
108 * any extra contention... 107 * any extra contention...
109 */ 108 */
110 109
111static int __link_path_walk(const char *name, struct nameidata *nd);
112
113/* In order to reduce some races, while at the same time doing additional 110/* In order to reduce some races, while at the same time doing additional
114 * checking and hopefully speeding things up, we copy filenames to the 111 * checking and hopefully speeding things up, we copy filenames to the
115 * kernel data space before using them.. 112 * kernel data space before using them..
@@ -234,6 +231,7 @@ int generic_permission(struct inode *inode, int mask,
234 /* 231 /*
235 * Searching includes executable on directories, else just read. 232 * Searching includes executable on directories, else just read.
236 */ 233 */
234 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
237 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) 235 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
238 if (capable(CAP_DAC_READ_SEARCH)) 236 if (capable(CAP_DAC_READ_SEARCH))
239 return 0; 237 return 0;
@@ -414,36 +412,55 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
414} 412}
415 413
416/* 414/*
417 * Internal lookup() using the new generic dcache. 415 * force_reval_path - force revalidation of a dentry
418 * SMP-safe 416 *
417 * In some situations the path walking code will trust dentries without
418 * revalidating them. This causes problems for filesystems that depend on
419 * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
420 * (which indicates that it's possible for the dentry to go stale), force
421 * a d_revalidate call before proceeding.
422 *
423 * Returns 0 if the revalidation was successful. If the revalidation fails,
424 * either return the error returned by d_revalidate or -ESTALE if the
425 * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
426 * invalidate the dentry. It's up to the caller to handle putting references
427 * to the path if necessary.
419 */ 428 */
420static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd) 429static int
430force_reval_path(struct path *path, struct nameidata *nd)
421{ 431{
422 struct dentry * dentry = __d_lookup(parent, name); 432 int status;
433 struct dentry *dentry = path->dentry;
423 434
424 /* lockess __d_lookup may fail due to concurrent d_move() 435 /*
425 * in some unrelated directory, so try with d_lookup 436 * only check on filesystems where it's possible for the dentry to
437 * become stale. It's assumed that if this flag is set then the
438 * d_revalidate op will also be defined.
426 */ 439 */
427 if (!dentry) 440 if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
428 dentry = d_lookup(parent, name); 441 return 0;
429 442
430 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 443 status = dentry->d_op->d_revalidate(dentry, nd);
431 dentry = do_revalidate(dentry, nd); 444 if (status > 0)
445 return 0;
432 446
433 return dentry; 447 if (!status) {
448 d_invalidate(dentry);
449 status = -ESTALE;
450 }
451 return status;
434} 452}
435 453
436/* 454/*
437 * Short-cut version of permission(), for calling by 455 * Short-cut version of permission(), for calling on directories
438 * path_walk(), when dcache lock is held. Combines parts 456 * during pathname resolution. Combines parts of permission()
439 * of permission() and generic_permission(), and tests ONLY for 457 * and generic_permission(), and tests ONLY for MAY_EXEC permission.
440 * MAY_EXEC permission.
441 * 458 *
442 * If appropriate, check DAC only. If not appropriate, or 459 * If appropriate, check DAC only. If not appropriate, or
443 * short-cut DAC fails, then call permission() to do more 460 * short-cut DAC fails, then call ->permission() to do more
444 * complete permission check. 461 * complete permission check.
445 */ 462 */
446static int exec_permission_lite(struct inode *inode) 463static int exec_permission(struct inode *inode)
447{ 464{
448 int ret; 465 int ret;
449 466
@@ -465,99 +482,6 @@ ok:
465 return security_inode_permission(inode, MAY_EXEC); 482 return security_inode_permission(inode, MAY_EXEC);
466} 483}
467 484
468/*
469 * This is called when everything else fails, and we actually have
470 * to go to the low-level filesystem to find out what we should do..
471 *
472 * We get the directory semaphore, and after getting that we also
473 * make sure that nobody added the entry to the dcache in the meantime..
474 * SMP-safe
475 */
476static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
477{
478 struct dentry * result;
479 struct inode *dir = parent->d_inode;
480
481 mutex_lock(&dir->i_mutex);
482 /*
483 * First re-do the cached lookup just in case it was created
484 * while we waited for the directory semaphore..
485 *
486 * FIXME! This could use version numbering or similar to
487 * avoid unnecessary cache lookups.
488 *
489 * The "dcache_lock" is purely to protect the RCU list walker
490 * from concurrent renames at this point (we mustn't get false
491 * negatives from the RCU list walk here, unlike the optimistic
492 * fast walk).
493 *
494 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
495 */
496 result = d_lookup(parent, name);
497 if (!result) {
498 struct dentry *dentry;
499
500 /* Don't create child dentry for a dead directory. */
501 result = ERR_PTR(-ENOENT);
502 if (IS_DEADDIR(dir))
503 goto out_unlock;
504
505 dentry = d_alloc(parent, name);
506 result = ERR_PTR(-ENOMEM);
507 if (dentry) {
508 result = dir->i_op->lookup(dir, dentry, nd);
509 if (result)
510 dput(dentry);
511 else
512 result = dentry;
513 }
514out_unlock:
515 mutex_unlock(&dir->i_mutex);
516 return result;
517 }
518
519 /*
520 * Uhhuh! Nasty case: the cache was re-populated while
521 * we waited on the semaphore. Need to revalidate.
522 */
523 mutex_unlock(&dir->i_mutex);
524 if (result->d_op && result->d_op->d_revalidate) {
525 result = do_revalidate(result, nd);
526 if (!result)
527 result = ERR_PTR(-ENOENT);
528 }
529 return result;
530}
531
532/*
533 * Wrapper to retry pathname resolution whenever the underlying
534 * file system returns an ESTALE.
535 *
536 * Retry the whole path once, forcing real lookup requests
537 * instead of relying on the dcache.
538 */
539static __always_inline int link_path_walk(const char *name, struct nameidata *nd)
540{
541 struct path save = nd->path;
542 int result;
543
544 /* make sure the stuff we saved doesn't go away */
545 path_get(&save);
546
547 result = __link_path_walk(name, nd);
548 if (result == -ESTALE) {
549 /* nd->path had been dropped */
550 nd->path = save;
551 path_get(&nd->path);
552 nd->flags |= LOOKUP_REVAL;
553 result = __link_path_walk(name, nd);
554 }
555
556 path_put(&save);
557
558 return result;
559}
560
561static __always_inline void set_root(struct nameidata *nd) 485static __always_inline void set_root(struct nameidata *nd)
562{ 486{
563 if (!nd->root.mnt) { 487 if (!nd->root.mnt) {
@@ -569,10 +493,10 @@ static __always_inline void set_root(struct nameidata *nd)
569 } 493 }
570} 494}
571 495
496static int link_path_walk(const char *, struct nameidata *);
497
572static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 498static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
573{ 499{
574 int res = 0;
575 char *name;
576 if (IS_ERR(link)) 500 if (IS_ERR(link))
577 goto fail; 501 goto fail;
578 502
@@ -583,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
583 path_get(&nd->root); 507 path_get(&nd->root);
584 } 508 }
585 509
586 res = link_path_walk(link, nd); 510 return link_path_walk(link, nd);
587 if (nd->depth || res || nd->last_type!=LAST_NORM)
588 return res;
589 /*
590 * If it is an iterative symlinks resolution in open_namei() we
591 * have to copy the last component. And all that crap because of
592 * bloody create() on broken symlinks. Furrfu...
593 */
594 name = __getname();
595 if (unlikely(!name)) {
596 path_put(&nd->path);
597 return -ENOMEM;
598 }
599 strcpy(name, nd->last.name);
600 nd->last.name = name;
601 return 0;
602fail: 511fail:
603 path_put(&nd->path); 512 path_put(&nd->path);
604 return PTR_ERR(link); 513 return PTR_ERR(link);
@@ -620,10 +529,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
620 nd->path.dentry = path->dentry; 529 nd->path.dentry = path->dentry;
621} 530}
622 531
623static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) 532static __always_inline int
533__do_follow_link(struct path *path, struct nameidata *nd, void **p)
624{ 534{
625 int error; 535 int error;
626 void *cookie;
627 struct dentry *dentry = path->dentry; 536 struct dentry *dentry = path->dentry;
628 537
629 touch_atime(path->mnt, dentry); 538 touch_atime(path->mnt, dentry);
@@ -634,18 +543,20 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
634 dget(dentry); 543 dget(dentry);
635 } 544 }
636 mntget(path->mnt); 545 mntget(path->mnt);
637 cookie = dentry->d_inode->i_op->follow_link(dentry, nd); 546 nd->last_type = LAST_BIND;
638 error = PTR_ERR(cookie); 547 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
639 if (!IS_ERR(cookie)) { 548 error = PTR_ERR(*p);
549 if (!IS_ERR(*p)) {
640 char *s = nd_get_link(nd); 550 char *s = nd_get_link(nd);
641 error = 0; 551 error = 0;
642 if (s) 552 if (s)
643 error = __vfs_follow_link(nd, s); 553 error = __vfs_follow_link(nd, s);
644 if (dentry->d_inode->i_op->put_link) 554 else if (nd->last_type == LAST_BIND) {
645 dentry->d_inode->i_op->put_link(dentry, nd, cookie); 555 error = force_reval_path(&nd->path, nd);
556 if (error)
557 path_put(&nd->path);
558 }
646 } 559 }
647 path_put(path);
648
649 return error; 560 return error;
650} 561}
651 562
@@ -658,6 +569,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
658 */ 569 */
659static inline int do_follow_link(struct path *path, struct nameidata *nd) 570static inline int do_follow_link(struct path *path, struct nameidata *nd)
660{ 571{
572 void *cookie;
661 int err = -ELOOP; 573 int err = -ELOOP;
662 if (current->link_count >= MAX_NESTED_LINKS) 574 if (current->link_count >= MAX_NESTED_LINKS)
663 goto loop; 575 goto loop;
@@ -671,7 +583,10 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
671 current->link_count++; 583 current->link_count++;
672 current->total_link_count++; 584 current->total_link_count++;
673 nd->depth++; 585 nd->depth++;
674 err = __do_follow_link(path, nd); 586 err = __do_follow_link(path, nd, &cookie);
587 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
588 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
589 path_put(path);
675 current->link_count--; 590 current->link_count--;
676 nd->depth--; 591 nd->depth--;
677 return err; 592 return err;
@@ -757,33 +672,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
757 set_root(nd); 672 set_root(nd);
758 673
759 while(1) { 674 while(1) {
760 struct vfsmount *parent;
761 struct dentry *old = nd->path.dentry; 675 struct dentry *old = nd->path.dentry;
762 676
763 if (nd->path.dentry == nd->root.dentry && 677 if (nd->path.dentry == nd->root.dentry &&
764 nd->path.mnt == nd->root.mnt) { 678 nd->path.mnt == nd->root.mnt) {
765 break; 679 break;
766 } 680 }
767 spin_lock(&dcache_lock);
768 if (nd->path.dentry != nd->path.mnt->mnt_root) { 681 if (nd->path.dentry != nd->path.mnt->mnt_root) {
769 nd->path.dentry = dget(nd->path.dentry->d_parent); 682 /* rare case of legitimate dget_parent()... */
770 spin_unlock(&dcache_lock); 683 nd->path.dentry = dget_parent(nd->path.dentry);
771 dput(old); 684 dput(old);
772 break; 685 break;
773 } 686 }
774 spin_unlock(&dcache_lock); 687 if (!follow_up(&nd->path))
775 spin_lock(&vfsmount_lock);
776 parent = nd->path.mnt->mnt_parent;
777 if (parent == nd->path.mnt) {
778 spin_unlock(&vfsmount_lock);
779 break; 688 break;
780 }
781 mntget(parent);
782 nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
783 spin_unlock(&vfsmount_lock);
784 dput(old);
785 mntput(nd->path.mnt);
786 nd->path.mnt = parent;
787 } 689 }
788 follow_mount(&nd->path); 690 follow_mount(&nd->path);
789} 691}
@@ -797,8 +699,19 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
797 struct path *path) 699 struct path *path)
798{ 700{
799 struct vfsmount *mnt = nd->path.mnt; 701 struct vfsmount *mnt = nd->path.mnt;
800 struct dentry *dentry = __d_lookup(nd->path.dentry, name); 702 struct dentry *dentry, *parent;
703 struct inode *dir;
704 /*
705 * See if the low-level filesystem might want
706 * to use its own hash..
707 */
708 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
709 int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
710 if (err < 0)
711 return err;
712 }
801 713
714 dentry = __d_lookup(nd->path.dentry, name);
802 if (!dentry) 715 if (!dentry)
803 goto need_lookup; 716 goto need_lookup;
804 if (dentry->d_op && dentry->d_op->d_revalidate) 717 if (dentry->d_op && dentry->d_op->d_revalidate)
@@ -810,7 +723,59 @@ done:
810 return 0; 723 return 0;
811 724
812need_lookup: 725need_lookup:
813 dentry = real_lookup(nd->path.dentry, name, nd); 726 parent = nd->path.dentry;
727 dir = parent->d_inode;
728
729 mutex_lock(&dir->i_mutex);
730 /*
731 * First re-do the cached lookup just in case it was created
732 * while we waited for the directory semaphore..
733 *
734 * FIXME! This could use version numbering or similar to
735 * avoid unnecessary cache lookups.
736 *
737 * The "dcache_lock" is purely to protect the RCU list walker
738 * from concurrent renames at this point (we mustn't get false
739 * negatives from the RCU list walk here, unlike the optimistic
740 * fast walk).
741 *
742 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
743 */
744 dentry = d_lookup(parent, name);
745 if (!dentry) {
746 struct dentry *new;
747
748 /* Don't create child dentry for a dead directory. */
749 dentry = ERR_PTR(-ENOENT);
750 if (IS_DEADDIR(dir))
751 goto out_unlock;
752
753 new = d_alloc(parent, name);
754 dentry = ERR_PTR(-ENOMEM);
755 if (new) {
756 dentry = dir->i_op->lookup(dir, new, nd);
757 if (dentry)
758 dput(new);
759 else
760 dentry = new;
761 }
762out_unlock:
763 mutex_unlock(&dir->i_mutex);
764 if (IS_ERR(dentry))
765 goto fail;
766 goto done;
767 }
768
769 /*
770 * Uhhuh! Nasty case: the cache was re-populated while
771 * we waited on the semaphore. Need to revalidate.
772 */
773 mutex_unlock(&dir->i_mutex);
774 if (dentry->d_op && dentry->d_op->d_revalidate) {
775 dentry = do_revalidate(dentry, nd);
776 if (!dentry)
777 dentry = ERR_PTR(-ENOENT);
778 }
814 if (IS_ERR(dentry)) 779 if (IS_ERR(dentry))
815 goto fail; 780 goto fail;
816 goto done; 781 goto done;
@@ -828,6 +793,17 @@ fail:
828} 793}
829 794
830/* 795/*
796 * This is a temporary kludge to deal with "automount" symlinks; proper
797 * solution is to trigger them on follow_mount(), so that do_lookup()
798 * would DTRT. To be killed before 2.6.34-final.
799 */
800static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
801{
802 return inode && unlikely(inode->i_op->follow_link) &&
803 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
804}
805
806/*
831 * Name resolution. 807 * Name resolution.
832 * This is the basic name resolution function, turning a pathname into 808 * This is the basic name resolution function, turning a pathname into
833 * the final dentry. We expect 'base' to be positive and a directory. 809 * the final dentry. We expect 'base' to be positive and a directory.
@@ -835,7 +811,7 @@ fail:
835 * Returns 0 and nd will have valid dentry and mnt on success. 811 * Returns 0 and nd will have valid dentry and mnt on success.
836 * Returns error and drops reference to input namei data on failure. 812 * Returns error and drops reference to input namei data on failure.
837 */ 813 */
838static int __link_path_walk(const char *name, struct nameidata *nd) 814static int link_path_walk(const char *name, struct nameidata *nd)
839{ 815{
840 struct path next; 816 struct path next;
841 struct inode *inode; 817 struct inode *inode;
@@ -858,7 +834,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
858 unsigned int c; 834 unsigned int c;
859 835
860 nd->flags |= LOOKUP_CONTINUE; 836 nd->flags |= LOOKUP_CONTINUE;
861 err = exec_permission_lite(inode); 837 err = exec_permission(inode);
862 if (err) 838 if (err)
863 break; 839 break;
864 840
@@ -898,16 +874,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
898 case 1: 874 case 1:
899 continue; 875 continue;
900 } 876 }
901 /*
902 * See if the low-level filesystem might want
903 * to use its own hash..
904 */
905 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
906 err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
907 &this);
908 if (err < 0)
909 break;
910 }
911 /* This does the actual lookups.. */ 877 /* This does the actual lookups.. */
912 err = do_lookup(nd, &this, &next); 878 err = do_lookup(nd, &this, &next);
913 if (err) 879 if (err)
@@ -953,18 +919,11 @@ last_component:
953 case 1: 919 case 1:
954 goto return_reval; 920 goto return_reval;
955 } 921 }
956 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
957 err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
958 &this);
959 if (err < 0)
960 break;
961 }
962 err = do_lookup(nd, &this, &next); 922 err = do_lookup(nd, &this, &next);
963 if (err) 923 if (err)
964 break; 924 break;
965 inode = next.dentry->d_inode; 925 inode = next.dentry->d_inode;
966 if ((lookup_flags & LOOKUP_FOLLOW) 926 if (follow_on_final(inode, lookup_flags)) {
967 && inode && inode->i_op->follow_link) {
968 err = do_follow_link(&next, nd); 927 err = do_follow_link(&next, nd);
969 if (err) 928 if (err)
970 goto return_err; 929 goto return_err;
@@ -1017,8 +976,27 @@ return_err:
1017 976
1018static int path_walk(const char *name, struct nameidata *nd) 977static int path_walk(const char *name, struct nameidata *nd)
1019{ 978{
979 struct path save = nd->path;
980 int result;
981
1020 current->total_link_count = 0; 982 current->total_link_count = 0;
1021 return link_path_walk(name, nd); 983
984 /* make sure the stuff we saved doesn't go away */
985 path_get(&save);
986
987 result = link_path_walk(name, nd);
988 if (result == -ESTALE) {
989 /* nd->path had been dropped */
990 current->total_link_count = 0;
991 nd->path = save;
992 path_get(&nd->path);
993 nd->flags |= LOOKUP_REVAL;
994 result = link_path_walk(name, nd);
995 }
996
997 path_put(&save);
998
999 return result;
1022} 1000}
1023 1001
1024static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1002static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
@@ -1141,36 +1119,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1141 return retval; 1119 return retval;
1142} 1120}
1143 1121
1144/**
1145 * path_lookup_open - lookup a file path with open intent
1146 * @dfd: the directory to use as base, or AT_FDCWD
1147 * @name: pointer to file name
1148 * @lookup_flags: lookup intent flags
1149 * @nd: pointer to nameidata
1150 * @open_flags: open intent flags
1151 */
1152static int path_lookup_open(int dfd, const char *name,
1153 unsigned int lookup_flags, struct nameidata *nd, int open_flags)
1154{
1155 struct file *filp = get_empty_filp();
1156 int err;
1157
1158 if (filp == NULL)
1159 return -ENFILE;
1160 nd->intent.open.file = filp;
1161 nd->intent.open.flags = open_flags;
1162 nd->intent.open.create_mode = 0;
1163 err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
1164 if (IS_ERR(nd->intent.open.file)) {
1165 if (err == 0) {
1166 err = PTR_ERR(nd->intent.open.file);
1167 path_put(&nd->path);
1168 }
1169 } else if (err != 0)
1170 release_open_intent(nd);
1171 return err;
1172}
1173
1174static struct dentry *__lookup_hash(struct qstr *name, 1122static struct dentry *__lookup_hash(struct qstr *name,
1175 struct dentry *base, struct nameidata *nd) 1123 struct dentry *base, struct nameidata *nd)
1176{ 1124{
@@ -1191,7 +1139,17 @@ static struct dentry *__lookup_hash(struct qstr *name,
1191 goto out; 1139 goto out;
1192 } 1140 }
1193 1141
1194 dentry = cached_lookup(base, name, nd); 1142 dentry = __d_lookup(base, name);
1143
1144 /* lockess __d_lookup may fail due to concurrent d_move()
1145 * in some unrelated directory, so try with d_lookup
1146 */
1147 if (!dentry)
1148 dentry = d_lookup(base, name);
1149
1150 if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1151 dentry = do_revalidate(dentry, nd);
1152
1195 if (!dentry) { 1153 if (!dentry) {
1196 struct dentry *new; 1154 struct dentry *new;
1197 1155
@@ -1223,7 +1181,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
1223{ 1181{
1224 int err; 1182 int err;
1225 1183
1226 err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC); 1184 err = exec_permission(nd->path.dentry->d_inode);
1227 if (err) 1185 if (err)
1228 return ERR_PTR(err); 1186 return ERR_PTR(err);
1229 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1187 return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1273,29 +1231,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1273 if (err) 1231 if (err)
1274 return ERR_PTR(err); 1232 return ERR_PTR(err);
1275 1233
1276 err = inode_permission(base->d_inode, MAY_EXEC); 1234 err = exec_permission(base->d_inode);
1277 if (err)
1278 return ERR_PTR(err);
1279 return __lookup_hash(&this, base, NULL);
1280}
1281
1282/**
1283 * lookup_one_noperm - bad hack for sysfs
1284 * @name: pathname component to lookup
1285 * @base: base directory to lookup from
1286 *
1287 * This is a variant of lookup_one_len that doesn't perform any permission
1288 * checks. It's a horrible hack to work around the braindead sysfs
1289 * architecture and should not be used anywhere else.
1290 *
1291 * DON'T USE THIS FUNCTION EVER, thanks.
1292 */
1293struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
1294{
1295 int err;
1296 struct qstr this;
1297
1298 err = __lookup_one_len(name, &this, base, strlen(name));
1299 if (err) 1235 if (err)
1300 return ERR_PTR(err); 1236 return ERR_PTR(err);
1301 return __lookup_hash(&this, base, NULL); 1237 return __lookup_hash(&this, base, NULL);
@@ -1381,7 +1317,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1381 return -ENOENT; 1317 return -ENOENT;
1382 1318
1383 BUG_ON(victim->d_parent->d_inode != dir); 1319 BUG_ON(victim->d_parent->d_inode != dir);
1384 audit_inode_child(victim->d_name.name, victim, dir); 1320 audit_inode_child(victim, dir);
1385 1321
1386 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 1322 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1387 if (error) 1323 if (error)
@@ -1422,22 +1358,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
1422 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 1358 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1423} 1359}
1424 1360
1425/*
1426 * O_DIRECTORY translates into forcing a directory lookup.
1427 */
1428static inline int lookup_flags(unsigned int f)
1429{
1430 unsigned long retval = LOOKUP_FOLLOW;
1431
1432 if (f & O_NOFOLLOW)
1433 retval &= ~LOOKUP_FOLLOW;
1434
1435 if (f & O_DIRECTORY)
1436 retval |= LOOKUP_DIRECTORY;
1437
1438 return retval;
1439}
1440
1441/* 1361/*
1442 * p1 and p2 should be directories on the same fs. 1362 * p1 and p2 should be directories on the same fs.
1443 */ 1363 */
@@ -1495,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1495 error = security_inode_create(dir, dentry, mode); 1415 error = security_inode_create(dir, dentry, mode);
1496 if (error) 1416 if (error)
1497 return error; 1417 return error;
1498 vfs_dq_init(dir);
1499 error = dir->i_op->create(dir, dentry, mode, nd); 1418 error = dir->i_op->create(dir, dentry, mode, nd);
1500 if (!error) 1419 if (!error)
1501 fsnotify_create(dir, dentry); 1420 fsnotify_create(dir, dentry);
@@ -1533,69 +1452,45 @@ int may_open(struct path *path, int acc_mode, int flag)
1533 if (error) 1452 if (error)
1534 return error; 1453 return error;
1535 1454
1536 error = ima_path_check(path, acc_mode ?
1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1538 ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
1539 IMA_COUNT_UPDATE);
1540
1541 if (error)
1542 return error;
1543 /* 1455 /*
1544 * An append-only file must be opened in append mode for writing. 1456 * An append-only file must be opened in append mode for writing.
1545 */ 1457 */
1546 if (IS_APPEND(inode)) { 1458 if (IS_APPEND(inode)) {
1547 error = -EPERM; 1459 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
1548 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1460 return -EPERM;
1549 goto err_out;
1550 if (flag & O_TRUNC) 1461 if (flag & O_TRUNC)
1551 goto err_out; 1462 return -EPERM;
1552 } 1463 }
1553 1464
1554 /* O_NOATIME can only be set by the owner or superuser */ 1465 /* O_NOATIME can only be set by the owner or superuser */
1555 if (flag & O_NOATIME) 1466 if (flag & O_NOATIME && !is_owner_or_cap(inode))
1556 if (!is_owner_or_cap(inode)) { 1467 return -EPERM;
1557 error = -EPERM;
1558 goto err_out;
1559 }
1560 1468
1561 /* 1469 /*
1562 * Ensure there are no outstanding leases on the file. 1470 * Ensure there are no outstanding leases on the file.
1563 */ 1471 */
1564 error = break_lease(inode, flag); 1472 return break_lease(inode, flag);
1565 if (error) 1473}
1566 goto err_out;
1567
1568 if (flag & O_TRUNC) {
1569 error = get_write_access(inode);
1570 if (error)
1571 goto err_out;
1572
1573 /*
1574 * Refuse to truncate files with mandatory locks held on them.
1575 */
1576 error = locks_verify_locked(inode);
1577 if (!error)
1578 error = security_path_truncate(path, 0,
1579 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1580 if (!error) {
1581 vfs_dq_init(inode);
1582
1583 error = do_truncate(dentry, 0,
1584 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1585 NULL);
1586 }
1587 put_write_access(inode);
1588 if (error)
1589 goto err_out;
1590 } else
1591 if (flag & FMODE_WRITE)
1592 vfs_dq_init(inode);
1593 1474
1594 return 0; 1475static int handle_truncate(struct path *path)
1595err_out: 1476{
1596 ima_counts_put(path, acc_mode ? 1477 struct inode *inode = path->dentry->d_inode;
1597 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : 1478 int error = get_write_access(inode);
1598 ACC_MODE(flag) & (MAY_READ | MAY_WRITE)); 1479 if (error)
1480 return error;
1481 /*
1482 * Refuse to truncate files with mandatory locks held on them.
1483 */
1484 error = locks_verify_locked(inode);
1485 if (!error)
1486 error = security_path_truncate(path, 0,
1487 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1488 if (!error) {
1489 error = do_truncate(path->dentry, 0,
1490 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1491 NULL);
1492 }
1493 put_write_access(inode);
1599 return error; 1494 return error;
1600} 1495}
1601 1496
@@ -1605,7 +1500,7 @@ err_out:
1605 * what get passed to sys_open(). 1500 * what get passed to sys_open().
1606 */ 1501 */
1607static int __open_namei_create(struct nameidata *nd, struct path *path, 1502static int __open_namei_create(struct nameidata *nd, struct path *path,
1608 int flag, int mode) 1503 int open_flag, int mode)
1609{ 1504{
1610 int error; 1505 int error;
1611 struct dentry *dir = nd->path.dentry; 1506 struct dentry *dir = nd->path.dentry;
@@ -1623,7 +1518,7 @@ out_unlock:
1623 if (error) 1518 if (error)
1624 return error; 1519 return error;
1625 /* Don't check for write permission, don't truncate */ 1520 /* Don't check for write permission, don't truncate */
1626 return may_open(&nd->path, 0, flag & ~O_TRUNC); 1521 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
1627} 1522}
1628 1523
1629/* 1524/*
@@ -1650,7 +1545,7 @@ static inline int open_to_namei_flags(int flag)
1650 return flag; 1545 return flag;
1651} 1546}
1652 1547
1653static int open_will_write_to_fs(int flag, struct inode *inode) 1548static int open_will_truncate(int flag, struct inode *inode)
1654{ 1549{
1655 /* 1550 /*
1656 * We'll never write to the fs underlying 1551 * We'll never write to the fs underlying
@@ -1661,100 +1556,133 @@ static int open_will_write_to_fs(int flag, struct inode *inode)
1661 return (flag & O_TRUNC); 1556 return (flag & O_TRUNC);
1662} 1557}
1663 1558
1664/* 1559static struct file *finish_open(struct nameidata *nd,
1665 * Note that the low bits of the passed in "open_flag" 1560 int open_flag, int acc_mode)
1666 * are not the same as in the local variable "flag". See
1667 * open_to_namei_flags() for more details.
1668 */
1669struct file *do_filp_open(int dfd, const char *pathname,
1670 int open_flag, int mode, int acc_mode)
1671{ 1561{
1672 struct file *filp; 1562 struct file *filp;
1673 struct nameidata nd; 1563 int will_truncate;
1674 int error; 1564 int error;
1675 struct path path;
1676 struct dentry *dir;
1677 int count = 0;
1678 int will_write;
1679 int flag = open_to_namei_flags(open_flag);
1680 1565
1681 if (!acc_mode) 1566 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
1682 acc_mode = MAY_OPEN | ACC_MODE(flag); 1567 if (will_truncate) {
1568 error = mnt_want_write(nd->path.mnt);
1569 if (error)
1570 goto exit;
1571 }
1572 error = may_open(&nd->path, acc_mode, open_flag);
1573 if (error) {
1574 if (will_truncate)
1575 mnt_drop_write(nd->path.mnt);
1576 goto exit;
1577 }
1578 filp = nameidata_to_filp(nd);
1579 if (!IS_ERR(filp)) {
1580 error = ima_file_check(filp, acc_mode);
1581 if (error) {
1582 fput(filp);
1583 filp = ERR_PTR(error);
1584 }
1585 }
1586 if (!IS_ERR(filp)) {
1587 if (will_truncate) {
1588 error = handle_truncate(&nd->path);
1589 if (error) {
1590 fput(filp);
1591 filp = ERR_PTR(error);
1592 }
1593 }
1594 }
1595 /*
1596 * It is now safe to drop the mnt write
1597 * because the filp has had a write taken
1598 * on its behalf.
1599 */
1600 if (will_truncate)
1601 mnt_drop_write(nd->path.mnt);
1602 return filp;
1683 1603
1684 /* O_TRUNC implies we need access checks for write permissions */ 1604exit:
1685 if (flag & O_TRUNC) 1605 if (!IS_ERR(nd->intent.open.file))
1686 acc_mode |= MAY_WRITE; 1606 release_open_intent(nd);
1607 path_put(&nd->path);
1608 return ERR_PTR(error);
1609}
1687 1610
1688 /* Allow the LSM permission hook to distinguish append 1611static struct file *do_last(struct nameidata *nd, struct path *path,
1689 access from general write access. */ 1612 int open_flag, int acc_mode,
1690 if (flag & O_APPEND) 1613 int mode, const char *pathname)
1691 acc_mode |= MAY_APPEND; 1614{
1615 struct dentry *dir = nd->path.dentry;
1616 struct file *filp;
1617 int error = -EISDIR;
1692 1618
1693 /* 1619 switch (nd->last_type) {
1694 * The simplest case - just a plain lookup. 1620 case LAST_DOTDOT:
1695 */ 1621 follow_dotdot(nd);
1696 if (!(flag & O_CREAT)) { 1622 dir = nd->path.dentry;
1697 error = path_lookup_open(dfd, pathname, lookup_flags(flag), 1623 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1698 &nd, flag); 1624 if (!dir->d_op->d_revalidate(dir, nd)) {
1699 if (error) 1625 error = -ESTALE;
1700 return ERR_PTR(error); 1626 goto exit;
1627 }
1628 }
1629 /* fallthrough */
1630 case LAST_DOT:
1631 case LAST_ROOT:
1632 if (open_flag & O_CREAT)
1633 goto exit;
1634 /* fallthrough */
1635 case LAST_BIND:
1636 audit_inode(pathname, dir);
1701 goto ok; 1637 goto ok;
1702 } 1638 }
1703 1639
1704 /* 1640 /* trailing slashes? */
1705 * Create - we need to know the parent. 1641 if (nd->last.name[nd->last.len]) {
1706 */ 1642 if (open_flag & O_CREAT)
1707 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 1643 goto exit;
1708 if (error) 1644 nd->flags |= LOOKUP_DIRECTORY;
1709 return ERR_PTR(error);
1710 error = path_walk(pathname, &nd);
1711 if (error) {
1712 if (nd.root.mnt)
1713 path_put(&nd.root);
1714 return ERR_PTR(error);
1715 } 1645 }
1716 if (unlikely(!audit_dummy_context()))
1717 audit_inode(pathname, nd.path.dentry);
1718 1646
1719 /* 1647 /* just plain open? */
1720 * We have the parent and last component. First of all, check 1648 if (!(open_flag & O_CREAT)) {
1721 * that we are not asked to creat(2) an obvious directory - that 1649 error = do_lookup(nd, &nd->last, path);
1722 * will not do. 1650 if (error)
1723 */ 1651 goto exit;
1724 error = -EISDIR; 1652 error = -ENOENT;
1725 if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) 1653 if (!path->dentry->d_inode)
1726 goto exit_parent; 1654 goto exit_dput;
1655 if (path->dentry->d_inode->i_op->follow_link)
1656 return NULL;
1657 error = -ENOTDIR;
1658 if (nd->flags & LOOKUP_DIRECTORY) {
1659 if (!path->dentry->d_inode->i_op->lookup)
1660 goto exit_dput;
1661 }
1662 path_to_nameidata(path, nd);
1663 audit_inode(pathname, nd->path.dentry);
1664 goto ok;
1665 }
1727 1666
1728 error = -ENFILE; 1667 /* OK, it's O_CREAT */
1729 filp = get_empty_filp();
1730 if (filp == NULL)
1731 goto exit_parent;
1732 nd.intent.open.file = filp;
1733 nd.intent.open.flags = flag;
1734 nd.intent.open.create_mode = mode;
1735 dir = nd.path.dentry;
1736 nd.flags &= ~LOOKUP_PARENT;
1737 nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
1738 if (flag & O_EXCL)
1739 nd.flags |= LOOKUP_EXCL;
1740 mutex_lock(&dir->d_inode->i_mutex); 1668 mutex_lock(&dir->d_inode->i_mutex);
1741 path.dentry = lookup_hash(&nd);
1742 path.mnt = nd.path.mnt;
1743 1669
1744do_last: 1670 path->dentry = lookup_hash(nd);
1745 error = PTR_ERR(path.dentry); 1671 path->mnt = nd->path.mnt;
1746 if (IS_ERR(path.dentry)) { 1672
1673 error = PTR_ERR(path->dentry);
1674 if (IS_ERR(path->dentry)) {
1747 mutex_unlock(&dir->d_inode->i_mutex); 1675 mutex_unlock(&dir->d_inode->i_mutex);
1748 goto exit; 1676 goto exit;
1749 } 1677 }
1750 1678
1751 if (IS_ERR(nd.intent.open.file)) { 1679 if (IS_ERR(nd->intent.open.file)) {
1752 error = PTR_ERR(nd.intent.open.file); 1680 error = PTR_ERR(nd->intent.open.file);
1753 goto exit_mutex_unlock; 1681 goto exit_mutex_unlock;
1754 } 1682 }
1755 1683
1756 /* Negative dentry, just create the file */ 1684 /* Negative dentry, just create the file */
1757 if (!path.dentry->d_inode) { 1685 if (!path->dentry->d_inode) {
1758 /* 1686 /*
1759 * This write is needed to ensure that a 1687 * This write is needed to ensure that a
1760 * ro->rw transition does not occur between 1688 * ro->rw transition does not occur between
@@ -1762,22 +1690,23 @@ do_last:
1762 * a permanent write count is taken through 1690 * a permanent write count is taken through
1763 * the 'struct file' in nameidata_to_filp(). 1691 * the 'struct file' in nameidata_to_filp().
1764 */ 1692 */
1765 error = mnt_want_write(nd.path.mnt); 1693 error = mnt_want_write(nd->path.mnt);
1766 if (error) 1694 if (error)
1767 goto exit_mutex_unlock; 1695 goto exit_mutex_unlock;
1768 error = __open_namei_create(&nd, &path, flag, mode); 1696 error = __open_namei_create(nd, path, open_flag, mode);
1769 if (error) { 1697 if (error) {
1770 mnt_drop_write(nd.path.mnt); 1698 mnt_drop_write(nd->path.mnt);
1771 goto exit; 1699 goto exit;
1772 } 1700 }
1773 filp = nameidata_to_filp(&nd, open_flag); 1701 filp = nameidata_to_filp(nd);
1774 if (IS_ERR(filp)) 1702 mnt_drop_write(nd->path.mnt);
1775 ima_counts_put(&nd.path, 1703 if (!IS_ERR(filp)) {
1776 acc_mode & (MAY_READ | MAY_WRITE | 1704 error = ima_file_check(filp, acc_mode);
1777 MAY_EXEC)); 1705 if (error) {
1778 mnt_drop_write(nd.path.mnt); 1706 fput(filp);
1779 if (nd.root.mnt) 1707 filp = ERR_PTR(error);
1780 path_put(&nd.root); 1708 }
1709 }
1781 return filp; 1710 return filp;
1782 } 1711 }
1783 1712
@@ -1785,129 +1714,182 @@ do_last:
1785 * It already exists. 1714 * It already exists.
1786 */ 1715 */
1787 mutex_unlock(&dir->d_inode->i_mutex); 1716 mutex_unlock(&dir->d_inode->i_mutex);
1788 audit_inode(pathname, path.dentry); 1717 audit_inode(pathname, path->dentry);
1789 1718
1790 error = -EEXIST; 1719 error = -EEXIST;
1791 if (flag & O_EXCL) 1720 if (open_flag & O_EXCL)
1792 goto exit_dput; 1721 goto exit_dput;
1793 1722
1794 if (__follow_mount(&path)) { 1723 if (__follow_mount(path)) {
1795 error = -ELOOP; 1724 error = -ELOOP;
1796 if (flag & O_NOFOLLOW) 1725 if (open_flag & O_NOFOLLOW)
1797 goto exit_dput; 1726 goto exit_dput;
1798 } 1727 }
1799 1728
1800 error = -ENOENT; 1729 error = -ENOENT;
1801 if (!path.dentry->d_inode) 1730 if (!path->dentry->d_inode)
1802 goto exit_dput; 1731 goto exit_dput;
1803 if (path.dentry->d_inode->i_op->follow_link)
1804 goto do_link;
1805 1732
1806 path_to_nameidata(&path, &nd); 1733 if (path->dentry->d_inode->i_op->follow_link)
1734 return NULL;
1735
1736 path_to_nameidata(path, nd);
1807 error = -EISDIR; 1737 error = -EISDIR;
1808 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) 1738 if (S_ISDIR(path->dentry->d_inode->i_mode))
1809 goto exit; 1739 goto exit;
1810ok: 1740ok:
1811 /* 1741 filp = finish_open(nd, open_flag, acc_mode);
1812 * Consider:
1813 * 1. may_open() truncates a file
1814 * 2. a rw->ro mount transition occurs
1815 * 3. nameidata_to_filp() fails due to
1816 * the ro mount.
1817 * That would be inconsistent, and should
1818 * be avoided. Taking this mnt write here
1819 * ensures that (2) can not occur.
1820 */
1821 will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
1822 if (will_write) {
1823 error = mnt_want_write(nd.path.mnt);
1824 if (error)
1825 goto exit;
1826 }
1827 error = may_open(&nd.path, acc_mode, flag);
1828 if (error) {
1829 if (will_write)
1830 mnt_drop_write(nd.path.mnt);
1831 goto exit;
1832 }
1833 filp = nameidata_to_filp(&nd, open_flag);
1834 if (IS_ERR(filp))
1835 ima_counts_put(&nd.path,
1836 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
1837 /*
1838 * It is now safe to drop the mnt write
1839 * because the filp has had a write taken
1840 * on its behalf.
1841 */
1842 if (will_write)
1843 mnt_drop_write(nd.path.mnt);
1844 if (nd.root.mnt)
1845 path_put(&nd.root);
1846 return filp; 1742 return filp;
1847 1743
1848exit_mutex_unlock: 1744exit_mutex_unlock:
1849 mutex_unlock(&dir->d_inode->i_mutex); 1745 mutex_unlock(&dir->d_inode->i_mutex);
1850exit_dput: 1746exit_dput:
1851 path_put_conditional(&path, &nd); 1747 path_put_conditional(path, nd);
1852exit: 1748exit:
1853 if (!IS_ERR(nd.intent.open.file)) 1749 if (!IS_ERR(nd->intent.open.file))
1854 release_open_intent(&nd); 1750 release_open_intent(nd);
1855exit_parent: 1751 path_put(&nd->path);
1856 if (nd.root.mnt)
1857 path_put(&nd.root);
1858 path_put(&nd.path);
1859 return ERR_PTR(error); 1752 return ERR_PTR(error);
1753}
1754
1755/*
1756 * Note that the low bits of the passed in "open_flag"
1757 * are not the same as in the local variable "flag". See
1758 * open_to_namei_flags() for more details.
1759 */
1760struct file *do_filp_open(int dfd, const char *pathname,
1761 int open_flag, int mode, int acc_mode)
1762{
1763 struct file *filp;
1764 struct nameidata nd;
1765 int error;
1766 struct path path;
1767 int count = 0;
1768 int flag = open_to_namei_flags(open_flag);
1769 int force_reval = 0;
1770
1771 if (!(open_flag & O_CREAT))
1772 mode = 0;
1860 1773
1861do_link:
1862 error = -ELOOP;
1863 if (flag & O_NOFOLLOW)
1864 goto exit_dput;
1865 /* 1774 /*
1866 * This is subtle. Instead of calling do_follow_link() we do the 1775 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1867 * thing by hands. The reason is that this way we have zero link_count 1776 * check for O_DSYNC if the need any syncing at all we enforce it's
1868 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. 1777 * always set instead of having to deal with possibly weird behaviour
1869 * After that we have the parent and last component, i.e. 1778 * for malicious applications setting only __O_SYNC.
1870 * we are in the same situation as after the first path_walk().
1871 * Well, almost - if the last component is normal we get its copy
1872 * stored in nd->last.name and we will have to putname() it when we
1873 * are done. Procfs-like symlinks just set LAST_BIND.
1874 */ 1779 */
1875 nd.flags |= LOOKUP_PARENT; 1780 if (open_flag & __O_SYNC)
1876 error = security_inode_follow_link(path.dentry, &nd); 1781 open_flag |= O_DSYNC;
1782
1783 if (!acc_mode)
1784 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1785
1786 /* O_TRUNC implies we need access checks for write permissions */
1787 if (open_flag & O_TRUNC)
1788 acc_mode |= MAY_WRITE;
1789
1790 /* Allow the LSM permission hook to distinguish append
1791 access from general write access. */
1792 if (open_flag & O_APPEND)
1793 acc_mode |= MAY_APPEND;
1794
1795 /* find the parent */
1796reval:
1797 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1877 if (error) 1798 if (error)
1878 goto exit_dput;
1879 error = __do_follow_link(&path, &nd);
1880 if (error) {
1881 /* Does someone understand code flow here? Or it is only
1882 * me so stupid? Anathema to whoever designed this non-sense
1883 * with "intent.open".
1884 */
1885 release_open_intent(&nd);
1886 if (nd.root.mnt)
1887 path_put(&nd.root);
1888 return ERR_PTR(error); 1799 return ERR_PTR(error);
1800 if (force_reval)
1801 nd.flags |= LOOKUP_REVAL;
1802
1803 current->total_link_count = 0;
1804 error = link_path_walk(pathname, &nd);
1805 if (error) {
1806 filp = ERR_PTR(error);
1807 goto out;
1889 } 1808 }
1809 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
1810 audit_inode(pathname, nd.path.dentry);
1811
1812 /*
1813 * We have the parent and last component.
1814 */
1815
1816 error = -ENFILE;
1817 filp = get_empty_filp();
1818 if (filp == NULL)
1819 goto exit_parent;
1820 nd.intent.open.file = filp;
1821 filp->f_flags = open_flag;
1822 nd.intent.open.flags = flag;
1823 nd.intent.open.create_mode = mode;
1890 nd.flags &= ~LOOKUP_PARENT; 1824 nd.flags &= ~LOOKUP_PARENT;
1891 if (nd.last_type == LAST_BIND) 1825 nd.flags |= LOOKUP_OPEN;
1892 goto ok; 1826 if (open_flag & O_CREAT) {
1893 error = -EISDIR; 1827 nd.flags |= LOOKUP_CREATE;
1894 if (nd.last_type != LAST_NORM) 1828 if (open_flag & O_EXCL)
1895 goto exit; 1829 nd.flags |= LOOKUP_EXCL;
1896 if (nd.last.name[nd.last.len]) {
1897 __putname(nd.last.name);
1898 goto exit;
1899 } 1830 }
1900 error = -ELOOP; 1831 if (open_flag & O_DIRECTORY)
1901 if (count++==32) { 1832 nd.flags |= LOOKUP_DIRECTORY;
1902 __putname(nd.last.name); 1833 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1903 goto exit; 1834 while (unlikely(!filp)) { /* trailing symlink */
1835 struct path holder;
1836 struct inode *inode = path.dentry->d_inode;
1837 void *cookie;
1838 error = -ELOOP;
1839 /* S_ISDIR part is a temporary automount kludge */
1840 if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
1841 goto exit_dput;
1842 if (count++ == 32)
1843 goto exit_dput;
1844 /*
1845 * This is subtle. Instead of calling do_follow_link() we do
1846 * the thing by hands. The reason is that this way we have zero
1847 * link_count and path_walk() (called from ->follow_link)
1848 * honoring LOOKUP_PARENT. After that we have the parent and
1849 * last component, i.e. we are in the same situation as after
1850 * the first path_walk(). Well, almost - if the last component
1851 * is normal we get its copy stored in nd->last.name and we will
1852 * have to putname() it when we are done. Procfs-like symlinks
1853 * just set LAST_BIND.
1854 */
1855 nd.flags |= LOOKUP_PARENT;
1856 error = security_inode_follow_link(path.dentry, &nd);
1857 if (error)
1858 goto exit_dput;
1859 error = __do_follow_link(&path, &nd, &cookie);
1860 if (unlikely(error)) {
1861 /* nd.path had been dropped */
1862 if (!IS_ERR(cookie) && inode->i_op->put_link)
1863 inode->i_op->put_link(path.dentry, &nd, cookie);
1864 path_put(&path);
1865 release_open_intent(&nd);
1866 filp = ERR_PTR(error);
1867 goto out;
1868 }
1869 holder = path;
1870 nd.flags &= ~LOOKUP_PARENT;
1871 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1872 if (inode->i_op->put_link)
1873 inode->i_op->put_link(holder.dentry, &nd, cookie);
1874 path_put(&holder);
1904 } 1875 }
1905 dir = nd.path.dentry; 1876out:
1906 mutex_lock(&dir->d_inode->i_mutex); 1877 if (nd.root.mnt)
1907 path.dentry = lookup_hash(&nd); 1878 path_put(&nd.root);
1908 path.mnt = nd.path.mnt; 1879 if (filp == ERR_PTR(-ESTALE) && !force_reval) {
1909 __putname(nd.last.name); 1880 force_reval = 1;
1910 goto do_last; 1881 goto reval;
1882 }
1883 return filp;
1884
1885exit_dput:
1886 path_put_conditional(&path, &nd);
1887 if (!IS_ERR(nd.intent.open.file))
1888 release_open_intent(&nd);
1889exit_parent:
1890 path_put(&nd.path);
1891 filp = ERR_PTR(error);
1892 goto out;
1911} 1893}
1912 1894
1913/** 1895/**
@@ -2001,7 +1983,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
2001 if (error) 1983 if (error)
2002 return error; 1984 return error;
2003 1985
2004 vfs_dq_init(dir);
2005 error = dir->i_op->mknod(dir, dentry, mode, dev); 1986 error = dir->i_op->mknod(dir, dentry, mode, dev);
2006 if (!error) 1987 if (!error)
2007 fsnotify_create(dir, dentry); 1988 fsnotify_create(dir, dentry);
@@ -2100,7 +2081,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2100 if (error) 2081 if (error)
2101 return error; 2082 return error;
2102 2083
2103 vfs_dq_init(dir);
2104 error = dir->i_op->mkdir(dir, dentry, mode); 2084 error = dir->i_op->mkdir(dir, dentry, mode);
2105 if (!error) 2085 if (!error)
2106 fsnotify_mkdir(dir, dentry); 2086 fsnotify_mkdir(dir, dentry);
@@ -2186,8 +2166,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2186 if (!dir->i_op->rmdir) 2166 if (!dir->i_op->rmdir)
2187 return -EPERM; 2167 return -EPERM;
2188 2168
2189 vfs_dq_init(dir);
2190
2191 mutex_lock(&dentry->d_inode->i_mutex); 2169 mutex_lock(&dentry->d_inode->i_mutex);
2192 dentry_unhash(dentry); 2170 dentry_unhash(dentry);
2193 if (d_mountpoint(dentry)) 2171 if (d_mountpoint(dentry))
@@ -2273,15 +2251,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2273 if (!dir->i_op->unlink) 2251 if (!dir->i_op->unlink)
2274 return -EPERM; 2252 return -EPERM;
2275 2253
2276 vfs_dq_init(dir);
2277
2278 mutex_lock(&dentry->d_inode->i_mutex); 2254 mutex_lock(&dentry->d_inode->i_mutex);
2279 if (d_mountpoint(dentry)) 2255 if (d_mountpoint(dentry))
2280 error = -EBUSY; 2256 error = -EBUSY;
2281 else { 2257 else {
2282 error = security_inode_unlink(dir, dentry); 2258 error = security_inode_unlink(dir, dentry);
2283 if (!error) 2259 if (!error) {
2284 error = dir->i_op->unlink(dir, dentry); 2260 error = dir->i_op->unlink(dir, dentry);
2261 if (!error)
2262 dentry->d_inode->i_flags |= S_DEAD;
2263 }
2285 } 2264 }
2286 mutex_unlock(&dentry->d_inode->i_mutex); 2265 mutex_unlock(&dentry->d_inode->i_mutex);
2287 2266
@@ -2384,7 +2363,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2384 if (error) 2363 if (error)
2385 return error; 2364 return error;
2386 2365
2387 vfs_dq_init(dir);
2388 error = dir->i_op->symlink(dir, dentry, oldname); 2366 error = dir->i_op->symlink(dir, dentry, oldname);
2389 if (!error) 2367 if (!error)
2390 fsnotify_create(dir, dentry); 2368 fsnotify_create(dir, dentry);
@@ -2468,7 +2446,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2468 return error; 2446 return error;
2469 2447
2470 mutex_lock(&inode->i_mutex); 2448 mutex_lock(&inode->i_mutex);
2471 vfs_dq_init(dir);
2472 error = dir->i_op->link(old_dentry, dir, new_dentry); 2449 error = dir->i_op->link(old_dentry, dir, new_dentry);
2473 mutex_unlock(&inode->i_mutex); 2450 mutex_unlock(&inode->i_mutex);
2474 if (!error) 2451 if (!error)
@@ -2569,7 +2546,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
2569 * e) conversion from fhandle to dentry may come in the wrong moment - when 2546 * e) conversion from fhandle to dentry may come in the wrong moment - when
2570 * we are removing the target. Solution: we will have to grab ->i_mutex 2547 * we are removing the target. Solution: we will have to grab ->i_mutex
2571 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2548 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2572 * ->i_mutex on parents, which works but leads to some truely excessive 2549 * ->i_mutex on parents, which works but leads to some truly excessive
2573 * locking]. 2550 * locking].
2574 */ 2551 */
2575static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 2552static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2634,6 +2611,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2634 else 2611 else
2635 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2612 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2636 if (!error) { 2613 if (!error) {
2614 if (target)
2615 target->i_flags |= S_DEAD;
2637 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2616 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2638 d_move(old_dentry, new_dentry); 2617 d_move(old_dentry, new_dentry);
2639 } 2618 }
@@ -2667,20 +2646,15 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2667 if (!old_dir->i_op->rename) 2646 if (!old_dir->i_op->rename)
2668 return -EPERM; 2647 return -EPERM;
2669 2648
2670 vfs_dq_init(old_dir);
2671 vfs_dq_init(new_dir);
2672
2673 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 2649 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2674 2650
2675 if (is_dir) 2651 if (is_dir)
2676 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 2652 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2677 else 2653 else
2678 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 2654 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2679 if (!error) { 2655 if (!error)
2680 const char *new_name = old_dentry->d_name.name; 2656 fsnotify_move(old_dir, new_dir, old_name, is_dir,
2681 fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2682 new_dentry->d_inode, old_dentry); 2657 new_dentry->d_inode, old_dentry);
2683 }
2684 fsnotify_oldname_free(old_name); 2658 fsnotify_oldname_free(old_name);
2685 2659
2686 return error; 2660 return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4fd222..8174c8ab5c70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
573 mnt->mnt_master = old; 573 mnt->mnt_master = old;
574 CLEAR_MNT_SHARED(mnt); 574 CLEAR_MNT_SHARED(mnt);
575 } else if (!(flag & CL_PRIVATE)) { 575 } else if (!(flag & CL_PRIVATE)) {
576 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 576 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
577 list_add(&mnt->mnt_share, &old->mnt_share); 577 list_add(&mnt->mnt_share, &old->mnt_share);
578 if (IS_MNT_SLAVE(old)) 578 if (IS_MNT_SLAVE(old))
579 list_add(&mnt->mnt_slave, &old->mnt_slave); 579 list_add(&mnt->mnt_slave, &old->mnt_slave);
@@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v)
737 up_read(&namespace_sem); 737 up_read(&namespace_sem);
738} 738}
739 739
740int mnt_had_events(struct proc_mounts *p)
741{
742 struct mnt_namespace *ns = p->ns;
743 int res = 0;
744
745 spin_lock(&vfsmount_lock);
746 if (p->event != ns->event) {
747 p->event = ns->event;
748 res = 1;
749 }
750 spin_unlock(&vfsmount_lock);
751
752 return res;
753}
754
740struct proc_fs_info { 755struct proc_fs_info {
741 int flag; 756 int flag;
742 const char *str; 757 const char *str;
@@ -965,10 +980,12 @@ EXPORT_SYMBOL(may_umount_tree);
965int may_umount(struct vfsmount *mnt) 980int may_umount(struct vfsmount *mnt)
966{ 981{
967 int ret = 1; 982 int ret = 1;
983 down_read(&namespace_sem);
968 spin_lock(&vfsmount_lock); 984 spin_lock(&vfsmount_lock);
969 if (propagate_mount_busy(mnt, 2)) 985 if (propagate_mount_busy(mnt, 2))
970 ret = 0; 986 ret = 0;
971 spin_unlock(&vfsmount_lock); 987 spin_unlock(&vfsmount_lock);
988 up_read(&namespace_sem);
972 return ret; 989 return ret;
973} 990}
974 991
@@ -1119,8 +1136,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1119{ 1136{
1120 struct path path; 1137 struct path path;
1121 int retval; 1138 int retval;
1139 int lookup_flags = 0;
1140
1141 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1142 return -EINVAL;
1122 1143
1123 retval = user_path(name, &path); 1144 if (!(flags & UMOUNT_NOFOLLOW))
1145 lookup_flags |= LOOKUP_FOLLOW;
1146
1147 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1124 if (retval) 1148 if (retval)
1125 goto out; 1149 goto out;
1126 retval = -EINVAL; 1150 retval = -EINVAL;
@@ -1244,6 +1268,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
1244 release_mounts(&umount_list); 1268 release_mounts(&umount_list);
1245} 1269}
1246 1270
1271int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1272 struct vfsmount *root)
1273{
1274 struct vfsmount *mnt;
1275 int res = f(root, arg);
1276 if (res)
1277 return res;
1278 list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
1279 res = f(mnt, arg);
1280 if (res)
1281 return res;
1282 }
1283 return 0;
1284}
1285
1247static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) 1286static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
1248{ 1287{
1249 struct vfsmount *p; 1288 struct vfsmount *p;
@@ -1352,12 +1391,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1352 if (err) 1391 if (err)
1353 goto out_cleanup_ids; 1392 goto out_cleanup_ids;
1354 1393
1394 spin_lock(&vfsmount_lock);
1395
1355 if (IS_MNT_SHARED(dest_mnt)) { 1396 if (IS_MNT_SHARED(dest_mnt)) {
1356 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1397 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1357 set_mnt_shared(p); 1398 set_mnt_shared(p);
1358 } 1399 }
1359
1360 spin_lock(&vfsmount_lock);
1361 if (parent_path) { 1400 if (parent_path) {
1362 detach_mnt(source_mnt, parent_path); 1401 detach_mnt(source_mnt, parent_path);
1363 attach_mnt(source_mnt, path); 1402 attach_mnt(source_mnt, path);
@@ -1534,8 +1573,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1534 err = change_mount_flags(path->mnt, flags); 1573 err = change_mount_flags(path->mnt, flags);
1535 else 1574 else
1536 err = do_remount_sb(sb, flags, data, 0); 1575 err = do_remount_sb(sb, flags, data, 0);
1537 if (!err) 1576 if (!err) {
1577 spin_lock(&vfsmount_lock);
1578 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1538 path->mnt->mnt_flags = mnt_flags; 1579 path->mnt->mnt_flags = mnt_flags;
1580 spin_unlock(&vfsmount_lock);
1581 }
1539 up_write(&sb->s_umount); 1582 up_write(&sb->s_umount);
1540 if (!err) { 1583 if (!err) {
1541 security_sb_post_remount(path->mnt, flags, data); 1584 security_sb_post_remount(path->mnt, flags, data);
@@ -1665,6 +1708,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1665{ 1708{
1666 int err; 1709 int err;
1667 1710
1711 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
1712
1668 down_write(&namespace_sem); 1713 down_write(&namespace_sem);
1669 /* Something was mounted here while we slept */ 1714 /* Something was mounted here while we slept */
1670 while (d_mountpoint(path->dentry) && 1715 while (d_mountpoint(path->dentry) &&
@@ -1921,6 +1966,16 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1921 if (data_page) 1966 if (data_page)
1922 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1967 ((char *)data_page)[PAGE_SIZE - 1] = 0;
1923 1968
1969 /* ... and get the mountpoint */
1970 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
1971 if (retval)
1972 return retval;
1973
1974 retval = security_sb_mount(dev_name, &path,
1975 type_page, flags, data_page);
1976 if (retval)
1977 goto dput_out;
1978
1924 /* Default to relatime unless overriden */ 1979 /* Default to relatime unless overriden */
1925 if (!(flags & MS_NOATIME)) 1980 if (!(flags & MS_NOATIME))
1926 mnt_flags |= MNT_RELATIME; 1981 mnt_flags |= MNT_RELATIME;
@@ -1945,16 +2000,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1945 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2000 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
1946 MS_STRICTATIME); 2001 MS_STRICTATIME);
1947 2002
1948 /* ... and get the mountpoint */
1949 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
1950 if (retval)
1951 return retval;
1952
1953 retval = security_sb_mount(dev_name, &path,
1954 type_page, flags, data_page);
1955 if (retval)
1956 goto dput_out;
1957
1958 if (flags & MS_REMOUNT) 2003 if (flags & MS_REMOUNT)
1959 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2004 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
1960 data_page); 2005 data_page);
@@ -2306,17 +2351,13 @@ void __init mnt_init(void)
2306 2351
2307void put_mnt_ns(struct mnt_namespace *ns) 2352void put_mnt_ns(struct mnt_namespace *ns)
2308{ 2353{
2309 struct vfsmount *root;
2310 LIST_HEAD(umount_list); 2354 LIST_HEAD(umount_list);
2311 2355
2312 if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock)) 2356 if (!atomic_dec_and_test(&ns->count))
2313 return; 2357 return;
2314 root = ns->root;
2315 ns->root = NULL;
2316 spin_unlock(&vfsmount_lock);
2317 down_write(&namespace_sem); 2358 down_write(&namespace_sem);
2318 spin_lock(&vfsmount_lock); 2359 spin_lock(&vfsmount_lock);
2319 umount_tree(root, 0, &umount_list); 2360 umount_tree(ns->root, 0, &umount_list);
2320 spin_unlock(&vfsmount_lock); 2361 spin_unlock(&vfsmount_lock);
2321 up_write(&namespace_sem); 2362 up_write(&namespace_sem);
2322 release_mounts(&umount_list); 2363 release_mounts(&umount_list);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/mm.h> 19#include <linux/mm.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0d58caf4a6e1..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h>
18#include <linux/highuid.h> 19#include <linux/highuid.h>
19#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
@@ -835,7 +836,7 @@ static int ncp_ioctl_need_write(unsigned int cmd)
835 case NCP_IOC_SETROOT: 836 case NCP_IOC_SETROOT:
836 return 0; 837 return 0;
837 default: 838 default:
838 /* unkown IOCTL command, assume write */ 839 /* unknown IOCTL command, assume write */
839 return 1; 840 return 1;
840 } 841 }
841} 842}
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/errno.h> 15#include <linux/errno.h>
15#include <linux/mman.h> 16#include <linux/mman.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/slab.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h> 19#include <linux/ncp_fs.h>
20 20
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/slab.h>
24#include <net/scm.h> 25#include <net/scm.h>
25#include <net/sock.h> 26#include <net/sock.h>
26#include <linux/ipx.h> 27#include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h> 28#include <linux/ncp_fs.h>
29#include <linux/time.h> 29#include <linux/time.h>
30#include <linux/slab.h>
30#include <linux/mm.h> 31#include <linux/mm.h>
31#include <linux/stat.h> 32#include <linux/stat.h>
32#include "ncplib_kernel.h" 33#include "ncplib_kernel.h"
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2a77bc25d5af..a43d07e7b924 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -90,13 +90,12 @@ config ROOT_NFS
90 If you want your system to mount its root file system via NFS, 90 If you want your system to mount its root file system via NFS,
91 choose Y here. This is common practice for managing systems 91 choose Y here. This is common practice for managing systems
92 without local permanent storage. For details, read 92 without local permanent storage. For details, read
93 <file:Documentation/filesystems/nfsroot.txt>. 93 <file:Documentation/filesystems/nfs/nfsroot.txt>.
94 94
95 Most people say N here. 95 Most people say N here.
96 96
97config NFS_FSCACHE 97config NFS_FSCACHE
98 bool "Provide NFS client caching support (EXPERIMENTAL)" 98 bool "Provide NFS client caching support"
99 depends on EXPERIMENTAL
100 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y 99 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
101 help 100 help
102 Say Y here if you want NFS data to be cached locally on disc through 101 Say Y here if you want NFS data to be cached locally on disc through
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
10#include <linux/moduleparam.h> 10#include <linux/moduleparam.h>
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/slab.h>
13#include <linux/sunrpc/cache.h> 14#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h> 15#include <linux/sunrpc/rpc_pipe_fs.h>
15 16
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 293fa0528a6e..36dfdae95123 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,11 +78,6 @@ nfs4_callback_svc(void *vrqstp)
78 78
79 set_freezable(); 79 set_freezable();
80 80
81 /*
82 * FIXME: do we really need to run this under the BKL? If so, please
83 * add a comment about what it's intended to protect.
84 */
85 lock_kernel();
86 while (!kthread_should_stop()) { 81 while (!kthread_should_stop()) {
87 /* 82 /*
88 * Listen for a request on the socket 83 * Listen for a request on the socket
@@ -104,7 +99,6 @@ nfs4_callback_svc(void *vrqstp)
104 preverr = err; 99 preverr = err;
105 svc_process(rqstp); 100 svc_process(rqstp);
106 } 101 }
107 unlock_kernel();
108 return 0; 102 return 0;
109} 103}
110 104
@@ -124,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv)
124 dprintk("NFS: Callback listener port = %u (af %u)\n", 118 dprintk("NFS: Callback listener port = %u (af %u)\n",
125 nfs_callback_tcpport, PF_INET); 119 nfs_callback_tcpport, PF_INET);
126 120
127#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
128 ret = svc_create_xprt(serv, "tcp", PF_INET6, 121 ret = svc_create_xprt(serv, "tcp", PF_INET6,
129 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 122 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
130 if (ret > 0) { 123 if (ret > 0) {
@@ -135,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv)
135 ret = 0; 128 ret = 0;
136 else 129 else
137 goto out_err; 130 goto out_err;
138#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
139 131
140 return svc_prepare_thread(serv, &serv->sv_pools[0]); 132 return svc_prepare_thread(serv, &serv->sv_pools[0]);
141 133
@@ -160,11 +152,6 @@ nfs41_callback_svc(void *vrqstp)
160 152
161 set_freezable(); 153 set_freezable();
162 154
163 /*
164 * FIXME: do we really need to run this under the BKL? If so, please
165 * add a comment about what it's intended to protect.
166 */
167 lock_kernel();
168 while (!kthread_should_stop()) { 155 while (!kthread_should_stop()) {
169 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); 156 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
170 spin_lock_bh(&serv->sv_cb_lock); 157 spin_lock_bh(&serv->sv_cb_lock);
@@ -183,7 +170,6 @@ nfs41_callback_svc(void *vrqstp)
183 } 170 }
184 finish_wait(&serv->sv_cb_waitq, &wq); 171 finish_wait(&serv->sv_cb_waitq, &wq);
185 } 172 }
186 unlock_kernel();
187 return 0; 173 return 0;
188} 174}
189 175
@@ -397,6 +383,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
397 */ 383 */
398static struct svc_version *nfs4_callback_version[] = { 384static struct svc_version *nfs4_callback_version[] = {
399 [1] = &nfs4_callback_version1, 385 [1] = &nfs4_callback_version1,
386 [4] = &nfs4_callback_version4,
400}; 387};
401 388
402static struct svc_stat nfs4_callback_stats; 389static struct svc_stat nfs4_callback_stats;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07baa8254ca1..85a7cfd1b8dd 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -106,6 +106,27 @@ struct cb_sequenceres {
106extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, 106extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
107 struct cb_sequenceres *res); 107 struct cb_sequenceres *res);
108 108
109extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
110 const nfs4_stateid *stateid);
111
112#define RCA4_TYPE_MASK_RDATA_DLG 0
113#define RCA4_TYPE_MASK_WDATA_DLG 1
114
115struct cb_recallanyargs {
116 struct sockaddr *craa_addr;
117 uint32_t craa_objs_to_keep;
118 uint32_t craa_type_mask;
119};
120
121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
122
123struct cb_recallslotargs {
124 struct sockaddr *crsa_addr;
125 uint32_t crsa_target_max_slots;
126};
127extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
128 void *dummy);
129
109#endif /* CONFIG_NFS_V4_1 */ 130#endif /* CONFIG_NFS_V4_1 */
110 131
111extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 132extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
@@ -114,8 +135,9 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
114#ifdef CONFIG_NFS_V4 135#ifdef CONFIG_NFS_V4
115extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); 136extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
116extern void nfs_callback_down(int minorversion); 137extern void nfs_callback_down(int minorversion);
138extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
139 const nfs4_stateid *stateid);
117#endif /* CONFIG_NFS_V4 */ 140#endif /* CONFIG_NFS_V4 */
118
119/* 141/*
120 * nfs41: Callbacks are expected to not cause substantial latency, 142 * nfs41: Callbacks are expected to not cause substantial latency,
121 * so we limit their concurrency to 1 by setting up the maximum number 143 * so we limit their concurrency to 1 by setting up the maximum number
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b7da1f54da68..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/nfs4.h> 8#include <linux/nfs4.h>
9#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
10#include <linux/slab.h>
10#include "nfs4_fs.h" 11#include "nfs4_fs.h"
11#include "callback.h" 12#include "callback.h"
12#include "delegation.h" 13#include "delegation.h"
@@ -61,6 +62,16 @@ out:
61 return res->status; 62 return res->status;
62} 63}
63 64
65static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
66{
67#if defined(CONFIG_NFS_V4_1)
68 if (clp->cl_minorversion > 0)
69 return nfs41_validate_delegation_stateid;
70#endif
71 return nfs4_validate_delegation_stateid;
72}
73
74
64__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 75__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
65{ 76{
66 struct nfs_client *clp; 77 struct nfs_client *clp;
@@ -81,7 +92,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
81 inode = nfs_delegation_find_inode(clp, &args->fh); 92 inode = nfs_delegation_find_inode(clp, &args->fh);
82 if (inode != NULL) { 93 if (inode != NULL) {
83 /* Set up a helper thread to actually return the delegation */ 94 /* Set up a helper thread to actually return the delegation */
84 switch(nfs_async_inode_return_delegation(inode, &args->stateid)) { 95 switch (nfs_async_inode_return_delegation(inode, &args->stateid,
96 nfs_validate_delegation_stateid(clp))) {
85 case 0: 97 case 0:
86 res = 0; 98 res = 0;
87 break; 99 break;
@@ -102,51 +114,79 @@ out:
102 return res; 114 return res;
103} 115}
104 116
117int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
118{
119 if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
120 sizeof(delegation->stateid.data)) != 0)
121 return 0;
122 return 1;
123}
124
105#if defined(CONFIG_NFS_V4_1) 125#if defined(CONFIG_NFS_V4_1)
106 126
127int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
128{
129 if (delegation == NULL)
130 return 0;
131
132 /* seqid is 4-bytes long */
133 if (((u32 *) &stateid->data)[0] != 0)
134 return 0;
135 if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
136 sizeof(stateid->data)-4))
137 return 0;
138
139 return 1;
140}
141
107/* 142/*
108 * Validate the sequenceID sent by the server. 143 * Validate the sequenceID sent by the server.
109 * Return success if the sequenceID is one more than what we last saw on 144 * Return success if the sequenceID is one more than what we last saw on
110 * this slot, accounting for wraparound. Increments the slot's sequence. 145 * this slot, accounting for wraparound. Increments the slot's sequence.
111 * 146 *
112 * We don't yet implement a duplicate request cache, so at this time 147 * We don't yet implement a duplicate request cache, instead we set the
113 * we will log replays, and process them as if we had not seen them before, 148 * back channel ca_maxresponsesize_cached to zero. This is OK for now
114 * but we don't bump the sequence in the slot. Not too worried about it,
115 * since we only currently implement idempotent callbacks anyway. 149 * since we only currently implement idempotent callbacks anyway.
116 * 150 *
117 * We have a single slot backchannel at this time, so we don't bother 151 * We have a single slot backchannel at this time, so we don't bother
118 * checking the used_slots bit array on the table. The lower layer guarantees 152 * checking the used_slots bit array on the table. The lower layer guarantees
119 * a single outstanding callback request at a time. 153 * a single outstanding callback request at a time.
120 */ 154 */
121static int 155static __be32
122validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid) 156validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
123{ 157{
124 struct nfs4_slot *slot; 158 struct nfs4_slot *slot;
125 159
126 dprintk("%s enter. slotid %d seqid %d\n", 160 dprintk("%s enter. slotid %d seqid %d\n",
127 __func__, slotid, seqid); 161 __func__, args->csa_slotid, args->csa_sequenceid);
128 162
129 if (slotid > NFS41_BC_MAX_CALLBACKS) 163 if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
130 return htonl(NFS4ERR_BADSLOT); 164 return htonl(NFS4ERR_BADSLOT);
131 165
132 slot = tbl->slots + slotid; 166 slot = tbl->slots + args->csa_slotid;
133 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); 167 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
134 168
135 /* Normal */ 169 /* Normal */
136 if (likely(seqid == slot->seq_nr + 1)) { 170 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
137 slot->seq_nr++; 171 slot->seq_nr++;
138 return htonl(NFS4_OK); 172 return htonl(NFS4_OK);
139 } 173 }
140 174
141 /* Replay */ 175 /* Replay */
142 if (seqid == slot->seq_nr) { 176 if (args->csa_sequenceid == slot->seq_nr) {
143 dprintk("%s seqid %d is a replay - no DRC available\n", 177 dprintk("%s seqid %d is a replay\n",
144 __func__, seqid); 178 __func__, args->csa_sequenceid);
145 return htonl(NFS4_OK); 179 /* Signal process_op to set this error on next op */
180 if (args->csa_cachethis == 0)
181 return htonl(NFS4ERR_RETRY_UNCACHED_REP);
182
183 /* The ca_maxresponsesize_cached is 0 with no DRC */
184 else if (args->csa_cachethis == 1)
185 return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
146 } 186 }
147 187
148 /* Wraparound */ 188 /* Wraparound */
149 if (seqid == 1 && (slot->seq_nr + 1) == 0) { 189 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
150 slot->seq_nr = 1; 190 slot->seq_nr = 1;
151 return htonl(NFS4_OK); 191 return htonl(NFS4_OK);
152 } 192 }
@@ -191,27 +231,87 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
191 return NULL; 231 return NULL;
192} 232}
193 233
194/* FIXME: referring calls should be processed */ 234/*
195unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, 235 * For each referring call triple, check the session's slot table for
236 * a match. If the slot is in use and the sequence numbers match, the
237 * client is still waiting for a response to the original request.
238 */
239static bool referring_call_exists(struct nfs_client *clp,
240 uint32_t nrclists,
241 struct referring_call_list *rclists)
242{
243 bool status = 0;
244 int i, j;
245 struct nfs4_session *session;
246 struct nfs4_slot_table *tbl;
247 struct referring_call_list *rclist;
248 struct referring_call *ref;
249
250 /*
251 * XXX When client trunking is implemented, this becomes
252 * a session lookup from within the loop
253 */
254 session = clp->cl_session;
255 tbl = &session->fc_slot_table;
256
257 for (i = 0; i < nrclists; i++) {
258 rclist = &rclists[i];
259 if (memcmp(session->sess_id.data,
260 rclist->rcl_sessionid.data,
261 NFS4_MAX_SESSIONID_LEN) != 0)
262 continue;
263
264 for (j = 0; j < rclist->rcl_nrefcalls; j++) {
265 ref = &rclist->rcl_refcalls[j];
266
267 dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
268 "slotid %u\n", __func__,
269 ((u32 *)&rclist->rcl_sessionid.data)[0],
270 ((u32 *)&rclist->rcl_sessionid.data)[1],
271 ((u32 *)&rclist->rcl_sessionid.data)[2],
272 ((u32 *)&rclist->rcl_sessionid.data)[3],
273 ref->rc_sequenceid, ref->rc_slotid);
274
275 spin_lock(&tbl->slot_tbl_lock);
276 status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
277 tbl->slots[ref->rc_slotid].seq_nr ==
278 ref->rc_sequenceid);
279 spin_unlock(&tbl->slot_tbl_lock);
280 if (status)
281 goto out;
282 }
283 }
284
285out:
286 return status;
287}
288
289__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
196 struct cb_sequenceres *res) 290 struct cb_sequenceres *res)
197{ 291{
198 struct nfs_client *clp; 292 struct nfs_client *clp;
199 int i, status; 293 int i;
200 294 __be32 status;
201 for (i = 0; i < args->csa_nrclists; i++)
202 kfree(args->csa_rclists[i].rcl_refcalls);
203 kfree(args->csa_rclists);
204 295
205 status = htonl(NFS4ERR_BADSESSION); 296 status = htonl(NFS4ERR_BADSESSION);
206 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); 297 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
207 if (clp == NULL) 298 if (clp == NULL)
208 goto out; 299 goto out;
209 300
210 status = validate_seqid(&clp->cl_session->bc_slot_table, 301 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
211 args->csa_slotid, args->csa_sequenceid);
212 if (status) 302 if (status)
213 goto out_putclient; 303 goto out_putclient;
214 304
305 /*
306 * Check for pending referring calls. If a match is found, a
307 * related callback was received before the response to the original
308 * call.
309 */
310 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
311 status = htonl(NFS4ERR_DELAY);
312 goto out_putclient;
313 }
314
215 memcpy(&res->csr_sessionid, &args->csa_sessionid, 315 memcpy(&res->csr_sessionid, &args->csa_sessionid,
216 sizeof(res->csr_sessionid)); 316 sizeof(res->csr_sessionid));
217 res->csr_sequenceid = args->csa_sequenceid; 317 res->csr_sequenceid = args->csa_sequenceid;
@@ -222,9 +322,81 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
222out_putclient: 322out_putclient:
223 nfs_put_client(clp); 323 nfs_put_client(clp);
224out: 324out:
325 for (i = 0; i < args->csa_nrclists; i++)
326 kfree(args->csa_rclists[i].rcl_refcalls);
327 kfree(args->csa_rclists);
328
329 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
330 res->csr_status = 0;
331 else
332 res->csr_status = status;
333 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
334 ntohl(status), ntohl(res->csr_status));
335 return status;
336}
337
338__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
339{
340 struct nfs_client *clp;
341 __be32 status;
342 fmode_t flags = 0;
343
344 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
345 clp = nfs_find_client(args->craa_addr, 4);
346 if (clp == NULL)
347 goto out;
348
349 dprintk("NFS: RECALL_ANY callback request from %s\n",
350 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
351
352 if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
353 &args->craa_type_mask))
354 flags = FMODE_READ;
355 if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
356 &args->craa_type_mask))
357 flags |= FMODE_WRITE;
358
359 if (flags)
360 nfs_expire_all_delegation_types(clp, flags);
361 status = htonl(NFS4_OK);
362out:
225 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 363 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
226 res->csr_status = status; 364 return status;
227 return res->csr_status;
228} 365}
229 366
367/* Reduce the fore channel's max_slots to the target value */
368__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
369{
370 struct nfs_client *clp;
371 struct nfs4_slot_table *fc_tbl;
372 __be32 status;
373
374 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
375 clp = nfs_find_client(args->crsa_addr, 4);
376 if (clp == NULL)
377 goto out;
378
379 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
380 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
381 args->crsa_target_max_slots);
382
383 fc_tbl = &clp->cl_session->fc_slot_table;
384
385 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
386 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
387 args->crsa_target_max_slots < 1)
388 goto out_putclient;
389
390 status = htonl(NFS4_OK);
391 if (args->crsa_target_max_slots == fc_tbl->max_slots)
392 goto out_putclient;
393
394 fc_tbl->target_max_slots = args->crsa_target_max_slots;
395 nfs41_handle_recall_slot(clp);
396out_putclient:
397 nfs_put_client(clp); /* balance nfs_find_client */
398out:
399 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
400 return status;
401}
230#endif /* CONFIG_NFS_V4_1 */ 402#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 76b0aa0f73bf..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
9#include <linux/sunrpc/svc.h> 9#include <linux/sunrpc/svc.h>
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h>
12#include "nfs4_fs.h" 13#include "nfs4_fs.h"
13#include "callback.h" 14#include "callback.h"
14 15
@@ -23,10 +24,15 @@
23#if defined(CONFIG_NFS_V4_1) 24#if defined(CONFIG_NFS_V4_1)
24#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 25#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
25 4 + 1 + 3) 26 4 + 1 + 3)
27#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
26#endif /* CONFIG_NFS_V4_1 */ 29#endif /* CONFIG_NFS_V4_1 */
27 30
28#define NFSDBG_FACILITY NFSDBG_CALLBACK 31#define NFSDBG_FACILITY NFSDBG_CALLBACK
29 32
33/* Internal error code */
34#define NFS4ERR_RESOURCE_HDR 11050
35
30typedef __be32 (*callback_process_op_t)(void *, void *); 36typedef __be32 (*callback_process_op_t)(void *, void *);
31typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); 37typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
32typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); 38typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -172,7 +178,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
172 __be32 *p; 178 __be32 *p;
173 p = read_buf(xdr, 4); 179 p = read_buf(xdr, 4);
174 if (unlikely(p == NULL)) 180 if (unlikely(p == NULL))
175 return htonl(NFS4ERR_RESOURCE); 181 return htonl(NFS4ERR_RESOURCE_HDR);
176 *op = ntohl(*p); 182 *op = ntohl(*p);
177 return 0; 183 return 0;
178} 184}
@@ -214,10 +220,10 @@ out:
214 220
215#if defined(CONFIG_NFS_V4_1) 221#if defined(CONFIG_NFS_V4_1)
216 222
217static unsigned decode_sessionid(struct xdr_stream *xdr, 223static __be32 decode_sessionid(struct xdr_stream *xdr,
218 struct nfs4_sessionid *sid) 224 struct nfs4_sessionid *sid)
219{ 225{
220 uint32_t *p; 226 __be32 *p;
221 int len = NFS4_MAX_SESSIONID_LEN; 227 int len = NFS4_MAX_SESSIONID_LEN;
222 228
223 p = read_buf(xdr, len); 229 p = read_buf(xdr, len);
@@ -228,12 +234,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
228 return 0; 234 return 0;
229} 235}
230 236
231static unsigned decode_rc_list(struct xdr_stream *xdr, 237static __be32 decode_rc_list(struct xdr_stream *xdr,
232 struct referring_call_list *rc_list) 238 struct referring_call_list *rc_list)
233{ 239{
234 uint32_t *p; 240 __be32 *p;
235 int i; 241 int i;
236 unsigned status; 242 __be32 status;
237 243
238 status = decode_sessionid(xdr, &rc_list->rcl_sessionid); 244 status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
239 if (status) 245 if (status)
@@ -266,13 +272,13 @@ out:
266 return status; 272 return status;
267} 273}
268 274
269static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp, 275static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
270 struct xdr_stream *xdr, 276 struct xdr_stream *xdr,
271 struct cb_sequenceargs *args) 277 struct cb_sequenceargs *args)
272{ 278{
273 uint32_t *p; 279 __be32 *p;
274 int i; 280 int i;
275 unsigned status; 281 __be32 status;
276 282
277 status = decode_sessionid(xdr, &args->csa_sessionid); 283 status = decode_sessionid(xdr, &args->csa_sessionid);
278 if (status) 284 if (status)
@@ -326,6 +332,39 @@ out_free:
326 goto out; 332 goto out;
327} 333}
328 334
335static __be32 decode_recallany_args(struct svc_rqst *rqstp,
336 struct xdr_stream *xdr,
337 struct cb_recallanyargs *args)
338{
339 __be32 *p;
340
341 args->craa_addr = svc_addr(rqstp);
342 p = read_buf(xdr, 4);
343 if (unlikely(p == NULL))
344 return htonl(NFS4ERR_BADXDR);
345 args->craa_objs_to_keep = ntohl(*p++);
346 p = read_buf(xdr, 4);
347 if (unlikely(p == NULL))
348 return htonl(NFS4ERR_BADXDR);
349 args->craa_type_mask = ntohl(*p);
350
351 return 0;
352}
353
354static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
355 struct xdr_stream *xdr,
356 struct cb_recallslotargs *args)
357{
358 __be32 *p;
359
360 args->crsa_addr = svc_addr(rqstp);
361 p = read_buf(xdr, 4);
362 if (unlikely(p == NULL))
363 return htonl(NFS4ERR_BADXDR);
364 args->crsa_target_max_slots = ntohl(*p++);
365 return 0;
366}
367
329#endif /* CONFIG_NFS_V4_1 */ 368#endif /* CONFIG_NFS_V4_1 */
330 369
331static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 370static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -445,7 +484,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
445 484
446 p = xdr_reserve_space(xdr, 8); 485 p = xdr_reserve_space(xdr, 8);
447 if (unlikely(p == NULL)) 486 if (unlikely(p == NULL))
448 return htonl(NFS4ERR_RESOURCE); 487 return htonl(NFS4ERR_RESOURCE_HDR);
449 *p++ = htonl(op); 488 *p++ = htonl(op);
450 *p = res; 489 *p = res;
451 return 0; 490 return 0;
@@ -479,10 +518,10 @@ out:
479 518
480#if defined(CONFIG_NFS_V4_1) 519#if defined(CONFIG_NFS_V4_1)
481 520
482static unsigned encode_sessionid(struct xdr_stream *xdr, 521static __be32 encode_sessionid(struct xdr_stream *xdr,
483 const struct nfs4_sessionid *sid) 522 const struct nfs4_sessionid *sid)
484{ 523{
485 uint32_t *p; 524 __be32 *p;
486 int len = NFS4_MAX_SESSIONID_LEN; 525 int len = NFS4_MAX_SESSIONID_LEN;
487 526
488 p = xdr_reserve_space(xdr, len); 527 p = xdr_reserve_space(xdr, len);
@@ -493,11 +532,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr,
493 return 0; 532 return 0;
494} 533}
495 534
496static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp, 535static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
497 struct xdr_stream *xdr, 536 struct xdr_stream *xdr,
498 const struct cb_sequenceres *res) 537 const struct cb_sequenceres *res)
499{ 538{
500 uint32_t *p; 539 __be32 *p;
501 unsigned status = res->csr_status; 540 unsigned status = res->csr_status;
502 541
503 if (unlikely(status != 0)) 542 if (unlikely(status != 0))
@@ -533,6 +572,8 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
533 case OP_CB_GETATTR: 572 case OP_CB_GETATTR:
534 case OP_CB_RECALL: 573 case OP_CB_RECALL:
535 case OP_CB_SEQUENCE: 574 case OP_CB_SEQUENCE:
575 case OP_CB_RECALL_ANY:
576 case OP_CB_RECALL_SLOT:
536 *op = &callback_ops[op_nr]; 577 *op = &callback_ops[op_nr];
537 break; 578 break;
538 579
@@ -540,9 +581,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
540 case OP_CB_NOTIFY_DEVICEID: 581 case OP_CB_NOTIFY_DEVICEID:
541 case OP_CB_NOTIFY: 582 case OP_CB_NOTIFY:
542 case OP_CB_PUSH_DELEG: 583 case OP_CB_PUSH_DELEG:
543 case OP_CB_RECALL_ANY:
544 case OP_CB_RECALLABLE_OBJ_AVAIL: 584 case OP_CB_RECALLABLE_OBJ_AVAIL:
545 case OP_CB_RECALL_SLOT:
546 case OP_CB_WANTS_CANCELLED: 585 case OP_CB_WANTS_CANCELLED:
547 case OP_CB_NOTIFY_LOCK: 586 case OP_CB_NOTIFY_LOCK:
548 return htonl(NFS4ERR_NOTSUPP); 587 return htonl(NFS4ERR_NOTSUPP);
@@ -582,20 +621,18 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
582static __be32 process_op(uint32_t minorversion, int nop, 621static __be32 process_op(uint32_t minorversion, int nop,
583 struct svc_rqst *rqstp, 622 struct svc_rqst *rqstp,
584 struct xdr_stream *xdr_in, void *argp, 623 struct xdr_stream *xdr_in, void *argp,
585 struct xdr_stream *xdr_out, void *resp) 624 struct xdr_stream *xdr_out, void *resp, int* drc_status)
586{ 625{
587 struct callback_op *op = &callback_ops[0]; 626 struct callback_op *op = &callback_ops[0];
588 unsigned int op_nr = OP_CB_ILLEGAL; 627 unsigned int op_nr;
589 __be32 status; 628 __be32 status;
590 long maxlen; 629 long maxlen;
591 __be32 res; 630 __be32 res;
592 631
593 dprintk("%s: start\n", __func__); 632 dprintk("%s: start\n", __func__);
594 status = decode_op_hdr(xdr_in, &op_nr); 633 status = decode_op_hdr(xdr_in, &op_nr);
595 if (unlikely(status)) { 634 if (unlikely(status))
596 status = htonl(NFS4ERR_OP_ILLEGAL); 635 return status;
597 goto out;
598 }
599 636
600 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", 637 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
601 __func__, minorversion, nop, op_nr); 638 __func__, minorversion, nop, op_nr);
@@ -604,19 +641,32 @@ static __be32 process_op(uint32_t minorversion, int nop,
604 preprocess_nfs4_op(op_nr, &op); 641 preprocess_nfs4_op(op_nr, &op);
605 if (status == htonl(NFS4ERR_OP_ILLEGAL)) 642 if (status == htonl(NFS4ERR_OP_ILLEGAL))
606 op_nr = OP_CB_ILLEGAL; 643 op_nr = OP_CB_ILLEGAL;
607out: 644 if (status)
645 goto encode_hdr;
646
647 if (*drc_status) {
648 status = *drc_status;
649 goto encode_hdr;
650 }
651
608 maxlen = xdr_out->end - xdr_out->p; 652 maxlen = xdr_out->end - xdr_out->p;
609 if (maxlen > 0 && maxlen < PAGE_SIZE) { 653 if (maxlen > 0 && maxlen < PAGE_SIZE) {
610 if (likely(status == 0 && op->decode_args != NULL)) 654 status = op->decode_args(rqstp, xdr_in, argp);
611 status = op->decode_args(rqstp, xdr_in, argp); 655 if (likely(status == 0))
612 if (likely(status == 0 && op->process_op != NULL))
613 status = op->process_op(argp, resp); 656 status = op->process_op(argp, resp);
614 } else 657 } else
615 status = htonl(NFS4ERR_RESOURCE); 658 status = htonl(NFS4ERR_RESOURCE);
616 659
660 /* Only set by OP_CB_SEQUENCE processing */
661 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
662 *drc_status = status;
663 status = 0;
664 }
665
666encode_hdr:
617 res = encode_op_hdr(xdr_out, op_nr, status); 667 res = encode_op_hdr(xdr_out, op_nr, status);
618 if (status == 0) 668 if (unlikely(res))
619 status = res; 669 return res;
620 if (op->encode_res != NULL && status == 0) 670 if (op->encode_res != NULL && status == 0)
621 status = op->encode_res(rqstp, xdr_out, resp); 671 status = op->encode_res(rqstp, xdr_out, resp);
622 dprintk("%s: done, status = %d\n", __func__, ntohl(status)); 672 dprintk("%s: done, status = %d\n", __func__, ntohl(status));
@@ -632,7 +682,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
632 struct cb_compound_hdr_res hdr_res = { NULL }; 682 struct cb_compound_hdr_res hdr_res = { NULL };
633 struct xdr_stream xdr_in, xdr_out; 683 struct xdr_stream xdr_in, xdr_out;
634 __be32 *p; 684 __be32 *p;
635 __be32 status; 685 __be32 status, drc_status = 0;
636 unsigned int nops = 0; 686 unsigned int nops = 0;
637 687
638 dprintk("%s: start\n", __func__); 688 dprintk("%s: start\n", __func__);
@@ -652,11 +702,18 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
652 return rpc_system_err; 702 return rpc_system_err;
653 703
654 while (status == 0 && nops != hdr_arg.nops) { 704 while (status == 0 && nops != hdr_arg.nops) {
655 status = process_op(hdr_arg.minorversion, nops, 705 status = process_op(hdr_arg.minorversion, nops, rqstp,
656 rqstp, &xdr_in, argp, &xdr_out, resp); 706 &xdr_in, argp, &xdr_out, resp, &drc_status);
657 nops++; 707 nops++;
658 } 708 }
659 709
710 /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
711 * resource error in cb_compound status without returning op */
712 if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
713 status = htonl(NFS4ERR_RESOURCE);
714 nops--;
715 }
716
660 *hdr_res.status = status; 717 *hdr_res.status = status;
661 *hdr_res.nops = htonl(nops); 718 *hdr_res.nops = htonl(nops);
662 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 719 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
@@ -688,6 +745,16 @@ static struct callback_op callback_ops[] = {
688 .encode_res = (callback_encode_res_t)encode_cb_sequence_res, 745 .encode_res = (callback_encode_res_t)encode_cb_sequence_res,
689 .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ, 746 .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
690 }, 747 },
748 [OP_CB_RECALL_ANY] = {
749 .process_op = (callback_process_op_t)nfs4_callback_recallany,
750 .decode_args = (callback_decode_arg_t)decode_recallany_args,
751 .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
752 },
753 [OP_CB_RECALL_SLOT] = {
754 .process_op = (callback_process_op_t)nfs4_callback_recallslot,
755 .decode_args = (callback_decode_arg_t)decode_recallslot_args,
756 .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
757 },
691#endif /* CONFIG_NFS_V4_1 */ 758#endif /* CONFIG_NFS_V4_1 */
692}; 759};
693 760
@@ -716,5 +783,13 @@ struct svc_version nfs4_callback_version1 = {
716 .vs_proc = nfs4_callback_procedures1, 783 .vs_proc = nfs4_callback_procedures1,
717 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 784 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
718 .vs_dispatch = NULL, 785 .vs_dispatch = NULL,
786 .vs_hidden = 1,
719}; 787};
720 788
789struct svc_version nfs4_callback_version4 = {
790 .vs_vers = 4,
791 .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
792 .vs_proc = nfs4_callback_procedures1,
793 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
794 .vs_dispatch = NULL,
795};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99ea196f071f..2a3d352c0bff 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/inet.h> 36#include <linux/inet.h>
37#include <linux/in6.h> 37#include <linux/in6.h>
38#include <linux/slab.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include <linux/nfs_xdr.h> 40#include <linux/nfs_xdr.h>
40#include <linux/sunrpc/bc_xprt.h> 41#include <linux/sunrpc/bc_xprt.h>
@@ -164,30 +165,7 @@ error_0:
164 return ERR_PTR(err); 165 return ERR_PTR(err);
165} 166}
166 167
167static void nfs4_shutdown_client(struct nfs_client *clp)
168{
169#ifdef CONFIG_NFS_V4
170 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
171 nfs4_kill_renewd(clp);
172 BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
173 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
174 nfs_idmap_delete(clp);
175
176 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
177#endif
178}
179
180/*
181 * Destroy the NFS4 callback service
182 */
183static void nfs4_destroy_callback(struct nfs_client *clp)
184{
185#ifdef CONFIG_NFS_V4 168#ifdef CONFIG_NFS_V4
186 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
187 nfs_callback_down(clp->cl_minorversion);
188#endif /* CONFIG_NFS_V4 */
189}
190
191/* 169/*
192 * Clears/puts all minor version specific parts from an nfs_client struct 170 * Clears/puts all minor version specific parts from an nfs_client struct
193 * reverting it to minorversion 0. 171 * reverting it to minorversion 0.
@@ -202,9 +180,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
202 180
203 clp->cl_call_sync = _nfs4_call_sync; 181 clp->cl_call_sync = _nfs4_call_sync;
204#endif /* CONFIG_NFS_V4_1 */ 182#endif /* CONFIG_NFS_V4_1 */
183}
184
185/*
186 * Destroy the NFS4 callback service
187 */
188static void nfs4_destroy_callback(struct nfs_client *clp)
189{
190 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
191 nfs_callback_down(clp->cl_minorversion);
192}
205 193
194static void nfs4_shutdown_client(struct nfs_client *clp)
195{
196 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
197 nfs4_kill_renewd(clp);
198 nfs4_clear_client_minor_version(clp);
206 nfs4_destroy_callback(clp); 199 nfs4_destroy_callback(clp);
200 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
201 nfs_idmap_delete(clp);
202
203 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
204}
205#else
206static void nfs4_shutdown_client(struct nfs_client *clp)
207{
207} 208}
209#endif /* CONFIG_NFS_V4 */
208 210
209/* 211/*
210 * Destroy a shared client record 212 * Destroy a shared client record
@@ -213,7 +215,6 @@ static void nfs_free_client(struct nfs_client *clp)
213{ 215{
214 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); 216 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
215 217
216 nfs4_clear_client_minor_version(clp);
217 nfs4_shutdown_client(clp); 218 nfs4_shutdown_client(clp);
218 219
219 nfs_fscache_release_client_cookie(clp); 220 nfs_fscache_release_client_cookie(clp);
@@ -1260,10 +1261,20 @@ error:
1260static void nfs4_session_set_rwsize(struct nfs_server *server) 1261static void nfs4_session_set_rwsize(struct nfs_server *server)
1261{ 1262{
1262#ifdef CONFIG_NFS_V4_1 1263#ifdef CONFIG_NFS_V4_1
1264 struct nfs4_session *sess;
1265 u32 server_resp_sz;
1266 u32 server_rqst_sz;
1267
1263 if (!nfs4_has_session(server->nfs_client)) 1268 if (!nfs4_has_session(server->nfs_client))
1264 return; 1269 return;
1265 server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 1270 sess = server->nfs_client->cl_session;
1266 server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz; 1271 server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
1272 server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
1273
1274 if (server->rsize > server_resp_sz)
1275 server->rsize = server_resp_sz;
1276 if (server->wsize > server_rqst_sz)
1277 server->wsize = server_rqst_sz;
1267#endif /* CONFIG_NFS_V4_1 */ 1278#endif /* CONFIG_NFS_V4_1 */
1268} 1279}
1269 1280
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 6dd48a4405b4..15671245c6ee 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
14#include <linux/spinlock.h> 15#include <linux/spinlock.h>
15 16
@@ -92,7 +93,7 @@ out:
92 return status; 93 return status;
93} 94}
94 95
95static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) 96static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
96{ 97{
97 struct nfs_inode *nfsi = NFS_I(inode); 98 struct nfs_inode *nfsi = NFS_I(inode);
98 struct nfs_open_context *ctx; 99 struct nfs_open_context *ctx;
@@ -116,10 +117,11 @@ again:
116 err = nfs_delegation_claim_locks(ctx, state); 117 err = nfs_delegation_claim_locks(ctx, state);
117 put_nfs_open_context(ctx); 118 put_nfs_open_context(ctx);
118 if (err != 0) 119 if (err != 0)
119 return; 120 return err;
120 goto again; 121 goto again;
121 } 122 }
122 spin_unlock(&inode->i_lock); 123 spin_unlock(&inode->i_lock);
124 return 0;
123} 125}
124 126
125/* 127/*
@@ -261,30 +263,34 @@ static void nfs_msync_inode(struct inode *inode)
261/* 263/*
262 * Basic procedure for returning a delegation to the server 264 * Basic procedure for returning a delegation to the server
263 */ 265 */
264static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation) 266static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
265{ 267{
266 struct nfs_inode *nfsi = NFS_I(inode); 268 struct nfs_inode *nfsi = NFS_I(inode);
269 int err;
267 270
268 nfs_msync_inode(inode);
269 /* 271 /*
270 * Guard against new delegated open/lock/unlock calls and against 272 * Guard against new delegated open/lock/unlock calls and against
271 * state recovery 273 * state recovery
272 */ 274 */
273 down_write(&nfsi->rwsem); 275 down_write(&nfsi->rwsem);
274 nfs_delegation_claim_opens(inode, &delegation->stateid); 276 err = nfs_delegation_claim_opens(inode, &delegation->stateid);
275 up_write(&nfsi->rwsem); 277 up_write(&nfsi->rwsem);
276 nfs_msync_inode(inode); 278 if (err)
279 goto out;
277 280
278 return nfs_do_return_delegation(inode, delegation, 1); 281 err = nfs_do_return_delegation(inode, delegation, issync);
282out:
283 return err;
279} 284}
280 285
281/* 286/*
282 * Return all delegations that have been marked for return 287 * Return all delegations that have been marked for return
283 */ 288 */
284void nfs_client_return_marked_delegations(struct nfs_client *clp) 289int nfs_client_return_marked_delegations(struct nfs_client *clp)
285{ 290{
286 struct nfs_delegation *delegation; 291 struct nfs_delegation *delegation;
287 struct inode *inode; 292 struct inode *inode;
293 int err = 0;
288 294
289restart: 295restart:
290 rcu_read_lock(); 296 rcu_read_lock();
@@ -298,12 +304,18 @@ restart:
298 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); 304 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
299 spin_unlock(&clp->cl_lock); 305 spin_unlock(&clp->cl_lock);
300 rcu_read_unlock(); 306 rcu_read_unlock();
301 if (delegation != NULL) 307 if (delegation != NULL) {
302 __nfs_inode_return_delegation(inode, delegation); 308 filemap_flush(inode->i_mapping);
309 err = __nfs_inode_return_delegation(inode, delegation, 0);
310 }
303 iput(inode); 311 iput(inode);
304 goto restart; 312 if (!err)
313 goto restart;
314 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
315 return err;
305 } 316 }
306 rcu_read_unlock(); 317 rcu_read_unlock();
318 return 0;
307} 319}
308 320
309/* 321/*
@@ -338,8 +350,10 @@ int nfs_inode_return_delegation(struct inode *inode)
338 spin_lock(&clp->cl_lock); 350 spin_lock(&clp->cl_lock);
339 delegation = nfs_detach_delegation_locked(nfsi, NULL); 351 delegation = nfs_detach_delegation_locked(nfsi, NULL);
340 spin_unlock(&clp->cl_lock); 352 spin_unlock(&clp->cl_lock);
341 if (delegation != NULL) 353 if (delegation != NULL) {
342 err = __nfs_inode_return_delegation(inode, delegation); 354 nfs_msync_inode(inode);
355 err = __nfs_inode_return_delegation(inode, delegation, 1);
356 }
343 } 357 }
344 return err; 358 return err;
345} 359}
@@ -368,33 +382,47 @@ void nfs_super_return_all_delegations(struct super_block *sb)
368 spin_unlock(&delegation->lock); 382 spin_unlock(&delegation->lock);
369 } 383 }
370 rcu_read_unlock(); 384 rcu_read_unlock();
371 nfs_client_return_marked_delegations(clp); 385 if (nfs_client_return_marked_delegations(clp) != 0)
386 nfs4_schedule_state_manager(clp);
372} 387}
373 388
374static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) 389static
390void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
375{ 391{
376 struct nfs_delegation *delegation; 392 struct nfs_delegation *delegation;
377 393
378 rcu_read_lock(); 394 rcu_read_lock();
379 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 395 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
380 set_bit(NFS_DELEGATION_RETURN, &delegation->flags); 396 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
381 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); 397 continue;
398 if (delegation->type & flags)
399 nfs_mark_return_delegation(clp, delegation);
382 } 400 }
383 rcu_read_unlock(); 401 rcu_read_unlock();
384} 402}
385 403
404static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
405{
406 nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
407}
408
386static void nfs_delegation_run_state_manager(struct nfs_client *clp) 409static void nfs_delegation_run_state_manager(struct nfs_client *clp)
387{ 410{
388 if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) 411 if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
389 nfs4_schedule_state_manager(clp); 412 nfs4_schedule_state_manager(clp);
390} 413}
391 414
392void nfs_expire_all_delegations(struct nfs_client *clp) 415void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
393{ 416{
394 nfs_client_mark_return_all_delegations(clp); 417 nfs_client_mark_return_all_delegation_types(clp, flags);
395 nfs_delegation_run_state_manager(clp); 418 nfs_delegation_run_state_manager(clp);
396} 419}
397 420
421void nfs_expire_all_delegations(struct nfs_client *clp)
422{
423 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
424}
425
398/* 426/*
399 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. 427 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
400 */ 428 */
@@ -413,8 +441,7 @@ static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *c
413 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 441 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
414 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) 442 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
415 continue; 443 continue;
416 set_bit(NFS_DELEGATION_RETURN, &delegation->flags); 444 nfs_mark_return_delegation(clp, delegation);
417 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
418 } 445 }
419 rcu_read_unlock(); 446 rcu_read_unlock();
420} 447}
@@ -428,18 +455,21 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
428/* 455/*
429 * Asynchronous delegation recall! 456 * Asynchronous delegation recall!
430 */ 457 */
431int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) 458int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
459 int (*validate_stateid)(struct nfs_delegation *delegation,
460 const nfs4_stateid *stateid))
432{ 461{
433 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 462 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
434 struct nfs_delegation *delegation; 463 struct nfs_delegation *delegation;
435 464
436 rcu_read_lock(); 465 rcu_read_lock();
437 delegation = rcu_dereference(NFS_I(inode)->delegation); 466 delegation = rcu_dereference(NFS_I(inode)->delegation);
438 if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, 467
439 sizeof(delegation->stateid.data)) != 0) { 468 if (!validate_stateid(delegation, stateid)) {
440 rcu_read_unlock(); 469 rcu_read_unlock();
441 return -ENOENT; 470 return -ENOENT;
442 } 471 }
472
443 nfs_mark_return_delegation(clp, delegation); 473 nfs_mark_return_delegation(clp, delegation);
444 rcu_read_unlock(); 474 rcu_read_unlock();
445 nfs_delegation_run_state_manager(clp); 475 nfs_delegation_run_state_manager(clp);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 09f383795174..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,15 +34,18 @@ enum {
34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
36int nfs_inode_return_delegation(struct inode *inode); 36int nfs_inode_return_delegation(struct inode *inode);
37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); 37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
38 int (*validate_stateid)(struct nfs_delegation *delegation,
39 const nfs4_stateid *stateid));
38void nfs_inode_return_delegation_noreclaim(struct inode *inode); 40void nfs_inode_return_delegation_noreclaim(struct inode *inode);
39 41
40struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); 42struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
41void nfs_super_return_all_delegations(struct super_block *sb); 43void nfs_super_return_all_delegations(struct super_block *sb);
42void nfs_expire_all_delegations(struct nfs_client *clp); 44void nfs_expire_all_delegations(struct nfs_client *clp);
45void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
43void nfs_expire_unreferenced_delegations(struct nfs_client *clp); 46void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
44void nfs_handle_cb_pathdown(struct nfs_client *clp); 47void nfs_handle_cb_pathdown(struct nfs_client *clp);
45void nfs_client_return_marked_delegations(struct nfs_client *clp); 48int nfs_client_return_marked_delegations(struct nfs_client *clp);
46 49
47void nfs_delegation_mark_reclaim(struct nfs_client *clp); 50void nfs_delegation_mark_reclaim(struct nfs_client *clp);
48void nfs_delegation_reap_unclaimed(struct nfs_client *clp); 51void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -68,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
68} 71}
69#endif 72#endif
70 73
74static inline int nfs_have_delegated_attributes(struct inode *inode)
75{
76 return nfs_have_delegation(inode, FMODE_READ) &&
77 !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
78}
79
71#endif 80#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7cb298525eef..c6f2750648f4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
560 desc->entry = &my_entry; 560 desc->entry = &my_entry;
561 561
562 nfs_block_sillyrename(dentry); 562 nfs_block_sillyrename(dentry);
563 res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); 563 res = nfs_revalidate_mapping(inode, filp->f_mapping);
564 if (res < 0) 564 if (res < 0)
565 goto out; 565 goto out;
566 566
@@ -1579,55 +1579,47 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1579 struct dentry *dentry = NULL, *rehash = NULL; 1579 struct dentry *dentry = NULL, *rehash = NULL;
1580 int error = -EBUSY; 1580 int error = -EBUSY;
1581 1581
1582 /*
1583 * To prevent any new references to the target during the rename,
1584 * we unhash the dentry and free the inode in advance.
1585 */
1586 if (!d_unhashed(new_dentry)) {
1587 d_drop(new_dentry);
1588 rehash = new_dentry;
1589 }
1590
1591 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", 1582 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1592 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1583 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1593 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1584 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1594 atomic_read(&new_dentry->d_count)); 1585 atomic_read(&new_dentry->d_count));
1595 1586
1596 /* 1587 /*
1597 * First check whether the target is busy ... we can't 1588 * For non-directories, check whether the target is busy and if so,
1598 * safely do _any_ rename if the target is in use. 1589 * make a copy of the dentry and then do a silly-rename. If the
1599 * 1590 * silly-rename succeeds, the copied dentry is hashed and becomes
1600 * For files, make a copy of the dentry and then do a 1591 * the new target.
1601 * silly-rename. If the silly-rename succeeds, the
1602 * copied dentry is hashed and becomes the new target.
1603 */ 1592 */
1604 if (!new_inode) 1593 if (new_inode && !S_ISDIR(new_inode->i_mode)) {
1605 goto go_ahead; 1594 /*
1606 if (S_ISDIR(new_inode->i_mode)) { 1595 * To prevent any new references to the target during the
1607 error = -EISDIR; 1596 * rename, we unhash the dentry in advance.
1608 if (!S_ISDIR(old_inode->i_mode)) 1597 */
1609 goto out; 1598 if (!d_unhashed(new_dentry)) {
1610 } else if (atomic_read(&new_dentry->d_count) > 2) { 1599 d_drop(new_dentry);
1611 int err; 1600 rehash = new_dentry;
1612 /* copy the target dentry's name */ 1601 }
1613 dentry = d_alloc(new_dentry->d_parent, 1602
1614 &new_dentry->d_name); 1603 if (atomic_read(&new_dentry->d_count) > 2) {
1615 if (!dentry) 1604 int err;
1616 goto out; 1605
1606 /* copy the target dentry's name */
1607 dentry = d_alloc(new_dentry->d_parent,
1608 &new_dentry->d_name);
1609 if (!dentry)
1610 goto out;
1617 1611
1618 /* silly-rename the existing target ... */ 1612 /* silly-rename the existing target ... */
1619 err = nfs_sillyrename(new_dir, new_dentry); 1613 err = nfs_sillyrename(new_dir, new_dentry);
1620 if (!err) { 1614 if (err)
1621 new_dentry = rehash = dentry; 1615 goto out;
1616
1617 new_dentry = dentry;
1618 rehash = NULL;
1622 new_inode = NULL; 1619 new_inode = NULL;
1623 /* instantiate the replacement target */ 1620 }
1624 d_instantiate(new_dentry, NULL);
1625 } else if (atomic_read(&new_dentry->d_count) > 1)
1626 /* dentry still busy? */
1627 goto out;
1628 } 1621 }
1629 1622
1630go_ahead:
1631 /* 1623 /*
1632 * ... prune child dentries and writebacks if needed. 1624 * ... prune child dentries and writebacks if needed.
1633 */ 1625 */
@@ -1797,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
1797 cache = nfs_access_search_rbtree(inode, cred); 1789 cache = nfs_access_search_rbtree(inode, cred);
1798 if (cache == NULL) 1790 if (cache == NULL)
1799 goto out; 1791 goto out;
1800 if (!nfs_have_delegation(inode, FMODE_READ) && 1792 if (!nfs_have_delegated_attributes(inode) &&
1801 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1802 goto out_stale; 1794 goto out_stale;
1803 res->jiffies = cache->jiffies; 1795 res->jiffies = cache->jiffies;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e1d415e97849..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
44#include <linux/file.h> 44#include <linux/file.h>
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h>
47 48
48#include <linux/nfs_fs.h> 49#include <linux/nfs_fs.h>
49#include <linux/nfs_page.h> 50#include <linux/nfs_page.h>
@@ -342,6 +343,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
342 data->res.fattr = &data->fattr; 343 data->res.fattr = &data->fattr;
343 data->res.eof = 0; 344 data->res.eof = 0;
344 data->res.count = bytes; 345 data->res.count = bytes;
346 nfs_fattr_init(&data->fattr);
345 msg.rpc_argp = &data->args; 347 msg.rpc_argp = &data->args;
346 msg.rpc_resp = &data->res; 348 msg.rpc_resp = &data->res;
347 349
@@ -575,6 +577,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
575 data->res.count = 0; 577 data->res.count = 0;
576 data->res.fattr = &data->fattr; 578 data->res.fattr = &data->fattr;
577 data->res.verf = &data->verf; 579 data->res.verf = &data->verf;
580 nfs_fattr_init(&data->fattr);
578 581
579 NFS_PROTO(data->inode)->commit_setup(data, &msg); 582 NFS_PROTO(data->inode)->commit_setup(data, &msg);
580 583
@@ -766,6 +769,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
766 data->res.fattr = &data->fattr; 769 data->res.fattr = &data->fattr;
767 data->res.count = bytes; 770 data->res.count = bytes;
768 data->res.verf = &data->verf; 771 data->res.verf = &data->verf;
772 nfs_fattr_init(&data->fattr);
769 773
770 task_setup_data.task = &data->task; 774 task_setup_data.task = &data->task;
771 task_setup_data.callback_data = data; 775 task_setup_data.callback_data = data;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index f4d54ba97cc6..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
9#include <linux/hash.h> 9#include <linux/hash.h>
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/socket.h> 14#include <linux/socket.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
@@ -36,6 +37,19 @@ struct nfs_dns_ent {
36}; 37};
37 38
38 39
40static void nfs_dns_ent_update(struct cache_head *cnew,
41 struct cache_head *ckey)
42{
43 struct nfs_dns_ent *new;
44 struct nfs_dns_ent *key;
45
46 new = container_of(cnew, struct nfs_dns_ent, h);
47 key = container_of(ckey, struct nfs_dns_ent, h);
48
49 memcpy(&new->addr, &key->addr, key->addrlen);
50 new->addrlen = key->addrlen;
51}
52
39static void nfs_dns_ent_init(struct cache_head *cnew, 53static void nfs_dns_ent_init(struct cache_head *cnew,
40 struct cache_head *ckey) 54 struct cache_head *ckey)
41{ 55{
@@ -49,8 +63,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
49 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); 63 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
50 if (new->hostname) { 64 if (new->hostname) {
51 new->namelen = key->namelen; 65 new->namelen = key->namelen;
52 memcpy(&new->addr, &key->addr, key->addrlen); 66 nfs_dns_ent_update(cnew, ckey);
53 new->addrlen = key->addrlen;
54 } else { 67 } else {
55 new->namelen = 0; 68 new->namelen = 0;
56 new->addrlen = 0; 69 new->addrlen = 0;
@@ -146,7 +159,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
146 return 0; 159 return 0;
147} 160}
148 161
149struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, 162static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
150 struct nfs_dns_ent *key) 163 struct nfs_dns_ent *key)
151{ 164{
152 struct cache_head *ch; 165 struct cache_head *ch;
@@ -159,7 +172,7 @@ struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
159 return container_of(ch, struct nfs_dns_ent, h); 172 return container_of(ch, struct nfs_dns_ent, h);
160} 173}
161 174
162struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, 175static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
163 struct nfs_dns_ent *new, 176 struct nfs_dns_ent *new,
164 struct nfs_dns_ent *key) 177 struct nfs_dns_ent *key)
165{ 178{
@@ -234,7 +247,7 @@ static struct cache_detail nfs_dns_resolve = {
234 .cache_show = nfs_dns_show, 247 .cache_show = nfs_dns_show,
235 .match = nfs_dns_match, 248 .match = nfs_dns_match,
236 .init = nfs_dns_ent_init, 249 .init = nfs_dns_ent_init,
237 .update = nfs_dns_ent_init, 250 .update = nfs_dns_ent_update,
238 .alloc = nfs_dns_ent_alloc, 251 .alloc = nfs_dns_ent_alloc,
239}; 252};
240 253
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f5fdd39e037a..8d965bddb87e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
24#include <linux/nfs_fs.h> 24#include <linux/nfs_fs.h>
25#include <linux/nfs_mount.h> 25#include <linux/nfs_mount.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
29#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/gfp.h>
30 30
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32#include <asm/system.h> 32#include <asm/system.h>
@@ -123,11 +123,11 @@ nfs_file_open(struct inode *inode, struct file *filp)
123 filp->f_path.dentry->d_parent->d_name.name, 123 filp->f_path.dentry->d_parent->d_name.name,
124 filp->f_path.dentry->d_name.name); 124 filp->f_path.dentry->d_name.name);
125 125
126 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
126 res = nfs_check_flags(filp->f_flags); 127 res = nfs_check_flags(filp->f_flags);
127 if (res) 128 if (res)
128 return res; 129 return res;
129 130
130 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
131 res = nfs_open(inode, filp); 131 res = nfs_open(inode, filp);
132 return res; 132 return res;
133} 133}
@@ -237,9 +237,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
237 dentry->d_parent->d_name.name, 237 dentry->d_parent->d_name.name,
238 dentry->d_name.name); 238 dentry->d_name.name);
239 239
240 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
240 if ((file->f_mode & FMODE_WRITE) == 0) 241 if ((file->f_mode & FMODE_WRITE) == 0)
241 return 0; 242 return 0;
242 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
243 243
244 /* Flush writes to the server and return any errors */ 244 /* Flush writes to the server and return any errors */
245 return nfs_do_fsync(ctx, inode); 245 return nfs_do_fsync(ctx, inode);
@@ -262,9 +262,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
262 (unsigned long) count, (unsigned long) pos); 262 (unsigned long) count, (unsigned long) pos);
263 263
264 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 264 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
265 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); 265 if (!result) {
266 if (!result)
267 result = generic_file_aio_read(iocb, iov, nr_segs, pos); 266 result = generic_file_aio_read(iocb, iov, nr_segs, pos);
267 if (result > 0)
268 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
269 }
268 return result; 270 return result;
269} 271}
270 272
@@ -282,8 +284,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
282 (unsigned long) count, (unsigned long long) *ppos); 284 (unsigned long) count, (unsigned long long) *ppos);
283 285
284 res = nfs_revalidate_mapping(inode, filp->f_mapping); 286 res = nfs_revalidate_mapping(inode, filp->f_mapping);
285 if (!res) 287 if (!res) {
286 res = generic_file_splice_read(filp, ppos, pipe, count, flags); 288 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
289 if (res > 0)
290 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
291 }
287 return res; 292 return res;
288} 293}
289 294
@@ -486,6 +491,9 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
486{ 491{
487 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
488 493
494 /* Only do I/O if gfp is a superset of GFP_KERNEL */
495 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
496 nfs_wb_page(page->mapping->host, page);
489 /* If PagePrivate() is set, then the page is not freeable */ 497 /* If PagePrivate() is set, then the page is not freeable */
490 if (PagePrivate(page)) 498 if (PagePrivate(page))
491 return 0; 499 return 0;
@@ -581,7 +589,7 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
581{ 589{
582 struct nfs_open_context *ctx; 590 struct nfs_open_context *ctx;
583 591
584 if (IS_SYNC(inode) || (filp->f_flags & O_SYNC)) 592 if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
585 return 1; 593 return 1;
586 ctx = nfs_file_open_context(filp); 594 ctx = nfs_file_open_context(filp);
587 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) 595 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
@@ -594,6 +602,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
594{ 602{
595 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 603 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
596 struct inode * inode = dentry->d_inode; 604 struct inode * inode = dentry->d_inode;
605 unsigned long written = 0;
597 ssize_t result; 606 ssize_t result;
598 size_t count = iov_length(iov, nr_segs); 607 size_t count = iov_length(iov, nr_segs);
599 608
@@ -620,14 +629,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
620 if (!count) 629 if (!count)
621 goto out; 630 goto out;
622 631
623 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
624 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 632 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
625 /* Return error values for O_SYNC and IS_SYNC() */ 633 if (result > 0)
634 written = result;
635
636 /* Return error values for O_DSYNC and IS_SYNC() */
626 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 637 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
627 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); 638 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
628 if (err < 0) 639 if (err < 0)
629 result = err; 640 result = err;
630 } 641 }
642 if (result > 0)
643 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
631out: 644out:
632 return result; 645 return result;
633 646
@@ -642,6 +655,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
642{ 655{
643 struct dentry *dentry = filp->f_path.dentry; 656 struct dentry *dentry = filp->f_path.dentry;
644 struct inode *inode = dentry->d_inode; 657 struct inode *inode = dentry->d_inode;
658 unsigned long written = 0;
645 ssize_t ret; 659 ssize_t ret;
646 660
647 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", 661 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
@@ -652,14 +666,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
652 * The combination of splice and an O_APPEND destination is disallowed. 666 * The combination of splice and an O_APPEND destination is disallowed.
653 */ 667 */
654 668
655 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
656
657 ret = generic_file_splice_write(pipe, filp, ppos, count, flags); 669 ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
670 if (ret > 0)
671 written = ret;
672
658 if (ret >= 0 && nfs_need_sync_write(filp, inode)) { 673 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
659 int err = nfs_do_fsync(nfs_file_open_context(filp), inode); 674 int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
660 if (err < 0) 675 if (err < 0)
661 ret = err; 676 ret = err;
662 } 677 }
678 if (ret > 0)
679 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
663 return ret; 680 return ret;
664} 681}
665 682
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index fa588006588d..a6b16ed93229 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
17#include <linux/nfs_fs_sb.h> 17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h> 18#include <linux/in6.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/slab.h>
20 21
21#include "internal.h" 22#include "internal.h"
22#include "iostat.h" 23#include "iostat.h"
@@ -354,12 +355,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
354 */ 355 */
355int nfs_fscache_release_page(struct page *page, gfp_t gfp) 356int nfs_fscache_release_page(struct page *page, gfp_t gfp)
356{ 357{
357 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
358 struct fscache_cookie *cookie = nfsi->fscache;
359
360 BUG_ON(!cookie);
361
362 if (PageFsCache(page)) { 358 if (PageFsCache(page)) {
359 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
360 struct fscache_cookie *cookie = nfsi->fscache;
361
362 BUG_ON(!cookie);
363 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", 363 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
364 cookie, page, nfsi); 364 cookie, page, nfsi);
365 365
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa091865ad0..737128f777f3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h>
39 40
40#include <asm/system.h> 41#include <asm/system.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -97,22 +98,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
97 return ino; 98 return ino;
98} 99}
99 100
100int nfs_write_inode(struct inode *inode, int sync)
101{
102 int ret;
103
104 if (sync) {
105 ret = filemap_fdatawait(inode->i_mapping);
106 if (ret == 0)
107 ret = nfs_commit_inode(inode, FLUSH_SYNC);
108 } else
109 ret = nfs_commit_inode(inode, 0);
110 if (ret >= 0)
111 return 0;
112 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
113 return ret;
114}
115
116void nfs_clear_inode(struct inode *inode) 101void nfs_clear_inode(struct inode *inode)
117{ 102{
118 /* 103 /*
@@ -130,16 +115,12 @@ void nfs_clear_inode(struct inode *inode)
130 */ 115 */
131int nfs_sync_mapping(struct address_space *mapping) 116int nfs_sync_mapping(struct address_space *mapping)
132{ 117{
133 int ret; 118 int ret = 0;
134 119
135 if (mapping->nrpages == 0) 120 if (mapping->nrpages != 0) {
136 return 0; 121 unmap_mapping_range(mapping, 0, 0, 0);
137 unmap_mapping_range(mapping, 0, 0, 0); 122 ret = nfs_wb_all(mapping->host);
138 ret = filemap_write_and_wait(mapping); 123 }
139 if (ret != 0)
140 goto out;
141 ret = nfs_wb_all(mapping->host);
142out:
143 return ret; 124 return ret;
144} 125}
145 126
@@ -511,17 +492,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
511 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 492 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
512 int err; 493 int err;
513 494
514 /* 495 /* Flush out writes to the server in order to update c/mtime. */
515 * Flush out writes to the server in order to update c/mtime.
516 *
517 * Hold the i_mutex to suspend application writes temporarily;
518 * this prevents long-running writing applications from blocking
519 * nfs_wb_nocommit.
520 */
521 if (S_ISREG(inode->i_mode)) { 496 if (S_ISREG(inode->i_mode)) {
522 mutex_lock(&inode->i_mutex); 497 err = filemap_write_and_wait(inode->i_mapping);
523 nfs_wb_nocommit(inode); 498 if (err)
524 mutex_unlock(&inode->i_mutex); 499 goto out;
525 } 500 }
526 501
527 /* 502 /*
@@ -545,6 +520,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
545 generic_fillattr(inode, stat); 520 generic_fillattr(inode, stat);
546 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 521 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
547 } 522 }
523out:
548 return err; 524 return err;
549} 525}
550 526
@@ -574,14 +550,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
574 nfs_revalidate_inode(server, inode); 550 nfs_revalidate_inode(server, inode);
575} 551}
576 552
577static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 553static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
578{ 554{
579 struct nfs_open_context *ctx; 555 struct nfs_open_context *ctx;
580 556
581 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 557 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
582 if (ctx != NULL) { 558 if (ctx != NULL) {
583 ctx->path.dentry = dget(dentry); 559 ctx->path = *path;
584 ctx->path.mnt = mntget(mnt); 560 path_get(&ctx->path);
585 ctx->cred = get_rpccred(cred); 561 ctx->cred = get_rpccred(cred);
586 ctx->state = NULL; 562 ctx->state = NULL;
587 ctx->lockowner = current->files; 563 ctx->lockowner = current->files;
@@ -620,11 +596,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
620 __put_nfs_open_context(ctx, 0); 596 __put_nfs_open_context(ctx, 0);
621} 597}
622 598
623static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
624{
625 __put_nfs_open_context(ctx, 1);
626}
627
628/* 599/*
629 * Ensure that mmap has a recent RPC credential for use when writing out 600 * Ensure that mmap has a recent RPC credential for use when writing out
630 * shared pages 601 * shared pages
@@ -671,7 +642,7 @@ static void nfs_file_clear_open_context(struct file *filp)
671 spin_lock(&inode->i_lock); 642 spin_lock(&inode->i_lock);
672 list_move_tail(&ctx->list, &NFS_I(inode)->open_files); 643 list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
673 spin_unlock(&inode->i_lock); 644 spin_unlock(&inode->i_lock);
674 put_nfs_open_context_sync(ctx); 645 __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
675 } 646 }
676} 647}
677 648
@@ -686,7 +657,7 @@ int nfs_open(struct inode *inode, struct file *filp)
686 cred = rpc_lookup_cred(); 657 cred = rpc_lookup_cred();
687 if (IS_ERR(cred)) 658 if (IS_ERR(cred))
688 return PTR_ERR(cred); 659 return PTR_ERR(cred);
689 ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); 660 ctx = alloc_nfs_open_context(&filp->f_path, cred);
690 put_rpccred(cred); 661 put_rpccred(cred);
691 if (ctx == NULL) 662 if (ctx == NULL)
692 return -ENOMEM; 663 return -ENOMEM;
@@ -759,7 +730,7 @@ int nfs_attribute_timeout(struct inode *inode)
759{ 730{
760 struct nfs_inode *nfsi = NFS_I(inode); 731 struct nfs_inode *nfsi = NFS_I(inode);
761 732
762 if (nfs_have_delegation(inode, FMODE_READ)) 733 if (nfs_have_delegated_attributes(inode))
763 return 0; 734 return 0;
764 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 735 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
765} 736}
@@ -779,7 +750,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
779 return __nfs_revalidate_inode(server, inode); 750 return __nfs_revalidate_inode(server, inode);
780} 751}
781 752
782static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) 753static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
783{ 754{
784 struct nfs_inode *nfsi = NFS_I(inode); 755 struct nfs_inode *nfsi = NFS_I(inode);
785 756
@@ -800,49 +771,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
800 return 0; 771 return 0;
801} 772}
802 773
803static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
804{
805 int ret = 0;
806
807 mutex_lock(&inode->i_mutex);
808 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
809 ret = nfs_sync_mapping(mapping);
810 if (ret == 0)
811 ret = nfs_invalidate_mapping_nolock(inode, mapping);
812 }
813 mutex_unlock(&inode->i_mutex);
814 return ret;
815}
816
817/**
818 * nfs_revalidate_mapping_nolock - Revalidate the pagecache
819 * @inode - pointer to host inode
820 * @mapping - pointer to mapping
821 */
822int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
823{
824 struct nfs_inode *nfsi = NFS_I(inode);
825 int ret = 0;
826
827 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
828 || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
829 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
830 if (ret < 0)
831 goto out;
832 }
833 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
834 ret = nfs_invalidate_mapping_nolock(inode, mapping);
835out:
836 return ret;
837}
838
839/** 774/**
840 * nfs_revalidate_mapping - Revalidate the pagecache 775 * nfs_revalidate_mapping - Revalidate the pagecache
841 * @inode - pointer to host inode 776 * @inode - pointer to host inode
842 * @mapping - pointer to mapping 777 * @mapping - pointer to mapping
843 *
844 * This version of the function will take the inode->i_mutex and attempt to
845 * flush out all dirty data if it needs to invalidate the page cache.
846 */ 778 */
847int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 779int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
848{ 780{
@@ -1261,8 +1193,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1261 1193
1262 if (fattr->valid & NFS_ATTR_FATTR_MODE) { 1194 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1263 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { 1195 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1196 umode_t newmode = inode->i_mode & S_IFMT;
1197 newmode |= fattr->mode & S_IALLUGO;
1198 inode->i_mode = newmode;
1264 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1199 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1265 inode->i_mode = fattr->mode;
1266 } 1200 }
1267 } else if (server->caps & NFS_CAP_MODE) 1201 } else if (server->caps & NFS_CAP_MODE)
1268 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR 1202 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1418,6 +1352,7 @@ static void init_once(void *foo)
1418 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1352 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1419 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1353 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1420 nfsi->npages = 0; 1354 nfsi->npages = 0;
1355 nfsi->ncommit = 0;
1421 atomic_set(&nfsi->silly_count, 1); 1356 atomic_set(&nfsi->silly_count, 1);
1422 INIT_HLIST_HEAD(&nfsi->silly_list); 1357 INIT_HLIST_HEAD(&nfsi->silly_list);
1423 init_waitqueue_head(&nfsi->waitqueue); 1358 init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e21b1bb9972f..11f82f03c5de 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -30,6 +30,15 @@ static inline int nfs4_has_session(const struct nfs_client *clp)
30 return 0; 30 return 0;
31} 31}
32 32
33static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
34{
35#ifdef CONFIG_NFS_V4_1
36 if (nfs4_has_session(clp))
37 return (clp->cl_session->flags & SESSION4_PERSIST);
38#endif /* CONFIG_NFS_V4_1 */
39 return 0;
40}
41
33struct nfs_clone_mount { 42struct nfs_clone_mount {
34 const struct super_block *sb; 43 const struct super_block *sb;
35 const struct dentry *dentry; 44 const struct dentry *dentry;
@@ -156,6 +165,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
156 165
157/* callback_xdr.c */ 166/* callback_xdr.c */
158extern struct svc_version nfs4_callback_version1; 167extern struct svc_version nfs4_callback_version1;
168extern struct svc_version nfs4_callback_version4;
159 169
160/* pagelist.c */ 170/* pagelist.c */
161extern int __init nfs_init_nfspagecache(void); 171extern int __init nfs_init_nfspagecache(void);
@@ -177,24 +187,14 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
177extern struct rpc_procinfo nfs3_procedures[]; 187extern struct rpc_procinfo nfs3_procedures[];
178extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); 188extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
179 189
180/* nfs4proc.c */
181static inline void nfs4_restart_rpc(struct rpc_task *task,
182 const struct nfs_client *clp)
183{
184#ifdef CONFIG_NFS_V4_1
185 if (nfs4_has_session(clp) &&
186 test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
187 rpc_restart_call_prepare(task);
188 return;
189 }
190#endif /* CONFIG_NFS_V4_1 */
191 rpc_restart_call(task);
192}
193
194/* nfs4xdr.c */ 190/* nfs4xdr.c */
195#ifdef CONFIG_NFS_V4 191#ifdef CONFIG_NFS_V4
196extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 192extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
197#endif 193#endif
194#ifdef CONFIG_NFS_V4_1
195extern const u32 nfs41_maxread_overhead;
196extern const u32 nfs41_maxwrite_overhead;
197#endif
198 198
199/* nfs4proc.c */ 199/* nfs4proc.c */
200#ifdef CONFIG_NFS_V4 200#ifdef CONFIG_NFS_V4
@@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
211extern struct workqueue_struct *nfsiod_workqueue; 211extern struct workqueue_struct *nfsiod_workqueue;
212extern struct inode *nfs_alloc_inode(struct super_block *sb); 212extern struct inode *nfs_alloc_inode(struct super_block *sb);
213extern void nfs_destroy_inode(struct inode *); 213extern void nfs_destroy_inode(struct inode *);
214extern int nfs_write_inode(struct inode *,int); 214extern int nfs_write_inode(struct inode *, struct writeback_control *);
215extern void nfs_clear_inode(struct inode *); 215extern void nfs_clear_inode(struct inode *);
216#ifdef CONFIG_NFS_V4 216#ifdef CONFIG_NFS_V4
217extern void nfs4_clear_inode(struct inode *); 217extern void nfs4_clear_inode(struct inode *);
@@ -273,20 +273,6 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
273 struct nfs4_sequence_res *res, 273 struct nfs4_sequence_res *res,
274 int cache_reply); 274 int cache_reply);
275 275
276#ifdef CONFIG_NFS_V4_1
277extern void nfs41_sequence_free_slot(const struct nfs_client *,
278 struct nfs4_sequence_res *res);
279#endif /* CONFIG_NFS_V4_1 */
280
281static inline void nfs4_sequence_free_slot(const struct nfs_client *clp,
282 struct nfs4_sequence_res *res)
283{
284#ifdef CONFIG_NFS_V4_1
285 if (nfs4_has_session(clp))
286 nfs41_sequence_free_slot(clp, res);
287#endif /* CONFIG_NFS_V4_1 */
288}
289
290/* 276/*
291 * Determine the device name as a string 277 * Determine the device name as a string
292 */ 278 */
@@ -380,3 +366,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
380 return ((unsigned long)len + (unsigned long)base + 366 return ((unsigned long)len + (unsigned long)base +
381 PAGE_SIZE - 1) >> PAGE_SHIFT; 367 PAGE_SIZE - 1) >> PAGE_SHIFT;
382} 368}
369
370/*
371 * Helper for restarting RPC calls in the possible presence of NFSv4.1
372 * sessions.
373 */
374static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
375{
376 if (nfs4_has_session(clp))
377 rpc_restart_call_prepare(task);
378 else
379 rpc_restart_call(task);
380}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index ceda50aad73c..1d8d5c813b01 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -25,13 +25,7 @@ struct nfs_iostats {
25static inline void nfs_inc_server_stats(const struct nfs_server *server, 25static inline void nfs_inc_server_stats(const struct nfs_server *server,
26 enum nfs_stat_eventcounters stat) 26 enum nfs_stat_eventcounters stat)
27{ 27{
28 struct nfs_iostats *iostats; 28 this_cpu_inc(server->io_stats->events[stat]);
29 int cpu;
30
31 cpu = get_cpu();
32 iostats = per_cpu_ptr(server->io_stats, cpu);
33 iostats->events[stat]++;
34 put_cpu();
35} 29}
36 30
37static inline void nfs_inc_stats(const struct inode *inode, 31static inline void nfs_inc_stats(const struct inode *inode,
@@ -44,13 +38,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
44 enum nfs_stat_bytecounters stat, 38 enum nfs_stat_bytecounters stat,
45 unsigned long addend) 39 unsigned long addend)
46{ 40{
47 struct nfs_iostats *iostats; 41 this_cpu_add(server->io_stats->bytes[stat], addend);
48 int cpu;
49
50 cpu = get_cpu();
51 iostats = per_cpu_ptr(server->io_stats, cpu);
52 iostats->bytes[stat] += addend;
53 put_cpu();
54} 42}
55 43
56static inline void nfs_add_stats(const struct inode *inode, 44static inline void nfs_add_stats(const struct inode *inode,
@@ -65,22 +53,16 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
65 enum nfs_stat_fscachecounters stat, 53 enum nfs_stat_fscachecounters stat,
66 unsigned long addend) 54 unsigned long addend)
67{ 55{
68 struct nfs_iostats *iostats; 56 this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
69 int cpu;
70
71 cpu = get_cpu();
72 iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
73 iostats->fscache[stat] += addend;
74 put_cpu();
75} 57}
76#endif 58#endif
77 59
78static inline struct nfs_iostats *nfs_alloc_iostats(void) 60static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
79{ 61{
80 return alloc_percpu(struct nfs_iostats); 62 return alloc_percpu(struct nfs_iostats);
81} 63}
82 64
83static inline void nfs_free_iostats(struct nfs_iostats *stats) 65static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
84{ 66{
85 if (stats != NULL) 67 if (stats != NULL)
86 free_percpu(stats); 68 free_percpu(stats);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0adefc40cc89..59047f8d7d72 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -120,7 +120,7 @@ static struct {
120 { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, 120 { .status = MNT3ERR_INVAL, .errno = -EINVAL, },
121 { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, 121 { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
122 { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, 122 { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
123 { .status = MNT3ERR_SERVERFAULT, .errno = -ESERVERFAULT, }, 123 { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, },
124}; 124};
125 125
126struct mountres { 126struct mountres {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..7888cf36022d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/dcache.h> 10#include <linux/dcache.h>
11#include <linux/gfp.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e078b222b4e..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
12#include <linux/param.h> 12#include <linux/param.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/errno.h> 15#include <linux/errno.h>
17#include <linux/string.h> 16#include <linux/string.h>
18#include <linux/in.h> 17#include <linux/in.h>
@@ -699,7 +698,7 @@ static struct {
699 { NFSERR_BAD_COOKIE, -EBADCOOKIE }, 698 { NFSERR_BAD_COOKIE, -EBADCOOKIE },
700 { NFSERR_NOTSUPP, -ENOTSUPP }, 699 { NFSERR_NOTSUPP, -ENOTSUPP },
701 { NFSERR_TOOSMALL, -ETOOSMALL }, 700 { NFSERR_TOOSMALL, -ETOOSMALL },
702 { NFSERR_SERVERFAULT, -ESERVERFAULT }, 701 { NFSERR_SERVERFAULT, -EREMOTEIO },
703 { NFSERR_BADTYPE, -EBADTYPE }, 702 { NFSERR_BADTYPE, -EBADTYPE },
704 { NFSERR_JUKEBOX, -EJUKEBOX }, 703 { NFSERR_JUKEBOX, -EJUKEBOX },
705 { -1, -EIO } 704 { -1, -EIO }
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..d150ae0c5ecd 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/gfp.h>
2#include <linux/nfs.h> 3#include <linux/nfs.h>
3#include <linux/nfs3.h> 4#include <linux/nfs3.h>
4#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3f8881d1a050..e701002694e5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
10#include <linux/errno.h> 10#include <linux/errno.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
13#include <linux/slab.h>
13#include <linux/nfs.h> 14#include <linux/nfs.h>
14#include <linux/nfs3.h> 15#include <linux/nfs3.h>
15#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
@@ -22,14 +23,14 @@
22 23
23#define NFSDBG_FACILITY NFSDBG_PROC 24#define NFSDBG_FACILITY NFSDBG_PROC
24 25
25/* A wrapper to handle the EJUKEBOX error message */ 26/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
26static int 27static int
27nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) 28nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
28{ 29{
29 int res; 30 int res;
30 do { 31 do {
31 res = rpc_call_sync(clnt, msg, flags); 32 res = rpc_call_sync(clnt, msg, flags);
32 if (res != -EJUKEBOX) 33 if (res != -EJUKEBOX && res != -EKEYEXPIRED)
33 break; 34 break;
34 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); 35 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
35 res = -ERESTARTSYS; 36 res = -ERESTARTSYS;
@@ -42,9 +43,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
42static int 43static int
43nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) 44nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
44{ 45{
45 if (task->tk_status != -EJUKEBOX) 46 if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
46 return 0; 47 return 0;
47 nfs_inc_stats(inode, NFSIOS_DELAY); 48 if (task->tk_status == -EJUKEBOX)
49 nfs_inc_stats(inode, NFSIOS_DELAY);
48 task->tk_status = 0; 50 task->tk_status = 0;
49 rpc_restart_call(task); 51 rpc_restart_call(task);
50 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); 52 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..56a86f6ac8b5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
9#include <linux/param.h> 9#include <linux/param.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
14#include <linux/string.h> 13#include <linux/string.h>
15#include <linux/in.h> 14#include <linux/in.h>
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 6ea07a3c75d4..a187200a7aac 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,7 +44,9 @@ enum nfs4_client_state {
44 NFS4CLNT_RECLAIM_REBOOT, 44 NFS4CLNT_RECLAIM_REBOOT,
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_SESSION_SETUP, 47 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_SESSION_DRAINING,
49 NFS4CLNT_RECALL_SLOT,
48}; 50};
49 51
50/* 52/*
@@ -107,6 +109,10 @@ enum {
107 NFS_OWNER_RECLAIM_NOGRACE 109 NFS_OWNER_RECLAIM_NOGRACE
108}; 110};
109 111
112#define NFS_LOCK_NEW 0
113#define NFS_LOCK_RECLAIM 1
114#define NFS_LOCK_EXPIRED 2
115
110/* 116/*
111 * struct nfs4_state maintains the client-side state for a given 117 * struct nfs4_state maintains the client-side state for a given
112 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). 118 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -141,6 +147,7 @@ enum {
141 NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ 147 NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
142 NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ 148 NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
143 NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ 149 NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
150 NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
144}; 151};
145 152
146struct nfs4_state { 153struct nfs4_state {
@@ -180,6 +187,7 @@ struct nfs4_state_recovery_ops {
180 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 187 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
181 int (*establish_clid)(struct nfs_client *, struct rpc_cred *); 188 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
182 struct rpc_cred * (*get_clid_cred)(struct nfs_client *); 189 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
190 int (*reclaim_complete)(struct nfs_client *);
183}; 191};
184 192
185struct nfs4_state_maintenance_ops { 193struct nfs4_state_maintenance_ops {
@@ -200,9 +208,11 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
200/* nfs4proc.c */ 208/* nfs4proc.c */
201extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); 209extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
202extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); 210extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
211extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
203extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); 212extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
204extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 213extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
205extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 214extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
215extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
206extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); 216extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
207extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 217extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
208extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 218extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -218,9 +228,11 @@ extern int nfs4_setup_sequence(struct nfs_client *clp,
218 int cache_reply, struct rpc_task *task); 228 int cache_reply, struct rpc_task *task);
219extern void nfs4_destroy_session(struct nfs4_session *session); 229extern void nfs4_destroy_session(struct nfs4_session *session);
220extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 230extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
221extern int nfs4_proc_create_session(struct nfs_client *, int reset); 231extern int nfs4_proc_create_session(struct nfs_client *);
222extern int nfs4_proc_destroy_session(struct nfs4_session *); 232extern int nfs4_proc_destroy_session(struct nfs4_session *);
223extern int nfs4_init_session(struct nfs_server *server); 233extern int nfs4_init_session(struct nfs_server *server);
234extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
235 struct nfs_fsinfo *fsinfo);
224#else /* CONFIG_NFS_v4_1 */ 236#else /* CONFIG_NFS_v4_1 */
225static inline int nfs4_setup_sequence(struct nfs_client *clp, 237static inline int nfs4_setup_sequence(struct nfs_client *clp,
226 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 238 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
@@ -267,6 +279,9 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
267extern void nfs4_schedule_state_recovery(struct nfs_client *); 279extern void nfs4_schedule_state_recovery(struct nfs_client *);
268extern void nfs4_schedule_state_manager(struct nfs_client *); 280extern void nfs4_schedule_state_manager(struct nfs_client *);
269extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); 281extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
282extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
283extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
284extern void nfs41_handle_recall_slot(struct nfs_client *clp);
270extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 285extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
271extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
272extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
@@ -275,6 +290,7 @@ extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
275extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
276extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); 291extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
277extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); 292extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
293extern void nfs_release_seqid(struct nfs_seqid *seqid);
278extern void nfs_free_seqid(struct nfs_seqid *seqid); 294extern void nfs_free_seqid(struct nfs_seqid *seqid);
279 295
280extern const nfs4_stateid zero_stateid; 296extern const nfs4_stateid zero_stateid;
@@ -287,6 +303,7 @@ struct nfs4_mount_data;
287 303
288/* callback_xdr.c */ 304/* callback_xdr.c */
289extern struct svc_version nfs4_callback_version1; 305extern struct svc_version nfs4_callback_version1;
306extern struct svc_version nfs4_callback_version4;
290 307
291#else 308#else
292 309
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..f071d12c613b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/slab.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 741a562177fc..d79a7b37e56c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h>
42#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
43#include <linux/nfs.h> 44#include <linux/nfs.h>
44#include <linux/nfs4.h> 45#include <linux/nfs4.h>
@@ -64,6 +65,7 @@
64 65
65struct nfs4_opendata; 66struct nfs4_opendata;
66static int _nfs4_proc_open(struct nfs4_opendata *data); 67static int _nfs4_proc_open(struct nfs4_opendata *data);
68static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
67static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 69static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
68static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 70static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
69static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 71static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
@@ -248,19 +250,15 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
248 if (state == NULL) 250 if (state == NULL)
249 break; 251 break;
250 nfs4_state_mark_reclaim_nograce(clp, state); 252 nfs4_state_mark_reclaim_nograce(clp, state);
251 case -NFS4ERR_STALE_CLIENTID: 253 goto do_state_recovery;
252 case -NFS4ERR_STALE_STATEID: 254 case -NFS4ERR_STALE_STATEID:
253 case -NFS4ERR_EXPIRED: 255 if (state == NULL)
254 nfs4_schedule_state_recovery(clp);
255 ret = nfs4_wait_clnt_recover(clp);
256 if (ret == 0)
257 exception->retry = 1;
258#if !defined(CONFIG_NFS_V4_1)
259 break;
260#else /* !defined(CONFIG_NFS_V4_1) */
261 if (!nfs4_has_session(server->nfs_client))
262 break; 256 break;
263 /* FALLTHROUGH */ 257 nfs4_state_mark_reclaim_reboot(clp, state);
258 case -NFS4ERR_STALE_CLIENTID:
259 case -NFS4ERR_EXPIRED:
260 goto do_state_recovery;
261#if defined(CONFIG_NFS_V4_1)
264 case -NFS4ERR_BADSESSION: 262 case -NFS4ERR_BADSESSION:
265 case -NFS4ERR_BADSLOT: 263 case -NFS4ERR_BADSLOT:
266 case -NFS4ERR_BAD_HIGH_SLOT: 264 case -NFS4ERR_BAD_HIGH_SLOT:
@@ -270,13 +268,21 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
270 case -NFS4ERR_SEQ_MISORDERED: 268 case -NFS4ERR_SEQ_MISORDERED:
271 dprintk("%s ERROR: %d Reset session\n", __func__, 269 dprintk("%s ERROR: %d Reset session\n", __func__,
272 errorcode); 270 errorcode);
273 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); 271 nfs4_schedule_state_recovery(clp);
274 exception->retry = 1; 272 exception->retry = 1;
275 /* FALLTHROUGH */ 273 break;
276#endif /* !defined(CONFIG_NFS_V4_1) */ 274#endif /* defined(CONFIG_NFS_V4_1) */
277 case -NFS4ERR_FILE_OPEN: 275 case -NFS4ERR_FILE_OPEN:
276 if (exception->timeout > HZ) {
277 /* We have retried a decent amount, time to
278 * fail
279 */
280 ret = -EBUSY;
281 break;
282 }
278 case -NFS4ERR_GRACE: 283 case -NFS4ERR_GRACE:
279 case -NFS4ERR_DELAY: 284 case -NFS4ERR_DELAY:
285 case -EKEYEXPIRED:
280 ret = nfs4_delay(server->client, &exception->timeout); 286 ret = nfs4_delay(server->client, &exception->timeout);
281 if (ret != 0) 287 if (ret != 0)
282 break; 288 break;
@@ -285,6 +291,12 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
285 } 291 }
286 /* We failed to handle the error */ 292 /* We failed to handle the error */
287 return nfs4_map_errors(ret); 293 return nfs4_map_errors(ret);
294do_state_recovery:
295 nfs4_schedule_state_recovery(clp);
296 ret = nfs4_wait_clnt_recover(clp);
297 if (ret == 0)
298 exception->retry = 1;
299 return ret;
288} 300}
289 301
290 302
@@ -311,48 +323,67 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
311 * so we need to scan down from highest_used_slotid to 0 looking for the now 323 * so we need to scan down from highest_used_slotid to 0 looking for the now
312 * highest slotid in use. 324 * highest slotid in use.
313 * If none found, highest_used_slotid is set to -1. 325 * If none found, highest_used_slotid is set to -1.
326 *
327 * Must be called while holding tbl->slot_tbl_lock
314 */ 328 */
315static void 329static void
316nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) 330nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
317{ 331{
318 int slotid = free_slotid; 332 int slotid = free_slotid;
319 333
320 spin_lock(&tbl->slot_tbl_lock);
321 /* clear used bit in bitmap */ 334 /* clear used bit in bitmap */
322 __clear_bit(slotid, tbl->used_slots); 335 __clear_bit(slotid, tbl->used_slots);
323 336
324 /* update highest_used_slotid when it is freed */ 337 /* update highest_used_slotid when it is freed */
325 if (slotid == tbl->highest_used_slotid) { 338 if (slotid == tbl->highest_used_slotid) {
326 slotid = find_last_bit(tbl->used_slots, tbl->max_slots); 339 slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
327 if (slotid >= 0 && slotid < tbl->max_slots) 340 if (slotid < tbl->max_slots)
328 tbl->highest_used_slotid = slotid; 341 tbl->highest_used_slotid = slotid;
329 else 342 else
330 tbl->highest_used_slotid = -1; 343 tbl->highest_used_slotid = -1;
331 } 344 }
332 rpc_wake_up_next(&tbl->slot_tbl_waitq);
333 spin_unlock(&tbl->slot_tbl_lock);
334 dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, 345 dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
335 free_slotid, tbl->highest_used_slotid); 346 free_slotid, tbl->highest_used_slotid);
336} 347}
337 348
338void nfs41_sequence_free_slot(const struct nfs_client *clp, 349/*
339 struct nfs4_sequence_res *res) 350 * Signal state manager thread if session is drained
351 */
352static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
340{ 353{
341 struct nfs4_slot_table *tbl; 354 struct rpc_task *task;
342 355
343 if (!nfs4_has_session(clp)) { 356 if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) {
344 dprintk("%s: No session\n", __func__); 357 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
358 if (task)
359 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
345 return; 360 return;
346 } 361 }
362
363 if (ses->fc_slot_table.highest_used_slotid != -1)
364 return;
365
366 dprintk("%s COMPLETE: Session Drained\n", __func__);
367 complete(&ses->complete);
368}
369
370static void nfs41_sequence_free_slot(const struct nfs_client *clp,
371 struct nfs4_sequence_res *res)
372{
373 struct nfs4_slot_table *tbl;
374
347 tbl = &clp->cl_session->fc_slot_table; 375 tbl = &clp->cl_session->fc_slot_table;
348 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { 376 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
349 dprintk("%s: No slot\n", __func__);
350 /* just wake up the next guy waiting since 377 /* just wake up the next guy waiting since
351 * we may have not consumed a slot after all */ 378 * we may have not consumed a slot after all */
352 rpc_wake_up_next(&tbl->slot_tbl_waitq); 379 dprintk("%s: No slot\n", __func__);
353 return; 380 return;
354 } 381 }
382
383 spin_lock(&tbl->slot_tbl_lock);
355 nfs4_free_slot(tbl, res->sr_slotid); 384 nfs4_free_slot(tbl, res->sr_slotid);
385 nfs41_check_drain_session_complete(clp->cl_session);
386 spin_unlock(&tbl->slot_tbl_lock);
356 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 387 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
357} 388}
358 389
@@ -377,10 +408,10 @@ static void nfs41_sequence_done(struct nfs_client *clp,
377 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) 408 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
378 goto out; 409 goto out;
379 410
380 tbl = &clp->cl_session->fc_slot_table; 411 /* Check the SEQUENCE operation status */
381 slot = tbl->slots + res->sr_slotid;
382
383 if (res->sr_status == 0) { 412 if (res->sr_status == 0) {
413 tbl = &clp->cl_session->fc_slot_table;
414 slot = tbl->slots + res->sr_slotid;
384 /* Update the slot's sequence and clientid lease timer */ 415 /* Update the slot's sequence and clientid lease timer */
385 ++slot->seq_nr; 416 ++slot->seq_nr;
386 timestamp = res->sr_renewal_time; 417 timestamp = res->sr_renewal_time;
@@ -388,7 +419,9 @@ static void nfs41_sequence_done(struct nfs_client *clp,
388 if (time_before(clp->cl_last_renewal, timestamp)) 419 if (time_before(clp->cl_last_renewal, timestamp))
389 clp->cl_last_renewal = timestamp; 420 clp->cl_last_renewal = timestamp;
390 spin_unlock(&clp->cl_lock); 421 spin_unlock(&clp->cl_lock);
391 return; 422 /* Check sequence flags */
423 if (atomic_read(&clp->cl_count) > 1)
424 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
392 } 425 }
393out: 426out:
394 /* The session may be reset by one of the error handlers. */ 427 /* The session may be reset by one of the error handlers. */
@@ -407,7 +440,7 @@ out:
407 * Note: must be called with under the slot_tbl_lock. 440 * Note: must be called with under the slot_tbl_lock.
408 */ 441 */
409static u8 442static u8
410nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task) 443nfs4_find_slot(struct nfs4_slot_table *tbl)
411{ 444{
412 int slotid; 445 int slotid;
413 u8 ret_id = NFS4_MAX_SLOT_TABLE; 446 u8 ret_id = NFS4_MAX_SLOT_TABLE;
@@ -429,24 +462,6 @@ out:
429 return ret_id; 462 return ret_id;
430} 463}
431 464
432static int nfs4_recover_session(struct nfs4_session *session)
433{
434 struct nfs_client *clp = session->clp;
435 unsigned int loop;
436 int ret;
437
438 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
439 ret = nfs4_wait_clnt_recover(clp);
440 if (ret != 0)
441 break;
442 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
443 break;
444 nfs4_schedule_state_manager(clp);
445 ret = -EIO;
446 }
447 return ret;
448}
449
450static int nfs41_setup_sequence(struct nfs4_session *session, 465static int nfs41_setup_sequence(struct nfs4_session *session,
451 struct nfs4_sequence_args *args, 466 struct nfs4_sequence_args *args,
452 struct nfs4_sequence_res *res, 467 struct nfs4_sequence_res *res,
@@ -455,7 +470,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
455{ 470{
456 struct nfs4_slot *slot; 471 struct nfs4_slot *slot;
457 struct nfs4_slot_table *tbl; 472 struct nfs4_slot_table *tbl;
458 int status = 0;
459 u8 slotid; 473 u8 slotid;
460 474
461 dprintk("--> %s\n", __func__); 475 dprintk("--> %s\n", __func__);
@@ -468,24 +482,27 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
468 tbl = &session->fc_slot_table; 482 tbl = &session->fc_slot_table;
469 483
470 spin_lock(&tbl->slot_tbl_lock); 484 spin_lock(&tbl->slot_tbl_lock);
471 if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) { 485 if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) &&
472 if (tbl->highest_used_slotid != -1) { 486 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
473 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); 487 /*
474 spin_unlock(&tbl->slot_tbl_lock); 488 * The state manager will wait until the slot table is empty.
475 dprintk("<-- %s: Session reset: draining\n", __func__); 489 * Schedule the reset thread
476 return -EAGAIN; 490 */
477 } 491 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
492 spin_unlock(&tbl->slot_tbl_lock);
493 dprintk("%s Schedule Session Reset\n", __func__);
494 return -EAGAIN;
495 }
478 496
479 /* The slot table is empty; start the reset thread */ 497 if (!rpc_queue_empty(&tbl->slot_tbl_waitq) &&
480 dprintk("%s Session Reset\n", __func__); 498 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
499 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
481 spin_unlock(&tbl->slot_tbl_lock); 500 spin_unlock(&tbl->slot_tbl_lock);
482 status = nfs4_recover_session(session); 501 dprintk("%s enforce FIFO order\n", __func__);
483 if (status) 502 return -EAGAIN;
484 return status;
485 spin_lock(&tbl->slot_tbl_lock);
486 } 503 }
487 504
488 slotid = nfs4_find_slot(tbl, task); 505 slotid = nfs4_find_slot(tbl);
489 if (slotid == NFS4_MAX_SLOT_TABLE) { 506 if (slotid == NFS4_MAX_SLOT_TABLE) {
490 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); 507 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
491 spin_unlock(&tbl->slot_tbl_lock); 508 spin_unlock(&tbl->slot_tbl_lock);
@@ -494,6 +511,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
494 } 511 }
495 spin_unlock(&tbl->slot_tbl_lock); 512 spin_unlock(&tbl->slot_tbl_lock);
496 513
514 rpc_task_set_priority(task, RPC_PRIORITY_NORMAL);
497 slot = tbl->slots + slotid; 515 slot = tbl->slots + slotid;
498 args->sa_session = session; 516 args->sa_session = session;
499 args->sa_slotid = slotid; 517 args->sa_slotid = slotid;
@@ -527,7 +545,7 @@ int nfs4_setup_sequence(struct nfs_client *clp,
527 goto out; 545 goto out;
528 ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, 546 ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
529 task); 547 task);
530 if (ret != -EAGAIN) { 548 if (ret && ret != -EAGAIN) {
531 /* terminate rpc task */ 549 /* terminate rpc task */
532 task->tk_status = ret; 550 task->tk_status = ret;
533 task->tk_action = NULL; 551 task->tk_action = NULL;
@@ -556,12 +574,17 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
556 rpc_call_start(task); 574 rpc_call_start(task);
557} 575}
558 576
577static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
578{
579 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
580 nfs41_call_sync_prepare(task, calldata);
581}
582
559static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) 583static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
560{ 584{
561 struct nfs41_call_sync_data *data = calldata; 585 struct nfs41_call_sync_data *data = calldata;
562 586
563 nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); 587 nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
564 nfs41_sequence_free_slot(data->clp, data->seq_res);
565} 588}
566 589
567struct rpc_call_ops nfs41_call_sync_ops = { 590struct rpc_call_ops nfs41_call_sync_ops = {
@@ -569,12 +592,18 @@ struct rpc_call_ops nfs41_call_sync_ops = {
569 .rpc_call_done = nfs41_call_sync_done, 592 .rpc_call_done = nfs41_call_sync_done,
570}; 593};
571 594
595struct rpc_call_ops nfs41_call_priv_sync_ops = {
596 .rpc_call_prepare = nfs41_call_priv_sync_prepare,
597 .rpc_call_done = nfs41_call_sync_done,
598};
599
572static int nfs4_call_sync_sequence(struct nfs_client *clp, 600static int nfs4_call_sync_sequence(struct nfs_client *clp,
573 struct rpc_clnt *clnt, 601 struct rpc_clnt *clnt,
574 struct rpc_message *msg, 602 struct rpc_message *msg,
575 struct nfs4_sequence_args *args, 603 struct nfs4_sequence_args *args,
576 struct nfs4_sequence_res *res, 604 struct nfs4_sequence_res *res,
577 int cache_reply) 605 int cache_reply,
606 int privileged)
578{ 607{
579 int ret; 608 int ret;
580 struct rpc_task *task; 609 struct rpc_task *task;
@@ -592,6 +621,8 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
592 }; 621 };
593 622
594 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 623 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
624 if (privileged)
625 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
595 task = rpc_run_task(&task_setup); 626 task = rpc_run_task(&task_setup);
596 if (IS_ERR(task)) 627 if (IS_ERR(task))
597 ret = PTR_ERR(task); 628 ret = PTR_ERR(task);
@@ -609,7 +640,7 @@ int _nfs4_call_sync_session(struct nfs_server *server,
609 int cache_reply) 640 int cache_reply)
610{ 641{
611 return nfs4_call_sync_sequence(server->nfs_client, server->client, 642 return nfs4_call_sync_sequence(server->nfs_client, server->client,
612 msg, args, res, cache_reply); 643 msg, args, res, cache_reply, 0);
613} 644}
614 645
615#endif /* CONFIG_NFS_V4_1 */ 646#endif /* CONFIG_NFS_V4_1 */
@@ -637,15 +668,6 @@ static void nfs4_sequence_done(const struct nfs_server *server,
637#endif /* CONFIG_NFS_V4_1 */ 668#endif /* CONFIG_NFS_V4_1 */
638} 669}
639 670
640/* no restart, therefore free slot here */
641static void nfs4_sequence_done_free_slot(const struct nfs_server *server,
642 struct nfs4_sequence_res *res,
643 int rpc_status)
644{
645 nfs4_sequence_done(server, res, rpc_status);
646 nfs4_sequence_free_slot(server->nfs_client, res);
647}
648
649static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 671static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
650{ 672{
651 struct nfs_inode *nfsi = NFS_I(dir); 673 struct nfs_inode *nfsi = NFS_I(dir);
@@ -705,8 +727,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
705 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 727 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
706 if (p->o_arg.seqid == NULL) 728 if (p->o_arg.seqid == NULL)
707 goto err_free; 729 goto err_free;
708 p->path.mnt = mntget(path->mnt); 730 path_get(path);
709 p->path.dentry = dget(path->dentry); 731 p->path = *path;
710 p->dir = parent; 732 p->dir = parent;
711 p->owner = sp; 733 p->owner = sp;
712 atomic_inc(&sp->so_count); 734 atomic_inc(&sp->so_count);
@@ -720,9 +742,15 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
720 p->o_arg.bitmask = server->attr_bitmask; 742 p->o_arg.bitmask = server->attr_bitmask;
721 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 743 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
722 if (flags & O_EXCL) { 744 if (flags & O_EXCL) {
723 u32 *s = (u32 *) p->o_arg.u.verifier.data; 745 if (nfs4_has_persistent_session(server->nfs_client)) {
724 s[0] = jiffies; 746 /* GUARDED */
725 s[1] = current->pid; 747 p->o_arg.u.attrs = &p->attrs;
748 memcpy(&p->attrs, attrs, sizeof(p->attrs));
749 } else { /* EXCLUSIVE4_1 */
750 u32 *s = (u32 *) p->o_arg.u.verifier.data;
751 s[0] = jiffies;
752 s[1] = current->pid;
753 }
726 } else if (flags & O_CREAT) { 754 } else if (flags & O_CREAT) {
727 p->o_arg.u.attrs = &p->attrs; 755 p->o_arg.u.attrs = &p->attrs;
728 memcpy(&p->attrs, attrs, sizeof(p->attrs)); 756 memcpy(&p->attrs, attrs, sizeof(p->attrs));
@@ -776,13 +804,16 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode
776 goto out; 804 goto out;
777 switch (mode & (FMODE_READ|FMODE_WRITE)) { 805 switch (mode & (FMODE_READ|FMODE_WRITE)) {
778 case FMODE_READ: 806 case FMODE_READ:
779 ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0; 807 ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
808 && state->n_rdonly != 0;
780 break; 809 break;
781 case FMODE_WRITE: 810 case FMODE_WRITE:
782 ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0; 811 ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0
812 && state->n_wronly != 0;
783 break; 813 break;
784 case FMODE_READ|FMODE_WRITE: 814 case FMODE_READ|FMODE_WRITE:
785 ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; 815 ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0
816 && state->n_rdwr != 0;
786 } 817 }
787out: 818out:
788 return ret; 819 return ret;
@@ -1047,7 +1078,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
1047 memset(&opendata->o_res, 0, sizeof(opendata->o_res)); 1078 memset(&opendata->o_res, 0, sizeof(opendata->o_res));
1048 memset(&opendata->c_res, 0, sizeof(opendata->c_res)); 1079 memset(&opendata->c_res, 0, sizeof(opendata->c_res));
1049 nfs4_init_opendata_res(opendata); 1080 nfs4_init_opendata_res(opendata);
1050 ret = _nfs4_proc_open(opendata); 1081 ret = _nfs4_recover_proc_open(opendata);
1051 if (ret != 0) 1082 if (ret != 0)
1052 return ret; 1083 return ret;
1053 newstate = nfs4_opendata_to_nfs4_state(opendata); 1084 newstate = nfs4_opendata_to_nfs4_state(opendata);
@@ -1135,7 +1166,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1135 int err; 1166 int err;
1136 do { 1167 do {
1137 err = _nfs4_do_open_reclaim(ctx, state); 1168 err = _nfs4_do_open_reclaim(ctx, state);
1138 if (err != -NFS4ERR_DELAY) 1169 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
1139 break; 1170 break;
1140 nfs4_handle_exception(server, err, &exception); 1171 nfs4_handle_exception(server, err, &exception);
1141 } while (exception.retry); 1172 } while (exception.retry);
@@ -1183,6 +1214,14 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1183 case -ENOENT: 1214 case -ENOENT:
1184 case -ESTALE: 1215 case -ESTALE:
1185 goto out; 1216 goto out;
1217 case -NFS4ERR_BADSESSION:
1218 case -NFS4ERR_BADSLOT:
1219 case -NFS4ERR_BAD_HIGH_SLOT:
1220 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1221 case -NFS4ERR_DEADSESSION:
1222 nfs4_schedule_state_recovery(
1223 server->nfs_client);
1224 goto out;
1186 case -NFS4ERR_STALE_CLIENTID: 1225 case -NFS4ERR_STALE_CLIENTID:
1187 case -NFS4ERR_STALE_STATEID: 1226 case -NFS4ERR_STALE_STATEID:
1188 case -NFS4ERR_EXPIRED: 1227 case -NFS4ERR_EXPIRED:
@@ -1330,14 +1369,20 @@ out_no_action:
1330 1369
1331} 1370}
1332 1371
1372static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
1373{
1374 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
1375 nfs4_open_prepare(task, calldata);
1376}
1377
1333static void nfs4_open_done(struct rpc_task *task, void *calldata) 1378static void nfs4_open_done(struct rpc_task *task, void *calldata)
1334{ 1379{
1335 struct nfs4_opendata *data = calldata; 1380 struct nfs4_opendata *data = calldata;
1336 1381
1337 data->rpc_status = task->tk_status; 1382 data->rpc_status = task->tk_status;
1338 1383
1339 nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res, 1384 nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res,
1340 task->tk_status); 1385 task->tk_status);
1341 1386
1342 if (RPC_ASSASSINATED(task)) 1387 if (RPC_ASSASSINATED(task))
1343 return; 1388 return;
@@ -1388,10 +1433,13 @@ static const struct rpc_call_ops nfs4_open_ops = {
1388 .rpc_release = nfs4_open_release, 1433 .rpc_release = nfs4_open_release,
1389}; 1434};
1390 1435
1391/* 1436static const struct rpc_call_ops nfs4_recover_open_ops = {
1392 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata 1437 .rpc_call_prepare = nfs4_recover_open_prepare,
1393 */ 1438 .rpc_call_done = nfs4_open_done,
1394static int _nfs4_proc_open(struct nfs4_opendata *data) 1439 .rpc_release = nfs4_open_release,
1440};
1441
1442static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
1395{ 1443{
1396 struct inode *dir = data->dir->d_inode; 1444 struct inode *dir = data->dir->d_inode;
1397 struct nfs_server *server = NFS_SERVER(dir); 1445 struct nfs_server *server = NFS_SERVER(dir);
@@ -1418,21 +1466,57 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1418 data->rpc_done = 0; 1466 data->rpc_done = 0;
1419 data->rpc_status = 0; 1467 data->rpc_status = 0;
1420 data->cancelled = 0; 1468 data->cancelled = 0;
1469 if (isrecover)
1470 task_setup_data.callback_ops = &nfs4_recover_open_ops;
1421 task = rpc_run_task(&task_setup_data); 1471 task = rpc_run_task(&task_setup_data);
1422 if (IS_ERR(task)) 1472 if (IS_ERR(task))
1423 return PTR_ERR(task); 1473 return PTR_ERR(task);
1424 status = nfs4_wait_for_completion_rpc_task(task); 1474 status = nfs4_wait_for_completion_rpc_task(task);
1425 if (status != 0) { 1475 if (status != 0) {
1426 data->cancelled = 1; 1476 data->cancelled = 1;
1427 smp_wmb(); 1477 smp_wmb();
1428 } else 1478 } else
1429 status = data->rpc_status; 1479 status = data->rpc_status;
1430 rpc_put_task(task); 1480 rpc_put_task(task);
1481
1482 return status;
1483}
1484
1485static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
1486{
1487 struct inode *dir = data->dir->d_inode;
1488 struct nfs_openres *o_res = &data->o_res;
1489 int status;
1490
1491 status = nfs4_run_open_task(data, 1);
1431 if (status != 0 || !data->rpc_done) 1492 if (status != 0 || !data->rpc_done)
1432 return status; 1493 return status;
1433 1494
1434 if (o_res->fh.size == 0) 1495 nfs_refresh_inode(dir, o_res->dir_attr);
1435 _nfs4_proc_lookup(dir, o_arg->name, &o_res->fh, o_res->f_attr); 1496
1497 if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
1498 status = _nfs4_proc_open_confirm(data);
1499 if (status != 0)
1500 return status;
1501 }
1502
1503 return status;
1504}
1505
1506/*
1507 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
1508 */
1509static int _nfs4_proc_open(struct nfs4_opendata *data)
1510{
1511 struct inode *dir = data->dir->d_inode;
1512 struct nfs_server *server = NFS_SERVER(dir);
1513 struct nfs_openargs *o_arg = &data->o_arg;
1514 struct nfs_openres *o_res = &data->o_res;
1515 int status;
1516
1517 status = nfs4_run_open_task(data, 0);
1518 if (status != 0 || !data->rpc_done)
1519 return status;
1436 1520
1437 if (o_arg->open_flags & O_CREAT) { 1521 if (o_arg->open_flags & O_CREAT) {
1438 update_changeattr(dir, &o_res->cinfo); 1522 update_changeattr(dir, &o_res->cinfo);
@@ -1488,7 +1572,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
1488 return ret; 1572 return ret;
1489} 1573}
1490 1574
1491static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) 1575static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
1492{ 1576{
1493 struct nfs_server *server = NFS_SERVER(state->inode); 1577 struct nfs_server *server = NFS_SERVER(state->inode);
1494 struct nfs4_exception exception = { }; 1578 struct nfs4_exception exception = { };
@@ -1496,10 +1580,17 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
1496 1580
1497 do { 1581 do {
1498 err = _nfs4_open_expired(ctx, state); 1582 err = _nfs4_open_expired(ctx, state);
1499 if (err != -NFS4ERR_DELAY) 1583 switch (err) {
1500 break; 1584 default:
1501 nfs4_handle_exception(server, err, &exception); 1585 goto out;
1586 case -NFS4ERR_GRACE:
1587 case -NFS4ERR_DELAY:
1588 case -EKEYEXPIRED:
1589 nfs4_handle_exception(server, err, &exception);
1590 err = 0;
1591 }
1502 } while (exception.retry); 1592 } while (exception.retry);
1593out:
1503 return err; 1594 return err;
1504} 1595}
1505 1596
@@ -1573,6 +1664,8 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1573 status = PTR_ERR(state); 1664 status = PTR_ERR(state);
1574 if (IS_ERR(state)) 1665 if (IS_ERR(state))
1575 goto err_opendata_put; 1666 goto err_opendata_put;
1667 if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0)
1668 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1576 nfs4_opendata_put(opendata); 1669 nfs4_opendata_put(opendata);
1577 nfs4_put_state_owner(sp); 1670 nfs4_put_state_owner(sp);
1578 *res = state; 1671 *res = state;
@@ -1712,6 +1805,18 @@ static void nfs4_free_closedata(void *data)
1712 kfree(calldata); 1805 kfree(calldata);
1713} 1806}
1714 1807
1808static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
1809 fmode_t fmode)
1810{
1811 spin_lock(&state->owner->so_lock);
1812 if (!(fmode & FMODE_READ))
1813 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1814 if (!(fmode & FMODE_WRITE))
1815 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1816 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1817 spin_unlock(&state->owner->so_lock);
1818}
1819
1715static void nfs4_close_done(struct rpc_task *task, void *data) 1820static void nfs4_close_done(struct rpc_task *task, void *data)
1716{ 1821{
1717 struct nfs4_closedata *calldata = data; 1822 struct nfs4_closedata *calldata = data;
@@ -1728,6 +1833,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1728 case 0: 1833 case 0:
1729 nfs_set_open_stateid(state, &calldata->res.stateid, 0); 1834 nfs_set_open_stateid(state, &calldata->res.stateid, 0);
1730 renew_lease(server, calldata->timestamp); 1835 renew_lease(server, calldata->timestamp);
1836 nfs4_close_clear_stateid_flags(state,
1837 calldata->arg.fmode);
1731 break; 1838 break;
1732 case -NFS4ERR_STALE_STATEID: 1839 case -NFS4ERR_STALE_STATEID:
1733 case -NFS4ERR_OLD_STATEID: 1840 case -NFS4ERR_OLD_STATEID:
@@ -1736,12 +1843,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1736 if (calldata->arg.fmode == 0) 1843 if (calldata->arg.fmode == 0)
1737 break; 1844 break;
1738 default: 1845 default:
1739 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { 1846 if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
1740 nfs4_restart_rpc(task, server->nfs_client); 1847 rpc_restart_call_prepare(task);
1741 return;
1742 }
1743 } 1848 }
1744 nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res); 1849 nfs_release_seqid(calldata->arg.seqid);
1745 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 1850 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
1746} 1851}
1747 1852
@@ -1749,38 +1854,39 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1749{ 1854{
1750 struct nfs4_closedata *calldata = data; 1855 struct nfs4_closedata *calldata = data;
1751 struct nfs4_state *state = calldata->state; 1856 struct nfs4_state *state = calldata->state;
1752 int clear_rd, clear_wr, clear_rdwr; 1857 int call_close = 0;
1753 1858
1754 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 1859 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
1755 return; 1860 return;
1756 1861
1757 clear_rd = clear_wr = clear_rdwr = 0; 1862 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
1863 calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
1758 spin_lock(&state->owner->so_lock); 1864 spin_lock(&state->owner->so_lock);
1759 /* Calculate the change in open mode */ 1865 /* Calculate the change in open mode */
1760 if (state->n_rdwr == 0) { 1866 if (state->n_rdwr == 0) {
1761 if (state->n_rdonly == 0) { 1867 if (state->n_rdonly == 0) {
1762 clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags); 1868 call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
1763 clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); 1869 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
1870 calldata->arg.fmode &= ~FMODE_READ;
1764 } 1871 }
1765 if (state->n_wronly == 0) { 1872 if (state->n_wronly == 0) {
1766 clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags); 1873 call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
1767 clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); 1874 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
1875 calldata->arg.fmode &= ~FMODE_WRITE;
1768 } 1876 }
1769 } 1877 }
1770 spin_unlock(&state->owner->so_lock); 1878 spin_unlock(&state->owner->so_lock);
1771 if (!clear_rd && !clear_wr && !clear_rdwr) { 1879
1880 if (!call_close) {
1772 /* Note: exit _without_ calling nfs4_close_done */ 1881 /* Note: exit _without_ calling nfs4_close_done */
1773 task->tk_action = NULL; 1882 task->tk_action = NULL;
1774 return; 1883 return;
1775 } 1884 }
1885
1886 if (calldata->arg.fmode == 0)
1887 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
1888
1776 nfs_fattr_init(calldata->res.fattr); 1889 nfs_fattr_init(calldata->res.fattr);
1777 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
1778 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
1779 calldata->arg.fmode = FMODE_READ;
1780 } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
1781 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
1782 calldata->arg.fmode = FMODE_WRITE;
1783 }
1784 calldata->timestamp = jiffies; 1890 calldata->timestamp = jiffies;
1785 if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, 1891 if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
1786 &calldata->arg.seq_args, &calldata->res.seq_res, 1892 &calldata->arg.seq_args, &calldata->res.seq_res,
@@ -1832,8 +1938,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1832 calldata->state = state; 1938 calldata->state = state;
1833 calldata->arg.fh = NFS_FH(state->inode); 1939 calldata->arg.fh = NFS_FH(state->inode);
1834 calldata->arg.stateid = &state->open_stateid; 1940 calldata->arg.stateid = &state->open_stateid;
1835 if (nfs4_has_session(server->nfs_client))
1836 memset(calldata->arg.stateid->data, 0, 4); /* clear seqid */
1837 /* Serialization for the sequence id */ 1941 /* Serialization for the sequence id */
1838 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); 1942 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
1839 if (calldata->arg.seqid == NULL) 1943 if (calldata->arg.seqid == NULL)
@@ -1844,8 +1948,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1844 calldata->res.seqid = calldata->arg.seqid; 1948 calldata->res.seqid = calldata->arg.seqid;
1845 calldata->res.server = server; 1949 calldata->res.server = server;
1846 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 1950 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1847 calldata->path.mnt = mntget(path->mnt); 1951 path_get(path);
1848 calldata->path.dentry = dget(path->dentry); 1952 calldata->path = *path;
1849 1953
1850 msg.rpc_argp = &calldata->arg, 1954 msg.rpc_argp = &calldata->arg,
1851 msg.rpc_resp = &calldata->res, 1955 msg.rpc_resp = &calldata->res,
@@ -1981,7 +2085,7 @@ out_drop:
1981 return 0; 2085 return 0;
1982} 2086}
1983 2087
1984void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2088static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
1985{ 2089{
1986 if (ctx->state == NULL) 2090 if (ctx->state == NULL)
1987 return; 2091 return;
@@ -2532,7 +2636,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2532 nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); 2636 nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
2533 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 2637 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2534 return 0; 2638 return 0;
2535 nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res);
2536 update_changeattr(dir, &res->cinfo); 2639 update_changeattr(dir, &res->cinfo);
2537 nfs_post_op_update_inode(dir, &res->dir_attr); 2640 nfs_post_op_update_inode(dir, &res->dir_attr);
2538 return 1; 2641 return 1;
@@ -2971,11 +3074,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
2971 3074
2972 dprintk("--> %s\n", __func__); 3075 dprintk("--> %s\n", __func__);
2973 3076
2974 /* nfs4_sequence_free_slot called in the read rpc_call_done */
2975 nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); 3077 nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
2976 3078
2977 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3079 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
2978 nfs4_restart_rpc(task, server->nfs_client); 3080 nfs_restart_rpc(task, server->nfs_client);
2979 return -EAGAIN; 3081 return -EAGAIN;
2980 } 3082 }
2981 3083
@@ -2995,12 +3097,11 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
2995{ 3097{
2996 struct inode *inode = data->inode; 3098 struct inode *inode = data->inode;
2997 3099
2998 /* slot is freed in nfs_writeback_done */
2999 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3100 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
3000 task->tk_status); 3101 task->tk_status);
3001 3102
3002 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3103 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3003 nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3104 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3004 return -EAGAIN; 3105 return -EAGAIN;
3005 } 3106 }
3006 if (task->tk_status >= 0) { 3107 if (task->tk_status >= 0) {
@@ -3028,11 +3129,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3028 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3129 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
3029 task->tk_status); 3130 task->tk_status);
3030 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3131 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3031 nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3132 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3032 return -EAGAIN; 3133 return -EAGAIN;
3033 } 3134 }
3034 nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client,
3035 &data->res.seq_res);
3036 nfs_refresh_inode(inode, data->res.fattr); 3135 nfs_refresh_inode(inode, data->res.fattr);
3037 return 0; 3136 return 0;
3038} 3137}
@@ -3050,10 +3149,19 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
3050 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special 3149 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
3051 * standalone procedure for queueing an asynchronous RENEW. 3150 * standalone procedure for queueing an asynchronous RENEW.
3052 */ 3151 */
3152static void nfs4_renew_release(void *data)
3153{
3154 struct nfs_client *clp = data;
3155
3156 if (atomic_read(&clp->cl_count) > 1)
3157 nfs4_schedule_state_renewal(clp);
3158 nfs_put_client(clp);
3159}
3160
3053static void nfs4_renew_done(struct rpc_task *task, void *data) 3161static void nfs4_renew_done(struct rpc_task *task, void *data)
3054{ 3162{
3055 struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp; 3163 struct nfs_client *clp = data;
3056 unsigned long timestamp = (unsigned long)data; 3164 unsigned long timestamp = task->tk_start;
3057 3165
3058 if (task->tk_status < 0) { 3166 if (task->tk_status < 0) {
3059 /* Unless we're shutting down, schedule state recovery! */ 3167 /* Unless we're shutting down, schedule state recovery! */
@@ -3069,6 +3177,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
3069 3177
3070static const struct rpc_call_ops nfs4_renew_ops = { 3178static const struct rpc_call_ops nfs4_renew_ops = {
3071 .rpc_call_done = nfs4_renew_done, 3179 .rpc_call_done = nfs4_renew_done,
3180 .rpc_release = nfs4_renew_release,
3072}; 3181};
3073 3182
3074int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) 3183int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3079,8 +3188,10 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3079 .rpc_cred = cred, 3188 .rpc_cred = cred,
3080 }; 3189 };
3081 3190
3191 if (!atomic_inc_not_zero(&clp->cl_count))
3192 return -EIO;
3082 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 3193 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
3083 &nfs4_renew_ops, (void *)jiffies); 3194 &nfs4_renew_ops, clp);
3084} 3195}
3085 3196
3086int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3197int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3331,15 +3442,14 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3331 if (state == NULL) 3442 if (state == NULL)
3332 break; 3443 break;
3333 nfs4_state_mark_reclaim_nograce(clp, state); 3444 nfs4_state_mark_reclaim_nograce(clp, state);
3334 case -NFS4ERR_STALE_CLIENTID: 3445 goto do_state_recovery;
3335 case -NFS4ERR_STALE_STATEID: 3446 case -NFS4ERR_STALE_STATEID:
3447 if (state == NULL)
3448 break;
3449 nfs4_state_mark_reclaim_reboot(clp, state);
3450 case -NFS4ERR_STALE_CLIENTID:
3336 case -NFS4ERR_EXPIRED: 3451 case -NFS4ERR_EXPIRED:
3337 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); 3452 goto do_state_recovery;
3338 nfs4_schedule_state_recovery(clp);
3339 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
3340 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
3341 task->tk_status = 0;
3342 return -EAGAIN;
3343#if defined(CONFIG_NFS_V4_1) 3453#if defined(CONFIG_NFS_V4_1)
3344 case -NFS4ERR_BADSESSION: 3454 case -NFS4ERR_BADSESSION:
3345 case -NFS4ERR_BADSLOT: 3455 case -NFS4ERR_BADSLOT:
@@ -3350,7 +3460,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3350 case -NFS4ERR_SEQ_MISORDERED: 3460 case -NFS4ERR_SEQ_MISORDERED:
3351 dprintk("%s ERROR %d, Reset session\n", __func__, 3461 dprintk("%s ERROR %d, Reset session\n", __func__,
3352 task->tk_status); 3462 task->tk_status);
3353 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); 3463 nfs4_schedule_state_recovery(clp);
3354 task->tk_status = 0; 3464 task->tk_status = 0;
3355 return -EAGAIN; 3465 return -EAGAIN;
3356#endif /* CONFIG_NFS_V4_1 */ 3466#endif /* CONFIG_NFS_V4_1 */
@@ -3358,6 +3468,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3358 if (server) 3468 if (server)
3359 nfs_inc_server_stats(server, NFSIOS_DELAY); 3469 nfs_inc_server_stats(server, NFSIOS_DELAY);
3360 case -NFS4ERR_GRACE: 3470 case -NFS4ERR_GRACE:
3471 case -EKEYEXPIRED:
3361 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3472 rpc_delay(task, NFS4_POLL_RETRY_MAX);
3362 task->tk_status = 0; 3473 task->tk_status = 0;
3363 return -EAGAIN; 3474 return -EAGAIN;
@@ -3367,6 +3478,13 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3367 } 3478 }
3368 task->tk_status = nfs4_map_errors(task->tk_status); 3479 task->tk_status = nfs4_map_errors(task->tk_status);
3369 return 0; 3480 return 0;
3481do_state_recovery:
3482 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
3483 nfs4_schedule_state_recovery(clp);
3484 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
3485 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
3486 task->tk_status = 0;
3487 return -EAGAIN;
3370} 3488}
3371 3489
3372static int 3490static int
@@ -3463,6 +3581,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
3463 case -NFS4ERR_RESOURCE: 3581 case -NFS4ERR_RESOURCE:
3464 /* The IBM lawyers misread another document! */ 3582 /* The IBM lawyers misread another document! */
3465 case -NFS4ERR_DELAY: 3583 case -NFS4ERR_DELAY:
3584 case -EKEYEXPIRED:
3466 err = nfs4_delay(clp->cl_rpcclient, &timeout); 3585 err = nfs4_delay(clp->cl_rpcclient, &timeout);
3467 } 3586 }
3468 } while (err == 0); 3587 } while (err == 0);
@@ -3483,12 +3602,23 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
3483{ 3602{
3484 struct nfs4_delegreturndata *data = calldata; 3603 struct nfs4_delegreturndata *data = calldata;
3485 3604
3486 nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res, 3605 nfs4_sequence_done(data->res.server, &data->res.seq_res,
3487 task->tk_status); 3606 task->tk_status);
3488 3607
3489 data->rpc_status = task->tk_status; 3608 switch (task->tk_status) {
3490 if (data->rpc_status == 0) 3609 case -NFS4ERR_STALE_STATEID:
3610 case -NFS4ERR_EXPIRED:
3611 case 0:
3491 renew_lease(data->res.server, data->timestamp); 3612 renew_lease(data->res.server, data->timestamp);
3613 break;
3614 default:
3615 if (nfs4_async_handle_error(task, data->res.server, NULL) ==
3616 -EAGAIN) {
3617 nfs_restart_rpc(task, data->res.server->nfs_client);
3618 return;
3619 }
3620 }
3621 data->rpc_status = task->tk_status;
3492} 3622}
3493 3623
3494static void nfs4_delegreturn_release(void *calldata) 3624static void nfs4_delegreturn_release(void *calldata)
@@ -3741,11 +3871,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3741 break; 3871 break;
3742 default: 3872 default:
3743 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 3873 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
3744 nfs4_restart_rpc(task, 3874 nfs_restart_rpc(task,
3745 calldata->server->nfs_client); 3875 calldata->server->nfs_client);
3746 } 3876 }
3747 nfs4_sequence_free_slot(calldata->server->nfs_client,
3748 &calldata->res.seq_res);
3749} 3877}
3750 3878
3751static void nfs4_locku_prepare(struct rpc_task *task, void *data) 3879static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -3921,14 +4049,20 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
3921 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); 4049 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
3922} 4050}
3923 4051
4052static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
4053{
4054 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
4055 nfs4_lock_prepare(task, calldata);
4056}
4057
3924static void nfs4_lock_done(struct rpc_task *task, void *calldata) 4058static void nfs4_lock_done(struct rpc_task *task, void *calldata)
3925{ 4059{
3926 struct nfs4_lockdata *data = calldata; 4060 struct nfs4_lockdata *data = calldata;
3927 4061
3928 dprintk("%s: begin!\n", __func__); 4062 dprintk("%s: begin!\n", __func__);
3929 4063
3930 nfs4_sequence_done_free_slot(data->server, &data->res.seq_res, 4064 nfs4_sequence_done(data->server, &data->res.seq_res,
3931 task->tk_status); 4065 task->tk_status);
3932 4066
3933 data->rpc_status = task->tk_status; 4067 data->rpc_status = task->tk_status;
3934 if (RPC_ASSASSINATED(task)) 4068 if (RPC_ASSASSINATED(task))
@@ -3976,7 +4110,35 @@ static const struct rpc_call_ops nfs4_lock_ops = {
3976 .rpc_release = nfs4_lock_release, 4110 .rpc_release = nfs4_lock_release,
3977}; 4111};
3978 4112
3979static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim) 4113static const struct rpc_call_ops nfs4_recover_lock_ops = {
4114 .rpc_call_prepare = nfs4_recover_lock_prepare,
4115 .rpc_call_done = nfs4_lock_done,
4116 .rpc_release = nfs4_lock_release,
4117};
4118
4119static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
4120{
4121 struct nfs_client *clp = server->nfs_client;
4122 struct nfs4_state *state = lsp->ls_state;
4123
4124 switch (error) {
4125 case -NFS4ERR_ADMIN_REVOKED:
4126 case -NFS4ERR_BAD_STATEID:
4127 case -NFS4ERR_EXPIRED:
4128 if (new_lock_owner != 0 ||
4129 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4130 nfs4_state_mark_reclaim_nograce(clp, state);
4131 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4132 break;
4133 case -NFS4ERR_STALE_STATEID:
4134 if (new_lock_owner != 0 ||
4135 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4136 nfs4_state_mark_reclaim_reboot(clp, state);
4137 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4138 };
4139}
4140
4141static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
3980{ 4142{
3981 struct nfs4_lockdata *data; 4143 struct nfs4_lockdata *data;
3982 struct rpc_task *task; 4144 struct rpc_task *task;
@@ -4000,8 +4162,11 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4000 return -ENOMEM; 4162 return -ENOMEM;
4001 if (IS_SETLKW(cmd)) 4163 if (IS_SETLKW(cmd))
4002 data->arg.block = 1; 4164 data->arg.block = 1;
4003 if (reclaim != 0) 4165 if (recovery_type > NFS_LOCK_NEW) {
4004 data->arg.reclaim = 1; 4166 if (recovery_type == NFS_LOCK_RECLAIM)
4167 data->arg.reclaim = NFS_LOCK_RECLAIM;
4168 task_setup_data.callback_ops = &nfs4_recover_lock_ops;
4169 }
4005 msg.rpc_argp = &data->arg, 4170 msg.rpc_argp = &data->arg,
4006 msg.rpc_resp = &data->res, 4171 msg.rpc_resp = &data->res,
4007 task_setup_data.callback_data = data; 4172 task_setup_data.callback_data = data;
@@ -4011,6 +4176,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4011 ret = nfs4_wait_for_completion_rpc_task(task); 4176 ret = nfs4_wait_for_completion_rpc_task(task);
4012 if (ret == 0) { 4177 if (ret == 0) {
4013 ret = data->rpc_status; 4178 ret = data->rpc_status;
4179 if (ret)
4180 nfs4_handle_setlk_error(data->server, data->lsp,
4181 data->arg.new_lock_owner, ret);
4014 } else 4182 } else
4015 data->cancelled = 1; 4183 data->cancelled = 1;
4016 rpc_put_task(task); 4184 rpc_put_task(task);
@@ -4028,8 +4196,8 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
4028 /* Cache the lock if possible... */ 4196 /* Cache the lock if possible... */
4029 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4197 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4030 return 0; 4198 return 0;
4031 err = _nfs4_do_setlk(state, F_SETLK, request, 1); 4199 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
4032 if (err != -NFS4ERR_DELAY) 4200 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
4033 break; 4201 break;
4034 nfs4_handle_exception(server, err, &exception); 4202 nfs4_handle_exception(server, err, &exception);
4035 } while (exception.retry); 4203 } while (exception.retry);
@@ -4048,11 +4216,18 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
4048 do { 4216 do {
4049 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4217 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4050 return 0; 4218 return 0;
4051 err = _nfs4_do_setlk(state, F_SETLK, request, 0); 4219 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
4052 if (err != -NFS4ERR_DELAY) 4220 switch (err) {
4053 break; 4221 default:
4054 nfs4_handle_exception(server, err, &exception); 4222 goto out;
4223 case -NFS4ERR_GRACE:
4224 case -NFS4ERR_DELAY:
4225 case -EKEYEXPIRED:
4226 nfs4_handle_exception(server, err, &exception);
4227 err = 0;
4228 }
4055 } while (exception.retry); 4229 } while (exception.retry);
4230out:
4056 return err; 4231 return err;
4057} 4232}
4058 4233
@@ -4060,8 +4235,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
4060{ 4235{
4061 struct nfs_inode *nfsi = NFS_I(state->inode); 4236 struct nfs_inode *nfsi = NFS_I(state->inode);
4062 unsigned char fl_flags = request->fl_flags; 4237 unsigned char fl_flags = request->fl_flags;
4063 int status; 4238 int status = -ENOLCK;
4064 4239
4240 if ((fl_flags & FL_POSIX) &&
4241 !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
4242 goto out;
4065 /* Is this a delegated open? */ 4243 /* Is this a delegated open? */
4066 status = nfs4_set_lock_state(state, request); 4244 status = nfs4_set_lock_state(state, request);
4067 if (status != 0) 4245 if (status != 0)
@@ -4078,7 +4256,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
4078 status = do_vfs_lock(request->fl_file, request); 4256 status = do_vfs_lock(request->fl_file, request);
4079 goto out_unlock; 4257 goto out_unlock;
4080 } 4258 }
4081 status = _nfs4_do_setlk(state, cmd, request, 0); 4259 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
4082 if (status != 0) 4260 if (status != 0)
4083 goto out_unlock; 4261 goto out_unlock;
4084 /* Note: we always want to sleep here! */ 4262 /* Note: we always want to sleep here! */
@@ -4161,7 +4339,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4161 if (err != 0) 4339 if (err != 0)
4162 goto out; 4340 goto out;
4163 do { 4341 do {
4164 err = _nfs4_do_setlk(state, F_SETLK, fl, 0); 4342 err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
4165 switch (err) { 4343 switch (err) {
4166 default: 4344 default:
4167 printk(KERN_ERR "%s: unhandled error %d.\n", 4345 printk(KERN_ERR "%s: unhandled error %d.\n",
@@ -4172,6 +4350,11 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4172 case -NFS4ERR_EXPIRED: 4350 case -NFS4ERR_EXPIRED:
4173 case -NFS4ERR_STALE_CLIENTID: 4351 case -NFS4ERR_STALE_CLIENTID:
4174 case -NFS4ERR_STALE_STATEID: 4352 case -NFS4ERR_STALE_STATEID:
4353 case -NFS4ERR_BADSESSION:
4354 case -NFS4ERR_BADSLOT:
4355 case -NFS4ERR_BAD_HIGH_SLOT:
4356 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
4357 case -NFS4ERR_DEADSESSION:
4175 nfs4_schedule_state_recovery(server->nfs_client); 4358 nfs4_schedule_state_recovery(server->nfs_client);
4176 goto out; 4359 goto out;
4177 case -ERESTARTSYS: 4360 case -ERESTARTSYS:
@@ -4191,6 +4374,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4191 err = 0; 4374 err = 0;
4192 goto out; 4375 goto out;
4193 case -NFS4ERR_DELAY: 4376 case -NFS4ERR_DELAY:
4377 case -EKEYEXPIRED:
4194 break; 4378 break;
4195 } 4379 }
4196 err = nfs4_handle_exception(server, err, &exception); 4380 err = nfs4_handle_exception(server, err, &exception);
@@ -4296,7 +4480,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
4296 * NFS4ERR_BADSESSION in the sequence operation, and will therefore 4480 * NFS4ERR_BADSESSION in the sequence operation, and will therefore
4297 * be in some phase of session reset. 4481 * be in some phase of session reset.
4298 */ 4482 */
4299static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) 4483int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4300{ 4484{
4301 nfs4_verifier verifier; 4485 nfs4_verifier verifier;
4302 struct nfs41_exchange_id_args args = { 4486 struct nfs41_exchange_id_args args = {
@@ -4318,6 +4502,9 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4318 dprintk("--> %s\n", __func__); 4502 dprintk("--> %s\n", __func__);
4319 BUG_ON(clp == NULL); 4503 BUG_ON(clp == NULL);
4320 4504
4505 /* Remove server-only flags */
4506 args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
4507
4321 p = (u32 *)verifier.data; 4508 p = (u32 *)verifier.data;
4322 *p++ = htonl((u32)clp->cl_boot_time.tv_sec); 4509 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
4323 *p = htonl((u32)clp->cl_boot_time.tv_nsec); 4510 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4333,7 +4520,7 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4333 4520
4334 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); 4521 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4335 4522
4336 if (status != NFS4ERR_CLID_INUSE) 4523 if (status != -NFS4ERR_CLID_INUSE)
4337 break; 4524 break;
4338 4525
4339 if (signalled()) 4526 if (signalled())
@@ -4361,11 +4548,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
4361 (struct nfs4_get_lease_time_data *)calldata; 4548 (struct nfs4_get_lease_time_data *)calldata;
4362 4549
4363 dprintk("--> %s\n", __func__); 4550 dprintk("--> %s\n", __func__);
4551 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
4364 /* just setup sequence, do not trigger session recovery 4552 /* just setup sequence, do not trigger session recovery
4365 since we're invoked within one */ 4553 since we're invoked within one */
4366 ret = nfs41_setup_sequence(data->clp->cl_session, 4554 ret = nfs41_setup_sequence(data->clp->cl_session,
4367 &data->args->la_seq_args, 4555 &data->args->la_seq_args,
4368 &data->res->lr_seq_res, 0, task); 4556 &data->res->lr_seq_res, 0, task);
4369 4557
4370 BUG_ON(ret == -EAGAIN); 4558 BUG_ON(ret == -EAGAIN);
4371 rpc_call_start(task); 4559 rpc_call_start(task);
@@ -4386,13 +4574,13 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4386 switch (task->tk_status) { 4574 switch (task->tk_status) {
4387 case -NFS4ERR_DELAY: 4575 case -NFS4ERR_DELAY:
4388 case -NFS4ERR_GRACE: 4576 case -NFS4ERR_GRACE:
4577 case -EKEYEXPIRED:
4389 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); 4578 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4390 rpc_delay(task, NFS4_POLL_RETRY_MIN); 4579 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4391 task->tk_status = 0; 4580 task->tk_status = 0;
4392 nfs4_restart_rpc(task, data->clp); 4581 nfs_restart_rpc(task, data->clp);
4393 return; 4582 return;
4394 } 4583 }
4395 nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
4396 dprintk("<-- %s\n", __func__); 4584 dprintk("<-- %s\n", __func__);
4397} 4585}
4398 4586
@@ -4444,28 +4632,33 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4444/* 4632/*
4445 * Reset a slot table 4633 * Reset a slot table
4446 */ 4634 */
4447static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots, 4635static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
4448 int old_max_slots, int ivalue) 4636 int ivalue)
4449{ 4637{
4638 struct nfs4_slot *new = NULL;
4450 int i; 4639 int i;
4451 int ret = 0; 4640 int ret = 0;
4452 4641
4453 dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl); 4642 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
4643 max_reqs, tbl->max_slots);
4454 4644
4455 /* 4645 /* Does the newly negotiated max_reqs match the existing slot table? */
4456 * Until we have dynamic slot table adjustment, insist 4646 if (max_reqs != tbl->max_slots) {
4457 * upon the same slot table size 4647 ret = -ENOMEM;
4458 */ 4648 new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
4459 if (max_slots != old_max_slots) { 4649 GFP_KERNEL);
4460 dprintk("%s reset slot table does't match old\n", 4650 if (!new)
4461 __func__); 4651 goto out;
4462 ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */ 4652 ret = 0;
4463 goto out; 4653 kfree(tbl->slots);
4464 } 4654 }
4465 spin_lock(&tbl->slot_tbl_lock); 4655 spin_lock(&tbl->slot_tbl_lock);
4466 for (i = 0; i < max_slots; ++i) 4656 if (new) {
4657 tbl->slots = new;
4658 tbl->max_slots = max_reqs;
4659 }
4660 for (i = 0; i < tbl->max_slots; ++i)
4467 tbl->slots[i].seq_nr = ivalue; 4661 tbl->slots[i].seq_nr = ivalue;
4468 tbl->highest_used_slotid = -1;
4469 spin_unlock(&tbl->slot_tbl_lock); 4662 spin_unlock(&tbl->slot_tbl_lock);
4470 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, 4663 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
4471 tbl, tbl->slots, tbl->max_slots); 4664 tbl, tbl->slots, tbl->max_slots);
@@ -4482,16 +4675,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
4482 int status; 4675 int status;
4483 4676
4484 status = nfs4_reset_slot_table(&session->fc_slot_table, 4677 status = nfs4_reset_slot_table(&session->fc_slot_table,
4485 session->fc_attrs.max_reqs, 4678 session->fc_attrs.max_reqs, 1);
4486 session->fc_slot_table.max_slots,
4487 1);
4488 if (status) 4679 if (status)
4489 return status; 4680 return status;
4490 4681
4491 status = nfs4_reset_slot_table(&session->bc_slot_table, 4682 status = nfs4_reset_slot_table(&session->bc_slot_table,
4492 session->bc_attrs.max_reqs, 4683 session->bc_attrs.max_reqs, 0);
4493 session->bc_slot_table.max_slots,
4494 0);
4495 return status; 4684 return status;
4496} 4685}
4497 4686
@@ -4515,7 +4704,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)
4515static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, 4704static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
4516 int max_slots, int ivalue) 4705 int max_slots, int ivalue)
4517{ 4706{
4518 int i;
4519 struct nfs4_slot *slot; 4707 struct nfs4_slot *slot;
4520 int ret = -ENOMEM; 4708 int ret = -ENOMEM;
4521 4709
@@ -4526,18 +4714,9 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
4526 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); 4714 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
4527 if (!slot) 4715 if (!slot)
4528 goto out; 4716 goto out;
4529 for (i = 0; i < max_slots; ++i)
4530 slot[i].seq_nr = ivalue;
4531 ret = 0; 4717 ret = 0;
4532 4718
4533 spin_lock(&tbl->slot_tbl_lock); 4719 spin_lock(&tbl->slot_tbl_lock);
4534 if (tbl->slots != NULL) {
4535 spin_unlock(&tbl->slot_tbl_lock);
4536 dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
4537 __func__, tbl, tbl->slots);
4538 WARN_ON(1);
4539 goto out_free;
4540 }
4541 tbl->max_slots = max_slots; 4720 tbl->max_slots = max_slots;
4542 tbl->slots = slot; 4721 tbl->slots = slot;
4543 tbl->highest_used_slotid = -1; /* no slot is currently used */ 4722 tbl->highest_used_slotid = -1; /* no slot is currently used */
@@ -4547,10 +4726,6 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
4547out: 4726out:
4548 dprintk("<-- %s: return %d\n", __func__, ret); 4727 dprintk("<-- %s: return %d\n", __func__, ret);
4549 return ret; 4728 return ret;
4550
4551out_free:
4552 kfree(slot);
4553 goto out;
4554} 4729}
4555 4730
4556/* 4731/*
@@ -4558,17 +4733,24 @@ out_free:
4558 */ 4733 */
4559static int nfs4_init_slot_tables(struct nfs4_session *session) 4734static int nfs4_init_slot_tables(struct nfs4_session *session)
4560{ 4735{
4561 int status; 4736 struct nfs4_slot_table *tbl;
4737 int status = 0;
4562 4738
4563 status = nfs4_init_slot_table(&session->fc_slot_table, 4739 tbl = &session->fc_slot_table;
4564 session->fc_attrs.max_reqs, 1); 4740 if (tbl->slots == NULL) {
4565 if (status) 4741 status = nfs4_init_slot_table(tbl,
4566 return status; 4742 session->fc_attrs.max_reqs, 1);
4743 if (status)
4744 return status;
4745 }
4567 4746
4568 status = nfs4_init_slot_table(&session->bc_slot_table, 4747 tbl = &session->bc_slot_table;
4569 session->bc_attrs.max_reqs, 0); 4748 if (tbl->slots == NULL) {
4570 if (status) 4749 status = nfs4_init_slot_table(tbl,
4571 nfs4_destroy_slot_tables(session); 4750 session->bc_attrs.max_reqs, 0);
4751 if (status)
4752 nfs4_destroy_slot_tables(session);
4753 }
4572 4754
4573 return status; 4755 return status;
4574} 4756}
@@ -4582,7 +4764,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4582 if (!session) 4764 if (!session)
4583 return NULL; 4765 return NULL;
4584 4766
4585 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
4586 /* 4767 /*
4587 * The create session reply races with the server back 4768 * The create session reply races with the server back
4588 * channel probe. Mark the client NFS_CS_SESSION_INITING 4769 * channel probe. Mark the client NFS_CS_SESSION_INITING
@@ -4590,12 +4771,15 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4590 * nfs_client struct 4771 * nfs_client struct
4591 */ 4772 */
4592 clp->cl_cons_state = NFS_CS_SESSION_INITING; 4773 clp->cl_cons_state = NFS_CS_SESSION_INITING;
4774 init_completion(&session->complete);
4593 4775
4594 tbl = &session->fc_slot_table; 4776 tbl = &session->fc_slot_table;
4777 tbl->highest_used_slotid = -1;
4595 spin_lock_init(&tbl->slot_tbl_lock); 4778 spin_lock_init(&tbl->slot_tbl_lock);
4596 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); 4779 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
4597 4780
4598 tbl = &session->bc_slot_table; 4781 tbl = &session->bc_slot_table;
4782 tbl->highest_used_slotid = -1;
4599 spin_lock_init(&tbl->slot_tbl_lock); 4783 spin_lock_init(&tbl->slot_tbl_lock);
4600 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 4784 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4601 4785
@@ -4637,16 +4821,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4637 args->fc_attrs.headerpadsz = 0; 4821 args->fc_attrs.headerpadsz = 0;
4638 args->fc_attrs.max_rqst_sz = mxrqst_sz; 4822 args->fc_attrs.max_rqst_sz = mxrqst_sz;
4639 args->fc_attrs.max_resp_sz = mxresp_sz; 4823 args->fc_attrs.max_resp_sz = mxresp_sz;
4640 args->fc_attrs.max_resp_sz_cached = mxresp_sz;
4641 args->fc_attrs.max_ops = NFS4_MAX_OPS; 4824 args->fc_attrs.max_ops = NFS4_MAX_OPS;
4642 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; 4825 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
4643 4826
4644 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " 4827 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
4645 "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", 4828 "max_ops=%u max_reqs=%u\n",
4646 __func__, 4829 __func__,
4647 args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, 4830 args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
4648 args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops, 4831 args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
4649 args->fc_attrs.max_reqs);
4650 4832
4651 /* Back channel attributes */ 4833 /* Back channel attributes */
4652 args->bc_attrs.headerpadsz = 0; 4834 args->bc_attrs.headerpadsz = 0;
@@ -4747,11 +4929,10 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
4747 * It is the responsibility of the caller to verify the session is 4929 * It is the responsibility of the caller to verify the session is
4748 * expired before calling this routine. 4930 * expired before calling this routine.
4749 */ 4931 */
4750int nfs4_proc_create_session(struct nfs_client *clp, int reset) 4932int nfs4_proc_create_session(struct nfs_client *clp)
4751{ 4933{
4752 int status; 4934 int status;
4753 unsigned *ptr; 4935 unsigned *ptr;
4754 struct nfs_fsinfo fsinfo;
4755 struct nfs4_session *session = clp->cl_session; 4936 struct nfs4_session *session = clp->cl_session;
4756 4937
4757 dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); 4938 dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
@@ -4760,35 +4941,19 @@ int nfs4_proc_create_session(struct nfs_client *clp, int reset)
4760 if (status) 4941 if (status)
4761 goto out; 4942 goto out;
4762 4943
4763 /* Init or reset the fore channel */ 4944 /* Init and reset the fore channel */
4764 if (reset) 4945 status = nfs4_init_slot_tables(session);
4765 status = nfs4_reset_slot_tables(session); 4946 dprintk("slot table initialization returned %d\n", status);
4766 else 4947 if (status)
4767 status = nfs4_init_slot_tables(session); 4948 goto out;
4768 dprintk("fore channel slot table initialization returned %d\n", status); 4949 status = nfs4_reset_slot_tables(session);
4950 dprintk("slot table reset returned %d\n", status);
4769 if (status) 4951 if (status)
4770 goto out; 4952 goto out;
4771 4953
4772 ptr = (unsigned *)&session->sess_id.data[0]; 4954 ptr = (unsigned *)&session->sess_id.data[0];
4773 dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, 4955 dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
4774 clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); 4956 clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
4775
4776 if (reset)
4777 /* Lease time is aleady set */
4778 goto out;
4779
4780 /* Get the lease time */
4781 status = nfs4_proc_get_lease_time(clp, &fsinfo);
4782 if (status == 0) {
4783 /* Update lease time and schedule renewal */
4784 spin_lock(&clp->cl_lock);
4785 clp->cl_lease_time = fsinfo.lease_time * HZ;
4786 clp->cl_last_renewal = jiffies;
4787 clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
4788 spin_unlock(&clp->cl_lock);
4789
4790 nfs4_schedule_state_renewal(clp);
4791 }
4792out: 4957out:
4793 dprintk("<-- %s\n", __func__); 4958 dprintk("<-- %s\n", __func__);
4794 return status; 4959 return status;
@@ -4827,13 +4992,24 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
4827int nfs4_init_session(struct nfs_server *server) 4992int nfs4_init_session(struct nfs_server *server)
4828{ 4993{
4829 struct nfs_client *clp = server->nfs_client; 4994 struct nfs_client *clp = server->nfs_client;
4995 struct nfs4_session *session;
4996 unsigned int rsize, wsize;
4830 int ret; 4997 int ret;
4831 4998
4832 if (!nfs4_has_session(clp)) 4999 if (!nfs4_has_session(clp))
4833 return 0; 5000 return 0;
4834 5001
4835 clp->cl_session->fc_attrs.max_rqst_sz = server->wsize; 5002 rsize = server->rsize;
4836 clp->cl_session->fc_attrs.max_resp_sz = server->rsize; 5003 if (rsize == 0)
5004 rsize = NFS_MAX_FILE_IO_SIZE;
5005 wsize = server->wsize;
5006 if (wsize == 0)
5007 wsize = NFS_MAX_FILE_IO_SIZE;
5008
5009 session = clp->cl_session;
5010 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
5011 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
5012
4837 ret = nfs4_recover_expired_lease(server); 5013 ret = nfs4_recover_expired_lease(server);
4838 if (!ret) 5014 if (!ret)
4839 ret = nfs4_check_client_ready(clp); 5015 ret = nfs4_check_client_ready(clp);
@@ -4858,10 +5034,19 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
4858 args.sa_cache_this = 0; 5034 args.sa_cache_this = 0;
4859 5035
4860 return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, 5036 return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
4861 &res, 0); 5037 &res, args.sa_cache_this, 1);
5038}
5039
5040static void nfs41_sequence_release(void *data)
5041{
5042 struct nfs_client *clp = (struct nfs_client *)data;
5043
5044 if (atomic_read(&clp->cl_count) > 1)
5045 nfs4_schedule_state_renewal(clp);
5046 nfs_put_client(clp);
4862} 5047}
4863 5048
4864void nfs41_sequence_call_done(struct rpc_task *task, void *data) 5049static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4865{ 5050{
4866 struct nfs_client *clp = (struct nfs_client *)data; 5051 struct nfs_client *clp = (struct nfs_client *)data;
4867 5052
@@ -4869,16 +5054,17 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4869 5054
4870 if (task->tk_status < 0) { 5055 if (task->tk_status < 0) {
4871 dprintk("%s ERROR %d\n", __func__, task->tk_status); 5056 dprintk("%s ERROR %d\n", __func__, task->tk_status);
5057 if (atomic_read(&clp->cl_count) == 1)
5058 goto out;
4872 5059
4873 if (_nfs4_async_handle_error(task, NULL, clp, NULL) 5060 if (_nfs4_async_handle_error(task, NULL, clp, NULL)
4874 == -EAGAIN) { 5061 == -EAGAIN) {
4875 nfs4_restart_rpc(task, clp); 5062 nfs_restart_rpc(task, clp);
4876 return; 5063 return;
4877 } 5064 }
4878 } 5065 }
4879 nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
4880 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 5066 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
4881 5067out:
4882 kfree(task->tk_msg.rpc_argp); 5068 kfree(task->tk_msg.rpc_argp);
4883 kfree(task->tk_msg.rpc_resp); 5069 kfree(task->tk_msg.rpc_resp);
4884 5070
@@ -4903,6 +5089,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
4903static const struct rpc_call_ops nfs41_sequence_ops = { 5089static const struct rpc_call_ops nfs41_sequence_ops = {
4904 .rpc_call_done = nfs41_sequence_call_done, 5090 .rpc_call_done = nfs41_sequence_call_done,
4905 .rpc_call_prepare = nfs41_sequence_prepare, 5091 .rpc_call_prepare = nfs41_sequence_prepare,
5092 .rpc_release = nfs41_sequence_release,
4906}; 5093};
4907 5094
4908static int nfs41_proc_async_sequence(struct nfs_client *clp, 5095static int nfs41_proc_async_sequence(struct nfs_client *clp,
@@ -4915,12 +5102,14 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
4915 .rpc_cred = cred, 5102 .rpc_cred = cred,
4916 }; 5103 };
4917 5104
5105 if (!atomic_inc_not_zero(&clp->cl_count))
5106 return -EIO;
4918 args = kzalloc(sizeof(*args), GFP_KERNEL); 5107 args = kzalloc(sizeof(*args), GFP_KERNEL);
4919 if (!args)
4920 return -ENOMEM;
4921 res = kzalloc(sizeof(*res), GFP_KERNEL); 5108 res = kzalloc(sizeof(*res), GFP_KERNEL);
4922 if (!res) { 5109 if (!args || !res) {
4923 kfree(args); 5110 kfree(args);
5111 kfree(res);
5112 nfs_put_client(clp);
4924 return -ENOMEM; 5113 return -ENOMEM;
4925 } 5114 }
4926 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 5115 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
@@ -4931,6 +5120,110 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
4931 &nfs41_sequence_ops, (void *)clp); 5120 &nfs41_sequence_ops, (void *)clp);
4932} 5121}
4933 5122
5123struct nfs4_reclaim_complete_data {
5124 struct nfs_client *clp;
5125 struct nfs41_reclaim_complete_args arg;
5126 struct nfs41_reclaim_complete_res res;
5127};
5128
5129static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
5130{
5131 struct nfs4_reclaim_complete_data *calldata = data;
5132
5133 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
5134 if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args,
5135 &calldata->res.seq_res, 0, task))
5136 return;
5137
5138 rpc_call_start(task);
5139}
5140
5141static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
5142{
5143 struct nfs4_reclaim_complete_data *calldata = data;
5144 struct nfs_client *clp = calldata->clp;
5145 struct nfs4_sequence_res *res = &calldata->res.seq_res;
5146
5147 dprintk("--> %s\n", __func__);
5148 nfs41_sequence_done(clp, res, task->tk_status);
5149 switch (task->tk_status) {
5150 case 0:
5151 case -NFS4ERR_COMPLETE_ALREADY:
5152 break;
5153 case -NFS4ERR_BADSESSION:
5154 case -NFS4ERR_DEADSESSION:
5155 /*
5156 * Handle the session error, but do not retry the operation, as
5157 * we have no way of telling whether the clientid had to be
5158 * reset before we got our reply. If reset, a new wave of
5159 * reclaim operations will follow, containing their own reclaim
5160 * complete. We don't want our retry to get on the way of
5161 * recovery by incorrectly indicating to the server that we're
5162 * done reclaiming state since the process had to be restarted.
5163 */
5164 _nfs4_async_handle_error(task, NULL, clp, NULL);
5165 break;
5166 default:
5167 if (_nfs4_async_handle_error(
5168 task, NULL, clp, NULL) == -EAGAIN) {
5169 rpc_restart_call_prepare(task);
5170 return;
5171 }
5172 }
5173
5174 dprintk("<-- %s\n", __func__);
5175}
5176
5177static void nfs4_free_reclaim_complete_data(void *data)
5178{
5179 struct nfs4_reclaim_complete_data *calldata = data;
5180
5181 kfree(calldata);
5182}
5183
5184static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
5185 .rpc_call_prepare = nfs4_reclaim_complete_prepare,
5186 .rpc_call_done = nfs4_reclaim_complete_done,
5187 .rpc_release = nfs4_free_reclaim_complete_data,
5188};
5189
5190/*
5191 * Issue a global reclaim complete.
5192 */
5193static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5194{
5195 struct nfs4_reclaim_complete_data *calldata;
5196 struct rpc_task *task;
5197 struct rpc_message msg = {
5198 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
5199 };
5200 struct rpc_task_setup task_setup_data = {
5201 .rpc_client = clp->cl_rpcclient,
5202 .rpc_message = &msg,
5203 .callback_ops = &nfs4_reclaim_complete_call_ops,
5204 .flags = RPC_TASK_ASYNC,
5205 };
5206 int status = -ENOMEM;
5207
5208 dprintk("--> %s\n", __func__);
5209 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
5210 if (calldata == NULL)
5211 goto out;
5212 calldata->clp = clp;
5213 calldata->arg.one_fs = 0;
5214 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5215
5216 msg.rpc_argp = &calldata->arg;
5217 msg.rpc_resp = &calldata->res;
5218 task_setup_data.callback_data = calldata;
5219 task = rpc_run_task(&task_setup_data);
5220 if (IS_ERR(task))
5221 status = PTR_ERR(task);
5222 rpc_put_task(task);
5223out:
5224 dprintk("<-- %s status=%d\n", __func__, status);
5225 return status;
5226}
4934#endif /* CONFIG_NFS_V4_1 */ 5227#endif /* CONFIG_NFS_V4_1 */
4935 5228
4936struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 5229struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -4948,8 +5241,9 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
4948 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, 5241 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
4949 .recover_open = nfs4_open_reclaim, 5242 .recover_open = nfs4_open_reclaim,
4950 .recover_lock = nfs4_lock_reclaim, 5243 .recover_lock = nfs4_lock_reclaim,
4951 .establish_clid = nfs4_proc_exchange_id, 5244 .establish_clid = nfs41_init_clientid,
4952 .get_clid_cred = nfs4_get_exchange_id_cred, 5245 .get_clid_cred = nfs4_get_exchange_id_cred,
5246 .reclaim_complete = nfs41_proc_reclaim_complete,
4953}; 5247};
4954#endif /* CONFIG_NFS_V4_1 */ 5248#endif /* CONFIG_NFS_V4_1 */
4955 5249
@@ -4968,7 +5262,7 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
4968 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, 5262 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
4969 .recover_open = nfs4_open_expired, 5263 .recover_open = nfs4_open_expired,
4970 .recover_lock = nfs4_lock_expired, 5264 .recover_lock = nfs4_lock_expired,
4971 .establish_clid = nfs4_proc_exchange_id, 5265 .establish_clid = nfs41_init_clientid,
4972 .get_clid_cred = nfs4_get_exchange_id_cred, 5266 .get_clid_cred = nfs4_get_exchange_id_cred,
4973}; 5267};
4974#endif /* CONFIG_NFS_V4_1 */ 5268#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0156c01c212c..d87f10327b72 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -36,11 +36,6 @@
36 * as an rpc_task, not a real kernel thread, so it always runs in rpciod's 36 * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
37 * context. There is one renewd per nfs_server. 37 * context. There is one renewd per nfs_server.
38 * 38 *
39 * TODO: If the send queue gets backlogged (e.g., if the server goes down),
40 * we will keep filling the queue with periodic RENEW requests. We need a
41 * mechanism for ensuring that if renewd successfully sends off a request,
42 * then it only wakes up when the request is finished. Maybe use the
43 * child task framework of the RPC layer?
44 */ 39 */
45 40
46#include <linux/mm.h> 41#include <linux/mm.h>
@@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work)
63 struct nfs_client *clp = 58 struct nfs_client *clp =
64 container_of(work, struct nfs_client, cl_renewd.work); 59 container_of(work, struct nfs_client, cl_renewd.work);
65 struct rpc_cred *cred; 60 struct rpc_cred *cred;
66 long lease, timeout; 61 long lease;
67 unsigned long last, now; 62 unsigned long last, now;
68 63
69 ops = nfs4_state_renewal_ops[clp->cl_minorversion]; 64 ops = nfs4_state_renewal_ops[clp->cl_minorversion];
@@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work)
75 lease = clp->cl_lease_time; 70 lease = clp->cl_lease_time;
76 last = clp->cl_last_renewal; 71 last = clp->cl_last_renewal;
77 now = jiffies; 72 now = jiffies;
78 timeout = (2 * lease) / 3 + (long)last - (long)now;
79 /* Are we close to a lease timeout? */ 73 /* Are we close to a lease timeout? */
80 if (time_after(now, last + lease/3)) { 74 if (time_after(now, last + lease/3)) {
81 cred = ops->get_state_renewal_cred_locked(clp); 75 cred = ops->get_state_renewal_cred_locked(clp);
@@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work)
90 /* Queue an asynchronous RENEW. */ 84 /* Queue an asynchronous RENEW. */
91 ops->sched_state_renewal(clp, cred); 85 ops->sched_state_renewal(clp, cred);
92 put_rpccred(cred); 86 put_rpccred(cred);
87 goto out_exp;
93 } 88 }
94 timeout = (2 * lease) / 3; 89 } else {
95 spin_lock(&clp->cl_lock);
96 } else
97 dprintk("%s: failed to call renewd. Reason: lease not expired \n", 90 dprintk("%s: failed to call renewd. Reason: lease not expired \n",
98 __func__); 91 __func__);
99 if (timeout < 5 * HZ) /* safeguard */ 92 spin_unlock(&clp->cl_lock);
100 timeout = 5 * HZ; 93 }
101 dprintk("%s: requeueing work. Lease period = %ld\n", 94 nfs4_schedule_state_renewal(clp);
102 __func__, (timeout + HZ - 1) / HZ); 95out_exp:
103 cancel_delayed_work(&clp->cl_renewd);
104 schedule_delayed_work(&clp->cl_renewd, timeout);
105 spin_unlock(&clp->cl_lock);
106 nfs_expire_unreferenced_delegations(clp); 96 nfs_expire_unreferenced_delegations(clp);
107out: 97out:
108 dprintk("%s: done\n", __func__); 98 dprintk("%s: done\n", __func__);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2ef4fecf3984..6c5ed51f105e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -116,6 +116,79 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
116 116
117#if defined(CONFIG_NFS_V4_1) 117#if defined(CONFIG_NFS_V4_1)
118 118
119static int nfs41_setup_state_renewal(struct nfs_client *clp)
120{
121 int status;
122 struct nfs_fsinfo fsinfo;
123
124 status = nfs4_proc_get_lease_time(clp, &fsinfo);
125 if (status == 0) {
126 /* Update lease time and schedule renewal */
127 spin_lock(&clp->cl_lock);
128 clp->cl_lease_time = fsinfo.lease_time * HZ;
129 clp->cl_last_renewal = jiffies;
130 spin_unlock(&clp->cl_lock);
131
132 nfs4_schedule_state_renewal(clp);
133 }
134
135 return status;
136}
137
138static void nfs4_end_drain_session(struct nfs_client *clp)
139{
140 struct nfs4_session *ses = clp->cl_session;
141 int max_slots;
142
143 if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
144 spin_lock(&ses->fc_slot_table.slot_tbl_lock);
145 max_slots = ses->fc_slot_table.max_slots;
146 while (max_slots--) {
147 struct rpc_task *task;
148
149 task = rpc_wake_up_next(&ses->fc_slot_table.
150 slot_tbl_waitq);
151 if (!task)
152 break;
153 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
154 }
155 spin_unlock(&ses->fc_slot_table.slot_tbl_lock);
156 }
157}
158
159static int nfs4_begin_drain_session(struct nfs_client *clp)
160{
161 struct nfs4_session *ses = clp->cl_session;
162 struct nfs4_slot_table *tbl = &ses->fc_slot_table;
163
164 spin_lock(&tbl->slot_tbl_lock);
165 set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state);
166 if (tbl->highest_used_slotid != -1) {
167 INIT_COMPLETION(ses->complete);
168 spin_unlock(&tbl->slot_tbl_lock);
169 return wait_for_completion_interruptible(&ses->complete);
170 }
171 spin_unlock(&tbl->slot_tbl_lock);
172 return 0;
173}
174
175int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
176{
177 int status;
178
179 nfs4_begin_drain_session(clp);
180 status = nfs4_proc_exchange_id(clp, cred);
181 if (status != 0)
182 goto out;
183 status = nfs4_proc_create_session(clp);
184 if (status != 0)
185 goto out;
186 nfs41_setup_state_renewal(clp);
187 nfs_mark_client_ready(clp, NFS_CS_READY);
188out:
189 return status;
190}
191
119struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) 192struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
120{ 193{
121 struct rpc_cred *cred; 194 struct rpc_cred *cred;
@@ -693,16 +766,21 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
693 return new; 766 return new;
694} 767}
695 768
696void nfs_free_seqid(struct nfs_seqid *seqid) 769void nfs_release_seqid(struct nfs_seqid *seqid)
697{ 770{
698 if (!list_empty(&seqid->list)) { 771 if (!list_empty(&seqid->list)) {
699 struct rpc_sequence *sequence = seqid->sequence->sequence; 772 struct rpc_sequence *sequence = seqid->sequence->sequence;
700 773
701 spin_lock(&sequence->lock); 774 spin_lock(&sequence->lock);
702 list_del(&seqid->list); 775 list_del_init(&seqid->list);
703 spin_unlock(&sequence->lock); 776 spin_unlock(&sequence->lock);
704 rpc_wake_up(&sequence->wait); 777 rpc_wake_up(&sequence->wait);
705 } 778 }
779}
780
781void nfs_free_seqid(struct nfs_seqid *seqid)
782{
783 nfs_release_seqid(seqid);
706 kfree(seqid); 784 kfree(seqid);
707} 785}
708 786
@@ -823,7 +901,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
823 nfs4_schedule_state_manager(clp); 901 nfs4_schedule_state_manager(clp);
824} 902}
825 903
826static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) 904int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
827{ 905{
828 906
829 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 907 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -877,6 +955,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
877 case -NFS4ERR_EXPIRED: 955 case -NFS4ERR_EXPIRED:
878 case -NFS4ERR_NO_GRACE: 956 case -NFS4ERR_NO_GRACE:
879 case -NFS4ERR_STALE_CLIENTID: 957 case -NFS4ERR_STALE_CLIENTID:
958 case -NFS4ERR_BADSESSION:
959 case -NFS4ERR_BADSLOT:
960 case -NFS4ERR_BAD_HIGH_SLOT:
961 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
880 goto out; 962 goto out;
881 default: 963 default:
882 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", 964 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
@@ -959,6 +1041,10 @@ restart:
959 case -NFS4ERR_NO_GRACE: 1041 case -NFS4ERR_NO_GRACE:
960 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 1042 nfs4_state_mark_reclaim_nograce(sp->so_client, state);
961 case -NFS4ERR_STALE_CLIENTID: 1043 case -NFS4ERR_STALE_CLIENTID:
1044 case -NFS4ERR_BADSESSION:
1045 case -NFS4ERR_BADSLOT:
1046 case -NFS4ERR_BAD_HIGH_SLOT:
1047 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
962 goto out_err; 1048 goto out_err;
963 } 1049 }
964 nfs4_put_open_state(state); 1050 nfs4_put_open_state(state);
@@ -1011,6 +1097,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
1011 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); 1097 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
1012} 1098}
1013 1099
1100static void nfs4_reclaim_complete(struct nfs_client *clp,
1101 const struct nfs4_state_recovery_ops *ops)
1102{
1103 /* Notify the server we're done reclaiming our state */
1104 if (ops->reclaim_complete)
1105 (void)ops->reclaim_complete(clp);
1106}
1107
1014static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1108static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1015{ 1109{
1016 struct nfs4_state_owner *sp; 1110 struct nfs4_state_owner *sp;
@@ -1020,6 +1114,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1020 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1114 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1021 return; 1115 return;
1022 1116
1117 nfs4_reclaim_complete(clp,
1118 nfs4_reboot_recovery_ops[clp->cl_minorversion]);
1119
1023 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1120 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1024 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1121 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
1025 spin_lock(&sp->so_lock); 1122 spin_lock(&sp->so_lock);
@@ -1046,25 +1143,25 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1046 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); 1143 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1047} 1144}
1048 1145
1049static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp) 1146static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1050{
1051 clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
1052}
1053
1054static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1055{ 1147{
1056 switch (error) { 1148 switch (error) {
1057 case -NFS4ERR_CB_PATH_DOWN: 1149 case -NFS4ERR_CB_PATH_DOWN:
1058 nfs_handle_cb_pathdown(clp); 1150 nfs_handle_cb_pathdown(clp);
1059 break; 1151 return 0;
1152 case -NFS4ERR_NO_GRACE:
1153 nfs4_state_end_reclaim_reboot(clp);
1154 return 0;
1060 case -NFS4ERR_STALE_CLIENTID: 1155 case -NFS4ERR_STALE_CLIENTID:
1061 case -NFS4ERR_LEASE_MOVED: 1156 case -NFS4ERR_LEASE_MOVED:
1062 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1157 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1158 nfs4_state_end_reclaim_reboot(clp);
1063 nfs4_state_start_reclaim_reboot(clp); 1159 nfs4_state_start_reclaim_reboot(clp);
1064 break; 1160 break;
1065 case -NFS4ERR_EXPIRED: 1161 case -NFS4ERR_EXPIRED:
1066 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1162 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1067 nfs4_state_start_reclaim_nograce(clp); 1163 nfs4_state_start_reclaim_nograce(clp);
1164 break;
1068 case -NFS4ERR_BADSESSION: 1165 case -NFS4ERR_BADSESSION:
1069 case -NFS4ERR_BADSLOT: 1166 case -NFS4ERR_BADSLOT:
1070 case -NFS4ERR_BAD_HIGH_SLOT: 1167 case -NFS4ERR_BAD_HIGH_SLOT:
@@ -1072,8 +1169,11 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1072 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1169 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1073 case -NFS4ERR_SEQ_FALSE_RETRY: 1170 case -NFS4ERR_SEQ_FALSE_RETRY:
1074 case -NFS4ERR_SEQ_MISORDERED: 1171 case -NFS4ERR_SEQ_MISORDERED:
1075 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); 1172 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1173 /* Zero session reset errors */
1174 return 0;
1076 } 1175 }
1176 return error;
1077} 1177}
1078 1178
1079static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) 1179static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
@@ -1093,8 +1193,7 @@ restart:
1093 if (status < 0) { 1193 if (status < 0) {
1094 set_bit(ops->owner_flag_bit, &sp->so_flags); 1194 set_bit(ops->owner_flag_bit, &sp->so_flags);
1095 nfs4_put_state_owner(sp); 1195 nfs4_put_state_owner(sp);
1096 nfs4_recovery_handle_error(clp, status); 1196 return nfs4_recovery_handle_error(clp, status);
1097 return status;
1098 } 1197 }
1099 nfs4_put_state_owner(sp); 1198 nfs4_put_state_owner(sp);
1100 goto restart; 1199 goto restart;
@@ -1124,8 +1223,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
1124 status = ops->renew_lease(clp, cred); 1223 status = ops->renew_lease(clp, cred);
1125 put_rpccred(cred); 1224 put_rpccred(cred);
1126out: 1225out:
1127 nfs4_recovery_handle_error(clp, status); 1226 return nfs4_recovery_handle_error(clp, status);
1128 return status;
1129} 1227}
1130 1228
1131static int nfs4_reclaim_lease(struct nfs_client *clp) 1229static int nfs4_reclaim_lease(struct nfs_client *clp)
@@ -1151,55 +1249,127 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
1151} 1249}
1152 1250
1153#ifdef CONFIG_NFS_V4_1 1251#ifdef CONFIG_NFS_V4_1
1154static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err) 1252void nfs41_handle_recall_slot(struct nfs_client *clp)
1155{ 1253{
1156 switch (err) { 1254 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1157 case -NFS4ERR_STALE_CLIENTID: 1255 nfs4_schedule_state_recovery(clp);
1158 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1256}
1159 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); 1257
1258static void nfs4_reset_all_state(struct nfs_client *clp)
1259{
1260 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1261 clp->cl_boot_time = CURRENT_TIME;
1262 nfs4_state_start_reclaim_nograce(clp);
1263 nfs4_schedule_state_recovery(clp);
1160 } 1264 }
1161} 1265}
1162 1266
1267static void nfs41_handle_server_reboot(struct nfs_client *clp)
1268{
1269 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1270 nfs4_state_start_reclaim_reboot(clp);
1271 nfs4_schedule_state_recovery(clp);
1272 }
1273}
1274
1275static void nfs41_handle_state_revoked(struct nfs_client *clp)
1276{
1277 /* Temporary */
1278 nfs4_reset_all_state(clp);
1279}
1280
1281static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
1282{
1283 /* This will need to handle layouts too */
1284 nfs_expire_all_delegations(clp);
1285}
1286
1287static void nfs41_handle_cb_path_down(struct nfs_client *clp)
1288{
1289 nfs_expire_all_delegations(clp);
1290 if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
1291 nfs4_schedule_state_recovery(clp);
1292}
1293
1294void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
1295{
1296 if (!flags)
1297 return;
1298 else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
1299 nfs41_handle_server_reboot(clp);
1300 else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
1301 SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
1302 SEQ4_STATUS_ADMIN_STATE_REVOKED |
1303 SEQ4_STATUS_LEASE_MOVED))
1304 nfs41_handle_state_revoked(clp);
1305 else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
1306 nfs41_handle_recallable_state_revoked(clp);
1307 else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
1308 SEQ4_STATUS_BACKCHANNEL_FAULT |
1309 SEQ4_STATUS_CB_PATH_DOWN_SESSION))
1310 nfs41_handle_cb_path_down(clp);
1311}
1312
1163static int nfs4_reset_session(struct nfs_client *clp) 1313static int nfs4_reset_session(struct nfs_client *clp)
1164{ 1314{
1165 int status; 1315 int status;
1166 1316
1317 nfs4_begin_drain_session(clp);
1167 status = nfs4_proc_destroy_session(clp->cl_session); 1318 status = nfs4_proc_destroy_session(clp->cl_session);
1168 if (status && status != -NFS4ERR_BADSESSION && 1319 if (status && status != -NFS4ERR_BADSESSION &&
1169 status != -NFS4ERR_DEADSESSION) { 1320 status != -NFS4ERR_DEADSESSION) {
1170 nfs4_session_recovery_handle_error(clp, status); 1321 status = nfs4_recovery_handle_error(clp, status);
1171 goto out; 1322 goto out;
1172 } 1323 }
1173 1324
1174 memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); 1325 memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
1175 status = nfs4_proc_create_session(clp, 1); 1326 status = nfs4_proc_create_session(clp);
1176 if (status) 1327 if (status) {
1177 nfs4_session_recovery_handle_error(clp, status); 1328 status = nfs4_recovery_handle_error(clp, status);
1178 /* fall through*/ 1329 goto out;
1330 }
1331 /* create_session negotiated new slot table */
1332 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1333
1334 /* Let the state manager reestablish state */
1335 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
1336 nfs41_setup_state_renewal(clp);
1179out: 1337out:
1180 /* Wake up the next rpc task even on error */
1181 rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq);
1182 return status; 1338 return status;
1183} 1339}
1184 1340
1185static int nfs4_initialize_session(struct nfs_client *clp) 1341static int nfs4_recall_slot(struct nfs_client *clp)
1186{ 1342{
1187 int status; 1343 struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table;
1344 struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs;
1345 struct nfs4_slot *new, *old;
1346 int i;
1347
1348 nfs4_begin_drain_session(clp);
1349 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
1350 GFP_KERNEL);
1351 if (!new)
1352 return -ENOMEM;
1188 1353
1189 status = nfs4_proc_create_session(clp, 0); 1354 spin_lock(&fc_tbl->slot_tbl_lock);
1190 if (!status) { 1355 for (i = 0; i < fc_tbl->target_max_slots; i++)
1191 nfs_mark_client_ready(clp, NFS_CS_READY); 1356 new[i].seq_nr = fc_tbl->slots[i].seq_nr;
1192 } else if (status == -NFS4ERR_STALE_CLIENTID) { 1357 old = fc_tbl->slots;
1193 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1358 fc_tbl->slots = new;
1194 set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); 1359 fc_tbl->max_slots = fc_tbl->target_max_slots;
1195 } else { 1360 fc_tbl->target_max_slots = 0;
1196 nfs_mark_client_ready(clp, status); 1361 fc_attrs->max_reqs = fc_tbl->max_slots;
1197 } 1362 spin_unlock(&fc_tbl->slot_tbl_lock);
1198 return status; 1363
1364 kfree(old);
1365 nfs4_end_drain_session(clp);
1366 return 0;
1199} 1367}
1368
1200#else /* CONFIG_NFS_V4_1 */ 1369#else /* CONFIG_NFS_V4_1 */
1201static int nfs4_reset_session(struct nfs_client *clp) { return 0; } 1370static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
1202static int nfs4_initialize_session(struct nfs_client *clp) { return 0; } 1371static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
1372static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
1203#endif /* CONFIG_NFS_V4_1 */ 1373#endif /* CONFIG_NFS_V4_1 */
1204 1374
1205/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors 1375/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1212,6 +1382,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1212 case -NFS4ERR_DELAY: 1382 case -NFS4ERR_DELAY:
1213 case -NFS4ERR_CLID_INUSE: 1383 case -NFS4ERR_CLID_INUSE:
1214 case -EAGAIN: 1384 case -EAGAIN:
1385 case -EKEYEXPIRED:
1215 break; 1386 break;
1216 1387
1217 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1388 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
@@ -1234,7 +1405,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
1234 status = nfs4_reclaim_lease(clp); 1405 status = nfs4_reclaim_lease(clp);
1235 if (status) { 1406 if (status) {
1236 nfs4_set_lease_expired(clp, status); 1407 nfs4_set_lease_expired(clp, status);
1237 if (status == -EAGAIN) 1408 if (test_bit(NFS4CLNT_LEASE_EXPIRED,
1409 &clp->cl_state))
1238 continue; 1410 continue;
1239 if (clp->cl_cons_state == 1411 if (clp->cl_cons_state ==
1240 NFS_CS_SESSION_INITING) 1412 NFS_CS_SESSION_INITING)
@@ -1242,61 +1414,67 @@ static void nfs4_state_manager(struct nfs_client *clp)
1242 goto out_error; 1414 goto out_error;
1243 } 1415 }
1244 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1416 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1417 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
1245 } 1418 }
1246 1419
1247 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { 1420 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
1248 status = nfs4_check_lease(clp); 1421 status = nfs4_check_lease(clp);
1249 if (status != 0) 1422 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
1250 continue; 1423 continue;
1424 if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN)
1425 goto out_error;
1251 } 1426 }
1427
1252 /* Initialize or reset the session */ 1428 /* Initialize or reset the session */
1253 if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state) 1429 if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)
1254 && nfs4_has_session(clp)) { 1430 && nfs4_has_session(clp)) {
1255 if (clp->cl_cons_state == NFS_CS_SESSION_INITING) 1431 status = nfs4_reset_session(clp);
1256 status = nfs4_initialize_session(clp); 1432 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
1257 else 1433 continue;
1258 status = nfs4_reset_session(clp); 1434 if (status < 0)
1259 if (status) {
1260 if (status == -NFS4ERR_STALE_CLIENTID)
1261 continue;
1262 goto out_error; 1435 goto out_error;
1263 }
1264 } 1436 }
1437
1265 /* First recover reboot state... */ 1438 /* First recover reboot state... */
1266 if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { 1439 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
1267 status = nfs4_do_reclaim(clp, 1440 status = nfs4_do_reclaim(clp,
1268 nfs4_reboot_recovery_ops[clp->cl_minorversion]); 1441 nfs4_reboot_recovery_ops[clp->cl_minorversion]);
1269 if (status == -NFS4ERR_STALE_CLIENTID) 1442 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1270 continue; 1443 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
1271 if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
1272 continue; 1444 continue;
1273 nfs4_state_end_reclaim_reboot(clp); 1445 nfs4_state_end_reclaim_reboot(clp);
1274 continue; 1446 if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
1447 continue;
1448 if (status < 0)
1449 goto out_error;
1275 } 1450 }
1276 1451
1277 /* Now recover expired state... */ 1452 /* Now recover expired state... */
1278 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { 1453 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
1279 status = nfs4_do_reclaim(clp, 1454 status = nfs4_do_reclaim(clp,
1280 nfs4_nograce_recovery_ops[clp->cl_minorversion]); 1455 nfs4_nograce_recovery_ops[clp->cl_minorversion]);
1281 if (status < 0) { 1456 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1282 set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); 1457 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
1283 if (status == -NFS4ERR_STALE_CLIENTID) 1458 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1284 continue; 1459 continue;
1285 if (status == -NFS4ERR_EXPIRED) 1460 if (status < 0)
1286 continue;
1287 if (test_bit(NFS4CLNT_SESSION_SETUP,
1288 &clp->cl_state))
1289 continue;
1290 goto out_error; 1461 goto out_error;
1291 } else
1292 nfs4_state_end_reclaim_nograce(clp);
1293 continue;
1294 } 1462 }
1295 1463
1464 nfs4_end_drain_session(clp);
1296 if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { 1465 if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
1297 nfs_client_return_marked_delegations(clp); 1466 nfs_client_return_marked_delegations(clp);
1298 continue; 1467 continue;
1299 } 1468 }
1469 /* Recall session slots */
1470 if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
1471 && nfs4_has_session(clp)) {
1472 status = nfs4_recall_slot(clp);
1473 if (status < 0)
1474 goto out_error;
1475 continue;
1476 }
1477
1300 1478
1301 nfs4_clear_state_manager_bit(clp); 1479 nfs4_clear_state_manager_bit(clp);
1302 /* Did we race with an attempt to give us more work? */ 1480 /* Did we race with an attempt to give us more work? */
@@ -1309,8 +1487,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1309out_error: 1487out_error:
1310 printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" 1488 printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
1311 " with error %d\n", clp->cl_hostname, -status); 1489 " with error %d\n", clp->cl_hostname, -status);
1312 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1490 nfs4_end_drain_session(clp);
1313 nfs4_state_end_reclaim_reboot(clp);
1314 nfs4_clear_state_manager_bit(clp); 1491 nfs4_clear_state_manager_bit(clp);
1315} 1492}
1316 1493
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 20b4e30e6c82..38f3b582e7c2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
38#include <linux/param.h> 38#include <linux/param.h>
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/in.h> 43#include <linux/in.h>
@@ -46,11 +45,13 @@
46#include <linux/proc_fs.h> 45#include <linux/proc_fs.h>
47#include <linux/kdev_t.h> 46#include <linux/kdev_t.h>
48#include <linux/sunrpc/clnt.h> 47#include <linux/sunrpc/clnt.h>
48#include <linux/sunrpc/msg_prot.h>
49#include <linux/nfs.h> 49#include <linux/nfs.h>
50#include <linux/nfs4.h> 50#include <linux/nfs4.h>
51#include <linux/nfs_fs.h> 51#include <linux/nfs_fs.h>
52#include <linux/nfs_idmap.h> 52#include <linux/nfs_idmap.h>
53#include "nfs4_fs.h" 53#include "nfs4_fs.h"
54#include "internal.h"
54 55
55#define NFSDBG_FACILITY NFSDBG_XDR 56#define NFSDBG_FACILITY NFSDBG_XDR
56 57
@@ -134,7 +135,7 @@ static int nfs4_stat_to_errno(int);
134#define decode_lookup_maxsz (op_decode_hdr_maxsz) 135#define decode_lookup_maxsz (op_decode_hdr_maxsz)
135#define encode_share_access_maxsz \ 136#define encode_share_access_maxsz \
136 (2) 137 (2)
137#define encode_createmode_maxsz (1 + encode_attrs_maxsz) 138#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz)
138#define encode_opentype_maxsz (1 + encode_createmode_maxsz) 139#define encode_opentype_maxsz (1 + encode_createmode_maxsz)
139#define encode_claim_null_maxsz (1 + nfs4_name_maxsz) 140#define encode_claim_null_maxsz (1 + nfs4_name_maxsz)
140#define encode_open_maxsz (op_encode_hdr_maxsz + \ 141#define encode_open_maxsz (op_encode_hdr_maxsz + \
@@ -299,6 +300,8 @@ static int nfs4_stat_to_errno(int);
299 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4) 300 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
300#define decode_sequence_maxsz (op_decode_hdr_maxsz + \ 301#define decode_sequence_maxsz (op_decode_hdr_maxsz + \
301 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 302 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
303#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
304#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
302#else /* CONFIG_NFS_V4_1 */ 305#else /* CONFIG_NFS_V4_1 */
303#define encode_sequence_maxsz 0 306#define encode_sequence_maxsz 0
304#define decode_sequence_maxsz 0 307#define decode_sequence_maxsz 0
@@ -676,6 +679,25 @@ static int nfs4_stat_to_errno(int);
676 decode_sequence_maxsz + \ 679 decode_sequence_maxsz + \
677 decode_putrootfh_maxsz + \ 680 decode_putrootfh_maxsz + \
678 decode_fsinfo_maxsz) 681 decode_fsinfo_maxsz)
682#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \
683 encode_sequence_maxsz + \
684 encode_reclaim_complete_maxsz)
685#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
686 decode_sequence_maxsz + \
687 decode_reclaim_complete_maxsz)
688
689const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
690 compound_encode_hdr_maxsz +
691 encode_sequence_maxsz +
692 encode_putfh_maxsz +
693 encode_getattr_maxsz) *
694 XDR_UNIT);
695
696const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
697 compound_decode_hdr_maxsz +
698 decode_sequence_maxsz +
699 decode_putfh_maxsz) *
700 XDR_UNIT);
679#endif /* CONFIG_NFS_V4_1 */ 701#endif /* CONFIG_NFS_V4_1 */
680 702
681static const umode_t nfs_type2fmt[] = { 703static const umode_t nfs_type2fmt[] = {
@@ -1140,6 +1162,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1140static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1162static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
1141{ 1163{
1142 __be32 *p; 1164 __be32 *p;
1165 struct nfs_client *clp;
1143 1166
1144 p = reserve_space(xdr, 4); 1167 p = reserve_space(xdr, 4);
1145 switch(arg->open_flags & O_EXCL) { 1168 switch(arg->open_flags & O_EXCL) {
@@ -1148,8 +1171,23 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1148 encode_attrs(xdr, arg->u.attrs, arg->server); 1171 encode_attrs(xdr, arg->u.attrs, arg->server);
1149 break; 1172 break;
1150 default: 1173 default:
1151 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); 1174 clp = arg->server->nfs_client;
1152 encode_nfs4_verifier(xdr, &arg->u.verifier); 1175 if (clp->cl_minorversion > 0) {
1176 if (nfs4_has_persistent_session(clp)) {
1177 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1178 encode_attrs(xdr, arg->u.attrs, arg->server);
1179 } else {
1180 struct iattr dummy;
1181
1182 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
1183 encode_nfs4_verifier(xdr, &arg->u.verifier);
1184 dummy.ia_valid = 0;
1185 encode_attrs(xdr, &dummy, arg->server);
1186 }
1187 } else {
1188 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
1189 encode_nfs4_verifier(xdr, &arg->u.verifier);
1190 }
1153 } 1191 }
1154} 1192}
1155 1193
@@ -1539,6 +1577,14 @@ static void encode_create_session(struct xdr_stream *xdr,
1539 char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; 1577 char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
1540 uint32_t len; 1578 uint32_t len;
1541 struct nfs_client *clp = args->client; 1579 struct nfs_client *clp = args->client;
1580 u32 max_resp_sz_cached;
1581
1582 /*
1583 * Assumes OPEN is the biggest non-idempotent compound.
1584 * 2 is the verifier.
1585 */
1586 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
1587 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
1542 1588
1543 len = scnprintf(machine_name, sizeof(machine_name), "%s", 1589 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1544 clp->cl_ipaddr); 1590 clp->cl_ipaddr);
@@ -1553,7 +1599,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1553 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ 1599 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1554 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ 1600 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
1555 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ 1601 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
1556 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1602 *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */
1557 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ 1603 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
1558 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ 1604 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
1559 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ 1605 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
@@ -1592,6 +1638,19 @@ static void encode_destroy_session(struct xdr_stream *xdr,
1592 hdr->nops++; 1638 hdr->nops++;
1593 hdr->replen += decode_destroy_session_maxsz; 1639 hdr->replen += decode_destroy_session_maxsz;
1594} 1640}
1641
1642static void encode_reclaim_complete(struct xdr_stream *xdr,
1643 struct nfs41_reclaim_complete_args *args,
1644 struct compound_hdr *hdr)
1645{
1646 __be32 *p;
1647
1648 p = reserve_space(xdr, 8);
1649 *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE);
1650 *p++ = cpu_to_be32(args->one_fs);
1651 hdr->nops++;
1652 hdr->replen += decode_reclaim_complete_maxsz;
1653}
1595#endif /* CONFIG_NFS_V4_1 */ 1654#endif /* CONFIG_NFS_V4_1 */
1596 1655
1597static void encode_sequence(struct xdr_stream *xdr, 1656static void encode_sequence(struct xdr_stream *xdr,
@@ -2096,7 +2155,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
2096 encode_compound_hdr(&xdr, req, &hdr); 2155 encode_compound_hdr(&xdr, req, &hdr);
2097 encode_sequence(&xdr, &args->seq_args, &hdr); 2156 encode_sequence(&xdr, &args->seq_args, &hdr);
2098 encode_putfh(&xdr, args->fh, &hdr); 2157 encode_putfh(&xdr, args->fh, &hdr);
2099 replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1; 2158 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
2100 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); 2159 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
2101 2160
2102 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, 2161 xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
@@ -2420,6 +2479,26 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
2420 encode_nops(&hdr); 2479 encode_nops(&hdr);
2421 return 0; 2480 return 0;
2422} 2481}
2482
2483/*
2484 * a RECLAIM_COMPLETE request
2485 */
2486static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
2487 struct nfs41_reclaim_complete_args *args)
2488{
2489 struct xdr_stream xdr;
2490 struct compound_hdr hdr = {
2491 .minorversion = nfs4_xdr_minorversion(&args->seq_args)
2492 };
2493
2494 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2495 encode_compound_hdr(&xdr, req, &hdr);
2496 encode_sequence(&xdr, &args->seq_args, &hdr);
2497 encode_reclaim_complete(&xdr, args, &hdr);
2498 encode_nops(&hdr);
2499 return 0;
2500}
2501
2423#endif /* CONFIG_NFS_V4_1 */ 2502#endif /* CONFIG_NFS_V4_1 */
2424 2503
2425static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 2504static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -4528,6 +4607,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
4528{ 4607{
4529 return decode_op_hdr(xdr, OP_DESTROY_SESSION); 4608 return decode_op_hdr(xdr, OP_DESTROY_SESSION);
4530} 4609}
4610
4611static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
4612{
4613 return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
4614}
4531#endif /* CONFIG_NFS_V4_1 */ 4615#endif /* CONFIG_NFS_V4_1 */
4532 4616
4533static int decode_sequence(struct xdr_stream *xdr, 4617static int decode_sequence(struct xdr_stream *xdr,
@@ -4554,7 +4638,7 @@ static int decode_sequence(struct xdr_stream *xdr,
4554 * If the server returns different values for sessionID, slotID or 4638 * If the server returns different values for sessionID, slotID or
4555 * sequence number, the server is looney tunes. 4639 * sequence number, the server is looney tunes.
4556 */ 4640 */
4557 status = -ESERVERFAULT; 4641 status = -EREMOTEIO;
4558 4642
4559 if (memcmp(id.data, res->sr_session->sess_id.data, 4643 if (memcmp(id.data, res->sr_session->sess_id.data,
4560 NFS4_MAX_SESSIONID_LEN)) { 4644 NFS4_MAX_SESSIONID_LEN)) {
@@ -4583,8 +4667,8 @@ static int decode_sequence(struct xdr_stream *xdr,
4583 dummy = be32_to_cpup(p++); 4667 dummy = be32_to_cpup(p++);
4584 /* target highest slot id - currently not processed */ 4668 /* target highest slot id - currently not processed */
4585 dummy = be32_to_cpup(p++); 4669 dummy = be32_to_cpup(p++);
4586 /* result flags - currently not processed */ 4670 /* result flags */
4587 dummy = be32_to_cpup(p); 4671 res->sr_status_flags = be32_to_cpup(p);
4588 status = 0; 4672 status = 0;
4589out_err: 4673out_err:
4590 res->sr_status = status; 4674 res->sr_status = status;
@@ -5309,7 +5393,7 @@ out:
5309} 5393}
5310 5394
5311/* 5395/*
5312 * FSINFO request 5396 * Decode FSINFO response
5313 */ 5397 */
5314static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, 5398static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
5315 struct nfs4_fsinfo_res *res) 5399 struct nfs4_fsinfo_res *res)
@@ -5330,7 +5414,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
5330} 5414}
5331 5415
5332/* 5416/*
5333 * PATHCONF request 5417 * Decode PATHCONF response
5334 */ 5418 */
5335static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, 5419static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
5336 struct nfs4_pathconf_res *res) 5420 struct nfs4_pathconf_res *res)
@@ -5351,7 +5435,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
5351} 5435}
5352 5436
5353/* 5437/*
5354 * STATFS request 5438 * Decode STATFS response
5355 */ 5439 */
5356static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, 5440static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
5357 struct nfs4_statfs_res *res) 5441 struct nfs4_statfs_res *res)
@@ -5372,7 +5456,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
5372} 5456}
5373 5457
5374/* 5458/*
5375 * GETATTR_BITMAP request 5459 * Decode GETATTR_BITMAP response
5376 */ 5460 */
5377static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) 5461static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
5378{ 5462{
@@ -5411,7 +5495,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
5411} 5495}
5412 5496
5413/* 5497/*
5414 * a SETCLIENTID request 5498 * Decode SETCLIENTID response
5415 */ 5499 */
5416static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, 5500static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5417 struct nfs_client *clp) 5501 struct nfs_client *clp)
@@ -5428,7 +5512,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5428} 5512}
5429 5513
5430/* 5514/*
5431 * a SETCLIENTID_CONFIRM request 5515 * Decode SETCLIENTID_CONFIRM response
5432 */ 5516 */
5433static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) 5517static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
5434{ 5518{
@@ -5448,7 +5532,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
5448} 5532}
5449 5533
5450/* 5534/*
5451 * DELEGRETURN request 5535 * Decode DELEGRETURN response
5452 */ 5536 */
5453static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) 5537static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
5454{ 5538{
@@ -5467,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5467 if (status != 0) 5551 if (status != 0)
5468 goto out; 5552 goto out;
5469 status = decode_delegreturn(&xdr); 5553 status = decode_delegreturn(&xdr);
5554 if (status != 0)
5555 goto out;
5470 decode_getfattr(&xdr, res->fattr, res->server, 5556 decode_getfattr(&xdr, res->fattr, res->server,
5471 !RPC_IS_ASYNC(rqstp->rq_task)); 5557 !RPC_IS_ASYNC(rqstp->rq_task));
5472out: 5558out:
@@ -5474,7 +5560,7 @@ out:
5474} 5560}
5475 5561
5476/* 5562/*
5477 * FS_LOCATIONS request 5563 * Decode FS_LOCATIONS response
5478 */ 5564 */
5479static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, 5565static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
5480 struct nfs4_fs_locations_res *res) 5566 struct nfs4_fs_locations_res *res)
@@ -5504,7 +5590,7 @@ out:
5504 5590
5505#if defined(CONFIG_NFS_V4_1) 5591#if defined(CONFIG_NFS_V4_1)
5506/* 5592/*
5507 * EXCHANGE_ID request 5593 * Decode EXCHANGE_ID response
5508 */ 5594 */
5509static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, 5595static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
5510 void *res) 5596 void *res)
@@ -5521,7 +5607,7 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
5521} 5607}
5522 5608
5523/* 5609/*
5524 * a CREATE_SESSION request 5610 * Decode CREATE_SESSION response
5525 */ 5611 */
5526static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, 5612static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
5527 struct nfs41_create_session_res *res) 5613 struct nfs41_create_session_res *res)
@@ -5538,7 +5624,7 @@ static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
5538} 5624}
5539 5625
5540/* 5626/*
5541 * a DESTROY_SESSION request 5627 * Decode DESTROY_SESSION response
5542 */ 5628 */
5543static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, 5629static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
5544 void *dummy) 5630 void *dummy)
@@ -5555,7 +5641,7 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
5555} 5641}
5556 5642
5557/* 5643/*
5558 * a SEQUENCE request 5644 * Decode SEQUENCE response
5559 */ 5645 */
5560static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, 5646static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
5561 struct nfs4_sequence_res *res) 5647 struct nfs4_sequence_res *res)
@@ -5572,7 +5658,7 @@ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
5572} 5658}
5573 5659
5574/* 5660/*
5575 * a GET_LEASE_TIME request 5661 * Decode GET_LEASE_TIME response
5576 */ 5662 */
5577static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, 5663static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
5578 struct nfs4_get_lease_time_res *res) 5664 struct nfs4_get_lease_time_res *res)
@@ -5591,6 +5677,25 @@ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
5591 status = decode_fsinfo(&xdr, res->lr_fsinfo); 5677 status = decode_fsinfo(&xdr, res->lr_fsinfo);
5592 return status; 5678 return status;
5593} 5679}
5680
5681/*
5682 * Decode RECLAIM_COMPLETE response
5683 */
5684static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
5685 struct nfs41_reclaim_complete_res *res)
5686{
5687 struct xdr_stream xdr;
5688 struct compound_hdr hdr;
5689 int status;
5690
5691 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5692 status = decode_compound_hdr(&xdr, &hdr);
5693 if (!status)
5694 status = decode_sequence(&xdr, &res->seq_res, rqstp);
5695 if (!status)
5696 status = decode_reclaim_complete(&xdr, (void *)NULL);
5697 return status;
5698}
5594#endif /* CONFIG_NFS_V4_1 */ 5699#endif /* CONFIG_NFS_V4_1 */
5595 5700
5596__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 5701__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
@@ -5678,7 +5783,7 @@ static struct {
5678 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, 5783 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
5679 { NFS4ERR_NOTSUPP, -ENOTSUPP }, 5784 { NFS4ERR_NOTSUPP, -ENOTSUPP },
5680 { NFS4ERR_TOOSMALL, -ETOOSMALL }, 5785 { NFS4ERR_TOOSMALL, -ETOOSMALL },
5681 { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, 5786 { NFS4ERR_SERVERFAULT, -EREMOTEIO },
5682 { NFS4ERR_BADTYPE, -EBADTYPE }, 5787 { NFS4ERR_BADTYPE, -EBADTYPE },
5683 { NFS4ERR_LOCKED, -EAGAIN }, 5788 { NFS4ERR_LOCKED, -EAGAIN },
5684 { NFS4ERR_SYMLINK, -ELOOP }, 5789 { NFS4ERR_SYMLINK, -ELOOP },
@@ -5705,7 +5810,7 @@ nfs4_stat_to_errno(int stat)
5705 } 5810 }
5706 if (stat <= 10000 || stat > 10100) { 5811 if (stat <= 10000 || stat > 10100) {
5707 /* The server is looney tunes. */ 5812 /* The server is looney tunes. */
5708 return -ESERVERFAULT; 5813 return -EREMOTEIO;
5709 } 5814 }
5710 /* If we cannot translate the error, the recovery routines should 5815 /* If we cannot translate the error, the recovery routines should
5711 * handle it. 5816 * handle it.
@@ -5767,6 +5872,7 @@ struct rpc_procinfo nfs4_procedures[] = {
5767 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), 5872 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
5768 PROC(SEQUENCE, enc_sequence, dec_sequence), 5873 PROC(SEQUENCE, enc_sequence, dec_sequence),
5769 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 5874 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
5875 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
5770#endif /* CONFIG_NFS_V4_1 */ 5876#endif /* CONFIG_NFS_V4_1 */
5771}; 5877};
5772 5878
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e2975939126a..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
112 */ 112 */
113int nfs_set_page_tag_locked(struct nfs_page *req) 113int nfs_set_page_tag_locked(struct nfs_page *req)
114{ 114{
115 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
116
117 if (!nfs_lock_request_dontget(req)) 115 if (!nfs_lock_request_dontget(req))
118 return 0; 116 return 0;
119 if (req->wb_page != NULL) 117 if (req->wb_page != NULL)
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 118 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
121 return 1; 119 return 1;
122} 120}
123 121
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
126 */ 124 */
127void nfs_clear_page_tag_locked(struct nfs_page *req) 125void nfs_clear_page_tag_locked(struct nfs_page *req)
128{ 126{
129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode);
131
132 if (req->wb_page != NULL) { 127 if (req->wb_page != NULL) {
128 struct inode *inode = req->wb_context->path.dentry->d_inode;
129 struct nfs_inode *nfsi = NFS_I(inode);
130
133 spin_lock(&inode->i_lock); 131 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 132 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 nfs_unlock_request(req); 133 nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
142 * nfs_clear_request - Free up all resources allocated to the request 140 * nfs_clear_request - Free up all resources allocated to the request
143 * @req: 141 * @req:
144 * 142 *
145 * Release page resources associated with a write request after it 143 * Release page and open context resources associated with a read/write
146 * has completed. 144 * request after it has completed.
147 */ 145 */
148void nfs_clear_request(struct nfs_page *req) 146void nfs_clear_request(struct nfs_page *req)
149{ 147{
150 struct page *page = req->wb_page; 148 struct page *page = req->wb_page;
149 struct nfs_open_context *ctx = req->wb_context;
150
151 if (page != NULL) { 151 if (page != NULL) {
152 page_cache_release(page); 152 page_cache_release(page);
153 req->wb_page = NULL; 153 req->wb_page = NULL;
154 } 154 }
155 if (ctx != NULL) {
156 put_nfs_open_context(ctx);
157 req->wb_context = NULL;
158 }
155} 159}
156 160
157 161
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
165{ 169{
166 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 170 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
167 171
168 /* Release struct file or cached credential */ 172 /* Release struct file and open context */
169 nfs_clear_request(req); 173 nfs_clear_request(req);
170 put_nfs_open_context(req->wb_context);
171 nfs_page_free(req); 174 nfs_page_free(req);
172} 175}
173 176
@@ -176,6 +179,12 @@ void nfs_release_request(struct nfs_page *req)
176 kref_put(&req->wb_kref, nfs_free_request); 179 kref_put(&req->wb_kref, nfs_free_request);
177} 180}
178 181
182static int nfs_wait_bit_uninterruptible(void *word)
183{
184 io_schedule();
185 return 0;
186}
187
179/** 188/**
180 * nfs_wait_on_request - Wait for a request to complete. 189 * nfs_wait_on_request - Wait for a request to complete.
181 * @req: request to wait upon. 190 * @req: request to wait upon.
@@ -186,14 +195,9 @@ void nfs_release_request(struct nfs_page *req)
186int 195int
187nfs_wait_on_request(struct nfs_page *req) 196nfs_wait_on_request(struct nfs_page *req)
188{ 197{
189 int ret = 0; 198 return wait_on_bit(&req->wb_flags, PG_BUSY,
190 199 nfs_wait_bit_uninterruptible,
191 if (!test_bit(PG_BUSY, &req->wb_flags)) 200 TASK_UNINTERRUPTIBLE);
192 goto out;
193 ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
194 nfs_wait_bit_killable, TASK_KILLABLE);
195out:
196 return ret;
197} 201}
198 202
199/** 203/**
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ef583854d8d0..0288be80444f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/param.h> 31#include <linux/param.h>
32#include <linux/slab.h>
33#include <linux/time.h> 32#include <linux/time.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/errno.h> 34#include <linux/errno.h>
@@ -47,6 +46,39 @@
47#define NFSDBG_FACILITY NFSDBG_PROC 46#define NFSDBG_FACILITY NFSDBG_PROC
48 47
49/* 48/*
49 * wrapper to handle the -EKEYEXPIRED error message. This should generally
50 * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
51 * support the NFSERR_JUKEBOX error code, but we handle this situation in the
52 * same way that we handle that error with NFSv3.
53 */
54static int
55nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
56{
57 int res;
58 do {
59 res = rpc_call_sync(clnt, msg, flags);
60 if (res != -EKEYEXPIRED)
61 break;
62 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
63 res = -ERESTARTSYS;
64 } while (!fatal_signal_pending(current));
65 return res;
66}
67
68#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
69
70static int
71nfs_async_handle_expired_key(struct rpc_task *task)
72{
73 if (task->tk_status != -EKEYEXPIRED)
74 return 0;
75 task->tk_status = 0;
76 rpc_restart_call(task);
77 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
78 return 1;
79}
80
81/*
50 * Bare-bones access to getattr: this is for nfs_read_super. 82 * Bare-bones access to getattr: this is for nfs_read_super.
51 */ 83 */
52static int 84static int
@@ -307,6 +339,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
307 339
308static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) 340static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
309{ 341{
342 if (nfs_async_handle_expired_key(task))
343 return 0;
310 nfs_mark_for_revalidate(dir); 344 nfs_mark_for_revalidate(dir);
311 return 1; 345 return 1;
312} 346}
@@ -560,6 +594,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
560 594
561static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) 595static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
562{ 596{
597 if (nfs_async_handle_expired_key(task))
598 return -EAGAIN;
599
563 nfs_invalidate_atime(data->inode); 600 nfs_invalidate_atime(data->inode);
564 if (task->tk_status >= 0) { 601 if (task->tk_status >= 0) {
565 nfs_refresh_inode(data->inode, data->res.fattr); 602 nfs_refresh_inode(data->inode, data->res.fattr);
@@ -579,6 +616,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
579 616
580static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 617static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
581{ 618{
619 if (nfs_async_handle_expired_key(task))
620 return -EAGAIN;
621
582 if (task->tk_status >= 0) 622 if (task->tk_status >= 0)
583 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); 623 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
584 return 0; 624 return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 12c9e66d3f1d..db9b360ae19d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -356,25 +356,19 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
356 struct nfs_readres *resp = &data->res; 356 struct nfs_readres *resp = &data->res;
357 357
358 if (resp->eof || resp->count == argp->count) 358 if (resp->eof || resp->count == argp->count)
359 goto out; 359 return;
360 360
361 /* This is a short read! */ 361 /* This is a short read! */
362 nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); 362 nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
363 /* Has the server at least made some progress? */ 363 /* Has the server at least made some progress? */
364 if (resp->count == 0) 364 if (resp->count == 0)
365 goto out; 365 return;
366 366
367 /* Yes, so retry the read at the end of the data */ 367 /* Yes, so retry the read at the end of the data */
368 argp->offset += resp->count; 368 argp->offset += resp->count;
369 argp->pgbase += resp->count; 369 argp->pgbase += resp->count;
370 argp->count -= resp->count; 370 argp->count -= resp->count;
371 nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); 371 nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
372 return;
373out:
374 nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client,
375 &data->res.seq_res);
376 return;
377
378} 372}
379 373
380/* 374/*
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 90be551b80c1..e01637240eeb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/inet.h> 49#include <linux/inet.h>
50#include <linux/in6.h> 50#include <linux/in6.h>
51#include <linux/slab.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <linux/netdevice.h> 53#include <linux/netdevice.h>
53#include <linux/nfs_xdr.h> 54#include <linux/nfs_xdr.h>
@@ -175,14 +176,16 @@ static const match_table_t nfs_mount_option_tokens = {
175}; 176};
176 177
177enum { 178enum {
178 Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma, 179 Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
179 180
180 Opt_xprt_err 181 Opt_xprt_err
181}; 182};
182 183
183static const match_table_t nfs_xprt_protocol_tokens = { 184static const match_table_t nfs_xprt_protocol_tokens = {
184 { Opt_xprt_udp, "udp" }, 185 { Opt_xprt_udp, "udp" },
186 { Opt_xprt_udp6, "udp6" },
185 { Opt_xprt_tcp, "tcp" }, 187 { Opt_xprt_tcp, "tcp" },
188 { Opt_xprt_tcp6, "tcp6" },
186 { Opt_xprt_rdma, "rdma" }, 189 { Opt_xprt_rdma, "rdma" },
187 190
188 { Opt_xprt_err, NULL } 191 { Opt_xprt_err, NULL }
@@ -241,6 +244,7 @@ static int nfs_show_stats(struct seq_file *, struct vfsmount *);
241static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); 244static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
242static int nfs_xdev_get_sb(struct file_system_type *fs_type, 245static int nfs_xdev_get_sb(struct file_system_type *fs_type,
243 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 246 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
247static void nfs_put_super(struct super_block *);
244static void nfs_kill_super(struct super_block *); 248static void nfs_kill_super(struct super_block *);
245static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 249static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
246 250
@@ -264,6 +268,7 @@ static const struct super_operations nfs_sops = {
264 .alloc_inode = nfs_alloc_inode, 268 .alloc_inode = nfs_alloc_inode,
265 .destroy_inode = nfs_destroy_inode, 269 .destroy_inode = nfs_destroy_inode,
266 .write_inode = nfs_write_inode, 270 .write_inode = nfs_write_inode,
271 .put_super = nfs_put_super,
267 .statfs = nfs_statfs, 272 .statfs = nfs_statfs,
268 .clear_inode = nfs_clear_inode, 273 .clear_inode = nfs_clear_inode,
269 .umount_begin = nfs_umount_begin, 274 .umount_begin = nfs_umount_begin,
@@ -333,6 +338,7 @@ static const struct super_operations nfs4_sops = {
333 .alloc_inode = nfs_alloc_inode, 338 .alloc_inode = nfs_alloc_inode,
334 .destroy_inode = nfs_destroy_inode, 339 .destroy_inode = nfs_destroy_inode,
335 .write_inode = nfs_write_inode, 340 .write_inode = nfs_write_inode,
341 .put_super = nfs_put_super,
336 .statfs = nfs_statfs, 342 .statfs = nfs_statfs,
337 .clear_inode = nfs4_clear_inode, 343 .clear_inode = nfs4_clear_inode,
338 .umount_begin = nfs_umount_begin, 344 .umount_begin = nfs_umount_begin,
@@ -492,6 +498,45 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
492 return sec_flavours[i].str; 498 return sec_flavours[i].str;
493} 499}
494 500
501static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
502 int showdefaults)
503{
504 struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
505
506 seq_printf(m, ",mountproto=");
507 switch (sap->sa_family) {
508 case AF_INET:
509 switch (nfss->mountd_protocol) {
510 case IPPROTO_UDP:
511 seq_printf(m, RPCBIND_NETID_UDP);
512 break;
513 case IPPROTO_TCP:
514 seq_printf(m, RPCBIND_NETID_TCP);
515 break;
516 default:
517 if (showdefaults)
518 seq_printf(m, "auto");
519 }
520 break;
521 case AF_INET6:
522 switch (nfss->mountd_protocol) {
523 case IPPROTO_UDP:
524 seq_printf(m, RPCBIND_NETID_UDP6);
525 break;
526 case IPPROTO_TCP:
527 seq_printf(m, RPCBIND_NETID_TCP6);
528 break;
529 default:
530 if (showdefaults)
531 seq_printf(m, "auto");
532 }
533 break;
534 default:
535 if (showdefaults)
536 seq_printf(m, "auto");
537 }
538}
539
495static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, 540static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
496 int showdefaults) 541 int showdefaults)
497{ 542{
@@ -505,7 +550,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
505 } 550 }
506 case AF_INET6: { 551 case AF_INET6: {
507 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; 552 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
508 seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr); 553 seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
509 break; 554 break;
510 } 555 }
511 default: 556 default:
@@ -518,17 +563,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
518 if (nfss->mountd_port || showdefaults) 563 if (nfss->mountd_port || showdefaults)
519 seq_printf(m, ",mountport=%u", nfss->mountd_port); 564 seq_printf(m, ",mountport=%u", nfss->mountd_port);
520 565
521 switch (nfss->mountd_protocol) { 566 nfs_show_mountd_netid(m, nfss, showdefaults);
522 case IPPROTO_UDP:
523 seq_printf(m, ",mountproto=udp");
524 break;
525 case IPPROTO_TCP:
526 seq_printf(m, ",mountproto=tcp");
527 break;
528 default:
529 if (showdefaults)
530 seq_printf(m, ",mountproto=auto");
531 }
532} 567}
533 568
534/* 569/*
@@ -578,7 +613,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
578 seq_puts(m, nfs_infop->nostr); 613 seq_puts(m, nfs_infop->nostr);
579 } 614 }
580 seq_printf(m, ",proto=%s", 615 seq_printf(m, ",proto=%s",
581 rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); 616 rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
582 if (version == 4) { 617 if (version == 4) {
583 if (nfss->port != NFS_PORT) 618 if (nfss->port != NFS_PORT)
584 seq_printf(m, ",port=%u", nfss->port); 619 seq_printf(m, ",port=%u", nfss->port);
@@ -714,8 +749,6 @@ static void nfs_umount_begin(struct super_block *sb)
714 struct nfs_server *server; 749 struct nfs_server *server;
715 struct rpc_clnt *rpc; 750 struct rpc_clnt *rpc;
716 751
717 lock_kernel();
718
719 server = NFS_SB(sb); 752 server = NFS_SB(sb);
720 /* -EIO all pending I/O */ 753 /* -EIO all pending I/O */
721 rpc = server->client_acl; 754 rpc = server->client_acl;
@@ -724,8 +757,6 @@ static void nfs_umount_begin(struct super_block *sb)
724 rpc = server->client; 757 rpc = server->client;
725 if (!IS_ERR(rpc)) 758 if (!IS_ERR(rpc))
726 rpc_killall_tasks(rpc); 759 rpc_killall_tasks(rpc);
727
728 unlock_kernel();
729} 760}
730 761
731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) 762static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
@@ -734,8 +765,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
734 765
735 data = kzalloc(sizeof(*data), GFP_KERNEL); 766 data = kzalloc(sizeof(*data), GFP_KERNEL);
736 if (data) { 767 if (data) {
737 data->rsize = NFS_MAX_FILE_IO_SIZE;
738 data->wsize = NFS_MAX_FILE_IO_SIZE;
739 data->acregmin = NFS_DEF_ACREGMIN; 768 data->acregmin = NFS_DEF_ACREGMIN;
740 data->acregmax = NFS_DEF_ACREGMAX; 769 data->acregmax = NFS_DEF_ACREGMAX;
741 data->acdirmin = NFS_DEF_ACDIRMIN; 770 data->acdirmin = NFS_DEF_ACDIRMIN;
@@ -887,6 +916,8 @@ static int nfs_parse_mount_options(char *raw,
887{ 916{
888 char *p, *string, *secdata; 917 char *p, *string, *secdata;
889 int rc, sloppy = 0, invalid_option = 0; 918 int rc, sloppy = 0, invalid_option = 0;
919 unsigned short protofamily = AF_UNSPEC;
920 unsigned short mountfamily = AF_UNSPEC;
890 921
891 if (!raw) { 922 if (!raw) {
892 dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); 923 dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -1232,12 +1263,17 @@ static int nfs_parse_mount_options(char *raw,
1232 token = match_token(string, 1263 token = match_token(string,
1233 nfs_xprt_protocol_tokens, args); 1264 nfs_xprt_protocol_tokens, args);
1234 1265
1266 protofamily = AF_INET;
1235 switch (token) { 1267 switch (token) {
1268 case Opt_xprt_udp6:
1269 protofamily = AF_INET6;
1236 case Opt_xprt_udp: 1270 case Opt_xprt_udp:
1237 mnt->flags &= ~NFS_MOUNT_TCP; 1271 mnt->flags &= ~NFS_MOUNT_TCP;
1238 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1272 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1239 kfree(string); 1273 kfree(string);
1240 break; 1274 break;
1275 case Opt_xprt_tcp6:
1276 protofamily = AF_INET6;
1241 case Opt_xprt_tcp: 1277 case Opt_xprt_tcp:
1242 mnt->flags |= NFS_MOUNT_TCP; 1278 mnt->flags |= NFS_MOUNT_TCP;
1243 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1279 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
@@ -1265,10 +1301,15 @@ static int nfs_parse_mount_options(char *raw,
1265 nfs_xprt_protocol_tokens, args); 1301 nfs_xprt_protocol_tokens, args);
1266 kfree(string); 1302 kfree(string);
1267 1303
1304 mountfamily = AF_INET;
1268 switch (token) { 1305 switch (token) {
1306 case Opt_xprt_udp6:
1307 mountfamily = AF_INET6;
1269 case Opt_xprt_udp: 1308 case Opt_xprt_udp:
1270 mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; 1309 mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
1271 break; 1310 break;
1311 case Opt_xprt_tcp6:
1312 mountfamily = AF_INET6;
1272 case Opt_xprt_tcp: 1313 case Opt_xprt_tcp:
1273 mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; 1314 mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
1274 break; 1315 break;
@@ -1367,8 +1408,33 @@ static int nfs_parse_mount_options(char *raw,
1367 if (!sloppy && invalid_option) 1408 if (!sloppy && invalid_option)
1368 return 0; 1409 return 0;
1369 1410
1411 /*
1412 * verify that any proto=/mountproto= options match the address
1413 * familiies in the addr=/mountaddr= options.
1414 */
1415 if (protofamily != AF_UNSPEC &&
1416 protofamily != mnt->nfs_server.address.ss_family)
1417 goto out_proto_mismatch;
1418
1419 if (mountfamily != AF_UNSPEC) {
1420 if (mnt->mount_server.addrlen) {
1421 if (mountfamily != mnt->mount_server.address.ss_family)
1422 goto out_mountproto_mismatch;
1423 } else {
1424 if (mountfamily != mnt->nfs_server.address.ss_family)
1425 goto out_mountproto_mismatch;
1426 }
1427 }
1428
1370 return 1; 1429 return 1;
1371 1430
1431out_mountproto_mismatch:
1432 printk(KERN_INFO "NFS: mount server address does not match mountproto= "
1433 "option\n");
1434 return 0;
1435out_proto_mismatch:
1436 printk(KERN_INFO "NFS: server address does not match proto= option\n");
1437 return 0;
1372out_invalid_address: 1438out_invalid_address:
1373 printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); 1439 printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
1374 return 0; 1440 return 0;
@@ -1881,7 +1947,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1881 if (data == NULL) 1947 if (data == NULL)
1882 return -ENOMEM; 1948 return -ENOMEM;
1883 1949
1884 lock_kernel();
1885 /* fill out struct with values from existing mount */ 1950 /* fill out struct with values from existing mount */
1886 data->flags = nfss->flags; 1951 data->flags = nfss->flags;
1887 data->rsize = nfss->rsize; 1952 data->rsize = nfss->rsize;
@@ -1907,7 +1972,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1907 error = nfs_compare_remount_data(nfss, data); 1972 error = nfs_compare_remount_data(nfss, data);
1908out: 1973out:
1909 kfree(data); 1974 kfree(data);
1910 unlock_kernel();
1911 return error; 1975 return error;
1912} 1976}
1913 1977
@@ -2151,7 +2215,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2151 } else { 2215 } else {
2152 error = nfs_bdi_register(server); 2216 error = nfs_bdi_register(server);
2153 if (error) 2217 if (error)
2154 goto error_splat_super; 2218 goto error_splat_bdi;
2155 } 2219 }
2156 2220
2157 if (!s->s_root) { 2221 if (!s->s_root) {
@@ -2193,11 +2257,25 @@ out_err_nosb:
2193error_splat_root: 2257error_splat_root:
2194 dput(mntroot); 2258 dput(mntroot);
2195error_splat_super: 2259error_splat_super:
2260 if (server && !s->s_root)
2261 bdi_unregister(&server->backing_dev_info);
2262error_splat_bdi:
2196 deactivate_locked_super(s); 2263 deactivate_locked_super(s);
2197 goto out; 2264 goto out;
2198} 2265}
2199 2266
2200/* 2267/*
2268 * Ensure that we unregister the bdi before kill_anon_super
2269 * releases the device name
2270 */
2271static void nfs_put_super(struct super_block *s)
2272{
2273 struct nfs_server *server = NFS_SB(s);
2274
2275 bdi_unregister(&server->backing_dev_info);
2276}
2277
2278/*
2201 * Destroy an NFS2/3 superblock 2279 * Destroy an NFS2/3 superblock
2202 */ 2280 */
2203static void nfs_kill_super(struct super_block *s) 2281static void nfs_kill_super(struct super_block *s)
@@ -2205,7 +2283,6 @@ static void nfs_kill_super(struct super_block *s)
2205 struct nfs_server *server = NFS_SB(s); 2283 struct nfs_server *server = NFS_SB(s);
2206 2284
2207 kill_anon_super(s); 2285 kill_anon_super(s);
2208 bdi_unregister(&server->backing_dev_info);
2209 nfs_fscache_release_super_cookie(s); 2286 nfs_fscache_release_super_cookie(s);
2210 nfs_free_server(server); 2287 nfs_free_server(server);
2211} 2288}
@@ -2253,7 +2330,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2253 } else { 2330 } else {
2254 error = nfs_bdi_register(server); 2331 error = nfs_bdi_register(server);
2255 if (error) 2332 if (error)
2256 goto error_splat_super; 2333 goto error_splat_bdi;
2257 } 2334 }
2258 2335
2259 if (!s->s_root) { 2336 if (!s->s_root) {
@@ -2290,6 +2367,9 @@ out_err_noserver:
2290 return error; 2367 return error;
2291 2368
2292error_splat_super: 2369error_splat_super:
2370 if (server && !s->s_root)
2371 bdi_unregister(&server->backing_dev_info);
2372error_splat_bdi:
2293 deactivate_locked_super(s); 2373 deactivate_locked_super(s);
2294 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2374 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2295 return error; 2375 return error;
@@ -2505,7 +2585,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2505 } else { 2585 } else {
2506 error = nfs_bdi_register(server); 2586 error = nfs_bdi_register(server);
2507 if (error) 2587 if (error)
2508 goto error_splat_super; 2588 goto error_splat_bdi;
2509 } 2589 }
2510 2590
2511 if (!s->s_root) { 2591 if (!s->s_root) {
@@ -2543,6 +2623,9 @@ out_free:
2543error_splat_root: 2623error_splat_root:
2544 dput(mntroot); 2624 dput(mntroot);
2545error_splat_super: 2625error_splat_super:
2626 if (server && !s->s_root)
2627 bdi_unregister(&server->backing_dev_info);
2628error_splat_bdi:
2546 deactivate_locked_super(s); 2629 deactivate_locked_super(s);
2547 goto out; 2630 goto out;
2548} 2631}
@@ -2738,7 +2821,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2738 } else { 2821 } else {
2739 error = nfs_bdi_register(server); 2822 error = nfs_bdi_register(server);
2740 if (error) 2823 if (error)
2741 goto error_splat_super; 2824 goto error_splat_bdi;
2742 } 2825 }
2743 2826
2744 if (!s->s_root) { 2827 if (!s->s_root) {
@@ -2774,6 +2857,9 @@ out_err_noserver:
2774 return error; 2857 return error;
2775 2858
2776error_splat_super: 2859error_splat_super:
2860 if (server && !s->s_root)
2861 bdi_unregister(&server->backing_dev_info);
2862error_splat_bdi:
2777 deactivate_locked_super(s); 2863 deactivate_locked_super(s);
2778 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2864 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2779 return error; 2865 return error;
@@ -2820,7 +2906,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2820 } else { 2906 } else {
2821 error = nfs_bdi_register(server); 2907 error = nfs_bdi_register(server);
2822 if (error) 2908 if (error)
2823 goto error_splat_super; 2909 goto error_splat_bdi;
2824 } 2910 }
2825 2911
2826 if (!s->s_root) { 2912 if (!s->s_root) {
@@ -2856,6 +2942,9 @@ out_err_noserver:
2856 return error; 2942 return error;
2857 2943
2858error_splat_super: 2944error_splat_super:
2945 if (server && !s->s_root)
2946 bdi_unregister(&server->backing_dev_info);
2947error_splat_bdi:
2859 deactivate_locked_super(s); 2948 deactivate_locked_super(s);
2860 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 2949 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2861 return error; 2950 return error;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc7..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/stat.h> 20#include <linux/stat.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/namei.h> 23#include <linux/namei.h>
25 24
@@ -50,7 +49,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
50 struct page *page; 49 struct page *page;
51 void *err; 50 void *err;
52 51
53 err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); 52 err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
54 if (err) 53 if (err)
55 goto read_failed; 54 goto read_failed;
56 page = read_cache_page(&inode->i_data, 0, 55 page = read_cache_page(&inode->i_data, 0,
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index b62481dabae9..ad4d2e787b20 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -15,70 +15,64 @@
15 15
16#include "callback.h" 16#include "callback.h"
17 17
18#ifdef CONFIG_NFS_V4
18static const int nfs_set_port_min = 0; 19static const int nfs_set_port_min = 0;
19static const int nfs_set_port_max = 65535; 20static const int nfs_set_port_max = 65535;
21#endif
20static struct ctl_table_header *nfs_callback_sysctl_table; 22static struct ctl_table_header *nfs_callback_sysctl_table;
21 23
22static ctl_table nfs_cb_sysctls[] = { 24static ctl_table nfs_cb_sysctls[] = {
23#ifdef CONFIG_NFS_V4 25#ifdef CONFIG_NFS_V4
24 { 26 {
25 .ctl_name = CTL_UNNUMBERED,
26 .procname = "nfs_callback_tcpport", 27 .procname = "nfs_callback_tcpport",
27 .data = &nfs_callback_set_tcpport, 28 .data = &nfs_callback_set_tcpport,
28 .maxlen = sizeof(int), 29 .maxlen = sizeof(int),
29 .mode = 0644, 30 .mode = 0644,
30 .proc_handler = &proc_dointvec_minmax, 31 .proc_handler = proc_dointvec_minmax,
31 .extra1 = (int *)&nfs_set_port_min, 32 .extra1 = (int *)&nfs_set_port_min,
32 .extra2 = (int *)&nfs_set_port_max, 33 .extra2 = (int *)&nfs_set_port_max,
33 }, 34 },
34 { 35 {
35 .ctl_name = CTL_UNNUMBERED,
36 .procname = "idmap_cache_timeout", 36 .procname = "idmap_cache_timeout",
37 .data = &nfs_idmap_cache_timeout, 37 .data = &nfs_idmap_cache_timeout,
38 .maxlen = sizeof(int), 38 .maxlen = sizeof(int),
39 .mode = 0644, 39 .mode = 0644,
40 .proc_handler = &proc_dointvec_jiffies, 40 .proc_handler = proc_dointvec_jiffies,
41 .strategy = &sysctl_jiffies,
42 }, 41 },
43#endif 42#endif
44 { 43 {
45 .ctl_name = CTL_UNNUMBERED,
46 .procname = "nfs_mountpoint_timeout", 44 .procname = "nfs_mountpoint_timeout",
47 .data = &nfs_mountpoint_expiry_timeout, 45 .data = &nfs_mountpoint_expiry_timeout,
48 .maxlen = sizeof(nfs_mountpoint_expiry_timeout), 46 .maxlen = sizeof(nfs_mountpoint_expiry_timeout),
49 .mode = 0644, 47 .mode = 0644,
50 .proc_handler = &proc_dointvec_jiffies, 48 .proc_handler = proc_dointvec_jiffies,
51 .strategy = &sysctl_jiffies,
52 }, 49 },
53 { 50 {
54 .ctl_name = CTL_UNNUMBERED,
55 .procname = "nfs_congestion_kb", 51 .procname = "nfs_congestion_kb",
56 .data = &nfs_congestion_kb, 52 .data = &nfs_congestion_kb,
57 .maxlen = sizeof(nfs_congestion_kb), 53 .maxlen = sizeof(nfs_congestion_kb),
58 .mode = 0644, 54 .mode = 0644,
59 .proc_handler = &proc_dointvec, 55 .proc_handler = proc_dointvec,
60 }, 56 },
61 { .ctl_name = 0 } 57 { }
62}; 58};
63 59
64static ctl_table nfs_cb_sysctl_dir[] = { 60static ctl_table nfs_cb_sysctl_dir[] = {
65 { 61 {
66 .ctl_name = CTL_UNNUMBERED,
67 .procname = "nfs", 62 .procname = "nfs",
68 .mode = 0555, 63 .mode = 0555,
69 .child = nfs_cb_sysctls, 64 .child = nfs_cb_sysctls,
70 }, 65 },
71 { .ctl_name = 0 } 66 { }
72}; 67};
73 68
74static ctl_table nfs_cb_sysctl_root[] = { 69static ctl_table nfs_cb_sysctl_root[] = {
75 { 70 {
76 .ctl_name = CTL_FS,
77 .procname = "fs", 71 .procname = "fs",
78 .mode = 0555, 72 .mode = 0555,
79 .child = nfs_cb_sysctl_dir, 73 .child = nfs_cb_sysctl_dir,
80 }, 74 },
81 { .ctl_name = 0 } 75 { }
82}; 76};
83 77
84int nfs_register_sysctl(void) 78int nfs_register_sysctl(void)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1064c91ae810..6da3d3ff6edd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -83,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
83 struct inode *dir = data->dir; 83 struct inode *dir = data->dir;
84 84
85 if (!NFS_PROTO(dir)->unlink_done(task, dir)) 85 if (!NFS_PROTO(dir)->unlink_done(task, dir))
86 nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client); 86 nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
87} 87}
88 88
89/** 89/**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..53ff70e23993 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
178{ 178{
179 if (wbc->for_reclaim) 179 if (wbc->for_reclaim)
180 return FLUSH_HIGHPRI | FLUSH_STABLE; 180 return FLUSH_HIGHPRI | FLUSH_STABLE;
181 if (wbc->for_kupdate) 181 if (wbc->for_kupdate || wbc->for_background)
182 return FLUSH_LOWPRI; 182 return FLUSH_LOWPRI;
183 return 0; 183 return 0;
184} 184}
@@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req)
438 radix_tree_tag_set(&nfsi->nfs_page_tree, 438 radix_tree_tag_set(&nfsi->nfs_page_tree,
439 req->wb_index, 439 req->wb_index,
440 NFS_PAGE_TAG_COMMIT); 440 NFS_PAGE_TAG_COMMIT);
441 nfsi->ncommit++;
441 spin_unlock(&inode->i_lock); 442 spin_unlock(&inode->i_lock);
442 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 443 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
443 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 444 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -501,57 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
501} 502}
502#endif 503#endif
503 504
504/*
505 * Wait for a request to complete.
506 *
507 * Interruptible by fatal signals only.
508 */
509static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
510{
511 struct nfs_inode *nfsi = NFS_I(inode);
512 struct nfs_page *req;
513 pgoff_t idx_end, next;
514 unsigned int res = 0;
515 int error;
516
517 if (npages == 0)
518 idx_end = ~0;
519 else
520 idx_end = idx_start + npages - 1;
521
522 next = idx_start;
523 while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
524 if (req->wb_index > idx_end)
525 break;
526
527 next = req->wb_index + 1;
528 BUG_ON(!NFS_WBACK_BUSY(req));
529
530 kref_get(&req->wb_kref);
531 spin_unlock(&inode->i_lock);
532 error = nfs_wait_on_request(req);
533 nfs_release_request(req);
534 spin_lock(&inode->i_lock);
535 if (error < 0)
536 return error;
537 res++;
538 }
539 return res;
540}
541
542static void nfs_cancel_commit_list(struct list_head *head)
543{
544 struct nfs_page *req;
545
546 while(!list_empty(head)) {
547 req = nfs_list_entry(head->next);
548 nfs_list_remove_request(req);
549 nfs_clear_request_commit(req);
550 nfs_inode_remove_request(req);
551 nfs_unlock_request(req);
552 }
553}
554
555#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 505#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
556static int 506static int
557nfs_need_commit(struct nfs_inode *nfsi) 507nfs_need_commit(struct nfs_inode *nfsi)
@@ -573,11 +523,17 @@ static int
573nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 523nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
574{ 524{
575 struct nfs_inode *nfsi = NFS_I(inode); 525 struct nfs_inode *nfsi = NFS_I(inode);
526 int ret;
576 527
577 if (!nfs_need_commit(nfsi)) 528 if (!nfs_need_commit(nfsi))
578 return 0; 529 return 0;
579 530
580 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); 531 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
532 if (ret > 0)
533 nfsi->ncommit -= ret;
534 if (nfs_need_commit(NFS_I(inode)))
535 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
536 return ret;
581} 537}
582#else 538#else
583static inline int nfs_need_commit(struct nfs_inode *nfsi) 539static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +598,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
642 spin_lock(&inode->i_lock); 598 spin_lock(&inode->i_lock);
643 } 599 }
644 600
645 if (nfs_clear_request_commit(req)) 601 if (nfs_clear_request_commit(req) &&
646 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, 602 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
647 req->wb_index, NFS_PAGE_TAG_COMMIT); 603 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
604 NFS_I(inode)->ncommit--;
648 605
649 /* Okay, the request matches. Update the region */ 606 /* Okay, the request matches. Update the region */
650 if (offset < req->wb_offset) { 607 if (offset < req->wb_offset) {
@@ -774,7 +731,7 @@ int nfs_updatepage(struct file *file, struct page *page,
774 */ 731 */
775 if (nfs_write_pageuptodate(page, inode) && 732 if (nfs_write_pageuptodate(page, inode) &&
776 inode->i_flock == NULL && 733 inode->i_flock == NULL &&
777 !(file->f_flags & O_SYNC)) { 734 !(file->f_flags & O_DSYNC)) {
778 count = max(count + offset, nfs_page_length(page)); 735 count = max(count + offset, nfs_page_length(page));
779 offset = 0; 736 offset = 0;
780 } 737 }
@@ -1216,7 +1173,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1216 */ 1173 */
1217 argp->stable = NFS_FILE_SYNC; 1174 argp->stable = NFS_FILE_SYNC;
1218 } 1175 }
1219 nfs4_restart_rpc(task, server->nfs_client); 1176 nfs_restart_rpc(task, server->nfs_client);
1220 return -EAGAIN; 1177 return -EAGAIN;
1221 } 1178 }
1222 if (time_before(complain, jiffies)) { 1179 if (time_before(complain, jiffies)) {
@@ -1228,13 +1185,12 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1228 /* Can't do anything about it except throw an error. */ 1185 /* Can't do anything about it except throw an error. */
1229 task->tk_status = -EIO; 1186 task->tk_status = -EIO;
1230 } 1187 }
1231 nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
1232 return 0; 1188 return 0;
1233} 1189}
1234 1190
1235 1191
1236#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1192#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1237void nfs_commitdata_release(void *data) 1193static void nfs_commitdata_release(void *data)
1238{ 1194{
1239 struct nfs_write_data *wdata = data; 1195 struct nfs_write_data *wdata = data;
1240 1196
@@ -1392,7 +1348,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
1392 .rpc_release = nfs_commit_release, 1348 .rpc_release = nfs_commit_release,
1393}; 1349};
1394 1350
1395int nfs_commit_inode(struct inode *inode, int how) 1351static int nfs_commit_inode(struct inode *inode, int how)
1396{ 1352{
1397 LIST_HEAD(head); 1353 LIST_HEAD(head);
1398 int res; 1354 int res;
@@ -1407,92 +1363,51 @@ int nfs_commit_inode(struct inode *inode, int how)
1407 } 1363 }
1408 return res; 1364 return res;
1409} 1365}
1410#else
1411static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1412{
1413 return 0;
1414}
1415#endif
1416 1366
1417long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) 1367static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1418{ 1368{
1419 struct inode *inode = mapping->host; 1369 struct nfs_inode *nfsi = NFS_I(inode);
1420 pgoff_t idx_start, idx_end; 1370 int flags = FLUSH_SYNC;
1421 unsigned int npages = 0; 1371 int ret = 0;
1422 LIST_HEAD(head); 1372
1423 int nocommit = how & FLUSH_NOCOMMIT; 1373 /* Don't commit yet if this is a non-blocking flush and there are
1424 long pages, ret; 1374 * lots of outstanding writes for this mapping.
1425 1375 */
1426 /* FIXME */ 1376 if (wbc->sync_mode == WB_SYNC_NONE &&
1427 if (wbc->range_cyclic) 1377 nfsi->ncommit <= (nfsi->npages >> 1))
1428 idx_start = 0; 1378 goto out_mark_dirty;
1429 else { 1379
1430 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; 1380 if (wbc->nonblocking || wbc->for_background)
1431 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; 1381 flags = 0;
1432 if (idx_end > idx_start) { 1382 ret = nfs_commit_inode(inode, flags);
1433 pgoff_t l_npages = 1 + idx_end - idx_start; 1383 if (ret >= 0) {
1434 npages = l_npages; 1384 if (wbc->sync_mode == WB_SYNC_NONE) {
1435 if (sizeof(npages) != sizeof(l_npages) && 1385 if (ret < wbc->nr_to_write)
1436 (pgoff_t)npages != l_npages) 1386 wbc->nr_to_write -= ret;
1437 npages = 0; 1387 else
1388 wbc->nr_to_write = 0;
1438 } 1389 }
1390 return 0;
1439 } 1391 }
1440 how &= ~FLUSH_NOCOMMIT; 1392out_mark_dirty:
1441 spin_lock(&inode->i_lock); 1393 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1442 do {
1443 ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
1444 if (ret != 0)
1445 continue;
1446 if (nocommit)
1447 break;
1448 pages = nfs_scan_commit(inode, &head, idx_start, npages);
1449 if (pages == 0)
1450 break;
1451 if (how & FLUSH_INVALIDATE) {
1452 spin_unlock(&inode->i_lock);
1453 nfs_cancel_commit_list(&head);
1454 ret = pages;
1455 spin_lock(&inode->i_lock);
1456 continue;
1457 }
1458 pages += nfs_scan_commit(inode, &head, 0, 0);
1459 spin_unlock(&inode->i_lock);
1460 ret = nfs_commit_list(inode, &head, how);
1461 spin_lock(&inode->i_lock);
1462
1463 } while (ret >= 0);
1464 spin_unlock(&inode->i_lock);
1465 return ret; 1394 return ret;
1466} 1395}
1467 1396#else
1468static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) 1397static int nfs_commit_inode(struct inode *inode, int how)
1469{ 1398{
1470 int ret;
1471
1472 ret = nfs_writepages(mapping, wbc);
1473 if (ret < 0)
1474 goto out;
1475 ret = nfs_sync_mapping_wait(mapping, wbc, how);
1476 if (ret < 0)
1477 goto out;
1478 return 0; 1399 return 0;
1479out:
1480 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1481 return ret;
1482} 1400}
1483 1401
1484/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ 1402static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1485static int nfs_write_mapping(struct address_space *mapping, int how)
1486{ 1403{
1487 struct writeback_control wbc = { 1404 return 0;
1488 .bdi = mapping->backing_dev_info, 1405}
1489 .sync_mode = WB_SYNC_ALL, 1406#endif
1490 .nr_to_write = LONG_MAX,
1491 .range_start = 0,
1492 .range_end = LLONG_MAX,
1493 };
1494 1407
1495 return __nfs_write_mapping(mapping, &wbc, how); 1408int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1409{
1410 return nfs_commit_unstable_pages(inode, wbc);
1496} 1411}
1497 1412
1498/* 1413/*
@@ -1500,37 +1415,26 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1500 */ 1415 */
1501int nfs_wb_all(struct inode *inode) 1416int nfs_wb_all(struct inode *inode)
1502{ 1417{
1503 return nfs_write_mapping(inode->i_mapping, 0); 1418 struct writeback_control wbc = {
1504} 1419 .sync_mode = WB_SYNC_ALL,
1420 .nr_to_write = LONG_MAX,
1421 .range_start = 0,
1422 .range_end = LLONG_MAX,
1423 };
1505 1424
1506int nfs_wb_nocommit(struct inode *inode) 1425 return sync_inode(inode, &wbc);
1507{
1508 return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
1509} 1426}
1510 1427
1511int nfs_wb_page_cancel(struct inode *inode, struct page *page) 1428int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1512{ 1429{
1513 struct nfs_page *req; 1430 struct nfs_page *req;
1514 loff_t range_start = page_offset(page);
1515 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1516 struct writeback_control wbc = {
1517 .bdi = page->mapping->backing_dev_info,
1518 .sync_mode = WB_SYNC_ALL,
1519 .nr_to_write = LONG_MAX,
1520 .range_start = range_start,
1521 .range_end = range_end,
1522 };
1523 int ret = 0; 1431 int ret = 0;
1524 1432
1525 BUG_ON(!PageLocked(page)); 1433 BUG_ON(!PageLocked(page));
1526 for (;;) { 1434 for (;;) {
1527 req = nfs_page_find_request(page); 1435 req = nfs_page_find_request(page);
1528 if (req == NULL) 1436 if (req == NULL)
1529 goto out;
1530 if (test_bit(PG_CLEAN, &req->wb_flags)) {
1531 nfs_release_request(req);
1532 break; 1437 break;
1533 }
1534 if (nfs_lock_request_dontget(req)) { 1438 if (nfs_lock_request_dontget(req)) {
1535 nfs_inode_remove_request(req); 1439 nfs_inode_remove_request(req);
1536 /* 1440 /*
@@ -1542,55 +1446,56 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1542 break; 1446 break;
1543 } 1447 }
1544 ret = nfs_wait_on_request(req); 1448 ret = nfs_wait_on_request(req);
1449 nfs_release_request(req);
1545 if (ret < 0) 1450 if (ret < 0)
1546 goto out; 1451 break;
1547 } 1452 }
1548 if (!PagePrivate(page))
1549 return 0;
1550 ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
1551out:
1552 return ret; 1453 return ret;
1553} 1454}
1554 1455
1555static int nfs_wb_page_priority(struct inode *inode, struct page *page, 1456/*
1556 int how) 1457 * Write back all requests on one page - we do this before reading it.
1458 */
1459int nfs_wb_page(struct inode *inode, struct page *page)
1557{ 1460{
1558 loff_t range_start = page_offset(page); 1461 loff_t range_start = page_offset(page);
1559 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); 1462 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1560 struct writeback_control wbc = { 1463 struct writeback_control wbc = {
1561 .bdi = page->mapping->backing_dev_info,
1562 .sync_mode = WB_SYNC_ALL, 1464 .sync_mode = WB_SYNC_ALL,
1563 .nr_to_write = LONG_MAX, 1465 .nr_to_write = 0,
1564 .range_start = range_start, 1466 .range_start = range_start,
1565 .range_end = range_end, 1467 .range_end = range_end,
1566 }; 1468 };
1469 struct nfs_page *req;
1470 int need_commit;
1567 int ret; 1471 int ret;
1568 1472
1569 do { 1473 while(PagePrivate(page)) {
1570 if (clear_page_dirty_for_io(page)) { 1474 if (clear_page_dirty_for_io(page)) {
1571 ret = nfs_writepage_locked(page, &wbc); 1475 ret = nfs_writepage_locked(page, &wbc);
1572 if (ret < 0) 1476 if (ret < 0)
1573 goto out_error; 1477 goto out_error;
1574 } else if (!PagePrivate(page)) 1478 }
1479 req = nfs_find_and_lock_request(page);
1480 if (!req)
1575 break; 1481 break;
1576 ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); 1482 if (IS_ERR(req)) {
1577 if (ret < 0) 1483 ret = PTR_ERR(req);
1578 goto out_error; 1484 goto out_error;
1579 } while (PagePrivate(page)); 1485 }
1486 need_commit = test_bit(PG_CLEAN, &req->wb_flags);
1487 nfs_clear_page_tag_locked(req);
1488 if (need_commit) {
1489 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1490 if (ret < 0)
1491 goto out_error;
1492 }
1493 }
1580 return 0; 1494 return 0;
1581out_error: 1495out_error:
1582 __mark_inode_dirty(inode, I_DIRTY_PAGES);
1583 return ret; 1496 return ret;
1584} 1497}
1585 1498
1586/*
1587 * Write back all requests on one page - we do this before reading it.
1588 */
1589int nfs_wb_page(struct inode *inode, struct page* page)
1590{
1591 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1592}
1593
1594#ifdef CONFIG_MIGRATION 1499#ifdef CONFIG_MIGRATION
1595int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1500int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1596 struct page *page) 1501 struct page *page)
@@ -1598,8 +1503,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1598 struct nfs_page *req; 1503 struct nfs_page *req;
1599 int ret; 1504 int ret;
1600 1505
1601 if (PageFsCache(page)) 1506 nfs_fscache_release_page(page, GFP_KERNEL);
1602 nfs_fscache_release_page(page, GFP_KERNEL);
1603 1507
1604 req = nfs_find_and_lock_request(page); 1508 req = nfs_find_and_lock_request(page);
1605 ret = PTR_ERR(req); 1509 ret = PTR_ERR(req);
@@ -1612,15 +1516,16 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1612 if (ret) 1516 if (ret)
1613 goto out_unlock; 1517 goto out_unlock;
1614 page_cache_get(newpage); 1518 page_cache_get(newpage);
1519 spin_lock(&mapping->host->i_lock);
1615 req->wb_page = newpage; 1520 req->wb_page = newpage;
1616 SetPagePrivate(newpage); 1521 SetPagePrivate(newpage);
1617 set_page_private(newpage, page_private(page)); 1522 set_page_private(newpage, (unsigned long)req);
1618 ClearPagePrivate(page); 1523 ClearPagePrivate(page);
1619 set_page_private(page, 0); 1524 set_page_private(page, 0);
1525 spin_unlock(&mapping->host->i_lock);
1620 page_cache_release(page); 1526 page_cache_release(page);
1621out_unlock: 1527out_unlock:
1622 nfs_clear_page_tag_locked(req); 1528 nfs_clear_page_tag_locked(req);
1623 nfs_release_request(req);
1624out: 1529out:
1625 return ret; 1530 return ret;
1626} 1531}
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/gfp.h>
25#include <linux/sunrpc/xdr.h> 26#include <linux/sunrpc/xdr.h>
26#include <linux/nfsacl.h> 27#include <linux/nfsacl.h>
27#include <linux/nfs3.h> 28#include <linux/nfs3.h>
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index 8f9a20556f79..bf9cbd242ddd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -7,8 +7,6 @@
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/file.h> 8#include <linux/file.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/sunrpc/svc.h>
11#include <linux/nfsd/nfsd.h>
12#include <linux/nfsd/syscall.h> 10#include <linux/nfsd/syscall.h>
13#include <linux/cred.h> 11#include <linux/cred.h>
14#include <linux/sched.h> 12#include <linux/sched.h>
@@ -38,10 +36,9 @@ static struct file *do_open(char *name, int flags)
38 return ERR_PTR(error); 36 return ERR_PTR(error);
39 37
40 if (flags == O_RDWR) 38 if (flags == O_RDWR)
41 error = may_open(&nd.path, MAY_READ|MAY_WRITE, 39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
42 FMODE_READ|FMODE_WRITE);
43 else 40 else
44 error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE); 41 error = may_open(&nd.path, MAY_WRITE, flags);
45 42
46 if (!error) 43 if (!error)
47 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 44 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 36fcabbf5186..79717a40daba 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -1,15 +1,7 @@
1/* 1/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
2 * linux/fs/nfsd/auth.c
3 *
4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
5 */
6 2
7#include <linux/types.h>
8#include <linux/sched.h> 3#include <linux/sched.h>
9#include <linux/sunrpc/svc.h> 4#include "nfsd.h"
10#include <linux/sunrpc/svcauth.h>
11#include <linux/nfsd/nfsd.h>
12#include <linux/nfsd/export.h>
13#include "auth.h" 5#include "auth.h"
14 6
15int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) 7int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
new file mode 100644
index 000000000000..d892be61016c
--- /dev/null
+++ b/fs/nfsd/cache.h
@@ -0,0 +1,83 @@
1/*
2 * Request reply cache. This was heavily inspired by the
3 * implementation in 4.3BSD/4.4BSD.
4 *
5 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
6 */
7
8#ifndef NFSCACHE_H
9#define NFSCACHE_H
10
11#include <linux/sunrpc/svc.h>
12
13/*
14 * Representation of a reply cache entry.
15 */
16struct svc_cacherep {
17 struct hlist_node c_hash;
18 struct list_head c_lru;
19
20 unsigned char c_state, /* unused, inprog, done */
21 c_type, /* status, buffer */
22 c_secure : 1; /* req came from port < 1024 */
23 struct sockaddr_in c_addr;
24 __be32 c_xid;
25 u32 c_prot;
26 u32 c_proc;
27 u32 c_vers;
28 unsigned long c_timestamp;
29 union {
30 struct kvec u_vec;
31 __be32 u_status;
32 } c_u;
33};
34
35#define c_replvec c_u.u_vec
36#define c_replstat c_u.u_status
37
38/* cache entry states */
39enum {
40 RC_UNUSED,
41 RC_INPROG,
42 RC_DONE
43};
44
45/* return values */
46enum {
47 RC_DROPIT,
48 RC_REPLY,
49 RC_DOIT,
50 RC_INTR
51};
52
53/*
54 * Cache types.
55 * We may want to add more types one day, e.g. for diropres and
56 * attrstat replies. Using cache entries with fixed length instead
57 * of buffer pointers may be more efficient.
58 */
59enum {
60 RC_NOCACHE,
61 RC_REPLSTAT,
62 RC_REPLBUFF,
63};
64
65/*
66 * If requests are retransmitted within this interval, they're dropped.
67 */
68#define RC_DELAY (HZ/5)
69
70int nfsd_reply_cache_init(void);
71void nfsd_reply_cache_shutdown(void);
72int nfsd_cache_lookup(struct svc_rqst *, int);
73void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
74
75#ifdef CONFIG_NFSD_V4
76void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
77#else /* CONFIG_NFSD_V4 */
78static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
79{
80}
81#endif /* CONFIG_NFSD_V4 */
82
83#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c1c9e035d4a4..872a5ef550c7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,7 +1,5 @@
1#define MSNFS /* HACK HACK */ 1#define MSNFS /* HACK HACK */
2/* 2/*
3 * linux/fs/nfsd/export.c
4 *
5 * NFS exporting and validation. 3 * NFS exporting and validation.
6 * 4 *
7 * We maintain a list of clients, each of which has a list of 5 * We maintain a list of clients, each of which has a list of
@@ -14,29 +12,17 @@
14 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> 12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
15 */ 13 */
16 14
17#include <linux/unistd.h>
18#include <linux/slab.h> 15#include <linux/slab.h>
19#include <linux/stat.h>
20#include <linux/in.h>
21#include <linux/seq_file.h>
22#include <linux/syscalls.h>
23#include <linux/rwsem.h>
24#include <linux/dcache.h>
25#include <linux/namei.h> 16#include <linux/namei.h>
26#include <linux/mount.h>
27#include <linux/hash.h>
28#include <linux/module.h> 17#include <linux/module.h>
29#include <linux/exportfs.h> 18#include <linux/exportfs.h>
30 19
31#include <linux/sunrpc/svc.h>
32#include <linux/nfsd/nfsd.h>
33#include <linux/nfsd/nfsfh.h>
34#include <linux/nfsd/syscall.h> 20#include <linux/nfsd/syscall.h>
35#include <linux/lockd/bind.h>
36#include <linux/sunrpc/msg_prot.h>
37#include <linux/sunrpc/gss_api.h>
38#include <net/ipv6.h> 21#include <net/ipv6.h>
39 22
23#include "nfsd.h"
24#include "nfsfh.h"
25
40#define NFSDDBG_FACILITY NFSDDBG_EXPORT 26#define NFSDDBG_FACILITY NFSDDBG_EXPORT
41 27
42typedef struct auth_domain svc_client; 28typedef struct auth_domain svc_client;
@@ -369,16 +355,25 @@ static struct svc_export *svc_export_update(struct svc_export *new,
369 struct svc_export *old); 355 struct svc_export *old);
370static struct svc_export *svc_export_lookup(struct svc_export *); 356static struct svc_export *svc_export_lookup(struct svc_export *);
371 357
372static int check_export(struct inode *inode, int flags, unsigned char *uuid) 358static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
373{ 359{
374 360
375 /* We currently export only dirs and regular files. 361 /*
376 * This is what umountd does. 362 * We currently export only dirs, regular files, and (for v4
363 * pseudoroot) symlinks.
377 */ 364 */
378 if (!S_ISDIR(inode->i_mode) && 365 if (!S_ISDIR(inode->i_mode) &&
366 !S_ISLNK(inode->i_mode) &&
379 !S_ISREG(inode->i_mode)) 367 !S_ISREG(inode->i_mode))
380 return -ENOTDIR; 368 return -ENOTDIR;
381 369
370 /*
371 * Mountd should never pass down a writeable V4ROOT export, but,
372 * just to make sure:
373 */
374 if (*flags & NFSEXP_V4ROOT)
375 *flags |= NFSEXP_READONLY;
376
382 /* There are two requirements on a filesystem to be exportable. 377 /* There are two requirements on a filesystem to be exportable.
383 * 1: We must be able to identify the filesystem from a number. 378 * 1: We must be able to identify the filesystem from a number.
384 * either a device number (so FS_REQUIRES_DEV needed) 379 * either a device number (so FS_REQUIRES_DEV needed)
@@ -387,7 +382,7 @@ static int check_export(struct inode *inode, int flags, unsigned char *uuid)
387 * This means that s_export_op must be set. 382 * This means that s_export_op must be set.
388 */ 383 */
389 if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && 384 if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
390 !(flags & NFSEXP_FSID) && 385 !(*flags & NFSEXP_FSID) &&
391 uuid == NULL) { 386 uuid == NULL) {
392 dprintk("exp_export: export of non-dev fs without fsid\n"); 387 dprintk("exp_export: export of non-dev fs without fsid\n");
393 return -EINVAL; 388 return -EINVAL;
@@ -602,7 +597,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
602 goto out4; 597 goto out4;
603 } 598 }
604 599
605 err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags, 600 err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags,
606 exp.ex_uuid); 601 exp.ex_uuid);
607 if (err) 602 if (err)
608 goto out4; 603 goto out4;
@@ -1041,7 +1036,7 @@ exp_export(struct nfsctl_export *nxp)
1041 goto finish; 1036 goto finish;
1042 } 1037 }
1043 1038
1044 err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL); 1039 err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL);
1045 if (err) goto finish; 1040 if (err) goto finish;
1046 1041
1047 err = -ENOMEM; 1042 err = -ENOMEM;
@@ -1320,6 +1315,15 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
1320 return exp; 1315 return exp;
1321} 1316}
1322 1317
1318static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
1319{
1320 u32 fsidv[2];
1321
1322 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
1323
1324 return rqst_exp_find(rqstp, FSID_NUM, fsidv);
1325}
1326
1323/* 1327/*
1324 * Called when we need the filehandle for the root of the pseudofs, 1328 * Called when we need the filehandle for the root of the pseudofs,
1325 * for a given NFSv4 client. The root is defined to be the 1329 * for a given NFSv4 client. The root is defined to be the
@@ -1330,11 +1334,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1330{ 1334{
1331 struct svc_export *exp; 1335 struct svc_export *exp;
1332 __be32 rv; 1336 __be32 rv;
1333 u32 fsidv[2];
1334
1335 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
1336 1337
1337 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); 1338 exp = find_fsidzero_export(rqstp);
1338 if (IS_ERR(exp)) 1339 if (IS_ERR(exp))
1339 return nfserrno(PTR_ERR(exp)); 1340 return nfserrno(PTR_ERR(exp));
1340 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); 1341 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
@@ -1425,6 +1426,7 @@ static struct flags {
1425 { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, 1426 { NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
1426 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1427 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
1427 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, 1428 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
1429 { NFSEXP_V4ROOT, {"v4root", ""}},
1428#ifdef MSNFS 1430#ifdef MSNFS
1429 { NFSEXP_MSNFS, {"msnfs", ""}}, 1431 { NFSEXP_MSNFS, {"msnfs", ""}},
1430#endif 1432#endif
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index b2786a5f9afe..0c6d81670137 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/fs/nfsd/lockd.c
3 *
4 * This file contains all the stubs needed when communicating with lockd. 2 * This file contains all the stubs needed when communicating with lockd.
5 * This level of indirection is necessary so we can run nfsd+lockd without 3 * This level of indirection is necessary so we can run nfsd+lockd without
6 * requiring the nfs client to be compiled in/loaded, and vice versa. 4 * requiring the nfs client to be compiled in/loaded, and vice versa.
@@ -8,14 +6,10 @@
8 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
9 */ 7 */
10 8
11#include <linux/types.h>
12#include <linux/fs.h>
13#include <linux/file.h> 9#include <linux/file.h>
14#include <linux/mount.h>
15#include <linux/sunrpc/clnt.h>
16#include <linux/sunrpc/svc.h>
17#include <linux/nfsd/nfsd.h>
18#include <linux/lockd/bind.h> 10#include <linux/lockd/bind.h>
11#include "nfsd.h"
12#include "vfs.h"
19 13
20#define NFSDDBG_FACILITY NFSDDBG_LOCKD 14#define NFSDDBG_FACILITY NFSDDBG_LOCKD
21 15
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4e3219e84116..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -1,19 +1,16 @@
1/* 1/*
2 * linux/fs/nfsd/nfs2acl.c
3 *
4 * Process version 2 NFSACL requests. 2 * Process version 2 NFSACL requests.
5 * 3 *
6 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> 4 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
7 */ 5 */
8 6
9#include <linux/sunrpc/svc.h> 7#include "nfsd.h"
10#include <linux/nfs.h> 8/* FIXME: nfsacl.h is a broken header */
11#include <linux/nfsd/nfsd.h>
12#include <linux/nfsd/cache.h>
13#include <linux/nfsd/xdr.h>
14#include <linux/nfsd/xdr3.h>
15#include <linux/posix_acl.h>
16#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
11#include "cache.h"
12#include "xdr3.h"
13#include "vfs.h"
17 14
18#define NFSDDBG_FACILITY NFSDDBG_PROC 15#define NFSDDBG_FACILITY NFSDDBG_PROC
19#define RETURN_STATUS(st) { resp->status = (st); return (st); } 16#define RETURN_STATUS(st) { resp->status = (st); return (st); }
@@ -217,6 +214,16 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
217 * XDR encode functions 214 * XDR encode functions
218 */ 215 */
219 216
217/*
218 * There must be an encoding function for void results so svc_process
219 * will work properly.
220 */
221int
222nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
223{
224 return xdr_ressize_check(rqstp, p);
225}
226
220/* GETACL */ 227/* GETACL */
221static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, 228static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
222 struct nfsd3_getaclres *resp) 229 struct nfsd3_getaclres *resp)
@@ -308,7 +315,6 @@ static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p,
308} 315}
309 316
310#define nfsaclsvc_decode_voidargs NULL 317#define nfsaclsvc_decode_voidargs NULL
311#define nfsaclsvc_encode_voidres NULL
312#define nfsaclsvc_release_void NULL 318#define nfsaclsvc_release_void NULL
313#define nfsd3_fhandleargs nfsd_fhandle 319#define nfsd3_fhandleargs nfsd_fhandle
314#define nfsd3_attrstatres nfsd_attrstat 320#define nfsd3_attrstatres nfsd_attrstat
@@ -346,5 +352,5 @@ struct svc_version nfsd_acl_version2 = {
346 .vs_proc = nfsd_acl_procedures2, 352 .vs_proc = nfsd_acl_procedures2,
347 .vs_dispatch = nfsd_dispatch, 353 .vs_dispatch = nfsd_dispatch,
348 .vs_xdrsize = NFS3_SVC_XDRSIZE, 354 .vs_xdrsize = NFS3_SVC_XDRSIZE,
349 .vs_hidden = 1, 355 .vs_hidden = 0,
350}; 356};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9981dbb377a3..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -1,18 +1,16 @@
1/* 1/*
2 * linux/fs/nfsd/nfs3acl.c
3 *
4 * Process version 3 NFSACL requests. 2 * Process version 3 NFSACL requests.
5 * 3 *
6 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> 4 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
7 */ 5 */
8 6
9#include <linux/sunrpc/svc.h> 7#include "nfsd.h"
10#include <linux/nfs3.h> 8/* FIXME: nfsacl.h is a broken header */
11#include <linux/nfsd/nfsd.h>
12#include <linux/nfsd/cache.h>
13#include <linux/nfsd/xdr3.h>
14#include <linux/posix_acl.h>
15#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
11#include "cache.h"
12#include "xdr3.h"
13#include "vfs.h"
16 14
17#define RETURN_STATUS(st) { resp->status = (st); return (st); } 15#define RETURN_STATUS(st) { resp->status = (st); return (st); }
18 16
@@ -264,6 +262,6 @@ struct svc_version nfsd_acl_version3 = {
264 .vs_proc = nfsd_acl_procedures3, 262 .vs_proc = nfsd_acl_procedures3,
265 .vs_dispatch = nfsd_dispatch, 263 .vs_dispatch = nfsd_dispatch,
266 .vs_xdrsize = NFS3_SVC_XDRSIZE, 264 .vs_xdrsize = NFS3_SVC_XDRSIZE,
267 .vs_hidden = 1, 265 .vs_hidden = 0,
268}; 266};
269 267
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a713c418a922..3d68f45a37b9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -1,30 +1,16 @@
1/* 1/*
2 * linux/fs/nfsd/nfs3proc.c
3 *
4 * Process version 3 NFS requests. 2 * Process version 3 NFS requests.
5 * 3 *
6 * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de>
7 */ 5 */
8 6
9#include <linux/linkage.h>
10#include <linux/time.h>
11#include <linux/errno.h>
12#include <linux/fs.h> 7#include <linux/fs.h>
13#include <linux/ext2_fs.h> 8#include <linux/ext2_fs.h>
14#include <linux/stat.h>
15#include <linux/fcntl.h>
16#include <linux/net.h>
17#include <linux/in.h>
18#include <linux/unistd.h>
19#include <linux/slab.h>
20#include <linux/major.h>
21#include <linux/magic.h> 9#include <linux/magic.h>
22 10
23#include <linux/sunrpc/svc.h> 11#include "cache.h"
24#include <linux/nfsd/nfsd.h> 12#include "xdr3.h"
25#include <linux/nfsd/cache.h> 13#include "vfs.h"
26#include <linux/nfsd/xdr3.h>
27#include <linux/nfs3.h>
28 14
29#define NFSDDBG_FACILITY NFSDDBG_PROC 15#define NFSDDBG_FACILITY NFSDDBG_PROC
30 16
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index d0a2ce1b4324..2a533a0af2a9 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/fs/nfsd/nfs3xdr.c
3 *
4 * XDR support for nfsd/protocol version 3. 2 * XDR support for nfsd/protocol version 3.
5 * 3 *
6 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
@@ -8,19 +6,8 @@
8 * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()! 6 * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()!
9 */ 7 */
10 8
11#include <linux/types.h>
12#include <linux/time.h>
13#include <linux/nfs3.h>
14#include <linux/list.h>
15#include <linux/spinlock.h>
16#include <linux/dcache.h>
17#include <linux/namei.h> 9#include <linux/namei.h>
18#include <linux/mm.h> 10#include "xdr3.h"
19#include <linux/vfs.h>
20#include <linux/sunrpc/xdr.h>
21#include <linux/sunrpc/svc.h>
22#include <linux/nfsd/nfsd.h>
23#include <linux/nfsd/xdr3.h>
24#include "auth.h" 11#include "auth.h"
25 12
26#define NFSDDBG_FACILITY NFSDDBG_XDR 13#define NFSDDBG_FACILITY NFSDDBG_XDR
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 725d02f210e2..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * fs/nfs4acl/acl.c
3 *
4 * Common NFSv4 ACL handling code. 2 * Common NFSv4 ACL handling code.
5 * 3 *
6 * Copyright (c) 2002, 2003 The Regents of the University of Michigan. 4 * Copyright (c) 2002, 2003 The Regents of the University of Michigan.
@@ -36,15 +34,8 @@
36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 */ 35 */
38 36
39#include <linux/string.h>
40#include <linux/slab.h> 37#include <linux/slab.h>
41#include <linux/list.h>
42#include <linux/types.h>
43#include <linux/fs.h>
44#include <linux/module.h>
45#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
46#include <linux/posix_acl.h>
47#include <linux/nfs4.h>
48#include <linux/nfs4_acl.h> 39#include <linux/nfs4_acl.h>
49 40
50 41
@@ -389,7 +380,7 @@ sort_pacl(struct posix_acl *pacl)
389 sort_pacl_range(pacl, 1, i-1); 380 sort_pacl_range(pacl, 1, i-1);
390 381
391 BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); 382 BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
392 j = i++; 383 j = ++i;
393 while (pacl->a_entries[j].e_tag == ACL_GROUP) 384 while (pacl->a_entries[j].e_tag == ACL_GROUP)
394 j++; 385 j++;
395 sort_pacl_range(pacl, i, j-1); 386 sort_pacl_range(pacl, i, j-1);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 24e8d78f8dde..7e32bd394e86 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/fs/nfsd/nfs4callback.c
3 *
4 * Copyright (c) 2001 The Regents of the University of Michigan. 2 * Copyright (c) 2001 The Regents of the University of Michigan.
5 * All rights reserved. 3 * All rights reserved.
6 * 4 *
@@ -33,22 +31,10 @@
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */ 32 */
35 33
36#include <linux/module.h>
37#include <linux/list.h>
38#include <linux/inet.h>
39#include <linux/errno.h>
40#include <linux/delay.h>
41#include <linux/sched.h>
42#include <linux/kthread.h>
43#include <linux/sunrpc/xdr.h>
44#include <linux/sunrpc/svc.h>
45#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
46#include <linux/sunrpc/svcsock.h> 35#include <linux/slab.h>
47#include <linux/nfsd/nfsd.h> 36#include "nfsd.h"
48#include <linux/nfsd/state.h> 37#include "state.h"
49#include <linux/sunrpc/sched.h>
50#include <linux/nfs4.h>
51#include <linux/sunrpc/xprtsock.h>
52 38
53#define NFSDDBG_FACILITY NFSDDBG_PROC 39#define NFSDDBG_FACILITY NFSDDBG_PROC
54 40
@@ -540,6 +526,8 @@ static struct rpc_cred *callback_cred;
540 526
541int set_callback_cred(void) 527int set_callback_cred(void)
542{ 528{
529 if (callback_cred)
530 return 0;
543 callback_cred = rpc_lookup_machine_cred(); 531 callback_cred = rpc_lookup_machine_cred();
544 if (!callback_cred) 532 if (!callback_cred)
545 return -ENOMEM; 533 return -ENOMEM;
@@ -557,7 +545,8 @@ void do_probe_callback(struct nfs4_client *clp)
557 }; 545 };
558 int status; 546 int status;
559 547
560 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, 548 status = rpc_call_async(cb->cb_client, &msg,
549 RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
561 &nfsd4_cb_probe_ops, (void *)clp); 550 &nfsd4_cb_probe_ops, (void *)clp);
562 if (status) { 551 if (status) {
563 warn_no_callback_path(clp, status); 552 warn_no_callback_path(clp, status);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index ba2c199592fd..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * fs/nfsd/nfs4idmap.c
3 *
4 * Mapping of UID/GIDs to name and vice versa. 2 * Mapping of UID/GIDs to name and vice versa.
5 * 3 *
6 * Copyright (c) 2002, 2003 The Regents of the University of 4 * Copyright (c) 2002, 2003 The Regents of the University of
@@ -35,22 +33,10 @@
35 */ 33 */
36 34
37#include <linux/module.h> 35#include <linux/module.h>
38#include <linux/init.h>
39
40#include <linux/mm.h>
41#include <linux/errno.h>
42#include <linux/string.h>
43#include <linux/sunrpc/clnt.h>
44#include <linux/nfs.h>
45#include <linux/nfs4.h>
46#include <linux/nfs_fs.h>
47#include <linux/nfs_page.h>
48#include <linux/sunrpc/cache.h>
49#include <linux/nfsd_idmap.h> 36#include <linux/nfsd_idmap.h>
50#include <linux/list.h>
51#include <linux/time.h>
52#include <linux/seq_file.h> 37#include <linux/seq_file.h>
53#include <linux/sunrpc/svcauth.h> 38#include <linux/sched.h>
39#include <linux/slab.h>
54 40
55/* 41/*
56 * Cache entry 42 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index bebc0c2e1b0a..2ab9e8501bfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * fs/nfsd/nfs4proc.c
3 *
4 * Server-side procedures for NFSv4. 2 * Server-side procedures for NFSv4.
5 * 3 *
6 * Copyright (c) 2002 The Regents of the University of Michigan. 4 * Copyright (c) 2002 The Regents of the University of Michigan.
@@ -34,20 +32,12 @@
34 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 */ 34 */
37
38#include <linux/param.h>
39#include <linux/major.h>
40#include <linux/slab.h>
41#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h>
42 37
43#include <linux/sunrpc/svc.h> 38#include "cache.h"
44#include <linux/nfsd/nfsd.h> 39#include "xdr4.h"
45#include <linux/nfsd/cache.h> 40#include "vfs.h"
46#include <linux/nfs4.h>
47#include <linux/nfsd/state.h>
48#include <linux/nfsd/xdr4.h>
49#include <linux/nfs4_acl.h>
50#include <linux/sunrpc/gss_api.h>
51 41
52#define NFSDDBG_FACILITY NFSDDBG_PROC 42#define NFSDDBG_FACILITY NFSDDBG_PROC
53 43
@@ -170,7 +160,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
170 accmode |= NFSD_MAY_READ; 160 accmode |= NFSD_MAY_READ;
171 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 161 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
172 accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); 162 accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
173 if (open->op_share_deny & NFS4_SHARE_DENY_WRITE) 163 if (open->op_share_deny & NFS4_SHARE_DENY_READ)
174 accmode |= NFSD_MAY_WRITE; 164 accmode |= NFSD_MAY_WRITE;
175 165
176 status = fh_verify(rqstp, current_fh, S_IFREG, accmode); 166 status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index b5348405046b..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1,6 +1,4 @@
1/* 1/*
2* linux/fs/nfsd/nfs4recover.c
3*
4* Copyright (c) 2004 The Regents of the University of Michigan. 2* Copyright (c) 2004 The Regents of the University of Michigan.
5* All rights reserved. 3* All rights reserved.
6* 4*
@@ -33,20 +31,15 @@
33* 31*
34*/ 32*/
35 33
36#include <linux/err.h>
37#include <linux/sunrpc/svc.h>
38#include <linux/nfsd/nfsd.h>
39#include <linux/nfs4.h>
40#include <linux/nfsd/state.h>
41#include <linux/nfsd/xdr4.h>
42#include <linux/param.h>
43#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/slab.h>
44#include <linux/namei.h> 36#include <linux/namei.h>
45#include <asm/uaccess.h>
46#include <linux/scatterlist.h>
47#include <linux/crypto.h> 37#include <linux/crypto.h>
48#include <linux/sched.h> 38#include <linux/sched.h>
49#include <linux/mount.h> 39
40#include "nfsd.h"
41#include "state.h"
42#include "vfs.h"
50 43
51#define NFSDDBG_FACILITY NFSDDBG_PROC 44#define NFSDDBG_FACILITY NFSDDBG_PROC
52 45
@@ -127,9 +120,7 @@ out_no_tfm:
127static void 120static void
128nfsd4_sync_rec_dir(void) 121nfsd4_sync_rec_dir(void)
129{ 122{
130 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 123 vfs_fsync(NULL, rec_dir.dentry, 0);
131 nfsd_sync_dir(rec_dir.dentry);
132 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
133} 124}
134 125
135int 126int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2153f9bdbebd..6a8fedaa4f55 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1,6 +1,4 @@
1/* 1/*
2* linux/fs/nfsd/nfs4state.c
3*
4* Copyright (c) 2001 The Regents of the University of Michigan. 2* Copyright (c) 2001 The Regents of the University of Michigan.
5* All rights reserved. 3* All rights reserved.
6* 4*
@@ -34,28 +32,15 @@
34* 32*
35*/ 33*/
36 34
37#include <linux/param.h>
38#include <linux/major.h>
39#include <linux/slab.h>
40
41#include <linux/sunrpc/svc.h>
42#include <linux/nfsd/nfsd.h>
43#include <linux/nfsd/cache.h>
44#include <linux/file.h> 35#include <linux/file.h>
45#include <linux/mount.h>
46#include <linux/workqueue.h>
47#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
48#include <linux/kthread.h> 37#include <linux/slab.h>
49#include <linux/nfs4.h>
50#include <linux/nfsd/state.h>
51#include <linux/nfsd/xdr4.h>
52#include <linux/namei.h> 38#include <linux/namei.h>
53#include <linux/swap.h> 39#include <linux/swap.h>
54#include <linux/mutex.h>
55#include <linux/lockd/bind.h>
56#include <linux/module.h>
57#include <linux/sunrpc/svcauth_gss.h> 40#include <linux/sunrpc/svcauth_gss.h>
58#include <linux/sunrpc/clnt.h> 41#include <linux/sunrpc/clnt.h>
42#include "xdr4.h"
43#include "vfs.h"
59 44
60#define NFSDDBG_FACILITY NFSDDBG_PROC 45#define NFSDDBG_FACILITY NFSDDBG_PROC
61 46
@@ -477,13 +462,14 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
477 462
478/* 463/*
479 * fchan holds the client values on input, and the server values on output 464 * fchan holds the client values on input, and the server values on output
465 * sv_max_mesg is the maximum payload plus one page for overhead.
480 */ 466 */
481static int init_forechannel_attrs(struct svc_rqst *rqstp, 467static int init_forechannel_attrs(struct svc_rqst *rqstp,
482 struct nfsd4_channel_attrs *session_fchan, 468 struct nfsd4_channel_attrs *session_fchan,
483 struct nfsd4_channel_attrs *fchan) 469 struct nfsd4_channel_attrs *fchan)
484{ 470{
485 int status = 0; 471 int status = 0;
486 __u32 maxcount = svc_max_payload(rqstp); 472 __u32 maxcount = nfsd_serv->sv_max_mesg;
487 473
488 /* headerpadsz set to zero in encode routine */ 474 /* headerpadsz set to zero in encode routine */
489 475
@@ -523,6 +509,15 @@ free_session_slots(struct nfsd4_session *ses)
523 kfree(ses->se_slots[i]); 509 kfree(ses->se_slots[i]);
524} 510}
525 511
512/*
513 * We don't actually need to cache the rpc and session headers, so we
514 * can allocate a little less for each slot:
515 */
516static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
517{
518 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
519}
520
526static int 521static int
527alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, 522alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
528 struct nfsd4_create_session *cses) 523 struct nfsd4_create_session *cses)
@@ -554,7 +549,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
554 memcpy(new, &tmp, sizeof(*new)); 549 memcpy(new, &tmp, sizeof(*new));
555 550
556 /* allocate each struct nfsd4_slot and data cache in one piece */ 551 /* allocate each struct nfsd4_slot and data cache in one piece */
557 cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; 552 cachesize = slot_bytes(&new->se_fchannel);
558 for (i = 0; i < new->se_fchannel.maxreqs; i++) { 553 for (i = 0; i < new->se_fchannel.maxreqs; i++) {
559 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); 554 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
560 if (!sp) 555 if (!sp)
@@ -628,10 +623,12 @@ void
628free_session(struct kref *kref) 623free_session(struct kref *kref)
629{ 624{
630 struct nfsd4_session *ses; 625 struct nfsd4_session *ses;
626 int mem;
631 627
632 ses = container_of(kref, struct nfsd4_session, se_ref); 628 ses = container_of(kref, struct nfsd4_session, se_ref);
633 spin_lock(&nfsd_drc_lock); 629 spin_lock(&nfsd_drc_lock);
634 nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE; 630 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
631 nfsd_drc_mem_used -= mem;
635 spin_unlock(&nfsd_drc_lock); 632 spin_unlock(&nfsd_drc_lock);
636 free_session_slots(ses); 633 free_session_slots(ses);
637 kfree(ses); 634 kfree(ses);
@@ -2002,7 +1999,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
2002{ 1999{
2003 if (share_access & NFS4_SHARE_ACCESS_WRITE) { 2000 if (share_access & NFS4_SHARE_ACCESS_WRITE) {
2004 drop_file_write_access(filp); 2001 drop_file_write_access(filp);
2002 spin_lock(&filp->f_lock);
2005 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; 2003 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
2004 spin_unlock(&filp->f_lock);
2006 } 2005 }
2007} 2006}
2008 2007
@@ -2404,11 +2403,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2404 2403
2405 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); 2404 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
2406 2405
2407 dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n", 2406 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
2408 dp->dl_stateid.si_boot, 2407 STATEID_VAL(&dp->dl_stateid));
2409 dp->dl_stateid.si_stateownerid,
2410 dp->dl_stateid.si_fileid,
2411 dp->dl_stateid.si_generation);
2412out: 2408out:
2413 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS 2409 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
2414 && flag == NFS4_OPEN_DELEGATE_NONE 2410 && flag == NFS4_OPEN_DELEGATE_NONE
@@ -2487,8 +2483,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2487 } 2483 }
2488 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2484 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
2489 2485
2490 if (nfsd4_has_session(&resp->cstate)) 2486 if (nfsd4_has_session(&resp->cstate)) {
2491 open->op_stateowner->so_confirmed = 1; 2487 open->op_stateowner->so_confirmed = 1;
2488 nfsd4_create_clid_dir(open->op_stateowner->so_client);
2489 }
2492 2490
2493 /* 2491 /*
2494 * Attempt to hand out a delegation. No error return, because the 2492 * Attempt to hand out a delegation. No error return, because the
@@ -2498,9 +2496,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2498 2496
2499 status = nfs_ok; 2497 status = nfs_ok;
2500 2498
2501 dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n", 2499 dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
2502 stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid, 2500 STATEID_VAL(&stp->st_stateid));
2503 stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
2504out: 2501out:
2505 if (fp) 2502 if (fp)
2506 put_nfs4_file(fp); 2503 put_nfs4_file(fp);
@@ -2666,9 +2663,8 @@ STALE_STATEID(stateid_t *stateid)
2666{ 2663{
2667 if (time_after((unsigned long)boot_time, 2664 if (time_after((unsigned long)boot_time,
2668 (unsigned long)stateid->si_boot)) { 2665 (unsigned long)stateid->si_boot)) {
2669 dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n", 2666 dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
2670 stateid->si_boot, stateid->si_stateownerid, 2667 STATEID_VAL(stateid));
2671 stateid->si_fileid, stateid->si_generation);
2672 return 1; 2668 return 1;
2673 } 2669 }
2674 return 0; 2670 return 0;
@@ -2680,9 +2676,8 @@ EXPIRED_STATEID(stateid_t *stateid)
2680 if (time_before((unsigned long)boot_time, 2676 if (time_before((unsigned long)boot_time,
2681 ((unsigned long)stateid->si_boot)) && 2677 ((unsigned long)stateid->si_boot)) &&
2682 time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { 2678 time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
2683 dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n", 2679 dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
2684 stateid->si_boot, stateid->si_stateownerid, 2680 STATEID_VAL(stateid));
2685 stateid->si_fileid, stateid->si_generation);
2686 return 1; 2681 return 1;
2687 } 2682 }
2688 return 0; 2683 return 0;
@@ -2696,9 +2691,8 @@ stateid_error_map(stateid_t *stateid)
2696 if (EXPIRED_STATEID(stateid)) 2691 if (EXPIRED_STATEID(stateid))
2697 return nfserr_expired; 2692 return nfserr_expired;
2698 2693
2699 dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n", 2694 dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
2700 stateid->si_boot, stateid->si_stateownerid, 2695 STATEID_VAL(stateid));
2701 stateid->si_fileid, stateid->si_generation);
2702 return nfserr_bad_stateid; 2696 return nfserr_bad_stateid;
2703} 2697}
2704 2698
@@ -2884,10 +2878,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2884 struct svc_fh *current_fh = &cstate->current_fh; 2878 struct svc_fh *current_fh = &cstate->current_fh;
2885 __be32 status; 2879 __be32 status;
2886 2880
2887 dprintk("NFSD: preprocess_seqid_op: seqid=%d " 2881 dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
2888 "stateid = (%08x/%08x/%08x/%08x)\n", seqid, 2882 seqid, STATEID_VAL(stateid));
2889 stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid,
2890 stateid->si_generation);
2891 2883
2892 *stpp = NULL; 2884 *stpp = NULL;
2893 *sopp = NULL; 2885 *sopp = NULL;
@@ -3019,12 +3011,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3019 sop->so_confirmed = 1; 3011 sop->so_confirmed = 1;
3020 update_stateid(&stp->st_stateid); 3012 update_stateid(&stp->st_stateid);
3021 memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); 3013 memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
3022 dprintk("NFSD: nfsd4_open_confirm: success, seqid=%d " 3014 dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
3023 "stateid=(%08x/%08x/%08x/%08x)\n", oc->oc_seqid, 3015 __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
3024 stp->st_stateid.si_boot,
3025 stp->st_stateid.si_stateownerid,
3026 stp->st_stateid.si_fileid,
3027 stp->st_stateid.si_generation);
3028 3016
3029 nfsd4_create_clid_dir(sop->so_client); 3017 nfsd4_create_clid_dir(sop->so_client);
3030out: 3018out:
@@ -3283,9 +3271,8 @@ find_delegation_stateid(struct inode *ino, stateid_t *stid)
3283 struct nfs4_file *fp; 3271 struct nfs4_file *fp;
3284 struct nfs4_delegation *dl; 3272 struct nfs4_delegation *dl;
3285 3273
3286 dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n", 3274 dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
3287 stid->si_boot, stid->si_stateownerid, 3275 STATEID_VAL(stid));
3288 stid->si_fileid, stid->si_generation);
3289 3276
3290 fp = find_file(ino); 3277 fp = find_file(ino);
3291 if (!fp) 3278 if (!fp)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0fbd50cee1f6..e1703175ee28 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,24 +40,17 @@
40 * at the end of nfs4svc_decode_compoundargs. 40 * at the end of nfs4svc_decode_compoundargs.
41 */ 41 */
42 42
43#include <linux/param.h> 43#include <linux/slab.h>
44#include <linux/smp.h>
45#include <linux/fs.h>
46#include <linux/namei.h> 44#include <linux/namei.h>
47#include <linux/vfs.h> 45#include <linux/statfs.h>
48#include <linux/utsname.h> 46#include <linux/utsname.h>
49#include <linux/sunrpc/xdr.h>
50#include <linux/sunrpc/svc.h>
51#include <linux/sunrpc/clnt.h>
52#include <linux/nfsd/nfsd.h>
53#include <linux/nfsd/state.h>
54#include <linux/nfsd/xdr4.h>
55#include <linux/nfsd_idmap.h> 47#include <linux/nfsd_idmap.h>
56#include <linux/nfs4.h>
57#include <linux/nfs4_acl.h> 48#include <linux/nfs4_acl.h>
58#include <linux/sunrpc/gss_api.h>
59#include <linux/sunrpc/svcauth_gss.h> 49#include <linux/sunrpc/svcauth_gss.h>
60 50
51#include "xdr4.h"
52#include "vfs.h"
53
61#define NFSDDBG_FACILITY NFSDDBG_XDR 54#define NFSDDBG_FACILITY NFSDDBG_XDR
62 55
63/* 56/*
@@ -1442,7 +1435,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1442 } 1435 }
1443 op->opnum = ntohl(*argp->p++); 1436 op->opnum = ntohl(*argp->p++);
1444 1437
1445 if (op->opnum >= OP_ACCESS && op->opnum < ops->nops) 1438 if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
1446 op->status = ops->decoders[op->opnum](argp, &op->u); 1439 op->status = ops->decoders[op->opnum](argp, &op->u);
1447 else { 1440 else {
1448 op->opnum = OP_ILLEGAL; 1441 op->opnum = OP_ILLEGAL;
@@ -1536,7 +1529,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
1536 } } while (0); 1529 } } while (0);
1537 1530
1538/* Encode as an array of strings the string given with components 1531/* Encode as an array of strings the string given with components
1539 * seperated @sep. 1532 * separated @sep.
1540 */ 1533 */
1541static __be32 nfsd4_encode_components(char sep, char *components, 1534static __be32 nfsd4_encode_components(char sep, char *components,
1542 __be32 **pp, int *buflen) 1535 __be32 **pp, int *buflen)
@@ -2129,9 +2122,15 @@ out_acl:
2129 * and this is the root of a cross-mounted filesystem. 2122 * and this is the root of a cross-mounted filesystem.
2130 */ 2123 */
2131 if (ignore_crossmnt == 0 && 2124 if (ignore_crossmnt == 0 &&
2132 exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) { 2125 dentry == exp->ex_path.mnt->mnt_root) {
2133 err = vfs_getattr(exp->ex_path.mnt->mnt_parent, 2126 struct path path = exp->ex_path;
2134 exp->ex_path.mnt->mnt_mountpoint, &stat); 2127 path_get(&path);
2128 while (follow_up(&path)) {
2129 if (path.dentry != path.mnt->mnt_root)
2130 break;
2131 }
2132 err = vfs_getattr(path.mnt, path.dentry, &stat);
2133 path_put(&path);
2135 if (err) 2134 if (err)
2136 goto out_nfserr; 2135 goto out_nfserr;
2137 } 2136 }
@@ -2204,11 +2203,14 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
2204 * we will not follow the cross mount and will fill the attribtutes 2203 * we will not follow the cross mount and will fill the attribtutes
2205 * directly from the mountpoint dentry. 2204 * directly from the mountpoint dentry.
2206 */ 2205 */
2207 if (d_mountpoint(dentry) && !attributes_need_mount(cd->rd_bmval)) 2206 if (nfsd_mountpoint(dentry, exp)) {
2208 ignore_crossmnt = 1;
2209 else if (d_mountpoint(dentry)) {
2210 int err; 2207 int err;
2211 2208
2209 if (!(exp->ex_flags & NFSEXP_V4ROOT)
2210 && !attributes_need_mount(cd->rd_bmval)) {
2211 ignore_crossmnt = 1;
2212 goto out_encode;
2213 }
2212 /* 2214 /*
2213 * Why the heck aren't we just using nfsd_lookup?? 2215 * Why the heck aren't we just using nfsd_lookup??
2214 * Different "."/".." handling? Something else? 2216 * Different "."/".." handling? Something else?
@@ -2224,6 +2226,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
2224 goto out_put; 2226 goto out_put;
2225 2227
2226 } 2228 }
2229out_encode:
2227 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, 2230 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
2228 cd->rd_rqstp, ignore_crossmnt); 2231 cd->rd_rqstp, ignore_crossmnt);
2229out_put: 2232out_put:
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 4638635c5d87..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/fs/nfsd/nfscache.c
3 *
4 * Request reply cache. This is currently a global cache, but this may 2 * Request reply cache. This is currently a global cache, but this may
5 * change in the future and be a per-client cache. 3 * change in the future and be a per-client cache.
6 * 4 *
@@ -10,16 +8,10 @@
10 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
11 */ 9 */
12 10
13#include <linux/kernel.h>
14#include <linux/time.h>
15#include <linux/slab.h> 11#include <linux/slab.h>
16#include <linux/string.h>
17#include <linux/spinlock.h>
18#include <linux/list.h>
19 12
20#include <linux/sunrpc/svc.h> 13#include "nfsd.h"
21#include <linux/nfsd/nfsd.h> 14#include "cache.h"
22#include <linux/nfsd/cache.h>
23 15
24/* Size of reply cache. Common values are: 16/* Size of reply cache. Common values are:
25 * 4.3BSD: 128 17 * 4.3BSD: 128
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5c01fc148ce8..e3591073098f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1,46 +1,21 @@
1/* 1/*
2 * linux/fs/nfsd/nfsctl.c
3 *
4 * Syscall interface to knfsd. 2 * Syscall interface to knfsd.
5 * 3 *
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
7 */ 5 */
8 6
9#include <linux/module.h>
10
11#include <linux/linkage.h>
12#include <linux/time.h>
13#include <linux/errno.h>
14#include <linux/fs.h>
15#include <linux/namei.h>
16#include <linux/fcntl.h>
17#include <linux/net.h>
18#include <linux/in.h>
19#include <linux/syscalls.h>
20#include <linux/unistd.h>
21#include <linux/slab.h> 7#include <linux/slab.h>
22#include <linux/proc_fs.h> 8#include <linux/namei.h>
23#include <linux/seq_file.h>
24#include <linux/pagemap.h>
25#include <linux/init.h>
26#include <linux/inet.h>
27#include <linux/string.h>
28#include <linux/ctype.h> 9#include <linux/ctype.h>
29 10
30#include <linux/nfs.h>
31#include <linux/nfsd_idmap.h> 11#include <linux/nfsd_idmap.h>
32#include <linux/lockd/bind.h>
33#include <linux/sunrpc/svc.h>
34#include <linux/sunrpc/svcsock.h> 12#include <linux/sunrpc/svcsock.h>
35#include <linux/nfsd/nfsd.h>
36#include <linux/nfsd/cache.h>
37#include <linux/nfsd/xdr.h>
38#include <linux/nfsd/syscall.h> 13#include <linux/nfsd/syscall.h>
39#include <linux/lockd/lockd.h> 14#include <linux/lockd/lockd.h>
40#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
41 16
42#include <asm/uaccess.h> 17#include "nfsd.h"
43#include <net/ipv6.h> 18#include "cache.h"
44 19
45/* 20/*
46 * We have a single directory with 9 nodes in it. 21 * We have a single directory with 9 nodes in it.
@@ -55,6 +30,7 @@ enum {
55 NFSD_Getfd, 30 NFSD_Getfd,
56 NFSD_Getfs, 31 NFSD_Getfs,
57 NFSD_List, 32 NFSD_List,
33 NFSD_Export_features,
58 NFSD_Fh, 34 NFSD_Fh,
59 NFSD_FO_UnlockIP, 35 NFSD_FO_UnlockIP,
60 NFSD_FO_UnlockFS, 36 NFSD_FO_UnlockFS,
@@ -173,6 +149,24 @@ static const struct file_operations exports_operations = {
173 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
174}; 150};
175 151
152static int export_features_show(struct seq_file *m, void *v)
153{
154 seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS);
155 return 0;
156}
157
158static int export_features_open(struct inode *inode, struct file *file)
159{
160 return single_open(file, export_features_show, NULL);
161}
162
163static struct file_operations export_features_operations = {
164 .open = export_features_open,
165 .read = seq_read,
166 .llseek = seq_lseek,
167 .release = single_release,
168};
169
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 170extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); 171extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
178 172
@@ -995,6 +989,7 @@ static ssize_t __write_ports_delfd(char *buf)
995static ssize_t __write_ports_addxprt(char *buf) 989static ssize_t __write_ports_addxprt(char *buf)
996{ 990{
997 char transport[16]; 991 char transport[16];
992 struct svc_xprt *xprt;
998 int port, err; 993 int port, err;
999 994
1000 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 995 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
@@ -1009,13 +1004,24 @@ static ssize_t __write_ports_addxprt(char *buf)
1009 1004
1010 err = svc_create_xprt(nfsd_serv, transport, 1005 err = svc_create_xprt(nfsd_serv, transport,
1011 PF_INET, port, SVC_SOCK_ANONYMOUS); 1006 PF_INET, port, SVC_SOCK_ANONYMOUS);
1012 if (err < 0) { 1007 if (err < 0)
1013 /* Give a reasonable perror msg for bad transport string */ 1008 goto out_err;
1014 if (err == -ENOENT) 1009
1015 err = -EPROTONOSUPPORT; 1010 err = svc_create_xprt(nfsd_serv, transport,
1016 return err; 1011 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1017 } 1012 if (err < 0 && err != -EAFNOSUPPORT)
1013 goto out_close;
1018 return 0; 1014 return 0;
1015out_close:
1016 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
1017 if (xprt != NULL) {
1018 svc_close_xprt(xprt);
1019 svc_xprt_put(xprt);
1020 }
1021out_err:
1022 /* Decrease the count, but don't shut down the service */
1023 nfsd_serv->sv_nrthreads--;
1024 return err;
1019} 1025}
1020 1026
1021/* 1027/*
@@ -1330,6 +1336,8 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1330 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, 1336 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
1331 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, 1337 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
1332 [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, 1338 [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
1339 [NFSD_Export_features] = {"export_features",
1340 &export_features_operations, S_IRUGO},
1333 [NFSD_FO_UnlockIP] = {"unlock_ip", 1341 [NFSD_FO_UnlockIP] = {"unlock_ip",
1334 &transaction_ops, S_IWUSR|S_IRUSR}, 1342 &transaction_ops, S_IWUSR|S_IRUSR},
1335 [NFSD_FO_UnlockFS] = {"unlock_filesystem", 1343 [NFSD_FO_UnlockFS] = {"unlock_filesystem",
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
new file mode 100644
index 000000000000..e942a1aaac92
--- /dev/null
+++ b/fs/nfsd/nfsd.h
@@ -0,0 +1,338 @@
1/*
2 * Hodge-podge collection of knfsd-related stuff.
3 * I will sort this out later.
4 *
5 * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
6 */
7
8#ifndef LINUX_NFSD_NFSD_H
9#define LINUX_NFSD_NFSD_H
10
11#include <linux/types.h>
12#include <linux/mount.h>
13
14#include <linux/nfsd/debug.h>
15#include <linux/nfsd/export.h>
16#include <linux/nfsd/stats.h>
17/*
18 * nfsd version
19 */
20#define NFSD_SUPPORTED_MINOR_VERSION 1
21
22struct readdir_cd {
23 __be32 err; /* 0, nfserr, or nfserr_eof */
24};
25
26
27extern struct svc_program nfsd_program;
28extern struct svc_version nfsd_version2, nfsd_version3,
29 nfsd_version4;
30extern u32 nfsd_supported_minorversion;
31extern struct mutex nfsd_mutex;
32extern struct svc_serv *nfsd_serv;
33extern spinlock_t nfsd_drc_lock;
34extern unsigned int nfsd_drc_max_mem;
35extern unsigned int nfsd_drc_mem_used;
36
37extern const struct seq_operations nfs_exports_op;
38
39/*
40 * Function prototypes.
41 */
42int nfsd_svc(unsigned short port, int nrservs);
43int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
44
45int nfsd_nrthreads(void);
46int nfsd_nrpools(void);
47int nfsd_get_nrthreads(int n, int *);
48int nfsd_set_nrthreads(int n, int *);
49
50#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
51#ifdef CONFIG_NFSD_V2_ACL
52extern struct svc_version nfsd_acl_version2;
53#else
54#define nfsd_acl_version2 NULL
55#endif
56#ifdef CONFIG_NFSD_V3_ACL
57extern struct svc_version nfsd_acl_version3;
58#else
59#define nfsd_acl_version3 NULL
60#endif
61#endif
62
63enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
64int nfsd_vers(int vers, enum vers_op change);
65int nfsd_minorversion(u32 minorversion, enum vers_op change);
66void nfsd_reset_versions(void);
67int nfsd_create_serv(void);
68
69extern int nfsd_max_blksize;
70
71static inline int nfsd_v4client(struct svc_rqst *rq)
72{
73 return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
74}
75
76/*
77 * NFSv4 State
78 */
79#ifdef CONFIG_NFSD_V4
80extern unsigned int max_delegations;
81int nfs4_state_init(void);
82void nfsd4_free_slabs(void);
83int nfs4_state_start(void);
84void nfs4_state_shutdown(void);
85time_t nfs4_lease_time(void);
86void nfs4_reset_lease(time_t leasetime);
87int nfs4_reset_recoverydir(char *recdir);
88#else
89static inline int nfs4_state_init(void) { return 0; }
90static inline void nfsd4_free_slabs(void) { }
91static inline int nfs4_state_start(void) { return 0; }
92static inline void nfs4_state_shutdown(void) { }
93static inline time_t nfs4_lease_time(void) { return 0; }
94static inline void nfs4_reset_lease(time_t leasetime) { }
95static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
96#endif
97
98/*
99 * lockd binding
100 */
101void nfsd_lockd_init(void);
102void nfsd_lockd_shutdown(void);
103
104
105/*
106 * These macros provide pre-xdr'ed values for faster operation.
107 */
108#define nfs_ok cpu_to_be32(NFS_OK)
109#define nfserr_perm cpu_to_be32(NFSERR_PERM)
110#define nfserr_noent cpu_to_be32(NFSERR_NOENT)
111#define nfserr_io cpu_to_be32(NFSERR_IO)
112#define nfserr_nxio cpu_to_be32(NFSERR_NXIO)
113#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN)
114#define nfserr_acces cpu_to_be32(NFSERR_ACCES)
115#define nfserr_exist cpu_to_be32(NFSERR_EXIST)
116#define nfserr_xdev cpu_to_be32(NFSERR_XDEV)
117#define nfserr_nodev cpu_to_be32(NFSERR_NODEV)
118#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR)
119#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR)
120#define nfserr_inval cpu_to_be32(NFSERR_INVAL)
121#define nfserr_fbig cpu_to_be32(NFSERR_FBIG)
122#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC)
123#define nfserr_rofs cpu_to_be32(NFSERR_ROFS)
124#define nfserr_mlink cpu_to_be32(NFSERR_MLINK)
125#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP)
126#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG)
127#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY)
128#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT)
129#define nfserr_stale cpu_to_be32(NFSERR_STALE)
130#define nfserr_remote cpu_to_be32(NFSERR_REMOTE)
131#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH)
132#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE)
133#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC)
134#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE)
135#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP)
136#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL)
137#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT)
138#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE)
139#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX)
140#define nfserr_denied cpu_to_be32(NFSERR_DENIED)
141#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK)
142#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED)
143#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE)
144#define nfserr_same cpu_to_be32(NFSERR_SAME)
145#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE)
146#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID)
147#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE)
148#define nfserr_moved cpu_to_be32(NFSERR_MOVED)
149#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE)
150#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH)
151#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED)
152#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID)
153#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID)
154#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID)
155#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID)
156#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK)
157#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME)
158#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH)
159#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
160#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
161#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
162#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
163#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
164#define nfserr_grace cpu_to_be32(NFSERR_GRACE)
165#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE)
166#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD)
167#define nfserr_badname cpu_to_be32(NFSERR_BADNAME)
168#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN)
169#define nfserr_locked cpu_to_be32(NFSERR_LOCKED)
170#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC)
171#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE)
172#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT)
173#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
174#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION)
175#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT)
176#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY)
177#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
178#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED)
179#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY)
180#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER)
181#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE)
182#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT)
183#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT)
184#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE)
185#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED)
186#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS)
187#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG)
188#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG)
189#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE)
190#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP)
191#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND)
192#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS)
193#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION)
194#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP)
195#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY)
196#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE)
197#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY)
198#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT)
199#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION)
200#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP)
201#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT)
202#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP)
203#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED)
204#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE)
205#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL)
206#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
207#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
208#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
209
210/* error codes for internal use */
211/* if a request fails due to kmalloc failure, it gets dropped.
212 * Client should resend eventually
213 */
214#define nfserr_dropit cpu_to_be32(30000)
215/* end-of-file indicator in readdir */
216#define nfserr_eof cpu_to_be32(30001)
217/* replay detected */
218#define nfserr_replay_me cpu_to_be32(11001)
219/* nfs41 replay detected */
220#define nfserr_replay_cache cpu_to_be32(11002)
221
222/* Check for dir entries '.' and '..' */
223#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
224
225/*
226 * Time of server startup
227 */
228extern struct timeval nfssvc_boot;
229
230#ifdef CONFIG_NFSD_V4
231
232/* before processing a COMPOUND operation, we have to check that there
233 * is enough space in the buffer for XDR encode to succeed. otherwise,
234 * we might process an operation with side effects, and be unable to
235 * tell the client that the operation succeeded.
236 *
237 * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space
238 * needed to encode an "ordinary" _successful_ operation. (GETATTR,
239 * READ, READDIR, and READLINK have their own buffer checks.) if we
240 * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
241 *
242 * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space
243 * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
244 * care is taken to ensure that we never fall below this level for any
245 * reason.
246 */
247#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
248#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
249
250#define NFSD_LEASE_TIME (nfs4_lease_time())
251#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */
252
253/*
254 * The following attributes are currently not supported by the NFSv4 server:
255 * ARCHIVE (deprecated anyway)
256 * HIDDEN (unlikely to be supported any time soon)
257 * MIMETYPE (unlikely to be supported any time soon)
258 * QUOTA_* (will be supported in a forthcoming patch)
259 * SYSTEM (unlikely to be supported any time soon)
260 * TIME_BACKUP (unlikely to be supported any time soon)
261 * TIME_CREATE (unlikely to be supported any time soon)
262 */
263#define NFSD4_SUPPORTED_ATTRS_WORD0 \
264(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \
265 | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \
266 | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \
267 | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \
268 | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \
269 | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \
270 | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \
271 | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \
272 | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \
273 | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL)
274
275#define NFSD4_SUPPORTED_ATTRS_WORD1 \
276(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \
277 | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
278 | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
279 | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
280 | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
281 | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
282
283#define NFSD4_SUPPORTED_ATTRS_WORD2 0
284
285#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
286 NFSD4_SUPPORTED_ATTRS_WORD0
287
288#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
289 NFSD4_SUPPORTED_ATTRS_WORD1
290
291#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
292 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
293
294static inline u32 nfsd_suppattrs0(u32 minorversion)
295{
296 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
297 : NFSD4_SUPPORTED_ATTRS_WORD0;
298}
299
300static inline u32 nfsd_suppattrs1(u32 minorversion)
301{
302 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
303 : NFSD4_SUPPORTED_ATTRS_WORD1;
304}
305
306static inline u32 nfsd_suppattrs2(u32 minorversion)
307{
308 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
309 : NFSD4_SUPPORTED_ATTRS_WORD2;
310}
311
312/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
313#define NFSD_WRITEONLY_ATTRS_WORD1 \
314(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
315
316/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
317#define NFSD_WRITEABLE_ATTRS_WORD0 \
318(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL )
319#define NFSD_WRITEABLE_ATTRS_WORD1 \
320(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
321 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
322#define NFSD_WRITEABLE_ATTRS_WORD2 0
323
324#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
325 NFSD_WRITEABLE_ATTRS_WORD0
326/*
327 * we currently store the exclusive create verifier in the v_{a,m}time
328 * attributes so the client can't set these at create time using EXCLUSIVE4_1
329 */
330#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \
331 (NFSD_WRITEABLE_ATTRS_WORD1 & \
332 ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET))
333#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
334 NFSD_WRITEABLE_ATTRS_WORD2
335
336#endif /* CONFIG_NFSD_V4 */
337
338#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 01965b2f3a76..55c8e63af0be 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/fs/nfsd/nfsfh.c
3 *
4 * NFS server file handle treatment. 2 * NFS server file handle treatment.
5 * 3 *
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
@@ -9,19 +7,11 @@
9 * ... and again Southern-Winter 2001 to support export_operations 7 * ... and again Southern-Winter 2001 to support export_operations
10 */ 8 */
11 9
12#include <linux/slab.h>
13#include <linux/fs.h>
14#include <linux/unistd.h>
15#include <linux/string.h>
16#include <linux/stat.h>
17#include <linux/dcache.h>
18#include <linux/exportfs.h> 10#include <linux/exportfs.h>
19#include <linux/mount.h>
20 11
21#include <linux/sunrpc/clnt.h>
22#include <linux/sunrpc/svc.h>
23#include <linux/sunrpc/svcauth_gss.h> 12#include <linux/sunrpc/svcauth_gss.h>
24#include <linux/nfsd/nfsd.h> 13#include "nfsd.h"
14#include "vfs.h"
25#include "auth.h" 15#include "auth.h"
26 16
27#define NFSDDBG_FACILITY NFSDDBG_FH 17#define NFSDDBG_FACILITY NFSDDBG_FH
@@ -96,8 +86,10 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
96static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, 86static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
97 struct svc_export *exp) 87 struct svc_export *exp)
98{ 88{
89 int flags = nfsexp_flags(rqstp, exp);
90
99 /* Check if the request originated from a secure port. */ 91 /* Check if the request originated from a secure port. */
100 if (!rqstp->rq_secure && EX_SECURE(exp)) { 92 if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) {
101 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); 93 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
102 dprintk(KERN_WARNING 94 dprintk(KERN_WARNING
103 "nfsd: request from insecure port %s!\n", 95 "nfsd: request from insecure port %s!\n",
@@ -109,6 +101,36 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
109 return nfserrno(nfsd_setuser(rqstp, exp)); 101 return nfserrno(nfsd_setuser(rqstp, exp));
110} 102}
111 103
104static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
105 struct dentry *dentry, struct svc_export *exp)
106{
107 if (!(exp->ex_flags & NFSEXP_V4ROOT))
108 return nfs_ok;
109 /*
110 * v2/v3 clients have no need for the V4ROOT export--they use
111 * the mount protocl instead; also, further V4ROOT checks may be
112 * in v4-specific code, in which case v2/v3 clients could bypass
113 * them.
114 */
115 if (!nfsd_v4client(rqstp))
116 return nfserr_stale;
117 /*
118 * We're exposing only the directories and symlinks that have to be
119 * traversed on the way to real exports:
120 */
121 if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) &&
122 !S_ISLNK(dentry->d_inode->i_mode)))
123 return nfserr_stale;
124 /*
125 * A pseudoroot export gives permission to access only one
126 * single directory; the kernel has to make another upcall
127 * before granting access to anything else under it:
128 */
129 if (unlikely(dentry != exp->ex_path.dentry))
130 return nfserr_stale;
131 return nfs_ok;
132}
133
112/* 134/*
113 * Use the given filehandle to look up the corresponding export and 135 * Use the given filehandle to look up the corresponding export and
114 * dentry. On success, the results are used to set fh_export and 136 * dentry. On success, the results are used to set fh_export and
@@ -232,14 +254,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
232 goto out; 254 goto out;
233 } 255 }
234 256
235 if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
236 error = nfsd_setuser_and_check_port(rqstp, exp);
237 if (error) {
238 dput(dentry);
239 goto out;
240 }
241 }
242
243 if (S_ISDIR(dentry->d_inode->i_mode) && 257 if (S_ISDIR(dentry->d_inode->i_mode) &&
244 (dentry->d_flags & DCACHE_DISCONNECTED)) { 258 (dentry->d_flags & DCACHE_DISCONNECTED)) {
245 printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", 259 printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
@@ -294,28 +308,32 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
294 error = nfsd_set_fh_dentry(rqstp, fhp); 308 error = nfsd_set_fh_dentry(rqstp, fhp);
295 if (error) 309 if (error)
296 goto out; 310 goto out;
297 dentry = fhp->fh_dentry;
298 exp = fhp->fh_export;
299 } else {
300 /*
301 * just rechecking permissions
302 * (e.g. nfsproc_create calls fh_verify, then nfsd_create
303 * does as well)
304 */
305 dprintk("nfsd: fh_verify - just checking\n");
306 dentry = fhp->fh_dentry;
307 exp = fhp->fh_export;
308 /*
309 * Set user creds for this exportpoint; necessary even
310 * in the "just checking" case because this may be a
311 * filehandle that was created by fh_compose, and that
312 * is about to be used in another nfsv4 compound
313 * operation.
314 */
315 error = nfsd_setuser_and_check_port(rqstp, exp);
316 if (error)
317 goto out;
318 } 311 }
312 dentry = fhp->fh_dentry;
313 exp = fhp->fh_export;
314 /*
315 * We still have to do all these permission checks, even when
316 * fh_dentry is already set:
317 * - fh_verify may be called multiple times with different
318 * "access" arguments (e.g. nfsd_proc_create calls
319 * fh_verify(...,NFSD_MAY_EXEC) first, then later (in
320 * nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE).
321 * - in the NFSv4 case, the filehandle may have been filled
322 * in by fh_compose, and given a dentry, but further
323 * compound operations performed with that filehandle
324 * still need permissions checks. In the worst case, a
325 * mountpoint crossing may have changed the export
326 * options, and we may now need to use a different uid
327 * (for example, if different id-squashing options are in
328 * effect on the new filesystem).
329 */
330 error = check_pseudo_root(rqstp, dentry, exp);
331 if (error)
332 goto out;
333
334 error = nfsd_setuser_and_check_port(rqstp, exp);
335 if (error)
336 goto out;
319 337
320 error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); 338 error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
321 if (error) 339 if (error)
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
new file mode 100644
index 000000000000..cdfb8c6a4206
--- /dev/null
+++ b/fs/nfsd/nfsfh.h
@@ -0,0 +1,208 @@
1/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */
2
3#ifndef _LINUX_NFSD_FH_INT_H
4#define _LINUX_NFSD_FH_INT_H
5
6#include <linux/nfsd/nfsfh.h>
7
8enum nfsd_fsid {
9 FSID_DEV = 0,
10 FSID_NUM,
11 FSID_MAJOR_MINOR,
12 FSID_ENCODE_DEV,
13 FSID_UUID4_INUM,
14 FSID_UUID8,
15 FSID_UUID16,
16 FSID_UUID16_INUM,
17};
18
19enum fsid_source {
20 FSIDSOURCE_DEV,
21 FSIDSOURCE_FSID,
22 FSIDSOURCE_UUID,
23};
24extern enum fsid_source fsid_source(struct svc_fh *fhp);
25
26
27/* This might look a little large to "inline" but in all calls except
28 * one, 'vers' is constant so moste of the function disappears.
29 */
30static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
31 u32 fsid, unsigned char *uuid)
32{
33 u32 *up;
34 switch(vers) {
35 case FSID_DEV:
36 fsidv[0] = htonl((MAJOR(dev)<<16) |
37 MINOR(dev));
38 fsidv[1] = ino_t_to_u32(ino);
39 break;
40 case FSID_NUM:
41 fsidv[0] = fsid;
42 break;
43 case FSID_MAJOR_MINOR:
44 fsidv[0] = htonl(MAJOR(dev));
45 fsidv[1] = htonl(MINOR(dev));
46 fsidv[2] = ino_t_to_u32(ino);
47 break;
48
49 case FSID_ENCODE_DEV:
50 fsidv[0] = new_encode_dev(dev);
51 fsidv[1] = ino_t_to_u32(ino);
52 break;
53
54 case FSID_UUID4_INUM:
55 /* 4 byte fsid and inode number */
56 up = (u32*)uuid;
57 fsidv[0] = ino_t_to_u32(ino);
58 fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3];
59 break;
60
61 case FSID_UUID8:
62 /* 8 byte fsid */
63 up = (u32*)uuid;
64 fsidv[0] = up[0] ^ up[2];
65 fsidv[1] = up[1] ^ up[3];
66 break;
67
68 case FSID_UUID16:
69 /* 16 byte fsid - NFSv3+ only */
70 memcpy(fsidv, uuid, 16);
71 break;
72
73 case FSID_UUID16_INUM:
74 /* 8 byte inode and 16 byte fsid */
75 *(u64*)fsidv = (u64)ino;
76 memcpy(fsidv+2, uuid, 16);
77 break;
78 default: BUG();
79 }
80}
81
82static inline int key_len(int type)
83{
84 switch(type) {
85 case FSID_DEV: return 8;
86 case FSID_NUM: return 4;
87 case FSID_MAJOR_MINOR: return 12;
88 case FSID_ENCODE_DEV: return 8;
89 case FSID_UUID4_INUM: return 8;
90 case FSID_UUID8: return 8;
91 case FSID_UUID16: return 16;
92 case FSID_UUID16_INUM: return 24;
93 default: return 0;
94 }
95}
96
97/*
98 * Shorthand for dprintk()'s
99 */
100extern char * SVCFH_fmt(struct svc_fh *fhp);
101
102/*
103 * Function prototypes
104 */
105__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int);
106__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
107__be32 fh_update(struct svc_fh *);
108void fh_put(struct svc_fh *);
109
110static __inline__ struct svc_fh *
111fh_copy(struct svc_fh *dst, struct svc_fh *src)
112{
113 WARN_ON(src->fh_dentry || src->fh_locked);
114
115 *dst = *src;
116 return dst;
117}
118
119static inline void
120fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
121{
122 dst->fh_size = src->fh_size;
123 memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
124}
125
126static __inline__ struct svc_fh *
127fh_init(struct svc_fh *fhp, int maxsize)
128{
129 memset(fhp, 0, sizeof(*fhp));
130 fhp->fh_maxsize = maxsize;
131 return fhp;
132}
133
134#ifdef CONFIG_NFSD_V3
135/*
136 * Fill in the pre_op attr for the wcc data
137 */
138static inline void
139fill_pre_wcc(struct svc_fh *fhp)
140{
141 struct inode *inode;
142
143 inode = fhp->fh_dentry->d_inode;
144 if (!fhp->fh_pre_saved) {
145 fhp->fh_pre_mtime = inode->i_mtime;
146 fhp->fh_pre_ctime = inode->i_ctime;
147 fhp->fh_pre_size = inode->i_size;
148 fhp->fh_pre_change = inode->i_version;
149 fhp->fh_pre_saved = 1;
150 }
151}
152
153extern void fill_post_wcc(struct svc_fh *);
154#else
155#define fill_pre_wcc(ignored)
156#define fill_post_wcc(notused)
157#endif /* CONFIG_NFSD_V3 */
158
159
160/*
161 * Lock a file handle/inode
162 * NOTE: both fh_lock and fh_unlock are done "by hand" in
163 * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
164 * so, any changes here should be reflected there.
165 */
166
167static inline void
168fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
169{
170 struct dentry *dentry = fhp->fh_dentry;
171 struct inode *inode;
172
173 BUG_ON(!dentry);
174
175 if (fhp->fh_locked) {
176 printk(KERN_WARNING "fh_lock: %s/%s already locked!\n",
177 dentry->d_parent->d_name.name, dentry->d_name.name);
178 return;
179 }
180
181 inode = dentry->d_inode;
182 mutex_lock_nested(&inode->i_mutex, subclass);
183 fill_pre_wcc(fhp);
184 fhp->fh_locked = 1;
185}
186
187static inline void
188fh_lock(struct svc_fh *fhp)
189{
190 fh_lock_nested(fhp, I_MUTEX_NORMAL);
191}
192
193/*
194 * Unlock a file handle/inode
195 */
196static inline void
197fh_unlock(struct svc_fh *fhp)
198{
199 BUG_ON(!fhp->fh_dentry);
200
201 if (fhp->fh_locked) {
202 fill_post_wcc(fhp);
203 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
204 fhp->fh_locked = 0;
205 }
206}
207
208#endif /* _LINUX_NFSD_FH_INT_H */
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0eb9c820b7a6..a047ad6111ef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -1,29 +1,14 @@
1/* 1/*
2 * nfsproc2.c Process version 2 NFS requests.
3 * linux/fs/nfsd/nfs2proc.c
4 *
5 * Process version 2 NFS requests. 2 * Process version 2 NFS requests.
6 * 3 *
7 * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
8 */ 5 */
9 6
10#include <linux/linkage.h>
11#include <linux/time.h>
12#include <linux/errno.h>
13#include <linux/fs.h>
14#include <linux/stat.h>
15#include <linux/fcntl.h>
16#include <linux/net.h>
17#include <linux/in.h>
18#include <linux/namei.h> 7#include <linux/namei.h>
19#include <linux/unistd.h>
20#include <linux/slab.h>
21 8
22#include <linux/sunrpc/clnt.h> 9#include "cache.h"
23#include <linux/sunrpc/svc.h> 10#include "xdr.h"
24#include <linux/nfsd/nfsd.h> 11#include "vfs.h"
25#include <linux/nfsd/cache.h>
26#include <linux/nfsd/xdr.h>
27 12
28typedef struct svc_rqst svc_rqst; 13typedef struct svc_rqst svc_rqst;
29typedef struct svc_buf svc_buf; 14typedef struct svc_buf svc_buf;
@@ -758,6 +743,7 @@ nfserrno (int errno)
758 { nfserr_io, -ETXTBSY }, 743 { nfserr_io, -ETXTBSY },
759 { nfserr_notsupp, -EOPNOTSUPP }, 744 { nfserr_notsupp, -EOPNOTSUPP },
760 { nfserr_toosmall, -ETOOSMALL }, 745 { nfserr_toosmall, -ETOOSMALL },
746 { nfserr_serverfault, -ESERVERFAULT },
761 }; 747 };
762 int i; 748 int i;
763 749
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 67ea83eedd43..171699eb07c8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/fs/nfsd/nfssvc.c
3 *
4 * Central processing for nfsd. 2 * Central processing for nfsd.
5 * 3 *
6 * Authors: Olaf Kirch (okir@monad.swb.de) 4 * Authors: Olaf Kirch (okir@monad.swb.de)
@@ -8,33 +6,19 @@
8 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
9 */ 7 */
10 8
11#include <linux/module.h>
12#include <linux/sched.h> 9#include <linux/sched.h>
13#include <linux/time.h>
14#include <linux/errno.h>
15#include <linux/nfs.h>
16#include <linux/in.h>
17#include <linux/uio.h>
18#include <linux/unistd.h>
19#include <linux/slab.h>
20#include <linux/smp.h>
21#include <linux/freezer.h> 10#include <linux/freezer.h>
22#include <linux/fs_struct.h> 11#include <linux/fs_struct.h>
23#include <linux/kthread.h>
24#include <linux/swap.h> 12#include <linux/swap.h>
25 13
26#include <linux/sunrpc/types.h>
27#include <linux/sunrpc/stats.h> 14#include <linux/sunrpc/stats.h>
28#include <linux/sunrpc/svc.h>
29#include <linux/sunrpc/svcsock.h> 15#include <linux/sunrpc/svcsock.h>
30#include <linux/sunrpc/cache.h>
31#include <linux/nfsd/nfsd.h>
32#include <linux/nfsd/stats.h>
33#include <linux/nfsd/cache.h>
34#include <linux/nfsd/syscall.h>
35#include <linux/lockd/bind.h> 16#include <linux/lockd/bind.h>
36#include <linux/nfsacl.h> 17#include <linux/nfsacl.h>
37#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include "nfsd.h"
20#include "cache.h"
21#include "vfs.h"
38 22
39#define NFSDDBG_FACILITY NFSDDBG_SVC 23#define NFSDDBG_FACILITY NFSDDBG_SVC
40 24
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index afd08e2c90a5..4ce005dbf3e6 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -1,20 +1,10 @@
1/* 1/*
2 * linux/fs/nfsd/nfsxdr.c
3 *
4 * XDR support for nfsd 2 * XDR support for nfsd
5 * 3 *
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
7 */ 5 */
8 6
9#include <linux/types.h> 7#include "xdr.h"
10#include <linux/time.h>
11#include <linux/nfs.h>
12#include <linux/vfs.h>
13#include <linux/sunrpc/xdr.h>
14#include <linux/sunrpc/svc.h>
15#include <linux/nfsd/nfsd.h>
16#include <linux/nfsd/xdr.h>
17#include <linux/mm.h>
18#include "auth.h" 8#include "auth.h"
19 9
20#define NFSDDBG_FACILITY NFSDDBG_XDR 10#define NFSDDBG_FACILITY NFSDDBG_XDR
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
new file mode 100644
index 000000000000..fefeae27f25e
--- /dev/null
+++ b/fs/nfsd/state.h
@@ -0,0 +1,408 @@
1/*
2 * Copyright (c) 2001 The Regents of the University of Michigan.
3 * All rights reserved.
4 *
5 * Kendrick Smith <kmsmith@umich.edu>
6 * Andy Adamson <andros@umich.edu>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the University nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
22 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
23 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 */
34
35#ifndef _NFSD4_STATE_H
36#define _NFSD4_STATE_H
37
38#include <linux/nfsd/nfsfh.h>
39#include "nfsfh.h"
40
41typedef struct {
42 u32 cl_boot;
43 u32 cl_id;
44} clientid_t;
45
46typedef struct {
47 u32 so_boot;
48 u32 so_stateownerid;
49 u32 so_fileid;
50} stateid_opaque_t;
51
52typedef struct {
53 u32 si_generation;
54 stateid_opaque_t si_opaque;
55} stateid_t;
56#define si_boot si_opaque.so_boot
57#define si_stateownerid si_opaque.so_stateownerid
58#define si_fileid si_opaque.so_fileid
59
60#define STATEID_FMT "(%08x/%08x/%08x/%08x)"
61#define STATEID_VAL(s) \
62 (s)->si_boot, \
63 (s)->si_stateownerid, \
64 (s)->si_fileid, \
65 (s)->si_generation
66
67struct nfsd4_cb_sequence {
68 /* args/res */
69 u32 cbs_minorversion;
70 struct nfs4_client *cbs_clp;
71};
72
73struct nfs4_delegation {
74 struct list_head dl_perfile;
75 struct list_head dl_perclnt;
76 struct list_head dl_recall_lru; /* delegation recalled */
77 atomic_t dl_count; /* ref count */
78 struct nfs4_client *dl_client;
79 struct nfs4_file *dl_file;
80 struct file_lock *dl_flock;
81 struct file *dl_vfs_file;
82 u32 dl_type;
83 time_t dl_time;
84/* For recall: */
85 u32 dl_ident;
86 stateid_t dl_stateid;
87 struct knfsd_fh dl_fh;
88 int dl_retries;
89};
90
91/* client delegation callback info */
92struct nfs4_cb_conn {
93 /* SETCLIENTID info */
94 struct sockaddr_storage cb_addr;
95 size_t cb_addrlen;
96 u32 cb_prog;
97 u32 cb_minorversion;
98 u32 cb_ident; /* minorversion 0 only */
99 /* RPC client info */
100 atomic_t cb_set; /* successful CB_NULL call */
101 struct rpc_clnt * cb_client;
102};
103
104/* Maximum number of slots per session. 160 is useful for long haul TCP */
105#define NFSD_MAX_SLOTS_PER_SESSION 160
106/* Maximum number of operations per session compound */
107#define NFSD_MAX_OPS_PER_COMPOUND 16
108/* Maximum session per slot cache size */
109#define NFSD_SLOT_CACHE_SIZE 1024
110/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
111#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32
112#define NFSD_MAX_MEM_PER_SESSION \
113 (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
114
115struct nfsd4_slot {
116 bool sl_inuse;
117 bool sl_cachethis;
118 u16 sl_opcnt;
119 u32 sl_seqid;
120 __be32 sl_status;
121 u32 sl_datalen;
122 char sl_data[];
123};
124
125struct nfsd4_channel_attrs {
126 u32 headerpadsz;
127 u32 maxreq_sz;
128 u32 maxresp_sz;
129 u32 maxresp_cached;
130 u32 maxops;
131 u32 maxreqs;
132 u32 nr_rdma_attrs;
133 u32 rdma_attrs;
134};
135
136struct nfsd4_create_session {
137 clientid_t clientid;
138 struct nfs4_sessionid sessionid;
139 u32 seqid;
140 u32 flags;
141 struct nfsd4_channel_attrs fore_channel;
142 struct nfsd4_channel_attrs back_channel;
143 u32 callback_prog;
144 u32 uid;
145 u32 gid;
146};
147
148/* The single slot clientid cache structure */
149struct nfsd4_clid_slot {
150 u32 sl_seqid;
151 __be32 sl_status;
152 struct nfsd4_create_session sl_cr_ses;
153};
154
155struct nfsd4_session {
156 struct kref se_ref;
157 struct list_head se_hash; /* hash by sessionid */
158 struct list_head se_perclnt;
159 u32 se_flags;
160 struct nfs4_client *se_client; /* for expire_client */
161 struct nfs4_sessionid se_sessionid;
162 struct nfsd4_channel_attrs se_fchannel;
163 struct nfsd4_channel_attrs se_bchannel;
164 struct nfsd4_slot *se_slots[]; /* forward channel slots */
165};
166
167static inline void
168nfsd4_put_session(struct nfsd4_session *ses)
169{
170 extern void free_session(struct kref *kref);
171 kref_put(&ses->se_ref, free_session);
172}
173
174static inline void
175nfsd4_get_session(struct nfsd4_session *ses)
176{
177 kref_get(&ses->se_ref);
178}
179
180/* formatted contents of nfs4_sessionid */
181struct nfsd4_sessionid {
182 clientid_t clientid;
183 u32 sequence;
184 u32 reserved;
185};
186
187#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */
188
189/*
190 * struct nfs4_client - one per client. Clientids live here.
191 * o Each nfs4_client is hashed by clientid.
192 *
193 * o Each nfs4_clients is also hashed by name
194 * (the opaque quantity initially sent by the client to identify itself).
195 *
196 * o cl_perclient list is used to ensure no dangling stateowner references
197 * when we expire the nfs4_client
198 */
199struct nfs4_client {
200 struct list_head cl_idhash; /* hash by cl_clientid.id */
201 struct list_head cl_strhash; /* hash by cl_name */
202 struct list_head cl_openowners;
203 struct list_head cl_delegations;
204 struct list_head cl_lru; /* tail queue */
205 struct xdr_netobj cl_name; /* id generated by client */
206 char cl_recdir[HEXDIR_LEN]; /* recovery dir */
207 nfs4_verifier cl_verifier; /* generated by client */
208 time_t cl_time; /* time of last lease renewal */
209 struct sockaddr_storage cl_addr; /* client ipaddress */
210 u32 cl_flavor; /* setclientid pseudoflavor */
211 char *cl_principal; /* setclientid principal name */
212 struct svc_cred cl_cred; /* setclientid principal */
213 clientid_t cl_clientid; /* generated by server */
214 nfs4_verifier cl_confirm; /* generated by server */
215 struct nfs4_cb_conn cl_cb_conn; /* callback info */
216 atomic_t cl_count; /* ref count */
217 u32 cl_firststate; /* recovery dir creation */
218
219 /* for nfs41 */
220 struct list_head cl_sessions;
221 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
222 u32 cl_exchange_flags;
223 struct nfs4_sessionid cl_sessionid;
224
225 /* for nfs41 callbacks */
226 /* We currently support a single back channel with a single slot */
227 unsigned long cl_cb_slot_busy;
228 u32 cl_cb_seq_nr;
229 struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */
230 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
231 /* wait here for slots */
232};
233
234/* struct nfs4_client_reset
235 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
236 * upon lease reset, or from upcall to state_daemon (to read in state
237 * from non-volitile storage) upon reboot.
238 */
239struct nfs4_client_reclaim {
240 struct list_head cr_strhash; /* hash by cr_name */
241 char cr_recdir[HEXDIR_LEN]; /* recover dir */
242};
243
244static inline void
245update_stateid(stateid_t *stateid)
246{
247 stateid->si_generation++;
248}
249
250/* A reasonable value for REPLAY_ISIZE was estimated as follows:
251 * The OPEN response, typically the largest, requires
252 * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) +
253 * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) +
254 * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes
255 */
256
257#define NFSD4_REPLAY_ISIZE 112
258
259/*
260 * Replay buffer, where the result of the last seqid-mutating operation
261 * is cached.
262 */
263struct nfs4_replay {
264 __be32 rp_status;
265 unsigned int rp_buflen;
266 char *rp_buf;
267 unsigned intrp_allocated;
268 struct knfsd_fh rp_openfh;
269 char rp_ibuf[NFSD4_REPLAY_ISIZE];
270};
271
272/*
273* nfs4_stateowner can either be an open_owner, or a lock_owner
274*
275* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
276* for lock_owner
277* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
278* for lock_owner
279* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
280* struct is reaped.
281* so_perfilestate: heads the list of nfs4_stateid (either open or lock)
282* and is used to ensure no dangling nfs4_stateid references when we
283* release a stateowner.
284* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
285* close is called to reap associated byte-range locks
286* so_close_lru: (open) stateowner is placed on this list instead of being
287* reaped (when so_perfilestate is empty) to hold the last close replay.
288* reaped by laundramat thread after lease period.
289*/
290struct nfs4_stateowner {
291 struct kref so_ref;
292 struct list_head so_idhash; /* hash by so_id */
293 struct list_head so_strhash; /* hash by op_name */
294 struct list_head so_perclient;
295 struct list_head so_stateids;
296 struct list_head so_perstateid; /* for lockowners only */
297 struct list_head so_close_lru; /* tail queue */
298 time_t so_time; /* time of placement on so_close_lru */
299 int so_is_open_owner; /* 1=openowner,0=lockowner */
300 u32 so_id;
301 struct nfs4_client * so_client;
302 /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
303 * sequence id expected from the client: */
304 u32 so_seqid;
305 struct xdr_netobj so_owner; /* open owner name */
306 int so_confirmed; /* successful OPEN_CONFIRM? */
307 struct nfs4_replay so_replay;
308};
309
310/*
311* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
312* o fi_perfile list is used to search for conflicting
313* share_acces, share_deny on the file.
314*/
315struct nfs4_file {
316 atomic_t fi_ref;
317 struct list_head fi_hash; /* hash by "struct inode *" */
318 struct list_head fi_stateids;
319 struct list_head fi_delegations;
320 struct inode *fi_inode;
321 u32 fi_id; /* used with stateowner->so_id
322 * for stateid_hashtbl hash */
323 bool fi_had_conflict;
324};
325
326/*
327* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
328*
329* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
330*
331* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
332* st_perfile: file_hashtbl[] entry.
333* st_perfile_state: nfs4_stateowner->so_perfilestate
334* st_perlockowner: (open stateid) list of lock nfs4_stateowners
335* st_access_bmap: used only for open stateid
336* st_deny_bmap: used only for open stateid
337* st_openstp: open stateid lock stateid was derived from
338*
339* XXX: open stateids and lock stateids have diverged sufficiently that
340* we should consider defining separate structs for the two cases.
341*/
342
343struct nfs4_stateid {
344 struct list_head st_hash;
345 struct list_head st_perfile;
346 struct list_head st_perstateowner;
347 struct list_head st_lockowners;
348 struct nfs4_stateowner * st_stateowner;
349 struct nfs4_file * st_file;
350 stateid_t st_stateid;
351 struct file * st_vfs_file;
352 unsigned long st_access_bmap;
353 unsigned long st_deny_bmap;
354 struct nfs4_stateid * st_openstp;
355};
356
357/* flags for preprocess_seqid_op() */
358#define HAS_SESSION 0x00000001
359#define CONFIRM 0x00000002
360#define OPEN_STATE 0x00000004
361#define LOCK_STATE 0x00000008
362#define RD_STATE 0x00000010
363#define WR_STATE 0x00000020
364#define CLOSE_STATE 0x00000040
365
366#define seqid_mutating_err(err) \
367 (((err) != nfserr_stale_clientid) && \
368 ((err) != nfserr_bad_seqid) && \
369 ((err) != nfserr_stale_stateid) && \
370 ((err) != nfserr_bad_stateid))
371
372struct nfsd4_compound_state;
373
374extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
375 stateid_t *stateid, int flags, struct file **filp);
376extern void nfs4_lock_state(void);
377extern void nfs4_unlock_state(void);
378extern int nfs4_in_grace(void);
379extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
380extern void put_nfs4_client(struct nfs4_client *clp);
381extern void nfs4_free_stateowner(struct kref *kref);
382extern int set_callback_cred(void);
383extern void nfsd4_probe_callback(struct nfs4_client *clp);
384extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
385extern void nfs4_put_delegation(struct nfs4_delegation *dp);
386extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
387extern void nfsd4_init_recdir(char *recdir_name);
388extern int nfsd4_recdir_load(void);
389extern void nfsd4_shutdown_recdir(void);
390extern int nfs4_client_to_reclaim(const char *name);
391extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
392extern void nfsd4_recdir_purge_old(void);
393extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
394extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
395
396static inline void
397nfs4_put_stateowner(struct nfs4_stateowner *so)
398{
399 kref_put(&so->so_ref, nfs4_free_stateowner);
400}
401
402static inline void
403nfs4_get_stateowner(struct nfs4_stateowner *so)
404{
405 kref_get(&so->so_ref);
406}
407
408#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 71944cddf680..5232d3e8fb2f 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/fs/nfsd/stats.c
3 *
4 * procfs-based user access to knfsd statistics 2 * procfs-based user access to knfsd statistics
5 * 3 *
6 * /proc/net/rpc/nfsd 4 * /proc/net/rpc/nfsd
@@ -23,18 +21,13 @@
23 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> 21 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
24 */ 22 */
25 23
26#include <linux/kernel.h>
27#include <linux/time.h>
28#include <linux/proc_fs.h>
29#include <linux/seq_file.h> 24#include <linux/seq_file.h>
30#include <linux/stat.h>
31#include <linux/module.h> 25#include <linux/module.h>
32
33#include <linux/sunrpc/svc.h>
34#include <linux/sunrpc/stats.h> 26#include <linux/sunrpc/stats.h>
35#include <linux/nfsd/nfsd.h>
36#include <linux/nfsd/stats.h> 27#include <linux/nfsd/stats.h>
37 28
29#include "nfsd.h"
30
38struct nfsd_stats nfsdstats; 31struct nfsd_stats nfsdstats;
39struct svc_stat nfsd_svcstats = { 32struct svc_stat nfsd_svcstats = {
40 .program = &nfsd_program, 33 .program = &nfsd_program,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a293f0273263..6dd5f1970e01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,7 +1,5 @@
1#define MSNFS /* HACK HACK */ 1#define MSNFS /* HACK HACK */
2/* 2/*
3 * linux/fs/nfsd/vfs.c
4 *
5 * File operations used by nfsd. Some of these have been ripped from 3 * File operations used by nfsd. Some of these have been ripped from
6 * other parts of the kernel because they weren't exported, others 4 * other parts of the kernel because they weren't exported, others
7 * are partial duplicates with added or changed functionality. 5 * are partial duplicates with added or changed functionality.
@@ -16,48 +14,33 @@
16 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> 14 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
17 */ 15 */
18 16
19#include <linux/string.h>
20#include <linux/time.h>
21#include <linux/errno.h>
22#include <linux/fs.h> 17#include <linux/fs.h>
23#include <linux/file.h> 18#include <linux/file.h>
24#include <linux/mount.h>
25#include <linux/major.h>
26#include <linux/splice.h> 19#include <linux/splice.h>
27#include <linux/proc_fs.h>
28#include <linux/stat.h>
29#include <linux/fcntl.h> 20#include <linux/fcntl.h>
30#include <linux/net.h>
31#include <linux/unistd.h>
32#include <linux/slab.h>
33#include <linux/pagemap.h>
34#include <linux/in.h>
35#include <linux/module.h>
36#include <linux/namei.h> 21#include <linux/namei.h>
37#include <linux/vfs.h>
38#include <linux/delay.h> 22#include <linux/delay.h>
39#include <linux/sunrpc/svc.h>
40#include <linux/nfsd/nfsd.h>
41#ifdef CONFIG_NFSD_V3
42#include <linux/nfs3.h>
43#include <linux/nfsd/xdr3.h>
44#endif /* CONFIG_NFSD_V3 */
45#include <linux/nfsd/nfsfh.h>
46#include <linux/quotaops.h>
47#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
48#include <linux/posix_acl.h>
49#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
50#include <linux/xattr.h> 25#include <linux/xattr.h>
26#include <linux/jhash.h>
27#include <linux/ima.h>
28#include <linux/slab.h>
29#include <asm/uaccess.h>
30#include <linux/exportfs.h>
31#include <linux/writeback.h>
32
33#ifdef CONFIG_NFSD_V3
34#include "xdr3.h"
35#endif /* CONFIG_NFSD_V3 */
36
51#ifdef CONFIG_NFSD_V4 37#ifdef CONFIG_NFSD_V4
52#include <linux/nfs4.h>
53#include <linux/nfs4_acl.h> 38#include <linux/nfs4_acl.h>
54#include <linux/nfsd_idmap.h> 39#include <linux/nfsd_idmap.h>
55#include <linux/security.h>
56#endif /* CONFIG_NFSD_V4 */ 40#endif /* CONFIG_NFSD_V4 */
57#include <linux/jhash.h>
58#include <linux/ima.h>
59 41
60#include <asm/uaccess.h> 42#include "nfsd.h"
43#include "vfs.h"
61 44
62#define NFSDDBG_FACILITY NFSDDBG_FILEOP 45#define NFSDDBG_FACILITY NFSDDBG_FILEOP
63 46
@@ -89,12 +72,6 @@ struct raparm_hbucket {
89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 72#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
90static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; 73static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
91 74
92static inline int
93nfsd_v4client(struct svc_rqst *rq)
94{
95 return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
96}
97
98/* 75/*
99 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 76 * Called from nfsd_lookup and encode_dirent. Check if we have crossed
100 * a mount point. 77 * a mount point.
@@ -116,8 +93,16 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
116 93
117 exp2 = rqst_exp_get_by_name(rqstp, &path); 94 exp2 = rqst_exp_get_by_name(rqstp, &path);
118 if (IS_ERR(exp2)) { 95 if (IS_ERR(exp2)) {
119 if (PTR_ERR(exp2) != -ENOENT) 96 err = PTR_ERR(exp2);
120 err = PTR_ERR(exp2); 97 /*
98 * We normally allow NFS clients to continue
99 * "underneath" a mountpoint that is not exported.
100 * The exception is V4ROOT, where no traversal is ever
101 * allowed without an explicit export of the new
102 * directory.
103 */
104 if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
105 err = 0;
121 path_put(&path); 106 path_put(&path);
122 goto out; 107 goto out;
123 } 108 }
@@ -141,6 +126,53 @@ out:
141 return err; 126 return err;
142} 127}
143 128
129static void follow_to_parent(struct path *path)
130{
131 struct dentry *dp;
132
133 while (path->dentry == path->mnt->mnt_root && follow_up(path))
134 ;
135 dp = dget_parent(path->dentry);
136 dput(path->dentry);
137 path->dentry = dp;
138}
139
140static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
141{
142 struct svc_export *exp2;
143 struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
144 .dentry = dget(dparent)};
145
146 follow_to_parent(&path);
147
148 exp2 = rqst_exp_parent(rqstp, &path);
149 if (PTR_ERR(exp2) == -ENOENT) {
150 *dentryp = dget(dparent);
151 } else if (IS_ERR(exp2)) {
152 path_put(&path);
153 return PTR_ERR(exp2);
154 } else {
155 *dentryp = dget(path.dentry);
156 exp_put(*exp);
157 *exp = exp2;
158 }
159 path_put(&path);
160 return 0;
161}
162
163/*
164 * For nfsd purposes, we treat V4ROOT exports as though there was an
165 * export at *every* directory.
166 */
167int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
168{
169 if (d_mountpoint(dentry))
170 return 1;
171 if (!(exp->ex_flags & NFSEXP_V4ROOT))
172 return 0;
173 return dentry->d_inode != NULL;
174}
175
144__be32 176__be32
145nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, 177nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
146 const char *name, unsigned int len, 178 const char *name, unsigned int len,
@@ -169,35 +201,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
169 dentry = dget(dparent); 201 dentry = dget(dparent);
170 else if (dparent != exp->ex_path.dentry) 202 else if (dparent != exp->ex_path.dentry)
171 dentry = dget_parent(dparent); 203 dentry = dget_parent(dparent);
172 else if (!EX_NOHIDE(exp)) 204 else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
173 dentry = dget(dparent); /* .. == . just like at / */ 205 dentry = dget(dparent); /* .. == . just like at / */
174 else { 206 else {
175 /* checking mountpoint crossing is very different when stepping up */ 207 /* checking mountpoint crossing is very different when stepping up */
176 struct svc_export *exp2 = NULL; 208 host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
177 struct dentry *dp; 209 if (host_err)
178 struct path path = {.mnt = mntget(exp->ex_path.mnt),
179 .dentry = dget(dparent)};
180
181 while (path.dentry == path.mnt->mnt_root &&
182 follow_up(&path))
183 ;
184 dp = dget_parent(path.dentry);
185 dput(path.dentry);
186 path.dentry = dp;
187
188 exp2 = rqst_exp_parent(rqstp, &path);
189 if (PTR_ERR(exp2) == -ENOENT) {
190 dentry = dget(dparent);
191 } else if (IS_ERR(exp2)) {
192 host_err = PTR_ERR(exp2);
193 path_put(&path);
194 goto out_nfserr; 210 goto out_nfserr;
195 } else {
196 dentry = dget(path.dentry);
197 exp_put(exp);
198 exp = exp2;
199 }
200 path_put(&path);
201 } 211 }
202 } else { 212 } else {
203 fh_lock(fhp); 213 fh_lock(fhp);
@@ -208,7 +218,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
208 /* 218 /*
209 * check if we have crossed a mount point ... 219 * check if we have crossed a mount point ...
210 */ 220 */
211 if (d_mountpoint(dentry)) { 221 if (nfsd_mountpoint(dentry, exp)) {
212 if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { 222 if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
213 dput(dentry); 223 dput(dentry);
214 goto out_nfserr; 224 goto out_nfserr;
@@ -263,6 +273,32 @@ out:
263 return err; 273 return err;
264} 274}
265 275
276/*
277 * Commit metadata changes to stable storage.
278 */
279static int
280commit_metadata(struct svc_fh *fhp)
281{
282 struct inode *inode = fhp->fh_dentry->d_inode;
283 const struct export_operations *export_ops = inode->i_sb->s_export_op;
284 int error = 0;
285
286 if (!EX_ISSYNC(fhp->fh_export))
287 return 0;
288
289 if (export_ops->commit_metadata) {
290 error = export_ops->commit_metadata(inode);
291 } else {
292 struct writeback_control wbc = {
293 .sync_mode = WB_SYNC_ALL,
294 .nr_to_write = 0, /* metadata only */
295 };
296
297 error = sync_inode(inode, &wbc);
298 }
299
300 return error;
301}
266 302
267/* 303/*
268 * Set various file attributes. 304 * Set various file attributes.
@@ -353,7 +389,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
353 * If we are changing the size of the file, then 389 * If we are changing the size of the file, then
354 * we need to break all leases. 390 * we need to break all leases.
355 */ 391 */
356 host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); 392 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
357 if (host_err == -EWOULDBLOCK) 393 if (host_err == -EWOULDBLOCK)
358 host_err = -ETIMEDOUT; 394 host_err = -ETIMEDOUT;
359 if (host_err) /* ENOMEM or EWOULDBLOCK */ 395 if (host_err) /* ENOMEM or EWOULDBLOCK */
@@ -369,7 +405,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
369 put_write_access(inode); 405 put_write_access(inode);
370 goto out_nfserr; 406 goto out_nfserr;
371 } 407 }
372 vfs_dq_init(inode);
373 } 408 }
374 409
375 /* sanitize the mode change */ 410 /* sanitize the mode change */
@@ -726,7 +761,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
726 * Check to see if there are any leases on this file. 761 * Check to see if there are any leases on this file.
727 * This may block while leases are broken. 762 * This may block while leases are broken.
728 */ 763 */
729 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0)); 764 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
730 if (host_err == -EWOULDBLOCK) 765 if (host_err == -EWOULDBLOCK)
731 host_err = -ETIMEDOUT; 766 host_err = -ETIMEDOUT;
732 if (host_err) /* NOMEM or WOULDBLOCK */ 767 if (host_err) /* NOMEM or WOULDBLOCK */
@@ -737,15 +772,13 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
737 flags = O_RDWR|O_LARGEFILE; 772 flags = O_RDWR|O_LARGEFILE;
738 else 773 else
739 flags = O_WRONLY|O_LARGEFILE; 774 flags = O_WRONLY|O_LARGEFILE;
740
741 vfs_dq_init(inode);
742 } 775 }
743 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), 776 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
744 flags, current_cred()); 777 flags, current_cred());
745 if (IS_ERR(*filp)) 778 if (IS_ERR(*filp))
746 host_err = PTR_ERR(*filp); 779 host_err = PTR_ERR(*filp);
747 else 780 else
748 ima_counts_get(*filp); 781 host_err = ima_file_check(*filp, access);
749out_nfserr: 782out_nfserr:
750 err = nfserrno(host_err); 783 err = nfserrno(host_err);
751out: 784out:
@@ -763,46 +796,6 @@ nfsd_close(struct file *filp)
763} 796}
764 797
765/* 798/*
766 * Sync a file
767 * As this calls fsync (not fdatasync) there is no need for a write_inode
768 * after it.
769 */
770static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
771 const struct file_operations *fop)
772{
773 struct inode *inode = dp->d_inode;
774 int (*fsync) (struct file *, struct dentry *, int);
775 int err;
776
777 err = filemap_fdatawrite(inode->i_mapping);
778 if (err == 0 && fop && (fsync = fop->fsync))
779 err = fsync(filp, dp, 0);
780 if (err == 0)
781 err = filemap_fdatawait(inode->i_mapping);
782
783 return err;
784}
785
786static int
787nfsd_sync(struct file *filp)
788{
789 int err;
790 struct inode *inode = filp->f_path.dentry->d_inode;
791 dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
792 mutex_lock(&inode->i_mutex);
793 err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
794 mutex_unlock(&inode->i_mutex);
795
796 return err;
797}
798
799int
800nfsd_sync_dir(struct dentry *dp)
801{
802 return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
803}
804
805/*
806 * Obtain the readahead parameters for the file 799 * Obtain the readahead parameters for the file
807 * specified by (dev, ino). 800 * specified by (dev, ino).
808 */ 801 */
@@ -1005,7 +998,7 @@ static int wait_for_concurrent_writes(struct file *file)
1005 998
1006 if (inode->i_state & I_DIRTY) { 999 if (inode->i_state & I_DIRTY) {
1007 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 1000 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1008 err = nfsd_sync(file); 1001 err = vfs_fsync(file, file->f_path.dentry, 0);
1009 } 1002 }
1010 last_ino = inode->i_ino; 1003 last_ino = inode->i_ino;
1011 last_dev = inode->i_sb->s_dev; 1004 last_dev = inode->i_sb->s_dev;
@@ -1153,8 +1146,9 @@ out:
1153#ifdef CONFIG_NFSD_V3 1146#ifdef CONFIG_NFSD_V3
1154/* 1147/*
1155 * Commit all pending writes to stable storage. 1148 * Commit all pending writes to stable storage.
1156 * Strictly speaking, we could sync just the indicated file region here, 1149 *
1157 * but there's currently no way we can ask the VFS to do so. 1150 * Note: we only guarantee that data that lies within the range specified
1151 * by the 'offset' and 'count' parameters will be synced.
1158 * 1152 *
1159 * Unfortunately we cannot lock the file to make sure we return full WCC 1153 * Unfortunately we cannot lock the file to make sure we return full WCC
1160 * data to the client, as locking happens lower down in the filesystem. 1154 * data to the client, as locking happens lower down in the filesystem.
@@ -1164,23 +1158,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1164 loff_t offset, unsigned long count) 1158 loff_t offset, unsigned long count)
1165{ 1159{
1166 struct file *file; 1160 struct file *file;
1167 __be32 err; 1161 loff_t end = LLONG_MAX;
1162 __be32 err = nfserr_inval;
1168 1163
1169 if ((u64)count > ~(u64)offset) 1164 if (offset < 0)
1170 return nfserr_inval; 1165 goto out;
1166 if (count != 0) {
1167 end = offset + (loff_t)count - 1;
1168 if (end < offset)
1169 goto out;
1170 }
1171 1171
1172 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); 1172 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
1173 if (err) 1173 if (err)
1174 return err; 1174 goto out;
1175 if (EX_ISSYNC(fhp->fh_export)) { 1175 if (EX_ISSYNC(fhp->fh_export)) {
1176 if (file->f_op && file->f_op->fsync) { 1176 int err2 = vfs_fsync_range(file, file->f_path.dentry,
1177 err = nfserrno(nfsd_sync(file)); 1177 offset, end, 0);
1178 } else { 1178
1179 if (err2 != -EINVAL)
1180 err = nfserrno(err2);
1181 else
1179 err = nfserr_notsupp; 1182 err = nfserr_notsupp;
1180 }
1181 } 1183 }
1182 1184
1183 nfsd_close(file); 1185 nfsd_close(file);
1186out:
1184 return err; 1187 return err;
1185} 1188}
1186#endif /* CONFIG_NFSD_V3 */ 1189#endif /* CONFIG_NFSD_V3 */
@@ -1333,12 +1336,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1333 goto out_nfserr; 1336 goto out_nfserr;
1334 } 1337 }
1335 1338
1336 if (EX_ISSYNC(fhp->fh_export)) { 1339 err = nfsd_create_setattr(rqstp, resfhp, iap);
1337 err = nfserrno(nfsd_sync_dir(dentry));
1338 write_inode_now(dchild->d_inode, 1);
1339 }
1340 1340
1341 err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1341 /*
1342 * nfsd_setattr already committed the child. Transactional filesystems
1343 * had a chance to commit changes for both parent and child
1344 * simultaneously making the following commit_metadata a noop.
1345 */
1346 err2 = nfserrno(commit_metadata(fhp));
1342 if (err2) 1347 if (err2)
1343 err = err2; 1348 err = err2;
1344 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1349 mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1370,7 +1375,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1370 struct dentry *dentry, *dchild = NULL; 1375 struct dentry *dentry, *dchild = NULL;
1371 struct inode *dirp; 1376 struct inode *dirp;
1372 __be32 err; 1377 __be32 err;
1373 __be32 err2;
1374 int host_err; 1378 int host_err;
1375 __u32 v_mtime=0, v_atime=0; 1379 __u32 v_mtime=0, v_atime=0;
1376 1380
@@ -1465,11 +1469,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1465 if (created) 1469 if (created)
1466 *created = 1; 1470 *created = 1;
1467 1471
1468 if (EX_ISSYNC(fhp->fh_export)) {
1469 err = nfserrno(nfsd_sync_dir(dentry));
1470 /* setattr will sync the child (or not) */
1471 }
1472
1473 nfsd_check_ignore_resizing(iap); 1472 nfsd_check_ignore_resizing(iap);
1474 1473
1475 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1474 if (createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -1484,9 +1483,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1484 } 1483 }
1485 1484
1486 set_attr: 1485 set_attr:
1487 err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1486 err = nfsd_create_setattr(rqstp, resfhp, iap);
1488 if (err2) 1487
1489 err = err2; 1488 /*
1489 * nfsd_setattr already committed the child (and possibly also the parent).
1490 */
1491 if (!err)
1492 err = nfserrno(commit_metadata(fhp));
1490 1493
1491 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1494 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1492 /* 1495 /*
@@ -1601,12 +1604,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1601 } 1604 }
1602 } else 1605 } else
1603 host_err = vfs_symlink(dentry->d_inode, dnew, path); 1606 host_err = vfs_symlink(dentry->d_inode, dnew, path);
1604
1605 if (!host_err) {
1606 if (EX_ISSYNC(fhp->fh_export))
1607 host_err = nfsd_sync_dir(dentry);
1608 }
1609 err = nfserrno(host_err); 1607 err = nfserrno(host_err);
1608 if (!err)
1609 err = nfserrno(commit_metadata(fhp));
1610 fh_unlock(fhp); 1610 fh_unlock(fhp);
1611 1611
1612 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1612 mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1668,11 +1668,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1668 } 1668 }
1669 host_err = vfs_link(dold, dirp, dnew); 1669 host_err = vfs_link(dold, dirp, dnew);
1670 if (!host_err) { 1670 if (!host_err) {
1671 if (EX_ISSYNC(ffhp->fh_export)) { 1671 err = nfserrno(commit_metadata(ffhp));
1672 err = nfserrno(nfsd_sync_dir(ddir)); 1672 if (!err)
1673 write_inode_now(dest, 1); 1673 err = nfserrno(commit_metadata(tfhp));
1674 }
1675 err = 0;
1676 } else { 1674 } else {
1677 if (host_err == -EXDEV && rqstp->rq_vers == 2) 1675 if (host_err == -EXDEV && rqstp->rq_vers == 2)
1678 err = nfserr_acces; 1676 err = nfserr_acces;
@@ -1768,10 +1766,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1768 goto out_dput_new; 1766 goto out_dput_new;
1769 1767
1770 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1768 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1771 if (!host_err && EX_ISSYNC(tfhp->fh_export)) { 1769 if (!host_err) {
1772 host_err = nfsd_sync_dir(tdentry); 1770 host_err = commit_metadata(tfhp);
1773 if (!host_err) 1771 if (!host_err)
1774 host_err = nfsd_sync_dir(fdentry); 1772 host_err = commit_metadata(ffhp);
1775 } 1773 }
1776 1774
1777 mnt_drop_write(ffhp->fh_export->ex_path.mnt); 1775 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
@@ -1852,12 +1850,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1852 1850
1853 dput(rdentry); 1851 dput(rdentry);
1854 1852
1855 if (host_err) 1853 if (!host_err)
1856 goto out_drop; 1854 host_err = commit_metadata(fhp);
1857 if (EX_ISSYNC(fhp->fh_export))
1858 host_err = nfsd_sync_dir(dentry);
1859 1855
1860out_drop:
1861 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1856 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1862out_nfserr: 1857out_nfserr:
1863 err = nfserrno(host_err); 1858 err = nfserrno(host_err);
@@ -2124,8 +2119,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2124 */ 2119 */
2125 path.mnt = exp->ex_path.mnt; 2120 path.mnt = exp->ex_path.mnt;
2126 path.dentry = dentry; 2121 path.dentry = dentry;
2127 err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
2128 IMA_COUNT_LEAVE);
2129nfsd_out: 2122nfsd_out:
2130 return err? nfserrno(err) : 0; 2123 return err? nfserrno(err) : 0;
2131} 2124}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
new file mode 100644
index 000000000000..4b1de0a9ea75
--- /dev/null
+++ b/fs/nfsd/vfs.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
3 */
4
5#ifndef LINUX_NFSD_VFS_H
6#define LINUX_NFSD_VFS_H
7
8#include "nfsfh.h"
9
10/*
11 * Flags for nfsd_permission
12 */
13#define NFSD_MAY_NOP 0
14#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */
15#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */
16#define NFSD_MAY_READ 4 /* == MAY_READ */
17#define NFSD_MAY_SATTR 8
18#define NFSD_MAY_TRUNC 16
19#define NFSD_MAY_LOCK 32
20#define NFSD_MAY_OWNER_OVERRIDE 64
21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
23
24#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
25#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
26
27/*
28 * Callback function for readdir
29 */
30typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
31
32/* nfsd/vfs.c */
33int fh_lock_parent(struct svc_fh *, struct dentry *);
34int nfsd_racache_init(int);
35void nfsd_racache_shutdown(void);
36int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
37 struct svc_export **expp);
38__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
39 const char *, unsigned int, struct svc_fh *);
40__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
41 const char *, unsigned int,
42 struct svc_export **, struct dentry **);
43__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
44 struct iattr *, int, time_t);
45int nfsd_mountpoint(struct dentry *, struct svc_export *);
46#ifdef CONFIG_NFSD_V4
47__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
48 struct nfs4_acl *);
49int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
50#endif /* CONFIG_NFSD_V4 */
51__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
52 char *name, int len, struct iattr *attrs,
53 int type, dev_t rdev, struct svc_fh *res);
54#ifdef CONFIG_NFSD_V3
55__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
56__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
57 char *name, int len, struct iattr *attrs,
58 struct svc_fh *res, int createmode,
59 u32 *verifier, int *truncp, int *created);
60__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
61 loff_t, unsigned long);
62#endif /* CONFIG_NFSD_V3 */
63__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int,
64 int, struct file **);
65void nfsd_close(struct file *);
66__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
67 loff_t, struct kvec *, int, unsigned long *);
68__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
69 loff_t, struct kvec *,int, unsigned long *, int *);
70__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
71 char *, int *);
72__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
73 char *name, int len, char *path, int plen,
74 struct svc_fh *res, struct iattr *);
75__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
76 char *, int, struct svc_fh *);
77__be32 nfsd_rename(struct svc_rqst *,
78 struct svc_fh *, char *, int,
79 struct svc_fh *, char *, int);
80__be32 nfsd_remove(struct svc_rqst *,
81 struct svc_fh *, char *, int);
82__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
83 char *name, int len);
84int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
85 unsigned long size);
86__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
87 loff_t *, struct readdir_cd *, filldir_t);
88__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
89 struct kstatfs *, int access);
90
91int nfsd_notify_change(struct inode *, struct iattr *);
92__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
93 struct dentry *, int);
94int nfsd_sync_dir(struct dentry *dp);
95
96#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
97struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
98int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
99#endif
100
101#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
new file mode 100644
index 000000000000..53b1863dd8f6
--- /dev/null
+++ b/fs/nfsd/xdr.h
@@ -0,0 +1,173 @@
1/* XDR types for nfsd. This is mainly a typing exercise. */
2
3#ifndef LINUX_NFSD_H
4#define LINUX_NFSD_H
5
6#include <linux/vfs.h>
7#include "nfsd.h"
8#include "nfsfh.h"
9
10struct nfsd_fhandle {
11 struct svc_fh fh;
12};
13
14struct nfsd_sattrargs {
15 struct svc_fh fh;
16 struct iattr attrs;
17};
18
19struct nfsd_diropargs {
20 struct svc_fh fh;
21 char * name;
22 unsigned int len;
23};
24
25struct nfsd_readargs {
26 struct svc_fh fh;
27 __u32 offset;
28 __u32 count;
29 int vlen;
30};
31
32struct nfsd_writeargs {
33 svc_fh fh;
34 __u32 offset;
35 int len;
36 int vlen;
37};
38
39struct nfsd_createargs {
40 struct svc_fh fh;
41 char * name;
42 unsigned int len;
43 struct iattr attrs;
44};
45
46struct nfsd_renameargs {
47 struct svc_fh ffh;
48 char * fname;
49 unsigned int flen;
50 struct svc_fh tfh;
51 char * tname;
52 unsigned int tlen;
53};
54
55struct nfsd_readlinkargs {
56 struct svc_fh fh;
57 char * buffer;
58};
59
60struct nfsd_linkargs {
61 struct svc_fh ffh;
62 struct svc_fh tfh;
63 char * tname;
64 unsigned int tlen;
65};
66
67struct nfsd_symlinkargs {
68 struct svc_fh ffh;
69 char * fname;
70 unsigned int flen;
71 char * tname;
72 unsigned int tlen;
73 struct iattr attrs;
74};
75
76struct nfsd_readdirargs {
77 struct svc_fh fh;
78 __u32 cookie;
79 __u32 count;
80 __be32 * buffer;
81};
82
83struct nfsd_attrstat {
84 struct svc_fh fh;
85 struct kstat stat;
86};
87
88struct nfsd_diropres {
89 struct svc_fh fh;
90 struct kstat stat;
91};
92
93struct nfsd_readlinkres {
94 int len;
95};
96
97struct nfsd_readres {
98 struct svc_fh fh;
99 unsigned long count;
100 struct kstat stat;
101};
102
103struct nfsd_readdirres {
104 int count;
105
106 struct readdir_cd common;
107 __be32 * buffer;
108 int buflen;
109 __be32 * offset;
110};
111
112struct nfsd_statfsres {
113 struct kstatfs stats;
114};
115
116/*
117 * Storage requirements for XDR arguments and results.
118 */
119union nfsd_xdrstore {
120 struct nfsd_sattrargs sattr;
121 struct nfsd_diropargs dirop;
122 struct nfsd_readargs read;
123 struct nfsd_writeargs write;
124 struct nfsd_createargs create;
125 struct nfsd_renameargs rename;
126 struct nfsd_linkargs link;
127 struct nfsd_symlinkargs symlink;
128 struct nfsd_readdirargs readdir;
129};
130
131#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore)
132
133
134int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *);
135int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
136int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *,
137 struct nfsd_sattrargs *);
138int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *,
139 struct nfsd_diropargs *);
140int nfssvc_decode_readargs(struct svc_rqst *, __be32 *,
141 struct nfsd_readargs *);
142int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *,
143 struct nfsd_writeargs *);
144int nfssvc_decode_createargs(struct svc_rqst *, __be32 *,
145 struct nfsd_createargs *);
146int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *,
147 struct nfsd_renameargs *);
148int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *,
149 struct nfsd_readlinkargs *);
150int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *,
151 struct nfsd_linkargs *);
152int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *,
153 struct nfsd_symlinkargs *);
154int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *,
155 struct nfsd_readdirargs *);
156int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *);
157int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *);
158int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *);
159int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *);
160int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *);
161int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *);
162int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *);
163
164int nfssvc_encode_entry(void *, const char *name,
165 int namlen, loff_t offset, u64 ino, unsigned int);
166
167int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
168
169/* Helper functions for NFSv2 ACL code */
170__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp);
171__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
172
173#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
new file mode 100644
index 000000000000..7df980eb0562
--- /dev/null
+++ b/fs/nfsd/xdr3.h
@@ -0,0 +1,344 @@
1/*
2 * XDR types for NFSv3 in nfsd.
3 *
4 * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de>
5 */
6
7#ifndef _LINUX_NFSD_XDR3_H
8#define _LINUX_NFSD_XDR3_H
9
10#include "xdr.h"
11
12struct nfsd3_sattrargs {
13 struct svc_fh fh;
14 struct iattr attrs;
15 int check_guard;
16 time_t guardtime;
17};
18
19struct nfsd3_diropargs {
20 struct svc_fh fh;
21 char * name;
22 unsigned int len;
23};
24
25struct nfsd3_accessargs {
26 struct svc_fh fh;
27 unsigned int access;
28};
29
30struct nfsd3_readargs {
31 struct svc_fh fh;
32 __u64 offset;
33 __u32 count;
34 int vlen;
35};
36
37struct nfsd3_writeargs {
38 svc_fh fh;
39 __u64 offset;
40 __u32 count;
41 int stable;
42 __u32 len;
43 int vlen;
44};
45
46struct nfsd3_createargs {
47 struct svc_fh fh;
48 char * name;
49 unsigned int len;
50 int createmode;
51 struct iattr attrs;
52 __be32 * verf;
53};
54
55struct nfsd3_mknodargs {
56 struct svc_fh fh;
57 char * name;
58 unsigned int len;
59 __u32 ftype;
60 __u32 major, minor;
61 struct iattr attrs;
62};
63
64struct nfsd3_renameargs {
65 struct svc_fh ffh;
66 char * fname;
67 unsigned int flen;
68 struct svc_fh tfh;
69 char * tname;
70 unsigned int tlen;
71};
72
73struct nfsd3_readlinkargs {
74 struct svc_fh fh;
75 char * buffer;
76};
77
78struct nfsd3_linkargs {
79 struct svc_fh ffh;
80 struct svc_fh tfh;
81 char * tname;
82 unsigned int tlen;
83};
84
85struct nfsd3_symlinkargs {
86 struct svc_fh ffh;
87 char * fname;
88 unsigned int flen;
89 char * tname;
90 unsigned int tlen;
91 struct iattr attrs;
92};
93
94struct nfsd3_readdirargs {
95 struct svc_fh fh;
96 __u64 cookie;
97 __u32 dircount;
98 __u32 count;
99 __be32 * verf;
100 __be32 * buffer;
101};
102
103struct nfsd3_commitargs {
104 struct svc_fh fh;
105 __u64 offset;
106 __u32 count;
107};
108
109struct nfsd3_getaclargs {
110 struct svc_fh fh;
111 int mask;
112};
113
114struct posix_acl;
115struct nfsd3_setaclargs {
116 struct svc_fh fh;
117 int mask;
118 struct posix_acl *acl_access;
119 struct posix_acl *acl_default;
120};
121
122struct nfsd3_attrstat {
123 __be32 status;
124 struct svc_fh fh;
125 struct kstat stat;
126};
127
128/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
129struct nfsd3_diropres {
130 __be32 status;
131 struct svc_fh dirfh;
132 struct svc_fh fh;
133};
134
135struct nfsd3_accessres {
136 __be32 status;
137 struct svc_fh fh;
138 __u32 access;
139};
140
141struct nfsd3_readlinkres {
142 __be32 status;
143 struct svc_fh fh;
144 __u32 len;
145};
146
147struct nfsd3_readres {
148 __be32 status;
149 struct svc_fh fh;
150 unsigned long count;
151 int eof;
152};
153
154struct nfsd3_writeres {
155 __be32 status;
156 struct svc_fh fh;
157 unsigned long count;
158 int committed;
159};
160
161struct nfsd3_renameres {
162 __be32 status;
163 struct svc_fh ffh;
164 struct svc_fh tfh;
165};
166
167struct nfsd3_linkres {
168 __be32 status;
169 struct svc_fh tfh;
170 struct svc_fh fh;
171};
172
173struct nfsd3_readdirres {
174 __be32 status;
175 struct svc_fh fh;
176 int count;
177 __be32 verf[2];
178
179 struct readdir_cd common;
180 __be32 * buffer;
181 int buflen;
182 __be32 * offset;
183 __be32 * offset1;
184 struct svc_rqst * rqstp;
185
186};
187
188struct nfsd3_fsstatres {
189 __be32 status;
190 struct kstatfs stats;
191 __u32 invarsec;
192};
193
194struct nfsd3_fsinfores {
195 __be32 status;
196 __u32 f_rtmax;
197 __u32 f_rtpref;
198 __u32 f_rtmult;
199 __u32 f_wtmax;
200 __u32 f_wtpref;
201 __u32 f_wtmult;
202 __u32 f_dtpref;
203 __u64 f_maxfilesize;
204 __u32 f_properties;
205};
206
207struct nfsd3_pathconfres {
208 __be32 status;
209 __u32 p_link_max;
210 __u32 p_name_max;
211 __u32 p_no_trunc;
212 __u32 p_chown_restricted;
213 __u32 p_case_insensitive;
214 __u32 p_case_preserving;
215};
216
217struct nfsd3_commitres {
218 __be32 status;
219 struct svc_fh fh;
220};
221
222struct nfsd3_getaclres {
223 __be32 status;
224 struct svc_fh fh;
225 int mask;
226 struct posix_acl *acl_access;
227 struct posix_acl *acl_default;
228};
229
230/* dummy type for release */
231struct nfsd3_fhandle_pair {
232 __u32 dummy;
233 struct svc_fh fh1;
234 struct svc_fh fh2;
235};
236
237/*
238 * Storage requirements for XDR arguments and results.
239 */
240union nfsd3_xdrstore {
241 struct nfsd3_sattrargs sattrargs;
242 struct nfsd3_diropargs diropargs;
243 struct nfsd3_readargs readargs;
244 struct nfsd3_writeargs writeargs;
245 struct nfsd3_createargs createargs;
246 struct nfsd3_renameargs renameargs;
247 struct nfsd3_linkargs linkargs;
248 struct nfsd3_symlinkargs symlinkargs;
249 struct nfsd3_readdirargs readdirargs;
250 struct nfsd3_diropres diropres;
251 struct nfsd3_accessres accessres;
252 struct nfsd3_readlinkres readlinkres;
253 struct nfsd3_readres readres;
254 struct nfsd3_writeres writeres;
255 struct nfsd3_renameres renameres;
256 struct nfsd3_linkres linkres;
257 struct nfsd3_readdirres readdirres;
258 struct nfsd3_fsstatres fsstatres;
259 struct nfsd3_fsinfores fsinfores;
260 struct nfsd3_pathconfres pathconfres;
261 struct nfsd3_commitres commitres;
262 struct nfsd3_getaclres getaclres;
263};
264
265#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
266
267int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
268int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *,
269 struct nfsd3_sattrargs *);
270int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *,
271 struct nfsd3_diropargs *);
272int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *,
273 struct nfsd3_accessargs *);
274int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *,
275 struct nfsd3_readargs *);
276int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *,
277 struct nfsd3_writeargs *);
278int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *,
279 struct nfsd3_createargs *);
280int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *,
281 struct nfsd3_createargs *);
282int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *,
283 struct nfsd3_mknodargs *);
284int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *,
285 struct nfsd3_renameargs *);
286int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *,
287 struct nfsd3_readlinkargs *);
288int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *,
289 struct nfsd3_linkargs *);
290int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *,
291 struct nfsd3_symlinkargs *);
292int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *,
293 struct nfsd3_readdirargs *);
294int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *,
295 struct nfsd3_readdirargs *);
296int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *,
297 struct nfsd3_commitargs *);
298int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
299int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *,
300 struct nfsd3_attrstat *);
301int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *,
302 struct nfsd3_attrstat *);
303int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *,
304 struct nfsd3_diropres *);
305int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *,
306 struct nfsd3_accessres *);
307int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *,
308 struct nfsd3_readlinkres *);
309int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *);
310int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *);
311int nfs3svc_encode_createres(struct svc_rqst *, __be32 *,
312 struct nfsd3_diropres *);
313int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *,
314 struct nfsd3_renameres *);
315int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *,
316 struct nfsd3_linkres *);
317int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *,
318 struct nfsd3_readdirres *);
319int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *,
320 struct nfsd3_fsstatres *);
321int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *,
322 struct nfsd3_fsinfores *);
323int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *,
324 struct nfsd3_pathconfres *);
325int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *,
326 struct nfsd3_commitres *);
327
328int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *,
329 struct nfsd3_attrstat *);
330int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *,
331 struct nfsd3_fhandle_pair *);
332int nfs3svc_encode_entry(void *, const char *name,
333 int namlen, loff_t offset, u64 ino,
334 unsigned int);
335int nfs3svc_encode_entry_plus(void *, const char *name,
336 int namlen, loff_t offset, u64 ino,
337 unsigned int);
338/* Helper functions for NFSv3 ACL code */
339__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
340 struct svc_fh *fhp);
341__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
342
343
344#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
new file mode 100644
index 000000000000..efa337739534
--- /dev/null
+++ b/fs/nfsd/xdr4.h
@@ -0,0 +1,562 @@
1/*
2 * Server-side types for NFSv4.
3 *
4 * Copyright (c) 2002 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Kendrick Smith <kmsmith@umich.edu>
8 * Andy Adamson <andros@umich.edu>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
24 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
25 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 *
35 */
36
37#ifndef _LINUX_NFSD_XDR4_H
38#define _LINUX_NFSD_XDR4_H
39
40#include "state.h"
41#include "nfsd.h"
42
43#define NFSD4_MAX_TAGLEN 128
44#define XDR_LEN(n) (((n) + 3) & ~3)
45
46struct nfsd4_compound_state {
47 struct svc_fh current_fh;
48 struct svc_fh save_fh;
49 struct nfs4_stateowner *replay_owner;
50 /* For sessions DRC */
51 struct nfsd4_session *session;
52 struct nfsd4_slot *slot;
53 __be32 *datap;
54 size_t iovlen;
55 u32 minorversion;
56 u32 status;
57};
58
59static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
60{
61 return cs->slot != NULL;
62}
63
64struct nfsd4_change_info {
65 u32 atomic;
66 bool change_supported;
67 u32 before_ctime_sec;
68 u32 before_ctime_nsec;
69 u64 before_change;
70 u32 after_ctime_sec;
71 u32 after_ctime_nsec;
72 u64 after_change;
73};
74
75struct nfsd4_access {
76 u32 ac_req_access; /* request */
77 u32 ac_supported; /* response */
78 u32 ac_resp_access; /* response */
79};
80
81struct nfsd4_close {
82 u32 cl_seqid; /* request */
83 stateid_t cl_stateid; /* request+response */
84 struct nfs4_stateowner * cl_stateowner; /* response */
85};
86
87struct nfsd4_commit {
88 u64 co_offset; /* request */
89 u32 co_count; /* request */
90 nfs4_verifier co_verf; /* response */
91};
92
93struct nfsd4_create {
94 u32 cr_namelen; /* request */
95 char * cr_name; /* request */
96 u32 cr_type; /* request */
97 union { /* request */
98 struct {
99 u32 namelen;
100 char *name;
101 } link; /* NF4LNK */
102 struct {
103 u32 specdata1;
104 u32 specdata2;
105 } dev; /* NF4BLK, NF4CHR */
106 } u;
107 u32 cr_bmval[3]; /* request */
108 struct iattr cr_iattr; /* request */
109 struct nfsd4_change_info cr_cinfo; /* response */
110 struct nfs4_acl *cr_acl;
111};
112#define cr_linklen u.link.namelen
113#define cr_linkname u.link.name
114#define cr_specdata1 u.dev.specdata1
115#define cr_specdata2 u.dev.specdata2
116
117struct nfsd4_delegreturn {
118 stateid_t dr_stateid;
119};
120
121struct nfsd4_getattr {
122 u32 ga_bmval[3]; /* request */
123 struct svc_fh *ga_fhp; /* response */
124};
125
126struct nfsd4_link {
127 u32 li_namelen; /* request */
128 char * li_name; /* request */
129 struct nfsd4_change_info li_cinfo; /* response */
130};
131
132struct nfsd4_lock_denied {
133 clientid_t ld_clientid;
134 struct nfs4_stateowner *ld_sop;
135 u64 ld_start;
136 u64 ld_length;
137 u32 ld_type;
138};
139
140struct nfsd4_lock {
141 /* request */
142 u32 lk_type;
143 u32 lk_reclaim; /* boolean */
144 u64 lk_offset;
145 u64 lk_length;
146 u32 lk_is_new;
147 union {
148 struct {
149 u32 open_seqid;
150 stateid_t open_stateid;
151 u32 lock_seqid;
152 clientid_t clientid;
153 struct xdr_netobj owner;
154 } new;
155 struct {
156 stateid_t lock_stateid;
157 u32 lock_seqid;
158 } old;
159 } v;
160
161 /* response */
162 union {
163 struct {
164 stateid_t stateid;
165 } ok;
166 struct nfsd4_lock_denied denied;
167 } u;
168 /* The lk_replay_owner is the open owner in the open_to_lock_owner
169 * case and the lock owner otherwise: */
170 struct nfs4_stateowner *lk_replay_owner;
171};
172#define lk_new_open_seqid v.new.open_seqid
173#define lk_new_open_stateid v.new.open_stateid
174#define lk_new_lock_seqid v.new.lock_seqid
175#define lk_new_clientid v.new.clientid
176#define lk_new_owner v.new.owner
177#define lk_old_lock_stateid v.old.lock_stateid
178#define lk_old_lock_seqid v.old.lock_seqid
179
180#define lk_rflags u.ok.rflags
181#define lk_resp_stateid u.ok.stateid
182#define lk_denied u.denied
183
184
185struct nfsd4_lockt {
186 u32 lt_type;
187 clientid_t lt_clientid;
188 struct xdr_netobj lt_owner;
189 u64 lt_offset;
190 u64 lt_length;
191 struct nfs4_stateowner * lt_stateowner;
192 struct nfsd4_lock_denied lt_denied;
193};
194
195
196struct nfsd4_locku {
197 u32 lu_type;
198 u32 lu_seqid;
199 stateid_t lu_stateid;
200 u64 lu_offset;
201 u64 lu_length;
202 struct nfs4_stateowner *lu_stateowner;
203};
204
205
206struct nfsd4_lookup {
207 u32 lo_len; /* request */
208 char * lo_name; /* request */
209};
210
211struct nfsd4_putfh {
212 u32 pf_fhlen; /* request */
213 char *pf_fhval; /* request */
214};
215
216struct nfsd4_open {
217 u32 op_claim_type; /* request */
218 struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */
219 u32 op_delegate_type; /* request - CLAIM_PREV only */
220 stateid_t op_delegate_stateid; /* request - response */
221 u32 op_create; /* request */
222 u32 op_createmode; /* request */
223 u32 op_bmval[3]; /* request */
224 struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
225 nfs4_verifier verf; /* EXCLUSIVE4 */
226 clientid_t op_clientid; /* request */
227 struct xdr_netobj op_owner; /* request */
228 u32 op_seqid; /* request */
229 u32 op_share_access; /* request */
230 u32 op_share_deny; /* request */
231 stateid_t op_stateid; /* response */
232 u32 op_recall; /* recall */
233 struct nfsd4_change_info op_cinfo; /* response */
234 u32 op_rflags; /* response */
235 int op_truncate; /* used during processing */
236 struct nfs4_stateowner *op_stateowner; /* used during processing */
237 struct nfs4_acl *op_acl;
238};
239#define op_iattr iattr
240#define op_verf verf
241
242struct nfsd4_open_confirm {
243 stateid_t oc_req_stateid /* request */;
244 u32 oc_seqid /* request */;
245 stateid_t oc_resp_stateid /* response */;
246 struct nfs4_stateowner * oc_stateowner; /* response */
247};
248
249struct nfsd4_open_downgrade {
250 stateid_t od_stateid;
251 u32 od_seqid;
252 u32 od_share_access;
253 u32 od_share_deny;
254 struct nfs4_stateowner *od_stateowner;
255};
256
257
258struct nfsd4_read {
259 stateid_t rd_stateid; /* request */
260 u64 rd_offset; /* request */
261 u32 rd_length; /* request */
262 int rd_vlen;
263 struct file *rd_filp;
264
265 struct svc_rqst *rd_rqstp; /* response */
266 struct svc_fh * rd_fhp; /* response */
267};
268
269struct nfsd4_readdir {
270 u64 rd_cookie; /* request */
271 nfs4_verifier rd_verf; /* request */
272 u32 rd_dircount; /* request */
273 u32 rd_maxcount; /* request */
274 u32 rd_bmval[3]; /* request */
275 struct svc_rqst *rd_rqstp; /* response */
276 struct svc_fh * rd_fhp; /* response */
277
278 struct readdir_cd common;
279 __be32 * buffer;
280 int buflen;
281 __be32 * offset;
282};
283
284struct nfsd4_release_lockowner {
285 clientid_t rl_clientid;
286 struct xdr_netobj rl_owner;
287};
288struct nfsd4_readlink {
289 struct svc_rqst *rl_rqstp; /* request */
290 struct svc_fh * rl_fhp; /* request */
291};
292
293struct nfsd4_remove {
294 u32 rm_namelen; /* request */
295 char * rm_name; /* request */
296 struct nfsd4_change_info rm_cinfo; /* response */
297};
298
299struct nfsd4_rename {
300 u32 rn_snamelen; /* request */
301 char * rn_sname; /* request */
302 u32 rn_tnamelen; /* request */
303 char * rn_tname; /* request */
304 struct nfsd4_change_info rn_sinfo; /* response */
305 struct nfsd4_change_info rn_tinfo; /* response */
306};
307
308struct nfsd4_secinfo {
309 u32 si_namelen; /* request */
310 char *si_name; /* request */
311 struct svc_export *si_exp; /* response */
312};
313
314struct nfsd4_setattr {
315 stateid_t sa_stateid; /* request */
316 u32 sa_bmval[3]; /* request */
317 struct iattr sa_iattr; /* request */
318 struct nfs4_acl *sa_acl;
319};
320
321struct nfsd4_setclientid {
322 nfs4_verifier se_verf; /* request */
323 u32 se_namelen; /* request */
324 char * se_name; /* request */
325 u32 se_callback_prog; /* request */
326 u32 se_callback_netid_len; /* request */
327 char * se_callback_netid_val; /* request */
328 u32 se_callback_addr_len; /* request */
329 char * se_callback_addr_val; /* request */
330 u32 se_callback_ident; /* request */
331 clientid_t se_clientid; /* response */
332 nfs4_verifier se_confirm; /* response */
333};
334
335struct nfsd4_setclientid_confirm {
336 clientid_t sc_clientid;
337 nfs4_verifier sc_confirm;
338};
339
340/* also used for NVERIFY */
341struct nfsd4_verify {
342 u32 ve_bmval[3]; /* request */
343 u32 ve_attrlen; /* request */
344 char * ve_attrval; /* request */
345};
346
347struct nfsd4_write {
348 stateid_t wr_stateid; /* request */
349 u64 wr_offset; /* request */
350 u32 wr_stable_how; /* request */
351 u32 wr_buflen; /* request */
352 int wr_vlen;
353
354 u32 wr_bytes_written; /* response */
355 u32 wr_how_written; /* response */
356 nfs4_verifier wr_verifier; /* response */
357};
358
359struct nfsd4_exchange_id {
360 nfs4_verifier verifier;
361 struct xdr_netobj clname;
362 u32 flags;
363 clientid_t clientid;
364 u32 seqid;
365 int spa_how;
366};
367
368struct nfsd4_sequence {
369 struct nfs4_sessionid sessionid; /* request/response */
370 u32 seqid; /* request/response */
371 u32 slotid; /* request/response */
372 u32 maxslots; /* request/response */
373 u32 cachethis; /* request */
374#if 0
375 u32 target_maxslots; /* response */
376 u32 status_flags; /* response */
377#endif /* not yet */
378};
379
380struct nfsd4_destroy_session {
381 struct nfs4_sessionid sessionid;
382};
383
384struct nfsd4_op {
385 int opnum;
386 __be32 status;
387 union {
388 struct nfsd4_access access;
389 struct nfsd4_close close;
390 struct nfsd4_commit commit;
391 struct nfsd4_create create;
392 struct nfsd4_delegreturn delegreturn;
393 struct nfsd4_getattr getattr;
394 struct svc_fh * getfh;
395 struct nfsd4_link link;
396 struct nfsd4_lock lock;
397 struct nfsd4_lockt lockt;
398 struct nfsd4_locku locku;
399 struct nfsd4_lookup lookup;
400 struct nfsd4_verify nverify;
401 struct nfsd4_open open;
402 struct nfsd4_open_confirm open_confirm;
403 struct nfsd4_open_downgrade open_downgrade;
404 struct nfsd4_putfh putfh;
405 struct nfsd4_read read;
406 struct nfsd4_readdir readdir;
407 struct nfsd4_readlink readlink;
408 struct nfsd4_remove remove;
409 struct nfsd4_rename rename;
410 clientid_t renew;
411 struct nfsd4_secinfo secinfo;
412 struct nfsd4_setattr setattr;
413 struct nfsd4_setclientid setclientid;
414 struct nfsd4_setclientid_confirm setclientid_confirm;
415 struct nfsd4_verify verify;
416 struct nfsd4_write write;
417 struct nfsd4_release_lockowner release_lockowner;
418
419 /* NFSv4.1 */
420 struct nfsd4_exchange_id exchange_id;
421 struct nfsd4_create_session create_session;
422 struct nfsd4_destroy_session destroy_session;
423 struct nfsd4_sequence sequence;
424 } u;
425 struct nfs4_replay * replay;
426};
427
428struct nfsd4_compoundargs {
429 /* scratch variables for XDR decode */
430 __be32 * p;
431 __be32 * end;
432 struct page ** pagelist;
433 int pagelen;
434 __be32 tmp[8];
435 __be32 * tmpp;
436 struct tmpbuf {
437 struct tmpbuf *next;
438 void (*release)(const void *);
439 void *buf;
440 } *to_free;
441
442 struct svc_rqst *rqstp;
443
444 u32 taglen;
445 char * tag;
446 u32 minorversion;
447 u32 opcnt;
448 struct nfsd4_op *ops;
449 struct nfsd4_op iops[8];
450};
451
452struct nfsd4_compoundres {
453 /* scratch variables for XDR encode */
454 __be32 * p;
455 __be32 * end;
456 struct xdr_buf * xbuf;
457 struct svc_rqst * rqstp;
458
459 u32 taglen;
460 char * tag;
461 u32 opcnt;
462 __be32 * tagp; /* tag, opcount encode location */
463 struct nfsd4_compound_state cstate;
464};
465
466static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
467{
468 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
469 return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
470}
471
472static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
473{
474 return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp);
475}
476
477#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
478
479static inline void
480set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
481{
482 BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
483 cinfo->atomic = 1;
484 cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
485 if (cinfo->change_supported) {
486 cinfo->before_change = fhp->fh_pre_change;
487 cinfo->after_change = fhp->fh_post_change;
488 } else {
489 cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
490 cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
491 cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
492 cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
493 }
494}
495
496int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
497int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
498 struct nfsd4_compoundargs *);
499int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
500 struct nfsd4_compoundres *);
501void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
502void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
503__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
504 struct dentry *dentry, __be32 *buffer, int *countp,
505 u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
506extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
507 struct nfsd4_compound_state *,
508 struct nfsd4_setclientid *setclid);
509extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
510 struct nfsd4_compound_state *,
511 struct nfsd4_setclientid_confirm *setclientid_confirm);
512extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
513extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
514 struct nfsd4_sequence *seq);
515extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
516 struct nfsd4_compound_state *,
517struct nfsd4_exchange_id *);
518 extern __be32 nfsd4_create_session(struct svc_rqst *,
519 struct nfsd4_compound_state *,
520 struct nfsd4_create_session *);
521extern __be32 nfsd4_sequence(struct svc_rqst *,
522 struct nfsd4_compound_state *,
523 struct nfsd4_sequence *);
524extern __be32 nfsd4_destroy_session(struct svc_rqst *,
525 struct nfsd4_compound_state *,
526 struct nfsd4_destroy_session *);
527extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
528 struct nfsd4_open *open);
529extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
530 struct svc_fh *current_fh, struct nfsd4_open *open);
531extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
532 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
533extern __be32 nfsd4_close(struct svc_rqst *rqstp,
534 struct nfsd4_compound_state *,
535 struct nfsd4_close *close);
536extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
537 struct nfsd4_compound_state *,
538 struct nfsd4_open_downgrade *od);
539extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
540 struct nfsd4_lock *lock);
541extern __be32 nfsd4_lockt(struct svc_rqst *rqstp,
542 struct nfsd4_compound_state *,
543 struct nfsd4_lockt *lockt);
544extern __be32 nfsd4_locku(struct svc_rqst *rqstp,
545 struct nfsd4_compound_state *,
546 struct nfsd4_locku *locku);
547extern __be32
548nfsd4_release_lockowner(struct svc_rqst *rqstp,
549 struct nfsd4_compound_state *,
550 struct nfsd4_release_lockowner *rlockowner);
551extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
552extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp,
553 struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr);
554extern __be32 nfsd4_renew(struct svc_rqst *rqstp,
555 struct nfsd4_compound_state *, clientid_t *clid);
556#endif
557
558/*
559 * Local variables:
560 * c-basic-offset: 8
561 * End:
562 */
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d69e6ae59251..8d6356a804f3 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,6 +26,7 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/bitops.h> 28#include <linux/bitops.h>
29#include <linux/slab.h>
29#include "mdt.h" 30#include "mdt.h"
30#include "alloc.h" 31#include "alloc.h"
31 32
@@ -142,29 +143,75 @@ static void nilfs_palloc_desc_block_init(struct inode *inode,
142 } 143 }
143} 144}
144 145
146static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
147 int create,
148 void (*init_block)(struct inode *,
149 struct buffer_head *,
150 void *),
151 struct buffer_head **bhp,
152 struct nilfs_bh_assoc *prev,
153 spinlock_t *lock)
154{
155 int ret;
156
157 spin_lock(lock);
158 if (prev->bh && blkoff == prev->blkoff) {
159 get_bh(prev->bh);
160 *bhp = prev->bh;
161 spin_unlock(lock);
162 return 0;
163 }
164 spin_unlock(lock);
165
166 ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
167 if (!ret) {
168 spin_lock(lock);
169 /*
170 * The following code must be safe for change of the
171 * cache contents during the get block call.
172 */
173 brelse(prev->bh);
174 get_bh(*bhp);
175 prev->bh = *bhp;
176 prev->blkoff = blkoff;
177 spin_unlock(lock);
178 }
179 return ret;
180}
181
145static int nilfs_palloc_get_desc_block(struct inode *inode, 182static int nilfs_palloc_get_desc_block(struct inode *inode,
146 unsigned long group, 183 unsigned long group,
147 int create, struct buffer_head **bhp) 184 int create, struct buffer_head **bhp)
148{ 185{
149 return nilfs_mdt_get_block(inode, 186 struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
150 nilfs_palloc_desc_blkoff(inode, group), 187
151 create, nilfs_palloc_desc_block_init, bhp); 188 return nilfs_palloc_get_block(inode,
189 nilfs_palloc_desc_blkoff(inode, group),
190 create, nilfs_palloc_desc_block_init,
191 bhp, &cache->prev_desc, &cache->lock);
152} 192}
153 193
154static int nilfs_palloc_get_bitmap_block(struct inode *inode, 194static int nilfs_palloc_get_bitmap_block(struct inode *inode,
155 unsigned long group, 195 unsigned long group,
156 int create, struct buffer_head **bhp) 196 int create, struct buffer_head **bhp)
157{ 197{
158 return nilfs_mdt_get_block(inode, 198 struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
159 nilfs_palloc_bitmap_blkoff(inode, group), 199
160 create, NULL, bhp); 200 return nilfs_palloc_get_block(inode,
201 nilfs_palloc_bitmap_blkoff(inode, group),
202 create, NULL, bhp,
203 &cache->prev_bitmap, &cache->lock);
161} 204}
162 205
163int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, 206int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
164 int create, struct buffer_head **bhp) 207 int create, struct buffer_head **bhp)
165{ 208{
166 return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), 209 struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
167 create, NULL, bhp); 210
211 return nilfs_palloc_get_block(inode,
212 nilfs_palloc_entry_blkoff(inode, nr),
213 create, NULL, bhp,
214 &cache->prev_entry, &cache->lock);
168} 215}
169 216
170static struct nilfs_palloc_group_desc * 217static struct nilfs_palloc_group_desc *
@@ -176,13 +223,6 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
176 group % nilfs_palloc_groups_per_desc_block(inode); 223 group % nilfs_palloc_groups_per_desc_block(inode);
177} 224}
178 225
179static unsigned char *
180nilfs_palloc_block_get_bitmap(const struct inode *inode,
181 const struct buffer_head *bh, void *kaddr)
182{
183 return (unsigned char *)(kaddr + bh_offset(bh));
184}
185
186void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, 226void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
187 const struct buffer_head *bh, void *kaddr) 227 const struct buffer_head *bh, void *kaddr)
188{ 228{
@@ -289,8 +329,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
289 if (ret < 0) 329 if (ret < 0)
290 goto out_desc; 330 goto out_desc;
291 bitmap_kaddr = kmap(bitmap_bh->b_page); 331 bitmap_kaddr = kmap(bitmap_bh->b_page);
292 bitmap = nilfs_palloc_block_get_bitmap( 332 bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
293 inode, bitmap_bh, bitmap_kaddr);
294 pos = nilfs_palloc_find_available_slot( 333 pos = nilfs_palloc_find_available_slot(
295 inode, group, group_offset, bitmap, 334 inode, group, group_offset, bitmap,
296 entries_per_group); 335 entries_per_group);
@@ -351,8 +390,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
351 desc = nilfs_palloc_block_get_group_desc(inode, group, 390 desc = nilfs_palloc_block_get_group_desc(inode, group,
352 req->pr_desc_bh, desc_kaddr); 391 req->pr_desc_bh, desc_kaddr);
353 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); 392 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
354 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, 393 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
355 bitmap_kaddr);
356 394
357 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), 395 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
358 group_offset, bitmap)) 396 group_offset, bitmap))
@@ -385,8 +423,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
385 desc = nilfs_palloc_block_get_group_desc(inode, group, 423 desc = nilfs_palloc_block_get_group_desc(inode, group,
386 req->pr_desc_bh, desc_kaddr); 424 req->pr_desc_bh, desc_kaddr);
387 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); 425 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
388 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, 426 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
389 bitmap_kaddr);
390 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), 427 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
391 group_offset, bitmap)) 428 group_offset, bitmap))
392 printk(KERN_WARNING "%s: entry numer %llu already freed\n", 429 printk(KERN_WARNING "%s: entry numer %llu already freed\n",
@@ -472,8 +509,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
472 desc = nilfs_palloc_block_get_group_desc( 509 desc = nilfs_palloc_block_get_group_desc(
473 inode, group, desc_bh, desc_kaddr); 510 inode, group, desc_bh, desc_kaddr);
474 bitmap_kaddr = kmap(bitmap_bh->b_page); 511 bitmap_kaddr = kmap(bitmap_bh->b_page);
475 bitmap = nilfs_palloc_block_get_bitmap( 512 bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
476 inode, bitmap_bh, bitmap_kaddr);
477 for (j = i, n = 0; 513 for (j = i, n = 0;
478 (j < nitems) && nilfs_palloc_group_is_in(inode, group, 514 (j < nitems) && nilfs_palloc_group_is_in(inode, group,
479 entry_nrs[j]); 515 entry_nrs[j]);
@@ -502,3 +538,30 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
502 } 538 }
503 return 0; 539 return 0;
504} 540}
541
542void nilfs_palloc_setup_cache(struct inode *inode,
543 struct nilfs_palloc_cache *cache)
544{
545 NILFS_MDT(inode)->mi_palloc_cache = cache;
546 spin_lock_init(&cache->lock);
547}
548
549void nilfs_palloc_clear_cache(struct inode *inode)
550{
551 struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
552
553 spin_lock(&cache->lock);
554 brelse(cache->prev_desc.bh);
555 brelse(cache->prev_bitmap.bh);
556 brelse(cache->prev_entry.bh);
557 cache->prev_desc.bh = NULL;
558 cache->prev_bitmap.bh = NULL;
559 cache->prev_entry.bh = NULL;
560 spin_unlock(&cache->lock);
561}
562
563void nilfs_palloc_destroy_cache(struct inode *inode)
564{
565 nilfs_palloc_clear_cache(inode);
566 NILFS_MDT(inode)->mi_palloc_cache = NULL;
567}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4ace5475c2c7..5cccf874d692 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *); 42 const struct buffer_head *, void *);
43 43
44/** 44/**
45 * nilfs_palloc_req - persistent alloctor request and reply 45 * nilfs_palloc_req - persistent allocator request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number) 46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors 47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap 48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
@@ -69,4 +69,25 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
69#define nilfs_clear_bit_atomic ext2_clear_bit_atomic 69#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
70#define nilfs_find_next_zero_bit ext2_find_next_zero_bit 70#define nilfs_find_next_zero_bit ext2_find_next_zero_bit
71 71
72/*
73 * persistent object allocator cache
74 */
75
76struct nilfs_bh_assoc {
77 unsigned long blkoff;
78 struct buffer_head *bh;
79};
80
81struct nilfs_palloc_cache {
82 spinlock_t lock;
83 struct nilfs_bh_assoc prev_desc;
84 struct nilfs_bh_assoc prev_bitmap;
85 struct nilfs_bh_assoc prev_entry;
86};
87
88void nilfs_palloc_setup_cache(struct inode *inode,
89 struct nilfs_palloc_cache *cache);
90void nilfs_palloc_clear_cache(struct inode *inode);
91void nilfs_palloc_destroy_cache(struct inode *inode);
92
72#endif /* _NILFS_ALLOC_H */ 93#endif /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 08834df6ec68..effdbdbe6c11 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -402,19 +402,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
402void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n) 402void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
403{ 403{
404 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); 404 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
405 if (NILFS_MDT(bmap->b_inode))
406 nilfs_mdt_mark_dirty(bmap->b_inode);
407 else
408 mark_inode_dirty(bmap->b_inode);
409} 405}
410 406
411void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n) 407void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
412{ 408{
413 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); 409 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
414 if (NILFS_MDT(bmap->b_inode))
415 nilfs_mdt_mark_dirty(bmap->b_inode);
416 else
417 mark_inode_dirty(bmap->b_inode);
418} 410}
419 411
420__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, 412__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
@@ -425,8 +417,8 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
425 417
426 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - 418 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
427 bmap->b_inode->i_blkbits); 419 bmap->b_inode->i_blkbits);
428 for (pbh = page_buffers(bh->b_page); pbh != bh; 420 for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
429 pbh = pbh->b_this_page, key++); 421 key++;
430 422
431 return key; 423 return key;
432} 424}
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 84c25382f8e3..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/gfp.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "mdt.h" 32#include "mdt.h"
32#include "dat.h" 33#include "dat.h"
@@ -68,9 +69,34 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
68 truncate_inode_pages(btnc, 0); 69 truncate_inode_pages(btnc, 0);
69} 70}
70 71
72struct buffer_head *
73nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
74{
75 struct inode *inode = NILFS_BTNC_I(btnc);
76 struct buffer_head *bh;
77
78 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
79 if (unlikely(!bh))
80 return NULL;
81
82 if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
83 buffer_dirty(bh))) {
84 brelse(bh);
85 BUG();
86 }
87 memset(bh->b_data, 0, 1 << inode->i_blkbits);
88 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
89 bh->b_blocknr = blocknr;
90 set_buffer_mapped(bh);
91 set_buffer_uptodate(bh);
92
93 unlock_page(bh->b_page);
94 page_cache_release(bh->b_page);
95 return bh;
96}
97
71int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, 98int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
72 sector_t pblocknr, struct buffer_head **pbh, 99 sector_t pblocknr, struct buffer_head **pbh)
73 int newblk)
74{ 100{
75 struct buffer_head *bh; 101 struct buffer_head *bh;
76 struct inode *inode = NILFS_BTNC_I(btnc); 102 struct inode *inode = NILFS_BTNC_I(btnc);
@@ -81,19 +107,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
81 return -ENOMEM; 107 return -ENOMEM;
82 108
83 err = -EEXIST; /* internal code */ 109 err = -EEXIST; /* internal code */
84 if (newblk) {
85 if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
86 buffer_dirty(bh))) {
87 brelse(bh);
88 BUG();
89 }
90 memset(bh->b_data, 0, 1 << inode->i_blkbits);
91 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
92 bh->b_blocknr = blocknr;
93 set_buffer_mapped(bh);
94 set_buffer_uptodate(bh);
95 goto found;
96 }
97 110
98 if (buffer_uptodate(bh) || buffer_dirty(bh)) 111 if (buffer_uptodate(bh) || buffer_dirty(bh))
99 goto found; 112 goto found;
@@ -135,27 +148,6 @@ out_locked:
135 return err; 148 return err;
136} 149}
137 150
138int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
139 sector_t pblocknr, struct buffer_head **pbh, int newblk)
140{
141 struct buffer_head *bh;
142 int err;
143
144 err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
145 if (err == -EEXIST) /* internal code (cache hit) */
146 return 0;
147 if (unlikely(err))
148 return err;
149
150 bh = *pbh;
151 wait_on_buffer(bh);
152 if (!buffer_uptodate(bh)) {
153 brelse(bh);
154 return -EIO;
155 }
156 return 0;
157}
158
159/** 151/**
160 * nilfs_btnode_delete - delete B-tree node buffer 152 * nilfs_btnode_delete - delete B-tree node buffer
161 * @bh: buffer to be deleted 153 * @bh: buffer to be deleted
@@ -244,12 +236,13 @@ retry:
244 unlock_page(obh->b_page); 236 unlock_page(obh->b_page);
245 } 237 }
246 238
247 err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1); 239 nbh = nilfs_btnode_create_block(btnc, newkey);
248 if (likely(!err)) { 240 if (!nbh)
249 BUG_ON(nbh == obh); 241 return -ENOMEM;
250 ctxt->newbh = nbh; 242
251 } 243 BUG_ON(nbh == obh);
252 return err; 244 ctxt->newbh = nbh;
245 return 0;
253 246
254 failed_unlock: 247 failed_unlock:
255 unlock_page(obh->b_page); 248 unlock_page(obh->b_page);
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 3e2275172ed6..07da83f07712 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -40,10 +40,10 @@ struct nilfs_btnode_chkey_ctxt {
40void nilfs_btnode_cache_init_once(struct address_space *); 40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); 41void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
42void nilfs_btnode_cache_clear(struct address_space *); 42void nilfs_btnode_cache_clear(struct address_space *);
43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
44 __u64 blocknr);
43int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, 45int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
44 struct buffer_head **, int); 46 struct buffer_head **);
45int nilfs_btnode_get(struct address_space *, __u64, sector_t,
46 struct buffer_head **, int);
47void nilfs_btnode_delete(struct buffer_head *); 47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *, 48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *); 49 struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index e25b507a474f..7cdd98b8d514 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -114,7 +114,18 @@ static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
114{ 114{
115 struct address_space *btnc = 115 struct address_space *btnc =
116 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; 116 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
117 return nilfs_btnode_get(btnc, ptr, 0, bhp, 0); 117 int err;
118
119 err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
120 if (err)
121 return err == -EEXIST ? 0 : err;
122
123 wait_on_buffer(*bhp);
124 if (!buffer_uptodate(*bhp)) {
125 brelse(*bhp);
126 return -EIO;
127 }
128 return 0;
118} 129}
119 130
120static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, 131static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
@@ -122,12 +133,15 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
122{ 133{
123 struct address_space *btnc = 134 struct address_space *btnc =
124 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; 135 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
125 int ret; 136 struct buffer_head *bh;
126 137
127 ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1); 138 bh = nilfs_btnode_create_block(btnc, ptr);
128 if (!ret) 139 if (!bh)
129 set_buffer_nilfs_volatile(*bhp); 140 return -ENOMEM;
130 return ret; 141
142 set_buffer_nilfs_volatile(bh);
143 *bhp = bh;
144 return 0;
131} 145}
132 146
133static inline int 147static inline int
@@ -444,6 +458,18 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
444 nilfs_btree_get_nonroot_node(path, level); 458 nilfs_btree_get_nonroot_node(path, level);
445} 459}
446 460
461static inline int
462nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
463{
464 if (unlikely(nilfs_btree_node_get_level(node) != level)) {
465 dump_stack();
466 printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
467 nilfs_btree_node_get_level(node), level);
468 return 1;
469 }
470 return 0;
471}
472
447static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, 473static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
448 struct nilfs_btree_path *path, 474 struct nilfs_btree_path *path,
449 __u64 key, __u64 *ptrp, int minlevel) 475 __u64 key, __u64 *ptrp, int minlevel)
@@ -467,7 +493,8 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
467 if (ret < 0) 493 if (ret < 0)
468 return ret; 494 return ret;
469 node = nilfs_btree_get_nonroot_node(path, level); 495 node = nilfs_btree_get_nonroot_node(path, level);
470 BUG_ON(level != nilfs_btree_node_get_level(node)); 496 if (nilfs_btree_bad_node(node, level))
497 return -EINVAL;
471 if (!found) 498 if (!found)
472 found = nilfs_btree_node_lookup(node, key, &index); 499 found = nilfs_btree_node_lookup(node, key, &index);
473 else 500 else
@@ -512,7 +539,8 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
512 if (ret < 0) 539 if (ret < 0)
513 return ret; 540 return ret;
514 node = nilfs_btree_get_nonroot_node(path, level); 541 node = nilfs_btree_get_nonroot_node(path, level);
515 BUG_ON(level != nilfs_btree_node_get_level(node)); 542 if (nilfs_btree_bad_node(node, level))
543 return -EINVAL;
516 index = nilfs_btree_node_get_nchildren(node) - 1; 544 index = nilfs_btree_node_get_nchildren(node) - 1;
517 ptr = nilfs_btree_node_get_ptr(btree, node, index); 545 ptr = nilfs_btree_node_get_ptr(btree, node, index);
518 path[level].bp_index = index; 546 path[level].bp_index = index;
@@ -638,13 +666,11 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
638{ 666{
639 if (level < nilfs_btree_height(btree) - 1) { 667 if (level < nilfs_btree_height(btree) - 1) {
640 do { 668 do {
641 lock_buffer(path[level].bp_bh);
642 nilfs_btree_node_set_key( 669 nilfs_btree_node_set_key(
643 nilfs_btree_get_nonroot_node(path, level), 670 nilfs_btree_get_nonroot_node(path, level),
644 path[level].bp_index, key); 671 path[level].bp_index, key);
645 if (!buffer_dirty(path[level].bp_bh)) 672 if (!buffer_dirty(path[level].bp_bh))
646 nilfs_btnode_mark_dirty(path[level].bp_bh); 673 nilfs_btnode_mark_dirty(path[level].bp_bh);
647 unlock_buffer(path[level].bp_bh);
648 } while ((path[level].bp_index == 0) && 674 } while ((path[level].bp_index == 0) &&
649 (++level < nilfs_btree_height(btree) - 1)); 675 (++level < nilfs_btree_height(btree) - 1));
650 } 676 }
@@ -663,13 +689,11 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
663 struct nilfs_btree_node *node; 689 struct nilfs_btree_node *node;
664 690
665 if (level < nilfs_btree_height(btree) - 1) { 691 if (level < nilfs_btree_height(btree) - 1) {
666 lock_buffer(path[level].bp_bh);
667 node = nilfs_btree_get_nonroot_node(path, level); 692 node = nilfs_btree_get_nonroot_node(path, level);
668 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 693 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
669 path[level].bp_index); 694 path[level].bp_index);
670 if (!buffer_dirty(path[level].bp_bh)) 695 if (!buffer_dirty(path[level].bp_bh))
671 nilfs_btnode_mark_dirty(path[level].bp_bh); 696 nilfs_btnode_mark_dirty(path[level].bp_bh);
672 unlock_buffer(path[level].bp_bh);
673 697
674 if (path[level].bp_index == 0) 698 if (path[level].bp_index == 0)
675 nilfs_btree_promote_key(btree, path, level + 1, 699 nilfs_btree_promote_key(btree, path, level + 1,
@@ -689,9 +713,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
689 struct nilfs_btree_node *node, *left; 713 struct nilfs_btree_node *node, *left;
690 int nchildren, lnchildren, n, move; 714 int nchildren, lnchildren, n, move;
691 715
692 lock_buffer(path[level].bp_bh);
693 lock_buffer(path[level].bp_sib_bh);
694
695 node = nilfs_btree_get_nonroot_node(path, level); 716 node = nilfs_btree_get_nonroot_node(path, level);
696 left = nilfs_btree_get_sib_node(path, level); 717 left = nilfs_btree_get_sib_node(path, level);
697 nchildren = nilfs_btree_node_get_nchildren(node); 718 nchildren = nilfs_btree_node_get_nchildren(node);
@@ -712,9 +733,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
712 if (!buffer_dirty(path[level].bp_sib_bh)) 733 if (!buffer_dirty(path[level].bp_sib_bh))
713 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 734 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
714 735
715 unlock_buffer(path[level].bp_bh);
716 unlock_buffer(path[level].bp_sib_bh);
717
718 nilfs_btree_promote_key(btree, path, level + 1, 736 nilfs_btree_promote_key(btree, path, level + 1,
719 nilfs_btree_node_get_key(node, 0)); 737 nilfs_btree_node_get_key(node, 0));
720 738
@@ -740,9 +758,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
740 struct nilfs_btree_node *node, *right; 758 struct nilfs_btree_node *node, *right;
741 int nchildren, rnchildren, n, move; 759 int nchildren, rnchildren, n, move;
742 760
743 lock_buffer(path[level].bp_bh);
744 lock_buffer(path[level].bp_sib_bh);
745
746 node = nilfs_btree_get_nonroot_node(path, level); 761 node = nilfs_btree_get_nonroot_node(path, level);
747 right = nilfs_btree_get_sib_node(path, level); 762 right = nilfs_btree_get_sib_node(path, level);
748 nchildren = nilfs_btree_node_get_nchildren(node); 763 nchildren = nilfs_btree_node_get_nchildren(node);
@@ -763,9 +778,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
763 if (!buffer_dirty(path[level].bp_sib_bh)) 778 if (!buffer_dirty(path[level].bp_sib_bh))
764 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 779 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
765 780
766 unlock_buffer(path[level].bp_bh);
767 unlock_buffer(path[level].bp_sib_bh);
768
769 path[level + 1].bp_index++; 781 path[level + 1].bp_index++;
770 nilfs_btree_promote_key(btree, path, level + 1, 782 nilfs_btree_promote_key(btree, path, level + 1,
771 nilfs_btree_node_get_key(right, 0)); 783 nilfs_btree_node_get_key(right, 0));
@@ -794,9 +806,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
794 __u64 newptr; 806 __u64 newptr;
795 int nchildren, n, move; 807 int nchildren, n, move;
796 808
797 lock_buffer(path[level].bp_bh);
798 lock_buffer(path[level].bp_sib_bh);
799
800 node = nilfs_btree_get_nonroot_node(path, level); 809 node = nilfs_btree_get_nonroot_node(path, level);
801 right = nilfs_btree_get_sib_node(path, level); 810 right = nilfs_btree_get_sib_node(path, level);
802 nchildren = nilfs_btree_node_get_nchildren(node); 811 nchildren = nilfs_btree_node_get_nchildren(node);
@@ -815,9 +824,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
815 if (!buffer_dirty(path[level].bp_sib_bh)) 824 if (!buffer_dirty(path[level].bp_sib_bh))
816 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 825 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
817 826
818 unlock_buffer(path[level].bp_bh);
819 unlock_buffer(path[level].bp_sib_bh);
820
821 newkey = nilfs_btree_node_get_key(right, 0); 827 newkey = nilfs_btree_node_get_key(right, 0);
822 newptr = path[level].bp_newreq.bpr_ptr; 828 newptr = path[level].bp_newreq.bpr_ptr;
823 829
@@ -852,8 +858,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
852 struct nilfs_btree_node *root, *child; 858 struct nilfs_btree_node *root, *child;
853 int n; 859 int n;
854 860
855 lock_buffer(path[level].bp_sib_bh);
856
857 root = nilfs_btree_get_root(btree); 861 root = nilfs_btree_get_root(btree);
858 child = nilfs_btree_get_sib_node(path, level); 862 child = nilfs_btree_get_sib_node(path, level);
859 863
@@ -865,8 +869,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
865 if (!buffer_dirty(path[level].bp_sib_bh)) 869 if (!buffer_dirty(path[level].bp_sib_bh))
866 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 870 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
867 871
868 unlock_buffer(path[level].bp_sib_bh);
869
870 path[level].bp_bh = path[level].bp_sib_bh; 872 path[level].bp_bh = path[level].bp_sib_bh;
871 path[level].bp_sib_bh = NULL; 873 path[level].bp_sib_bh = NULL;
872 874
@@ -1023,11 +1025,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1023 1025
1024 stats->bs_nblocks++; 1026 stats->bs_nblocks++;
1025 1027
1026 lock_buffer(bh);
1027 nilfs_btree_node_init(btree, 1028 nilfs_btree_node_init(btree,
1028 (struct nilfs_btree_node *)bh->b_data, 1029 (struct nilfs_btree_node *)bh->b_data,
1029 0, level, 0, NULL, NULL); 1030 0, level, 0, NULL, NULL);
1030 unlock_buffer(bh);
1031 path[level].bp_sib_bh = bh; 1031 path[level].bp_sib_bh = bh;
1032 path[level].bp_op = nilfs_btree_split; 1032 path[level].bp_op = nilfs_btree_split;
1033 } 1033 }
@@ -1052,10 +1052,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1052 if (ret < 0) 1052 if (ret < 0)
1053 goto err_out_curr_node; 1053 goto err_out_curr_node;
1054 1054
1055 lock_buffer(bh);
1056 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 1055 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
1057 0, level, 0, NULL, NULL); 1056 0, level, 0, NULL, NULL);
1058 unlock_buffer(bh);
1059 path[level].bp_sib_bh = bh; 1057 path[level].bp_sib_bh = bh;
1060 path[level].bp_op = nilfs_btree_grow; 1058 path[level].bp_op = nilfs_btree_grow;
1061 1059
@@ -1154,13 +1152,11 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1154 struct nilfs_btree_node *node; 1152 struct nilfs_btree_node *node;
1155 1153
1156 if (level < nilfs_btree_height(btree) - 1) { 1154 if (level < nilfs_btree_height(btree) - 1) {
1157 lock_buffer(path[level].bp_bh);
1158 node = nilfs_btree_get_nonroot_node(path, level); 1155 node = nilfs_btree_get_nonroot_node(path, level);
1159 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1156 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1160 path[level].bp_index); 1157 path[level].bp_index);
1161 if (!buffer_dirty(path[level].bp_bh)) 1158 if (!buffer_dirty(path[level].bp_bh))
1162 nilfs_btnode_mark_dirty(path[level].bp_bh); 1159 nilfs_btnode_mark_dirty(path[level].bp_bh);
1163 unlock_buffer(path[level].bp_bh);
1164 if (path[level].bp_index == 0) 1160 if (path[level].bp_index == 0)
1165 nilfs_btree_promote_key(btree, path, level + 1, 1161 nilfs_btree_promote_key(btree, path, level + 1,
1166 nilfs_btree_node_get_key(node, 0)); 1162 nilfs_btree_node_get_key(node, 0));
@@ -1180,9 +1176,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1180 1176
1181 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1177 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1182 1178
1183 lock_buffer(path[level].bp_bh);
1184 lock_buffer(path[level].bp_sib_bh);
1185
1186 node = nilfs_btree_get_nonroot_node(path, level); 1179 node = nilfs_btree_get_nonroot_node(path, level);
1187 left = nilfs_btree_get_sib_node(path, level); 1180 left = nilfs_btree_get_sib_node(path, level);
1188 nchildren = nilfs_btree_node_get_nchildren(node); 1181 nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1197,9 +1190,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1197 if (!buffer_dirty(path[level].bp_sib_bh)) 1190 if (!buffer_dirty(path[level].bp_sib_bh))
1198 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 1191 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1199 1192
1200 unlock_buffer(path[level].bp_bh);
1201 unlock_buffer(path[level].bp_sib_bh);
1202
1203 nilfs_btree_promote_key(btree, path, level + 1, 1193 nilfs_btree_promote_key(btree, path, level + 1,
1204 nilfs_btree_node_get_key(node, 0)); 1194 nilfs_btree_node_get_key(node, 0));
1205 1195
@@ -1217,9 +1207,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1217 1207
1218 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1208 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1219 1209
1220 lock_buffer(path[level].bp_bh);
1221 lock_buffer(path[level].bp_sib_bh);
1222
1223 node = nilfs_btree_get_nonroot_node(path, level); 1210 node = nilfs_btree_get_nonroot_node(path, level);
1224 right = nilfs_btree_get_sib_node(path, level); 1211 right = nilfs_btree_get_sib_node(path, level);
1225 nchildren = nilfs_btree_node_get_nchildren(node); 1212 nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1234,9 +1221,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1234 if (!buffer_dirty(path[level].bp_sib_bh)) 1221 if (!buffer_dirty(path[level].bp_sib_bh))
1235 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 1222 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1236 1223
1237 unlock_buffer(path[level].bp_bh);
1238 unlock_buffer(path[level].bp_sib_bh);
1239
1240 path[level + 1].bp_index++; 1224 path[level + 1].bp_index++;
1241 nilfs_btree_promote_key(btree, path, level + 1, 1225 nilfs_btree_promote_key(btree, path, level + 1,
1242 nilfs_btree_node_get_key(right, 0)); 1226 nilfs_btree_node_get_key(right, 0));
@@ -1255,9 +1239,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1255 1239
1256 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1240 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1257 1241
1258 lock_buffer(path[level].bp_bh);
1259 lock_buffer(path[level].bp_sib_bh);
1260
1261 node = nilfs_btree_get_nonroot_node(path, level); 1242 node = nilfs_btree_get_nonroot_node(path, level);
1262 left = nilfs_btree_get_sib_node(path, level); 1243 left = nilfs_btree_get_sib_node(path, level);
1263 1244
@@ -1268,9 +1249,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1268 if (!buffer_dirty(path[level].bp_sib_bh)) 1249 if (!buffer_dirty(path[level].bp_sib_bh))
1269 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 1250 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1270 1251
1271 unlock_buffer(path[level].bp_bh);
1272 unlock_buffer(path[level].bp_sib_bh);
1273
1274 nilfs_btnode_delete(path[level].bp_bh); 1252 nilfs_btnode_delete(path[level].bp_bh);
1275 path[level].bp_bh = path[level].bp_sib_bh; 1253 path[level].bp_bh = path[level].bp_sib_bh;
1276 path[level].bp_sib_bh = NULL; 1254 path[level].bp_sib_bh = NULL;
@@ -1286,9 +1264,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1286 1264
1287 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1265 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1288 1266
1289 lock_buffer(path[level].bp_bh);
1290 lock_buffer(path[level].bp_sib_bh);
1291
1292 node = nilfs_btree_get_nonroot_node(path, level); 1267 node = nilfs_btree_get_nonroot_node(path, level);
1293 right = nilfs_btree_get_sib_node(path, level); 1268 right = nilfs_btree_get_sib_node(path, level);
1294 1269
@@ -1299,9 +1274,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1299 if (!buffer_dirty(path[level].bp_bh)) 1274 if (!buffer_dirty(path[level].bp_bh))
1300 nilfs_btnode_mark_dirty(path[level].bp_bh); 1275 nilfs_btnode_mark_dirty(path[level].bp_bh);
1301 1276
1302 unlock_buffer(path[level].bp_bh);
1303 unlock_buffer(path[level].bp_sib_bh);
1304
1305 nilfs_btnode_delete(path[level].bp_sib_bh); 1277 nilfs_btnode_delete(path[level].bp_sib_bh);
1306 path[level].bp_sib_bh = NULL; 1278 path[level].bp_sib_bh = NULL;
1307 path[level + 1].bp_index++; 1279 path[level + 1].bp_index++;
@@ -1316,7 +1288,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1316 1288
1317 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1289 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1318 1290
1319 lock_buffer(path[level].bp_bh);
1320 root = nilfs_btree_get_root(btree); 1291 root = nilfs_btree_get_root(btree);
1321 child = nilfs_btree_get_nonroot_node(path, level); 1292 child = nilfs_btree_get_nonroot_node(path, level);
1322 1293
@@ -1324,7 +1295,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1324 nilfs_btree_node_set_level(root, level); 1295 nilfs_btree_node_set_level(root, level);
1325 n = nilfs_btree_node_get_nchildren(child); 1296 n = nilfs_btree_node_get_nchildren(child);
1326 nilfs_btree_node_move_left(btree, root, child, n); 1297 nilfs_btree_node_move_left(btree, root, child, n);
1327 unlock_buffer(path[level].bp_bh);
1328 1298
1329 nilfs_btnode_delete(path[level].bp_bh); 1299 nilfs_btnode_delete(path[level].bp_bh);
1330 path[level].bp_bh = NULL; 1300 path[level].bp_bh = NULL;
@@ -1699,7 +1669,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1699 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); 1669 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
1700 1670
1701 /* create child node at level 1 */ 1671 /* create child node at level 1 */
1702 lock_buffer(bh);
1703 node = (struct nilfs_btree_node *)bh->b_data; 1672 node = (struct nilfs_btree_node *)bh->b_data;
1704 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); 1673 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
1705 nilfs_btree_node_insert(btree, node, 1674 nilfs_btree_node_insert(btree, node,
@@ -1709,7 +1678,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1709 if (!nilfs_bmap_dirty(bmap)) 1678 if (!nilfs_bmap_dirty(bmap))
1710 nilfs_bmap_set_dirty(bmap); 1679 nilfs_bmap_set_dirty(bmap);
1711 1680
1712 unlock_buffer(bh);
1713 brelse(bh); 1681 brelse(bh);
1714 1682
1715 /* create root node at level 2 */ 1683 /* create root node at level 2 */
@@ -2050,7 +2018,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
2050 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 2018 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
2051 level < NILFS_BTREE_LEVEL_MAX; 2019 level < NILFS_BTREE_LEVEL_MAX;
2052 level++) 2020 level++)
2053 list_splice(&lists[level], listp->prev); 2021 list_splice_tail(&lists[level], listp);
2054} 2022}
2055 2023
2056static int nilfs_btree_assign_p(struct nilfs_btree *btree, 2024static int nilfs_btree_assign_p(struct nilfs_btree *btree,
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 0e72bbbc6b64..4b82d84ade75 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
34struct nilfs_btree_path; 34struct nilfs_btree_path;
35 35
36/** 36/**
37 * struct nilfs_btree_node - B-tree node
38 * @bn_flags: flags
39 * @bn_level: level
40 * @bn_nchildren: number of children
41 * @bn_pad: padding
42 */
43struct nilfs_btree_node {
44 __u8 bn_flags;
45 __u8 bn_level;
46 __le16 bn_nchildren;
47 __le32 bn_pad;
48};
49
50/* flags */
51#define NILFS_BTREE_NODE_ROOT 0x01
52
53/* level */
54#define NILFS_BTREE_LEVEL_DATA 0
55#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1)
56#define NILFS_BTREE_LEVEL_MAX 14
57
58/**
59 * struct nilfs_btree - B-tree structure 37 * struct nilfs_btree - B-tree structure
60 * @bt_bmap: bmap base structure 38 * @bt_bmap: bmap base structure
61 */ 39 */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 3f5d5d06f53c..18737818db63 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -328,19 +328,24 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
328 tnicps += nicps; 328 tnicps += nicps;
329 nilfs_mdt_mark_buffer_dirty(cp_bh); 329 nilfs_mdt_mark_buffer_dirty(cp_bh);
330 nilfs_mdt_mark_dirty(cpfile); 330 nilfs_mdt_mark_dirty(cpfile);
331 if (!nilfs_cpfile_is_in_first(cpfile, cno) && 331 if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
332 (count = nilfs_cpfile_block_sub_valid_checkpoints( 332 count =
333 cpfile, cp_bh, kaddr, nicps)) == 0) { 333 nilfs_cpfile_block_sub_valid_checkpoints(
334 /* make hole */ 334 cpfile, cp_bh, kaddr, nicps);
335 kunmap_atomic(kaddr, KM_USER0); 335 if (count == 0) {
336 brelse(cp_bh); 336 /* make hole */
337 ret = nilfs_cpfile_delete_checkpoint_block( 337 kunmap_atomic(kaddr, KM_USER0);
338 cpfile, cno); 338 brelse(cp_bh);
339 if (ret == 0) 339 ret =
340 continue; 340 nilfs_cpfile_delete_checkpoint_block(
341 printk(KERN_ERR "%s: cannot delete block\n", 341 cpfile, cno);
342 __func__); 342 if (ret == 0)
343 break; 343 continue;
344 printk(KERN_ERR
345 "%s: cannot delete block\n",
346 __func__);
347 break;
348 }
344 } 349 }
345 } 350 }
346 351
@@ -926,3 +931,29 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
926 up_read(&NILFS_MDT(cpfile)->mi_sem); 931 up_read(&NILFS_MDT(cpfile)->mi_sem);
927 return ret; 932 return ret;
928} 933}
934
935/**
936 * nilfs_cpfile_read - read cpfile inode
937 * @cpfile: cpfile inode
938 * @raw_inode: on-disk cpfile inode
939 */
940int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
941{
942 return nilfs_read_inode_common(cpfile, raw_inode);
943}
944
945/**
946 * nilfs_cpfile_new - create cpfile
947 * @nilfs: nilfs object
948 * @cpsize: size of a checkpoint entry
949 */
950struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
951{
952 struct inode *cpfile;
953
954 cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
955 if (cpfile)
956 nilfs_mdt_set_entry_size(cpfile, cpsize,
957 sizeof(struct nilfs_cpfile_header));
958 return cpfile;
959}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index debea896e701..bc0809e0ab43 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,4 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, 40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
41 size_t); 41 size_t);
42 42
43int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
44struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
45
43#endif /* _NILFS_CPFILE_H */ 46#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 1ff8e15bd36b..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -33,6 +33,16 @@
33#define NILFS_CNO_MIN ((__u64)1) 33#define NILFS_CNO_MIN ((__u64)1)
34#define NILFS_CNO_MAX (~(__u64)0) 34#define NILFS_CNO_MAX (~(__u64)0)
35 35
36struct nilfs_dat_info {
37 struct nilfs_mdt_info mi;
38 struct nilfs_palloc_cache palloc_cache;
39};
40
41static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
42{
43 return (struct nilfs_dat_info *)NILFS_MDT(dat);
44}
45
36static int nilfs_dat_prepare_entry(struct inode *dat, 46static int nilfs_dat_prepare_entry(struct inode *dat,
37 struct nilfs_palloc_req *req, int create) 47 struct nilfs_palloc_req *req, int create)
38{ 48{
@@ -278,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
278 * @vblocknrs and @nitems. 288 * @vblocknrs and @nitems.
279 * 289 *
280 * Return Value: On success, 0 is returned. On error, one of the following 290 * Return Value: On success, 0 is returned. On error, one of the following
281 * nagative error codes is returned. 291 * negative error codes is returned.
282 * 292 *
283 * %-EIO - I/O error. 293 * %-EIO - I/O error.
284 * 294 *
@@ -378,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
378 ret = -ENOENT; 388 ret = -ENOENT;
379 goto out; 389 goto out;
380 } 390 }
381 if (blocknrp != NULL) 391 *blocknrp = blocknr;
382 *blocknrp = blocknr;
383 392
384 out: 393 out:
385 kunmap_atomic(kaddr, KM_USER0); 394 kunmap_atomic(kaddr, KM_USER0);
@@ -425,3 +434,40 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
425 434
426 return nvi; 435 return nvi;
427} 436}
437
438/**
439 * nilfs_dat_read - read dat inode
440 * @dat: dat inode
441 * @raw_inode: on-disk dat inode
442 */
443int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
444{
445 return nilfs_read_inode_common(dat, raw_inode);
446}
447
448/**
449 * nilfs_dat_new - create dat file
450 * @nilfs: nilfs object
451 * @entry_size: size of a dat entry
452 */
453struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
454{
455 static struct lock_class_key dat_lock_key;
456 struct inode *dat;
457 struct nilfs_dat_info *di;
458 int err;
459
460 dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
461 if (dat) {
462 err = nilfs_palloc_init_blockgroup(dat, entry_size);
463 if (unlikely(err)) {
464 nilfs_mdt_destroy(dat);
465 return NULL;
466 }
467
468 di = NILFS_DAT_I(dat);
469 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
470 nilfs_palloc_setup_cache(dat, &di->palloc_cache);
471 }
472 return dat;
473}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 406070d3ff49..d31c3aab0efe 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,4 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
53int nilfs_dat_move(struct inode *, __u64, sector_t); 53int nilfs_dat_move(struct inode *, __u64, sector_t);
54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); 54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
55 55
56int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
57struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
58
56#endif /* _NILFS_DAT_H */ 59#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index e097099bfc8f..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -99,9 +99,9 @@ static int nilfs_prepare_chunk(struct page *page,
99 NULL, nilfs_get_block); 99 NULL, nilfs_get_block);
100} 100}
101 101
102static int nilfs_commit_chunk(struct page *page, 102static void nilfs_commit_chunk(struct page *page,
103 struct address_space *mapping, 103 struct address_space *mapping,
104 unsigned from, unsigned to) 104 unsigned from, unsigned to)
105{ 105{
106 struct inode *dir = mapping->host; 106 struct inode *dir = mapping->host;
107 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb); 107 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
@@ -112,15 +112,13 @@ static int nilfs_commit_chunk(struct page *page,
112 112
113 nr_dirty = nilfs_page_count_clean_buffers(page, from, to); 113 nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
114 copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); 114 copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
115 if (pos + copied > dir->i_size) { 115 if (pos + copied > dir->i_size)
116 i_size_write(dir, pos + copied); 116 i_size_write(dir, pos + copied);
117 mark_inode_dirty(dir);
118 }
119 if (IS_DIRSYNC(dir)) 117 if (IS_DIRSYNC(dir))
120 nilfs_set_transaction_flag(NILFS_TI_SYNC); 118 nilfs_set_transaction_flag(NILFS_TI_SYNC);
121 err = nilfs_set_file_dirty(sbi, dir, nr_dirty); 119 err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
120 WARN_ON(err); /* do not happen */
122 unlock_page(page); 121 unlock_page(page);
123 return err;
124} 122}
125 123
126static void nilfs_check_page(struct page *page) 124static void nilfs_check_page(struct page *page)
@@ -226,7 +224,7 @@ fail:
226 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. 224 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
227 */ 225 */
228static int 226static int
229nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) 227nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
230{ 228{
231 if (len != de->name_len) 229 if (len != de->name_len)
232 return 0; 230 return 0;
@@ -351,11 +349,11 @@ done:
351 * Entry is guaranteed to be valid. 349 * Entry is guaranteed to be valid.
352 */ 350 */
353struct nilfs_dir_entry * 351struct nilfs_dir_entry *
354nilfs_find_entry(struct inode *dir, struct dentry *dentry, 352nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
355 struct page **res_page) 353 struct page **res_page)
356{ 354{
357 const char *name = dentry->d_name.name; 355 const unsigned char *name = qstr->name;
358 int namelen = dentry->d_name.len; 356 int namelen = qstr->len;
359 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 357 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
360 unsigned long start, n; 358 unsigned long start, n;
361 unsigned long npages = dir_pages(dir); 359 unsigned long npages = dir_pages(dir);
@@ -398,7 +396,7 @@ nilfs_find_entry(struct inode *dir, struct dentry *dentry,
398 /* next page is past the blocks we've got */ 396 /* next page is past the blocks we've got */
399 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { 397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
400 nilfs_error(dir->i_sb, __func__, 398 nilfs_error(dir->i_sb, __func__,
401 "dir %lu size %lld exceeds block cout %llu", 399 "dir %lu size %lld exceeds block count %llu",
402 dir->i_ino, dir->i_size, 400 dir->i_ino, dir->i_size,
403 (unsigned long long)dir->i_blocks); 401 (unsigned long long)dir->i_blocks);
404 goto out; 402 goto out;
@@ -426,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
426 return de; 424 return de;
427} 425}
428 426
429ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) 427ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
430{ 428{
431 ino_t res = 0; 429 ino_t res = 0;
432 struct nilfs_dir_entry *de; 430 struct nilfs_dir_entry *de;
433 struct page *page; 431 struct page *page;
434 432
435 de = nilfs_find_entry(dir, dentry, &page); 433 de = nilfs_find_entry(dir, qstr, &page);
436 if (de) { 434 if (de) {
437 res = le64_to_cpu(de->inode); 435 res = le64_to_cpu(de->inode);
438 kunmap(page); 436 kunmap(page);
@@ -455,11 +453,10 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
455 BUG_ON(err); 453 BUG_ON(err);
456 de->inode = cpu_to_le64(inode->i_ino); 454 de->inode = cpu_to_le64(inode->i_ino);
457 nilfs_set_de_type(de, inode); 455 nilfs_set_de_type(de, inode);
458 err = nilfs_commit_chunk(page, mapping, from, to); 456 nilfs_commit_chunk(page, mapping, from, to);
459 nilfs_put_page(page); 457 nilfs_put_page(page);
460 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 458 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
461/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ 459/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
462 mark_inode_dirty(dir);
463} 460}
464 461
465/* 462/*
@@ -468,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
468int nilfs_add_link(struct dentry *dentry, struct inode *inode) 465int nilfs_add_link(struct dentry *dentry, struct inode *inode)
469{ 466{
470 struct inode *dir = dentry->d_parent->d_inode; 467 struct inode *dir = dentry->d_parent->d_inode;
471 const char *name = dentry->d_name.name; 468 const unsigned char *name = dentry->d_name.name;
472 int namelen = dentry->d_name.len; 469 int namelen = dentry->d_name.len;
473 unsigned chunk_size = nilfs_chunk_size(dir); 470 unsigned chunk_size = nilfs_chunk_size(dir);
474 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 471 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
@@ -548,10 +545,10 @@ got_it:
548 memcpy(de->name, name, namelen); 545 memcpy(de->name, name, namelen);
549 de->inode = cpu_to_le64(inode->i_ino); 546 de->inode = cpu_to_le64(inode->i_ino);
550 nilfs_set_de_type(de, inode); 547 nilfs_set_de_type(de, inode);
551 err = nilfs_commit_chunk(page, page->mapping, from, to); 548 nilfs_commit_chunk(page, page->mapping, from, to);
552 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 549 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
553/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ 550/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
554 mark_inode_dirty(dir); 551 nilfs_mark_inode_dirty(dir);
555 /* OFFSET_CACHE */ 552 /* OFFSET_CACHE */
556out_put: 553out_put:
557 nilfs_put_page(page); 554 nilfs_put_page(page);
@@ -595,10 +592,9 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
595 if (pde) 592 if (pde)
596 pde->rec_len = cpu_to_le16(to - from); 593 pde->rec_len = cpu_to_le16(to - from);
597 dir->inode = 0; 594 dir->inode = 0;
598 err = nilfs_commit_chunk(page, mapping, from, to); 595 nilfs_commit_chunk(page, mapping, from, to);
599 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 596 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
600/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */ 597/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
601 mark_inode_dirty(inode);
602out: 598out:
603 nilfs_put_page(page); 599 nilfs_put_page(page);
604 return err; 600 return err;
@@ -640,7 +636,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
640 memcpy(de->name, "..\0", 4); 636 memcpy(de->name, "..\0", 4);
641 nilfs_set_de_type(de, inode); 637 nilfs_set_de_type(de, inode);
642 kunmap_atomic(kaddr, KM_USER0); 638 kunmap_atomic(kaddr, KM_USER0);
643 err = nilfs_commit_chunk(page, mapping, 0, chunk_size); 639 nilfs_commit_chunk(page, mapping, 0, chunk_size);
644fail: 640fail:
645 page_cache_release(page); 641 page_cache_release(page);
646 return err; 642 return err;
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index d369ac718277..236753df5cdf 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -51,11 +51,11 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
51 struct nilfs_direct *direct; 51 struct nilfs_direct *direct;
52 __u64 ptr; 52 __u64 ptr;
53 53
54 direct = (struct nilfs_direct *)bmap; 54 direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */
55 if ((key > NILFS_DIRECT_KEY_MAX) || 55 if (key > NILFS_DIRECT_KEY_MAX || level != 1)
56 (level != 1) || /* XXX: use macro for level 1 */ 56 return -ENOENT;
57 ((ptr = nilfs_direct_get_ptr(direct, key)) == 57 ptr = nilfs_direct_get_ptr(direct, key);
58 NILFS_BMAP_INVALID_PTR)) 58 if (ptr == NILFS_BMAP_INVALID_PTR)
59 return -ENOENT; 59 return -ENOENT;
60 60
61 if (ptrp != NULL) 61 if (ptrp != NULL)
@@ -73,9 +73,10 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
73 sector_t blocknr; 73 sector_t blocknr;
74 int ret, cnt; 74 int ret, cnt;
75 75
76 if (key > NILFS_DIRECT_KEY_MAX || 76 if (key > NILFS_DIRECT_KEY_MAX)
77 (ptr = nilfs_direct_get_ptr(direct, key)) == 77 return -ENOENT;
78 NILFS_BMAP_INVALID_PTR) 78 ptr = nilfs_direct_get_ptr(direct, key);
79 if (ptr == NILFS_BMAP_INVALID_PTR)
79 return -ENOENT; 80 return -ENOENT;
80 81
81 if (NILFS_BMAP_USE_VBN(bmap)) { 82 if (NILFS_BMAP_USE_VBN(bmap)) {
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index 93383c5cee90..dd5f7e0a95f6 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -61,6 +61,8 @@ void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
61 61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap); 62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63 63
64 nilfs_palloc_clear_cache(dat);
65 nilfs_palloc_clear_cache(gcdat);
64 nilfs_clear_dirty_pages(mapping); 66 nilfs_clear_dirty_pages(mapping);
65 nilfs_copy_back_pages(mapping, gmapping); 67 nilfs_copy_back_pages(mapping, gmapping);
66 /* note: mdt dirty flags should be cleared by segctor. */ 68 /* note: mdt dirty flags should be cleared by segctor. */
@@ -79,6 +81,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
79 gcdat->i_state = I_CLEAR; 81 gcdat->i_state = I_CLEAR;
80 gii->i_flags = 0; 82 gii->i_flags = 0;
81 83
84 nilfs_palloc_clear_cache(gcdat);
82 truncate_inode_pages(gcdat->i_mapping, 0); 85 truncate_inode_pages(gcdat->i_mapping, 0);
83 truncate_inode_pages(&gii->i_btnode_cache, 0); 86 truncate_inode_pages(&gii->i_btnode_cache, 0);
84} 87}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e6de0a27ab5d..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it 31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different 32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy 33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup 34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a 35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number. 36 * checkpoint number argument as well as an inode number.
37 * 37 *
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/mpage.h> 46#include <linux/mpage.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/slab.h>
48#include <linux/swap.h> 49#include <linux/swap.h>
49#include "nilfs.h" 50#include "nilfs.h"
50#include "page.h" 51#include "page.h"
@@ -149,7 +150,7 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
149 __u64 vbn, struct buffer_head **out_bh) 150 __u64 vbn, struct buffer_head **out_bh)
150{ 151{
151 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, 152 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
152 vbn ? : pbn, pbn, out_bh, 0); 153 vbn ? : pbn, pbn, out_bh);
153 if (ret == -EEXIST) /* internal code (cache hit) */ 154 if (ret == -EEXIST) /* internal code (cache hit) */
154 ret = 0; 155 ret = 0;
155 return ret; 156 return ret;
@@ -212,9 +213,10 @@ void nilfs_destroy_gccache(struct the_nilfs *nilfs)
212static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino, 213static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
213 __u64 cno) 214 __u64 cno)
214{ 215{
215 struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS); 216 struct inode *inode;
216 struct nilfs_inode_info *ii; 217 struct nilfs_inode_info *ii;
217 218
219 inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
218 if (!inode) 220 if (!inode)
219 return NULL; 221 return NULL;
220 222
@@ -265,7 +267,6 @@ struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
265 */ 267 */
266void nilfs_clear_gcinode(struct inode *inode) 268void nilfs_clear_gcinode(struct inode *inode)
267{ 269{
268 nilfs_mdt_clear(inode);
269 nilfs_mdt_destroy(inode); 270 nilfs_mdt_destroy(inode);
270} 271}
271 272
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index de86401f209f..922d9dd42c8f 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -29,6 +29,17 @@
29#include "alloc.h" 29#include "alloc.h"
30#include "ifile.h" 30#include "ifile.h"
31 31
32
33struct nilfs_ifile_info {
34 struct nilfs_mdt_info mi;
35 struct nilfs_palloc_cache palloc_cache;
36};
37
38static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
39{
40 return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
41}
42
32/** 43/**
33 * nilfs_ifile_create_inode - create a new disk inode 44 * nilfs_ifile_create_inode - create a new disk inode
34 * @ifile: ifile inode 45 * @ifile: ifile inode
@@ -148,3 +159,27 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
148 } 159 }
149 return err; 160 return err;
150} 161}
162
163/**
164 * nilfs_ifile_new - create inode file
165 * @sbi: nilfs_sb_info struct
166 * @inode_size: size of an inode
167 */
168struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
169{
170 struct inode *ifile;
171 int err;
172
173 ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
174 sizeof(struct nilfs_ifile_info));
175 if (ifile) {
176 err = nilfs_palloc_init_blockgroup(ifile, inode_size);
177 if (unlikely(err)) {
178 nilfs_mdt_destroy(ifile);
179 return NULL;
180 }
181 nilfs_palloc_setup_cache(ifile,
182 &NILFS_IFILE_I(ifile)->palloc_cache);
183 }
184 return ifile;
185}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index ecc3ba76db47..cbca32e498f2 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,4 +49,6 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
53
52#endif /* _NILFS_IFILE_H */ 54#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2a0a5a3ac134..0957b58f909d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27#include <linux/uio.h> 28#include <linux/uio.h>
@@ -97,6 +98,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
97 nilfs_transaction_abort(inode->i_sb); 98 nilfs_transaction_abort(inode->i_sb);
98 goto out; 99 goto out;
99 } 100 }
101 nilfs_mark_inode_dirty(inode);
100 nilfs_transaction_commit(inode->i_sb); /* never fails */ 102 nilfs_transaction_commit(inode->i_sb); /* never fails */
101 /* Error handling should be detailed */ 103 /* Error handling should be detailed */
102 set_buffer_new(bh_result); 104 set_buffer_new(bh_result);
@@ -322,7 +324,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
322 nilfs_init_acl(), proper cancellation of 324 nilfs_init_acl(), proper cancellation of
323 above jobs should be considered */ 325 above jobs should be considered */
324 326
325 mark_inode_dirty(inode);
326 return inode; 327 return inode;
327 328
328 failed_acl: 329 failed_acl:
@@ -525,7 +526,6 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
525 526
526 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); 527 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
527 528
528 /* The buffer is guarded with lock_buffer() by the caller */
529 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 529 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
530 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); 530 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
531 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 531 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
@@ -599,6 +599,7 @@ void nilfs_truncate(struct inode *inode)
599 if (IS_SYNC(inode)) 599 if (IS_SYNC(inode))
600 nilfs_set_transaction_flag(NILFS_TI_SYNC); 600 nilfs_set_transaction_flag(NILFS_TI_SYNC);
601 601
602 nilfs_mark_inode_dirty(inode);
602 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); 603 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
603 nilfs_transaction_commit(sb); 604 nilfs_transaction_commit(sb);
604 /* May construct a logical segment and may fail in sync mode. 605 /* May construct a logical segment and may fail in sync mode.
@@ -623,6 +624,7 @@ void nilfs_delete_inode(struct inode *inode)
623 truncate_inode_pages(&inode->i_data, 0); 624 truncate_inode_pages(&inode->i_data, 0);
624 625
625 nilfs_truncate_bmap(ii, 0); 626 nilfs_truncate_bmap(ii, 0);
627 nilfs_mark_inode_dirty(inode);
626 nilfs_free_inode(inode); 628 nilfs_free_inode(inode);
627 /* nilfs_free_inode() marks inode buffer dirty */ 629 /* nilfs_free_inode() marks inode buffer dirty */
628 if (IS_SYNC(inode)) 630 if (IS_SYNC(inode))
@@ -745,9 +747,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
745 "failed to reget inode block.\n"); 747 "failed to reget inode block.\n");
746 return err; 748 return err;
747 } 749 }
748 lock_buffer(ibh);
749 nilfs_update_inode(inode, ibh); 750 nilfs_update_inode(inode, ibh);
750 unlock_buffer(ibh);
751 nilfs_mdt_mark_buffer_dirty(ibh); 751 nilfs_mdt_mark_buffer_dirty(ibh);
752 nilfs_mdt_mark_dirty(sbi->s_ifile); 752 nilfs_mdt_mark_dirty(sbi->s_ifile);
753 brelse(ibh); 753 brelse(ibh);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f6af76042d80..c2ff1b306012 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,9 +23,11 @@
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ 25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h>
26#include <linux/capability.h> /* capable() */ 27#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
30#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
29#include <linux/nilfs2_fs.h> 31#include <linux/nilfs2_fs.h>
30#include "nilfs.h" 32#include "nilfs.h"
31#include "segment.h" 33#include "segment.h"
@@ -107,20 +109,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
107 109
108 if (!capable(CAP_SYS_ADMIN)) 110 if (!capable(CAP_SYS_ADMIN))
109 return -EPERM; 111 return -EPERM;
112
113 ret = mnt_want_write(filp->f_path.mnt);
114 if (ret)
115 return ret;
116
117 ret = -EFAULT;
110 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 118 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
111 return -EFAULT; 119 goto out;
112 120
113 mutex_lock(&nilfs->ns_mount_mutex); 121 mutex_lock(&nilfs->ns_mount_mutex);
122
114 nilfs_transaction_begin(inode->i_sb, &ti, 0); 123 nilfs_transaction_begin(inode->i_sb, &ti, 0);
115 ret = nilfs_cpfile_change_cpmode( 124 ret = nilfs_cpfile_change_cpmode(
116 cpfile, cpmode.cm_cno, cpmode.cm_mode); 125 cpfile, cpmode.cm_cno, cpmode.cm_mode);
117 if (unlikely(ret < 0)) { 126 if (unlikely(ret < 0))
118 nilfs_transaction_abort(inode->i_sb); 127 nilfs_transaction_abort(inode->i_sb);
119 mutex_unlock(&nilfs->ns_mount_mutex); 128 else
120 return ret; 129 nilfs_transaction_commit(inode->i_sb); /* never fails */
121 } 130
122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 mutex_unlock(&nilfs->ns_mount_mutex); 131 mutex_unlock(&nilfs->ns_mount_mutex);
132out:
133 mnt_drop_write(filp->f_path.mnt);
124 return ret; 134 return ret;
125} 135}
126 136
@@ -135,16 +145,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
135 145
136 if (!capable(CAP_SYS_ADMIN)) 146 if (!capable(CAP_SYS_ADMIN))
137 return -EPERM; 147 return -EPERM;
148
149 ret = mnt_want_write(filp->f_path.mnt);
150 if (ret)
151 return ret;
152
153 ret = -EFAULT;
138 if (copy_from_user(&cno, argp, sizeof(cno))) 154 if (copy_from_user(&cno, argp, sizeof(cno)))
139 return -EFAULT; 155 goto out;
140 156
141 nilfs_transaction_begin(inode->i_sb, &ti, 0); 157 nilfs_transaction_begin(inode->i_sb, &ti, 0);
142 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); 158 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
143 if (unlikely(ret < 0)) { 159 if (unlikely(ret < 0))
144 nilfs_transaction_abort(inode->i_sb); 160 nilfs_transaction_abort(inode->i_sb);
145 return ret; 161 else
146 } 162 nilfs_transaction_commit(inode->i_sb); /* never fails */
147 nilfs_transaction_commit(inode->i_sb); /* never fails */ 163out:
164 mnt_drop_write(filp->f_path.mnt);
148 return ret; 165 return ret;
149} 166}
150 167
@@ -480,7 +497,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
480 unsigned int cmd, void __user *argp) 497 unsigned int cmd, void __user *argp)
481{ 498{
482 struct nilfs_argv argv[5]; 499 struct nilfs_argv argv[5];
483 const static size_t argsz[5] = { 500 static const size_t argsz[5] = {
484 sizeof(struct nilfs_vdesc), 501 sizeof(struct nilfs_vdesc),
485 sizeof(struct nilfs_period), 502 sizeof(struct nilfs_period),
486 sizeof(__u64), 503 sizeof(__u64),
@@ -496,12 +513,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
496 if (!capable(CAP_SYS_ADMIN)) 513 if (!capable(CAP_SYS_ADMIN))
497 return -EPERM; 514 return -EPERM;
498 515
516 ret = mnt_want_write(filp->f_path.mnt);
517 if (ret)
518 return ret;
519
520 ret = -EFAULT;
499 if (copy_from_user(argv, argp, sizeof(argv))) 521 if (copy_from_user(argv, argp, sizeof(argv)))
500 return -EFAULT; 522 goto out;
501 523
524 ret = -EINVAL;
502 nsegs = argv[4].v_nmembs; 525 nsegs = argv[4].v_nmembs;
503 if (argv[4].v_size != argsz[4]) 526 if (argv[4].v_size != argsz[4])
504 return -EINVAL; 527 goto out;
528
505 /* 529 /*
506 * argv[4] points to segment numbers this ioctl cleans. We 530 * argv[4] points to segment numbers this ioctl cleans. We
507 * use kmalloc() for its buffer because memory used for the 531 * use kmalloc() for its buffer because memory used for the
@@ -509,9 +533,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
509 */ 533 */
510 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, 534 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
511 nsegs * sizeof(__u64)); 535 nsegs * sizeof(__u64));
512 if (IS_ERR(kbufs[4])) 536 if (IS_ERR(kbufs[4])) {
513 return PTR_ERR(kbufs[4]); 537 ret = PTR_ERR(kbufs[4]);
514 538 goto out;
539 }
515 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 540 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
516 541
517 for (n = 0; n < 4; n++) { 542 for (n = 0; n < 4; n++) {
@@ -563,10 +588,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
563 nilfs_remove_all_gcinode(nilfs); 588 nilfs_remove_all_gcinode(nilfs);
564 clear_nilfs_gc_running(nilfs); 589 clear_nilfs_gc_running(nilfs);
565 590
566 out_free: 591out_free:
567 while (--n >= 0) 592 while (--n >= 0)
568 vfree(kbufs[n]); 593 vfree(kbufs[n]);
569 kfree(kbufs[4]); 594 kfree(kbufs[4]);
595out:
596 mnt_drop_write(filp->f_path.mnt);
570 return ret; 597 return ret;
571} 598}
572 599
@@ -575,13 +602,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
575{ 602{
576 __u64 cno; 603 __u64 cno;
577 int ret; 604 int ret;
605 struct the_nilfs *nilfs;
578 606
579 ret = nilfs_construct_segment(inode->i_sb); 607 ret = nilfs_construct_segment(inode->i_sb);
580 if (ret < 0) 608 if (ret < 0)
581 return ret; 609 return ret;
582 610
583 if (argp != NULL) { 611 if (argp != NULL) {
584 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; 612 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
613 down_read(&nilfs->ns_segctor_sem);
614 cno = nilfs->ns_cno - 1;
615 up_read(&nilfs->ns_segctor_sem);
585 if (copy_to_user(argp, &cno, sizeof(cno))) 616 if (copy_to_user(argp, &cno, sizeof(cno)))
586 return -EFAULT; 617 return -EFAULT;
587 } 618 }
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6326112d647..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
@@ -186,7 +187,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
186} 187}
187 188
188static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, 189static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
189 struct buffer_head **out_bh) 190 int readahead, struct buffer_head **out_bh)
190{ 191{
191 struct buffer_head *first_bh, *bh; 192 struct buffer_head *first_bh, *bh;
192 unsigned long blkoff; 193 unsigned long blkoff;
@@ -200,16 +201,18 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
200 if (unlikely(err)) 201 if (unlikely(err))
201 goto failed; 202 goto failed;
202 203
203 blkoff = block + 1; 204 if (readahead) {
204 for (i = 0; i < nr_ra_blocks; i++, blkoff++) { 205 blkoff = block + 1;
205 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); 206 for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
206 if (likely(!err || err == -EEXIST)) 207 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
207 brelse(bh); 208 if (likely(!err || err == -EEXIST))
208 else if (err != -EBUSY) 209 brelse(bh);
209 break; /* abort readahead if bmap lookup failed */ 210 else if (err != -EBUSY)
210 211 break;
211 if (!buffer_locked(first_bh)) 212 /* abort readahead if bmap lookup failed */
212 goto out_no_wait; 213 if (!buffer_locked(first_bh))
214 goto out_no_wait;
215 }
213 } 216 }
214 217
215 wait_on_buffer(first_bh); 218 wait_on_buffer(first_bh);
@@ -263,7 +266,7 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
263 266
264 /* Should be rewritten with merging nilfs_mdt_read_block() */ 267 /* Should be rewritten with merging nilfs_mdt_read_block() */
265 retry: 268 retry:
266 ret = nilfs_mdt_read_block(inode, blkoff, out_bh); 269 ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
267 if (!create || ret != -ENOENT) 270 if (!create || ret != -ENOENT)
268 return ret; 271 return ret;
269 272
@@ -371,7 +374,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
371 struct buffer_head *bh; 374 struct buffer_head *bh;
372 int err; 375 int err;
373 376
374 err = nilfs_mdt_read_block(inode, block, &bh); 377 err = nilfs_mdt_read_block(inode, block, 0, &bh);
375 if (unlikely(err)) 378 if (unlikely(err))
376 return err; 379 return err;
377 nilfs_mark_buffer_dirty(bh); 380 nilfs_mark_buffer_dirty(bh);
@@ -445,9 +448,17 @@ static const struct file_operations def_mdt_fops;
445 * longer than those of the super block structs; they may continue for 448 * longer than those of the super block structs; they may continue for
446 * several consecutive mounts/umounts. This would need discussions. 449 * several consecutive mounts/umounts. This would need discussions.
447 */ 450 */
451/**
452 * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
453 * @nilfs: nilfs object
454 * @sb: super block instance the metadata file belongs to
455 * @ino: inode number
456 * @gfp_mask: gfp mask for data pages
457 * @objsz: size of the private object attached to inode->i_private
458 */
448struct inode * 459struct inode *
449nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, 460nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
450 ino_t ino, gfp_t gfp_mask) 461 ino_t ino, gfp_t gfp_mask, size_t objsz)
451{ 462{
452 struct inode *inode = nilfs_alloc_inode_common(nilfs); 463 struct inode *inode = nilfs_alloc_inode_common(nilfs);
453 464
@@ -455,8 +466,9 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
455 return NULL; 466 return NULL;
456 else { 467 else {
457 struct address_space * const mapping = &inode->i_data; 468 struct address_space * const mapping = &inode->i_data;
458 struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS); 469 struct nilfs_mdt_info *mi;
459 470
471 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
460 if (!mi) { 472 if (!mi) {
461 nilfs_destroy_inode(inode); 473 nilfs_destroy_inode(inode);
462 return NULL; 474 return NULL;
@@ -513,11 +525,11 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
513} 525}
514 526
515struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 527struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
516 ino_t ino) 528 ino_t ino, size_t objsz)
517{ 529{
518 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, 530 struct inode *inode;
519 NILFS_MDT_GFP);
520 531
532 inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
521 if (!inode) 533 if (!inode)
522 return NULL; 534 return NULL;
523 535
@@ -544,14 +556,15 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
544 &NILFS_I(orig)->i_btnode_cache; 556 &NILFS_I(orig)->i_btnode_cache;
545} 557}
546 558
547void nilfs_mdt_clear(struct inode *inode) 559static void nilfs_mdt_clear(struct inode *inode)
548{ 560{
549 struct nilfs_inode_info *ii = NILFS_I(inode); 561 struct nilfs_inode_info *ii = NILFS_I(inode);
550 562
551 invalidate_mapping_pages(inode->i_mapping, 0, -1); 563 invalidate_mapping_pages(inode->i_mapping, 0, -1);
552 truncate_inode_pages(inode->i_mapping, 0); 564 truncate_inode_pages(inode->i_mapping, 0);
553 565
554 nilfs_bmap_clear(ii->i_bmap); 566 if (test_bit(NILFS_I_BMAP, &ii->i_state))
567 nilfs_bmap_clear(ii->i_bmap);
555 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 568 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
556} 569}
557 570
@@ -559,6 +572,10 @@ void nilfs_mdt_destroy(struct inode *inode)
559{ 572{
560 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 573 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
561 574
575 if (mdi->mi_palloc_cache)
576 nilfs_palloc_destroy_cache(inode);
577 nilfs_mdt_clear(inode);
578
562 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ 579 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
563 kfree(mdi); 580 kfree(mdi);
564 nilfs_destroy_inode(inode); 581 nilfs_destroy_inode(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 431599733c9b..6c4bbb0470fc 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -36,6 +36,7 @@
36 * @mi_entry_size: size of an entry 36 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry 37 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block 38 * @mi_entries_per_block: number of entries in a block
39 * @mi_palloc_cache: persistent object allocator cache
39 * @mi_blocks_per_group: number of blocks in a group 40 * @mi_blocks_per_group: number of blocks in a group
40 * @mi_blocks_per_desc_block: number of blocks per descriptor block 41 * @mi_blocks_per_desc_block: number of blocks per descriptor block
41 */ 42 */
@@ -46,6 +47,7 @@ struct nilfs_mdt_info {
46 unsigned mi_entry_size; 47 unsigned mi_entry_size;
47 unsigned mi_first_entry_offset; 48 unsigned mi_first_entry_offset;
48 unsigned long mi_entries_per_block; 49 unsigned long mi_entries_per_block;
50 struct nilfs_palloc_cache *mi_palloc_cache;
49 unsigned long mi_blocks_per_group; 51 unsigned long mi_blocks_per_group;
50 unsigned long mi_blocks_per_desc_block; 52 unsigned long mi_blocks_per_desc_block;
51}; 53};
@@ -74,11 +76,11 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 76int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *); 77int nilfs_mdt_fetch_dirty(struct inode *);
76 78
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t); 79struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
80 size_t);
78struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, 81struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
79 ino_t, gfp_t); 82 ino_t, gfp_t, size_t);
80void nilfs_mdt_destroy(struct inode *); 83void nilfs_mdt_destroy(struct inode *);
81void nilfs_mdt_clear(struct inode *);
82void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); 84void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
83void nilfs_mdt_set_shadow(struct inode *, struct inode *); 85void nilfs_mdt_set_shadow(struct inode *, struct inode *);
84 86
@@ -104,21 +106,4 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
104#define nilfs_mdt_bgl_lock(inode, bg) \ 106#define nilfs_mdt_bgl_lock(inode, bg) \
105 (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) 107 (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
106 108
107
108static inline int
109nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
110 unsigned n)
111{
112 return nilfs_read_inode_common(
113 inode, (struct nilfs_inode *)(bh->b_data + n));
114}
115
116static inline void
117nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
118 unsigned n)
119{
120 nilfs_write_inode_common(
121 inode, (struct nilfs_inode *)(bh->b_data + n), 1);
122}
123
124#endif /* _NILFS_MDT_H */ 109#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ed02e886fa79..ad6ed2cf19b4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
67 if (dentry->d_name.len > NILFS_NAME_LEN) 67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG); 68 return ERR_PTR(-ENAMETOOLONG);
69 69
70 ino = nilfs_inode_by_name(dir, dentry); 70 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 71 inode = NULL;
72 if (ino) { 72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 73 inode = nilfs_iget(dir->i_sb, ino);
@@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child)
81{ 81{
82 unsigned long ino; 82 unsigned long ino;
83 struct inode *inode; 83 struct inode *inode;
84 struct dentry dotdot; 84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88 85
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot); 86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino) 87 if (!ino)
@@ -120,7 +117,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
120 inode->i_op = &nilfs_file_inode_operations; 117 inode->i_op = &nilfs_file_inode_operations;
121 inode->i_fop = &nilfs_file_operations; 118 inode->i_fop = &nilfs_file_operations;
122 inode->i_mapping->a_ops = &nilfs_aops; 119 inode->i_mapping->a_ops = &nilfs_aops;
123 mark_inode_dirty(inode); 120 nilfs_mark_inode_dirty(inode);
124 err = nilfs_add_nondir(dentry, inode); 121 err = nilfs_add_nondir(dentry, inode);
125 } 122 }
126 if (!err) 123 if (!err)
@@ -148,7 +145,7 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
148 err = PTR_ERR(inode); 145 err = PTR_ERR(inode);
149 if (!IS_ERR(inode)) { 146 if (!IS_ERR(inode)) {
150 init_special_inode(inode, inode->i_mode, rdev); 147 init_special_inode(inode, inode->i_mode, rdev);
151 mark_inode_dirty(inode); 148 nilfs_mark_inode_dirty(inode);
152 err = nilfs_add_nondir(dentry, inode); 149 err = nilfs_add_nondir(dentry, inode);
153 } 150 }
154 if (!err) 151 if (!err)
@@ -188,7 +185,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
188 goto out_fail; 185 goto out_fail;
189 186
190 /* mark_inode_dirty(inode); */ 187 /* mark_inode_dirty(inode); */
191 /* nilfs_new_inode() and page_symlink() do this */ 188 /* page_symlink() do this */
192 189
193 err = nilfs_add_nondir(dentry, inode); 190 err = nilfs_add_nondir(dentry, inode);
194out: 191out:
@@ -200,7 +197,8 @@ out:
200 return err; 197 return err;
201 198
202out_fail: 199out_fail:
203 inode_dec_link_count(inode); 200 drop_nlink(inode);
201 nilfs_mark_inode_dirty(inode);
204 iput(inode); 202 iput(inode);
205 goto out; 203 goto out;
206} 204}
@@ -245,7 +243,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
245 if (err) 243 if (err)
246 return err; 244 return err;
247 245
248 inode_inc_link_count(dir); 246 inc_nlink(dir);
249 247
250 inode = nilfs_new_inode(dir, S_IFDIR | mode); 248 inode = nilfs_new_inode(dir, S_IFDIR | mode);
251 err = PTR_ERR(inode); 249 err = PTR_ERR(inode);
@@ -256,7 +254,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
256 inode->i_fop = &nilfs_dir_operations; 254 inode->i_fop = &nilfs_dir_operations;
257 inode->i_mapping->a_ops = &nilfs_aops; 255 inode->i_mapping->a_ops = &nilfs_aops;
258 256
259 inode_inc_link_count(inode); 257 inc_nlink(inode);
260 258
261 err = nilfs_make_empty(inode, dir); 259 err = nilfs_make_empty(inode, dir);
262 if (err) 260 if (err)
@@ -266,6 +264,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
266 if (err) 264 if (err)
267 goto out_fail; 265 goto out_fail;
268 266
267 nilfs_mark_inode_dirty(inode);
269 d_instantiate(dentry, inode); 268 d_instantiate(dentry, inode);
270out: 269out:
271 if (!err) 270 if (!err)
@@ -276,28 +275,25 @@ out:
276 return err; 275 return err;
277 276
278out_fail: 277out_fail:
279 inode_dec_link_count(inode); 278 drop_nlink(inode);
280 inode_dec_link_count(inode); 279 drop_nlink(inode);
280 nilfs_mark_inode_dirty(inode);
281 iput(inode); 281 iput(inode);
282out_dir: 282out_dir:
283 inode_dec_link_count(dir); 283 drop_nlink(dir);
284 nilfs_mark_inode_dirty(dir);
284 goto out; 285 goto out;
285} 286}
286 287
287static int nilfs_unlink(struct inode *dir, struct dentry *dentry) 288static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
288{ 289{
289 struct inode *inode; 290 struct inode *inode;
290 struct nilfs_dir_entry *de; 291 struct nilfs_dir_entry *de;
291 struct page *page; 292 struct page *page;
292 struct nilfs_transaction_info ti;
293 int err; 293 int err;
294 294
295 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
296 if (err)
297 return err;
298
299 err = -ENOENT; 295 err = -ENOENT;
300 de = nilfs_find_entry(dir, dentry, &page); 296 de = nilfs_find_entry(dir, &dentry->d_name, &page);
301 if (!de) 297 if (!de)
302 goto out; 298 goto out;
303 299
@@ -317,12 +313,28 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
317 goto out; 313 goto out;
318 314
319 inode->i_ctime = dir->i_ctime; 315 inode->i_ctime = dir->i_ctime;
320 inode_dec_link_count(inode); 316 drop_nlink(inode);
321 err = 0; 317 err = 0;
322out: 318out:
323 if (!err) 319 return err;
320}
321
322static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
323{
324 struct nilfs_transaction_info ti;
325 int err;
326
327 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
328 if (err)
329 return err;
330
331 err = nilfs_do_unlink(dir, dentry);
332
333 if (!err) {
334 nilfs_mark_inode_dirty(dir);
335 nilfs_mark_inode_dirty(dentry->d_inode);
324 err = nilfs_transaction_commit(dir->i_sb); 336 err = nilfs_transaction_commit(dir->i_sb);
325 else 337 } else
326 nilfs_transaction_abort(dir->i_sb); 338 nilfs_transaction_abort(dir->i_sb);
327 339
328 return err; 340 return err;
@@ -340,11 +352,13 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
340 352
341 err = -ENOTEMPTY; 353 err = -ENOTEMPTY;
342 if (nilfs_empty_dir(inode)) { 354 if (nilfs_empty_dir(inode)) {
343 err = nilfs_unlink(dir, dentry); 355 err = nilfs_do_unlink(dir, dentry);
344 if (!err) { 356 if (!err) {
345 inode->i_size = 0; 357 inode->i_size = 0;
346 inode_dec_link_count(inode); 358 drop_nlink(inode);
347 inode_dec_link_count(dir); 359 nilfs_mark_inode_dirty(inode);
360 drop_nlink(dir);
361 nilfs_mark_inode_dirty(dir);
348 } 362 }
349 } 363 }
350 if (!err) 364 if (!err)
@@ -372,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
372 return err; 386 return err;
373 387
374 err = -ENOENT; 388 err = -ENOENT;
375 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); 389 old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
376 if (!old_de) 390 if (!old_de)
377 goto out; 391 goto out;
378 392
@@ -392,45 +406,51 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
392 goto out_dir; 406 goto out_dir;
393 407
394 err = -ENOENT; 408 err = -ENOENT;
395 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); 409 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
396 if (!new_de) 410 if (!new_de)
397 goto out_dir; 411 goto out_dir;
398 inode_inc_link_count(old_inode); 412 inc_nlink(old_inode);
399 nilfs_set_link(new_dir, new_de, new_page, old_inode); 413 nilfs_set_link(new_dir, new_de, new_page, old_inode);
414 nilfs_mark_inode_dirty(new_dir);
400 new_inode->i_ctime = CURRENT_TIME; 415 new_inode->i_ctime = CURRENT_TIME;
401 if (dir_de) 416 if (dir_de)
402 drop_nlink(new_inode); 417 drop_nlink(new_inode);
403 inode_dec_link_count(new_inode); 418 drop_nlink(new_inode);
419 nilfs_mark_inode_dirty(new_inode);
404 } else { 420 } else {
405 if (dir_de) { 421 if (dir_de) {
406 err = -EMLINK; 422 err = -EMLINK;
407 if (new_dir->i_nlink >= NILFS_LINK_MAX) 423 if (new_dir->i_nlink >= NILFS_LINK_MAX)
408 goto out_dir; 424 goto out_dir;
409 } 425 }
410 inode_inc_link_count(old_inode); 426 inc_nlink(old_inode);
411 err = nilfs_add_link(new_dentry, old_inode); 427 err = nilfs_add_link(new_dentry, old_inode);
412 if (err) { 428 if (err) {
413 inode_dec_link_count(old_inode); 429 drop_nlink(old_inode);
430 nilfs_mark_inode_dirty(old_inode);
414 goto out_dir; 431 goto out_dir;
415 } 432 }
416 if (dir_de) 433 if (dir_de) {
417 inode_inc_link_count(new_dir); 434 inc_nlink(new_dir);
435 nilfs_mark_inode_dirty(new_dir);
436 }
418 } 437 }
419 438
420 /* 439 /*
421 * Like most other Unix systems, set the ctime for inodes on a 440 * Like most other Unix systems, set the ctime for inodes on a
422 * rename. 441 * rename.
423 * inode_dec_link_count() will mark the inode dirty.
424 */ 442 */
425 old_inode->i_ctime = CURRENT_TIME; 443 old_inode->i_ctime = CURRENT_TIME;
426 444
427 nilfs_delete_entry(old_de, old_page); 445 nilfs_delete_entry(old_de, old_page);
428 inode_dec_link_count(old_inode); 446 drop_nlink(old_inode);
429 447
430 if (dir_de) { 448 if (dir_de) {
431 nilfs_set_link(old_inode, dir_de, dir_page, new_dir); 449 nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
432 inode_dec_link_count(old_dir); 450 drop_nlink(old_dir);
433 } 451 }
452 nilfs_mark_inode_dirty(old_dir);
453 nilfs_mark_inode_dirty(old_inode);
434 454
435 err = nilfs_transaction_commit(old_dir->i_sb); 455 err = nilfs_transaction_commit(old_dir->i_sb);
436 return err; 456 return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4da6f67e9a91..8723e5bfd071 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
217 217
218/* dir.c */ 218/* dir.c */
219extern int nilfs_add_link(struct dentry *, struct inode *); 219extern int nilfs_add_link(struct dentry *, struct inode *);
220extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); 220extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
221extern int nilfs_make_empty(struct inode *, struct inode *); 221extern int nilfs_make_empty(struct inode *, struct inode *);
222extern struct nilfs_dir_entry * 222extern struct nilfs_dir_entry *
223nilfs_find_entry(struct inode *, struct dentry *, struct page **); 223nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); 224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
225extern int nilfs_empty_dir(struct inode *); 225extern int nilfs_empty_dir(struct inode *);
226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); 226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/pagevec.h> 31#include <linux/pagevec.h>
32#include <linux/gfp.h>
32#include "nilfs.h" 33#include "nilfs.h"
33#include "page.h" 34#include "page.h"
34#include "mdt.h" 35#include "mdt.h"
@@ -292,7 +293,7 @@ void nilfs_free_private_page(struct page *page)
292 * @src: source page 293 * @src: source page
293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. 294 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
294 * 295 *
295 * This fuction is for both data pages and btnode pages. The dirty flag 296 * This function is for both data pages and btnode pages. The dirty flag
296 * should be treated by caller. The page must not be under i/o. 297 * should be treated by caller. The page must not be under i/o.
297 * Both src and dst page must be locked 298 * Both src and dst page must be locked
298 */ 299 */
@@ -388,7 +389,7 @@ repeat:
388} 389}
389 390
390/** 391/**
391 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache 392 * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
392 * @dmap: destination page cache 393 * @dmap: destination page cache
393 * @smap: source page cache 394 * @smap: source page cache
394 * 395 *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6dc83591d118..ba43146f3c30 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/slab.h>
26#include <linux/crc32.h> 27#include <linux/crc32.h>
27#include "nilfs.h" 28#include "nilfs.h"
28#include "segment.h" 29#include "segment.h"
@@ -39,7 +40,6 @@ enum {
39 NILFS_SEG_FAIL_IO, 40 NILFS_SEG_FAIL_IO,
40 NILFS_SEG_FAIL_MAGIC, 41 NILFS_SEG_FAIL_MAGIC,
41 NILFS_SEG_FAIL_SEQ, 42 NILFS_SEG_FAIL_SEQ,
42 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
43 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, 43 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
44 NILFS_SEG_FAIL_CHECKSUM_FULL, 44 NILFS_SEG_FAIL_CHECKSUM_FULL,
45 NILFS_SEG_FAIL_CONSISTENCY, 45 NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +71,6 @@ static int nilfs_warn_segment_error(int err)
71 printk(KERN_WARNING 71 printk(KERN_WARNING
72 "NILFS warning: Sequence number mismatch\n"); 72 "NILFS warning: Sequence number mismatch\n");
73 break; 73 break;
74 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
75 printk(KERN_WARNING
76 "NILFS warning: Checksum error in segment summary\n");
77 break;
78 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: 74 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
79 printk(KERN_WARNING 75 printk(KERN_WARNING
80 "NILFS warning: Checksum error in super root\n"); 76 "NILFS warning: Checksum error in super root\n");
@@ -206,19 +202,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
206 * @pseg_start: start disk block number of partial segment 202 * @pseg_start: start disk block number of partial segment
207 * @seg_seq: sequence number requested 203 * @seg_seq: sequence number requested
208 * @ssi: pointer to nilfs_segsum_info struct to store information 204 * @ssi: pointer to nilfs_segsum_info struct to store information
209 * @full_check: full check flag
210 * (0: only checks segment summary CRC, 1: data CRC)
211 */ 205 */
212static int 206static int
213load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 207load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
214 u64 seg_seq, struct nilfs_segsum_info *ssi, 208 u64 seg_seq, struct nilfs_segsum_info *ssi)
215 int full_check)
216{ 209{
217 struct buffer_head *bh_sum; 210 struct buffer_head *bh_sum;
218 struct nilfs_segment_summary *sum; 211 struct nilfs_segment_summary *sum;
219 unsigned long offset, nblock; 212 unsigned long nblock;
220 u64 check_bytes; 213 u32 crc;
221 u32 crc, crc_sum;
222 int ret = NILFS_SEG_FAIL_IO; 214 int ret = NILFS_SEG_FAIL_IO;
223 215
224 bh_sum = sb_bread(sbi->s_super, pseg_start); 216 bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +229,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
237 ret = NILFS_SEG_FAIL_SEQ; 229 ret = NILFS_SEG_FAIL_SEQ;
238 goto failed; 230 goto failed;
239 } 231 }
240 if (full_check) {
241 offset = sizeof(sum->ss_datasum);
242 check_bytes =
243 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
244 nblock = ssi->nblocks;
245 crc_sum = le32_to_cpu(sum->ss_datasum);
246 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
247 } else { /* only checks segment summary */
248 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
249 check_bytes = ssi->sumbytes;
250 nblock = ssi->nsumblk;
251 crc_sum = le32_to_cpu(sum->ss_sumsum);
252 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
253 }
254 232
233 nblock = ssi->nblocks;
255 if (unlikely(nblock == 0 || 234 if (unlikely(nblock == 0 ||
256 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 235 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
257 /* This limits the number of blocks read in the CRC check */ 236 /* This limits the number of blocks read in the CRC check */
258 ret = NILFS_SEG_FAIL_CONSISTENCY; 237 ret = NILFS_SEG_FAIL_CONSISTENCY;
259 goto failed; 238 goto failed;
260 } 239 }
261 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, 240 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
241 ((u64)nblock << sbi->s_super->s_blocksize_bits),
262 pseg_start, nblock)) { 242 pseg_start, nblock)) {
263 ret = NILFS_SEG_FAIL_IO; 243 ret = NILFS_SEG_FAIL_IO;
264 goto failed; 244 goto failed;
265 } 245 }
266 if (crc == crc_sum) 246 if (crc == le32_to_cpu(sum->ss_datasum))
267 ret = 0; 247 ret = 0;
248 else
249 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
268 failed: 250 failed:
269 brelse(bh_sum); 251 brelse(bh_sum);
270 out: 252 out:
@@ -598,7 +580,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
598 580
599 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 581 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
600 582
601 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 583 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
602 if (ret) { 584 if (ret) {
603 if (ret == NILFS_SEG_FAIL_IO) { 585 if (ret == NILFS_SEG_FAIL_IO) {
604 err = -EIO; 586 err = -EIO;
@@ -770,14 +752,8 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
770 nilfs_finish_roll_forward(nilfs, sbi, ri); 752 nilfs_finish_roll_forward(nilfs, sbi, ri);
771 } 753 }
772 754
773 nilfs_detach_checkpoint(sbi);
774 return 0;
775
776 failed: 755 failed:
777 nilfs_detach_checkpoint(sbi); 756 nilfs_detach_checkpoint(sbi);
778 nilfs_mdt_clear(nilfs->ns_cpfile);
779 nilfs_mdt_clear(nilfs->ns_sufile);
780 nilfs_mdt_clear(nilfs->ns_dat);
781 return err; 757 return err;
782} 758}
783 759
@@ -804,6 +780,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
804 struct nilfs_segsum_info ssi; 780 struct nilfs_segsum_info ssi;
805 sector_t pseg_start, pseg_end, sr_pseg_start = 0; 781 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
806 sector_t seg_start, seg_end; /* range of full segment (block number) */ 782 sector_t seg_start, seg_end; /* range of full segment (block number) */
783 sector_t b, end;
807 u64 seg_seq; 784 u64 seg_seq;
808 __u64 segnum, nextnum = 0; 785 __u64 segnum, nextnum = 0;
809 __u64 cno; 786 __u64 cno;
@@ -819,9 +796,14 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
819 /* Calculate range of segment */ 796 /* Calculate range of segment */
820 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); 797 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
821 798
799 /* Read ahead segment */
800 b = seg_start;
801 while (b <= seg_end)
802 sb_breadahead(sbi->s_super, b++);
803
822 for (;;) { 804 for (;;) {
823 /* Load segment summary */ 805 /* Load segment summary */
824 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 806 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
825 if (ret) { 807 if (ret) {
826 if (ret == NILFS_SEG_FAIL_IO) 808 if (ret == NILFS_SEG_FAIL_IO)
827 goto failed; 809 goto failed;
@@ -841,14 +823,20 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
841 ri->ri_nextnum = nextnum; 823 ri->ri_nextnum = nextnum;
842 empty_seg = 0; 824 empty_seg = 0;
843 825
826 if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
827 /* This will never happen because a superblock
828 (last_segment) always points to a pseg
829 having a super root. */
830 ret = NILFS_SEG_FAIL_CONSISTENCY;
831 goto failed;
832 }
833
834 if (pseg_start == seg_start) {
835 nilfs_get_segment_range(nilfs, nextnum, &b, &end);
836 while (b <= end)
837 sb_breadahead(sbi->s_super, b++);
838 }
844 if (!NILFS_SEG_HAS_SR(&ssi)) { 839 if (!NILFS_SEG_HAS_SR(&ssi)) {
845 if (!scan_newer) {
846 /* This will never happen because a superblock
847 (last_segment) always points to a pseg
848 having a super root. */
849 ret = NILFS_SEG_FAIL_CONSISTENCY;
850 goto failed;
851 }
852 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { 840 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
853 ri->ri_lsegs_start = pseg_start; 841 ri->ri_lsegs_start = pseg_start;
854 ri->ri_lsegs_start_seq = seg_seq; 842 ri->ri_lsegs_start_seq = seg_seq;
@@ -919,7 +907,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
919 907
920 super_root_found: 908 super_root_found:
921 /* Updating pointers relating to the latest checkpoint */ 909 /* Updating pointers relating to the latest checkpoint */
922 list_splice(&segments, ri->ri_used_segments.prev); 910 list_splice_tail(&segments, &ri->ri_used_segments);
923 nilfs->ns_last_pseg = sr_pseg_start; 911 nilfs->ns_last_pseg = sr_pseg_start;
924 nilfs->ns_last_seq = nilfs->ns_seg_seq; 912 nilfs->ns_last_seq = nilfs->ns_seg_seq;
925 nilfs->ns_last_cno = ri->ri_cno; 913 nilfs->ns_last_cno = ri->ri_cno;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e6d9e37fa241..17851f77f739 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -24,10 +24,28 @@
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27#include <linux/backing-dev.h>
28#include <linux/slab.h>
27#include "page.h" 29#include "page.h"
28#include "segbuf.h" 30#include "segbuf.h"
29 31
30 32
33struct nilfs_write_info {
34 struct the_nilfs *nilfs;
35 struct bio *bio;
36 int start, end; /* The region to be submitted */
37 int rest_blocks;
38 int max_pages;
39 int nr_vecs;
40 sector_t blocknr;
41};
42
43
44static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
45 struct the_nilfs *nilfs);
46static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
47
48
31static struct kmem_cache *nilfs_segbuf_cachep; 49static struct kmem_cache *nilfs_segbuf_cachep;
32 50
33static void nilfs_segbuf_init_once(void *obj) 51static void nilfs_segbuf_init_once(void *obj)
@@ -63,6 +81,11 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
63 INIT_LIST_HEAD(&segbuf->sb_list); 81 INIT_LIST_HEAD(&segbuf->sb_list);
64 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); 82 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
65 INIT_LIST_HEAD(&segbuf->sb_payload_buffers); 83 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
84
85 init_completion(&segbuf->sb_bio_event);
86 atomic_set(&segbuf->sb_err, 0);
87 segbuf->sb_nbio = 0;
88
66 return segbuf; 89 return segbuf;
67} 90}
68 91
@@ -83,6 +106,22 @@ void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
83 segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; 106 segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
84} 107}
85 108
109/**
110 * nilfs_segbuf_map_cont - map a new log behind a given log
111 * @segbuf: new segment buffer
112 * @prev: segment buffer containing a log to be continued
113 */
114void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
115 struct nilfs_segment_buffer *prev)
116{
117 segbuf->sb_segnum = prev->sb_segnum;
118 segbuf->sb_fseg_start = prev->sb_fseg_start;
119 segbuf->sb_fseg_end = prev->sb_fseg_end;
120 segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks;
121 segbuf->sb_rest_blocks =
122 segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
123}
124
86void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, 125void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
87 __u64 nextnum, struct the_nilfs *nilfs) 126 __u64 nextnum, struct the_nilfs *nilfs)
88{ 127{
@@ -132,13 +171,11 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
132 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); 171 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
133 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; 172 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
134 segbuf->sb_sum.ctime = ctime; 173 segbuf->sb_sum.ctime = ctime;
135
136 segbuf->sb_io_error = 0;
137 return 0; 174 return 0;
138} 175}
139 176
140/* 177/*
141 * Setup segument summary 178 * Setup segment summary
142 */ 179 */
143void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) 180void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
144{ 181{
@@ -219,7 +256,7 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
219 raw_sum->ss_datasum = cpu_to_le32(crc); 256 raw_sum->ss_datasum = cpu_to_le32(crc);
220} 257}
221 258
222void nilfs_release_buffers(struct list_head *list) 259static void nilfs_release_buffers(struct list_head *list)
223{ 260{
224 struct buffer_head *bh, *n; 261 struct buffer_head *bh, *n;
225 262
@@ -241,13 +278,69 @@ void nilfs_release_buffers(struct list_head *list)
241 } 278 }
242} 279}
243 280
281static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
282{
283 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
284 nilfs_release_buffers(&segbuf->sb_payload_buffers);
285}
286
287/*
288 * Iterators for segment buffers
289 */
290void nilfs_clear_logs(struct list_head *logs)
291{
292 struct nilfs_segment_buffer *segbuf;
293
294 list_for_each_entry(segbuf, logs, sb_list)
295 nilfs_segbuf_clear(segbuf);
296}
297
298void nilfs_truncate_logs(struct list_head *logs,
299 struct nilfs_segment_buffer *last)
300{
301 struct nilfs_segment_buffer *n, *segbuf;
302
303 segbuf = list_prepare_entry(last, logs, sb_list);
304 list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) {
305 list_del_init(&segbuf->sb_list);
306 nilfs_segbuf_clear(segbuf);
307 nilfs_segbuf_free(segbuf);
308 }
309}
310
311int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
312{
313 struct nilfs_segment_buffer *segbuf;
314 int ret = 0;
315
316 list_for_each_entry(segbuf, logs, sb_list) {
317 ret = nilfs_segbuf_write(segbuf, nilfs);
318 if (ret)
319 break;
320 }
321 return ret;
322}
323
324int nilfs_wait_on_logs(struct list_head *logs)
325{
326 struct nilfs_segment_buffer *segbuf;
327 int err, ret = 0;
328
329 list_for_each_entry(segbuf, logs, sb_list) {
330 err = nilfs_segbuf_wait(segbuf);
331 if (err && !ret)
332 ret = err;
333 }
334 return ret;
335}
336
244/* 337/*
245 * BIO operations 338 * BIO operations
246 */ 339 */
247static void nilfs_end_bio_write(struct bio *bio, int err) 340static void nilfs_end_bio_write(struct bio *bio, int err)
248{ 341{
249 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 342 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
250 struct nilfs_write_info *wi = bio->bi_private; 343 struct nilfs_segment_buffer *segbuf = bio->bi_private;
251 344
252 if (err == -EOPNOTSUPP) { 345 if (err == -EOPNOTSUPP) {
253 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 346 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
@@ -256,21 +349,22 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
256 } 349 }
257 350
258 if (!uptodate) 351 if (!uptodate)
259 atomic_inc(&wi->err); 352 atomic_inc(&segbuf->sb_err);
260 353
261 bio_put(bio); 354 bio_put(bio);
262 complete(&wi->bio_event); 355 complete(&segbuf->sb_bio_event);
263} 356}
264 357
265static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) 358static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
359 struct nilfs_write_info *wi, int mode)
266{ 360{
267 struct bio *bio = wi->bio; 361 struct bio *bio = wi->bio;
268 int err; 362 int err;
269 363
270 if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { 364 if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
271 wait_for_completion(&wi->bio_event); 365 wait_for_completion(&segbuf->sb_bio_event);
272 wi->nbio--; 366 segbuf->sb_nbio--;
273 if (unlikely(atomic_read(&wi->err))) { 367 if (unlikely(atomic_read(&segbuf->sb_err))) {
274 bio_put(bio); 368 bio_put(bio);
275 err = -EIO; 369 err = -EIO;
276 goto failed; 370 goto failed;
@@ -278,7 +372,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
278 } 372 }
279 373
280 bio->bi_end_io = nilfs_end_bio_write; 374 bio->bi_end_io = nilfs_end_bio_write;
281 bio->bi_private = wi; 375 bio->bi_private = segbuf;
282 bio_get(bio); 376 bio_get(bio);
283 submit_bio(mode, bio); 377 submit_bio(mode, bio);
284 if (bio_flagged(bio, BIO_EOPNOTSUPP)) { 378 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
@@ -286,7 +380,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
286 err = -EOPNOTSUPP; 380 err = -EOPNOTSUPP;
287 goto failed; 381 goto failed;
288 } 382 }
289 wi->nbio++; 383 segbuf->sb_nbio++;
290 bio_put(bio); 384 bio_put(bio);
291 385
292 wi->bio = NULL; 386 wi->bio = NULL;
@@ -301,17 +395,15 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
301} 395}
302 396
303/** 397/**
304 * nilfs_alloc_seg_bio - allocate a bio for writing segment. 398 * nilfs_alloc_seg_bio - allocate a new bio for writing log
305 * @sb: super block 399 * @nilfs: nilfs object
306 * @start: beginning disk block number of this BIO. 400 * @start: start block number of the bio
307 * @nr_vecs: request size of page vector. 401 * @nr_vecs: request size of page vector.
308 * 402 *
309 * alloc_seg_bio() allocates a new BIO structure and initialize it.
310 *
311 * Return Value: On success, pointer to the struct bio is returned. 403 * Return Value: On success, pointer to the struct bio is returned.
312 * On error, NULL is returned. 404 * On error, NULL is returned.
313 */ 405 */
314static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, 406static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
315 int nr_vecs) 407 int nr_vecs)
316{ 408{
317 struct bio *bio; 409 struct bio *bio;
@@ -322,36 +414,33 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
322 bio = bio_alloc(GFP_NOIO, nr_vecs); 414 bio = bio_alloc(GFP_NOIO, nr_vecs);
323 } 415 }
324 if (likely(bio)) { 416 if (likely(bio)) {
325 bio->bi_bdev = sb->s_bdev; 417 bio->bi_bdev = nilfs->ns_bdev;
326 bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9); 418 bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
327 } 419 }
328 return bio; 420 return bio;
329} 421}
330 422
331void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, 423static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
332 struct nilfs_write_info *wi) 424 struct nilfs_write_info *wi)
333{ 425{
334 wi->bio = NULL; 426 wi->bio = NULL;
335 wi->rest_blocks = segbuf->sb_sum.nblocks; 427 wi->rest_blocks = segbuf->sb_sum.nblocks;
336 wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev); 428 wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
337 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); 429 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
338 wi->start = wi->end = 0; 430 wi->start = wi->end = 0;
339 wi->nbio = 0;
340 wi->blocknr = segbuf->sb_pseg_start; 431 wi->blocknr = segbuf->sb_pseg_start;
341
342 atomic_set(&wi->err, 0);
343 init_completion(&wi->bio_event);
344} 432}
345 433
346static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, 434static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
347 int mode) 435 struct nilfs_write_info *wi,
436 struct buffer_head *bh, int mode)
348{ 437{
349 int len, err; 438 int len, err;
350 439
351 BUG_ON(wi->nr_vecs <= 0); 440 BUG_ON(wi->nr_vecs <= 0);
352 repeat: 441 repeat:
353 if (!wi->bio) { 442 if (!wi->bio) {
354 wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end, 443 wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
355 wi->nr_vecs); 444 wi->nr_vecs);
356 if (unlikely(!wi->bio)) 445 if (unlikely(!wi->bio))
357 return -ENOMEM; 446 return -ENOMEM;
@@ -363,76 +452,83 @@ static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
363 return 0; 452 return 0;
364 } 453 }
365 /* bio is FULL */ 454 /* bio is FULL */
366 err = nilfs_submit_seg_bio(wi, mode); 455 err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
367 /* never submit current bh */ 456 /* never submit current bh */
368 if (likely(!err)) 457 if (likely(!err))
369 goto repeat; 458 goto repeat;
370 return err; 459 return err;
371} 460}
372 461
373int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 462/**
374 struct nilfs_write_info *wi) 463 * nilfs_segbuf_write - submit write requests of a log
464 * @segbuf: buffer storing a log to be written
465 * @nilfs: nilfs object
466 *
467 * Return Value: On Success, 0 is returned. On Error, one of the following
468 * negative error code is returned.
469 *
470 * %-EIO - I/O error
471 *
472 * %-ENOMEM - Insufficient memory available.
473 */
474static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
475 struct the_nilfs *nilfs)
375{ 476{
477 struct nilfs_write_info wi;
376 struct buffer_head *bh; 478 struct buffer_head *bh;
377 int res, rw = WRITE; 479 int res = 0, rw = WRITE;
480
481 wi.nilfs = nilfs;
482 nilfs_segbuf_prepare_write(segbuf, &wi);
378 483
379 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { 484 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
380 res = nilfs_submit_bh(wi, bh, rw); 485 res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
381 if (unlikely(res)) 486 if (unlikely(res))
382 goto failed_bio; 487 goto failed_bio;
383 } 488 }
384 489
385 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { 490 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
386 res = nilfs_submit_bh(wi, bh, rw); 491 res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
387 if (unlikely(res)) 492 if (unlikely(res))
388 goto failed_bio; 493 goto failed_bio;
389 } 494 }
390 495
391 if (wi->bio) { 496 if (wi.bio) {
392 /* 497 /*
393 * Last BIO is always sent through the following 498 * Last BIO is always sent through the following
394 * submission. 499 * submission.
395 */ 500 */
396 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 501 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
397 res = nilfs_submit_seg_bio(wi, rw); 502 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
398 if (unlikely(res))
399 goto failed_bio;
400 } 503 }
401 504
402 res = 0;
403 out:
404 return res;
405
406 failed_bio: 505 failed_bio:
407 atomic_inc(&wi->err); 506 return res;
408 goto out;
409} 507}
410 508
411/** 509/**
412 * nilfs_segbuf_wait - wait for completion of requested BIOs 510 * nilfs_segbuf_wait - wait for completion of requested BIOs
413 * @wi: nilfs_write_info 511 * @segbuf: segment buffer
414 * 512 *
415 * Return Value: On Success, 0 is returned. On Error, one of the following 513 * Return Value: On Success, 0 is returned. On Error, one of the following
416 * negative error code is returned. 514 * negative error code is returned.
417 * 515 *
418 * %-EIO - I/O error 516 * %-EIO - I/O error
419 */ 517 */
420int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf, 518static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
421 struct nilfs_write_info *wi)
422{ 519{
423 int err = 0; 520 int err = 0;
424 521
425 if (!wi->nbio) 522 if (!segbuf->sb_nbio)
426 return 0; 523 return 0;
427 524
428 do { 525 do {
429 wait_for_completion(&wi->bio_event); 526 wait_for_completion(&segbuf->sb_bio_event);
430 } while (--wi->nbio > 0); 527 } while (--segbuf->sb_nbio > 0);
431 528
432 if (unlikely(atomic_read(&wi->err) > 0)) { 529 if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
433 printk(KERN_ERR "NILFS: IO error writing segment\n"); 530 printk(KERN_ERR "NILFS: IO error writing segment\n");
434 err = -EIO; 531 err = -EIO;
435 segbuf->sb_io_error = 1;
436 } 532 }
437 return err; 533 return err;
438} 534}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 0c3076f4e592..94dfd3517bc0 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -27,7 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/bio.h> 28#include <linux/bio.h>
29#include <linux/completion.h> 29#include <linux/completion.h>
30#include <linux/backing-dev.h>
31 30
32/** 31/**
33 * struct nilfs_segsum_info - On-memory segment summary 32 * struct nilfs_segsum_info - On-memory segment summary
@@ -77,7 +76,9 @@ struct nilfs_segsum_info {
77 * @sb_rest_blocks: Number of residual blocks in the current segment 76 * @sb_rest_blocks: Number of residual blocks in the current segment
78 * @sb_segsum_buffers: List of buffers for segment summaries 77 * @sb_segsum_buffers: List of buffers for segment summaries
79 * @sb_payload_buffers: List of buffers for segment payload 78 * @sb_payload_buffers: List of buffers for segment payload
80 * @sb_io_error: I/O error status 79 * @sb_nbio: Number of flying bio requests
80 * @sb_err: I/O error status
81 * @sb_bio_event: Completion event of log writing
81 */ 82 */
82struct nilfs_segment_buffer { 83struct nilfs_segment_buffer {
83 struct super_block *sb_super; 84 struct super_block *sb_super;
@@ -96,7 +97,9 @@ struct nilfs_segment_buffer {
96 struct list_head sb_payload_buffers; /* including super root */ 97 struct list_head sb_payload_buffers; /* including super root */
97 98
98 /* io status */ 99 /* io status */
99 int sb_io_error; 100 int sb_nbio;
101 atomic_t sb_err;
102 struct completion sb_bio_event;
100}; 103};
101 104
102#define NILFS_LIST_SEGBUF(head) \ 105#define NILFS_LIST_SEGBUF(head) \
@@ -125,6 +128,8 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
125void nilfs_segbuf_free(struct nilfs_segment_buffer *); 128void nilfs_segbuf_free(struct nilfs_segment_buffer *);
126void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, 129void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
127 struct the_nilfs *); 130 struct the_nilfs *);
131void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
132 struct nilfs_segment_buffer *prev);
128void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, 133void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
129 struct the_nilfs *); 134 struct the_nilfs *);
130int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); 135int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
@@ -161,41 +166,15 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
161 segbuf->sb_sum.nfileblk++; 166 segbuf->sb_sum.nfileblk++;
162} 167}
163 168
164void nilfs_release_buffers(struct list_head *); 169void nilfs_clear_logs(struct list_head *logs);
170void nilfs_truncate_logs(struct list_head *logs,
171 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
173int nilfs_wait_on_logs(struct list_head *logs);
165 174
166static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) 175static inline void nilfs_destroy_logs(struct list_head *logs)
167{ 176{
168 nilfs_release_buffers(&segbuf->sb_segsum_buffers); 177 nilfs_truncate_logs(logs, NULL);
169 nilfs_release_buffers(&segbuf->sb_payload_buffers);
170} 178}
171 179
172struct nilfs_write_info {
173 struct bio *bio;
174 int start, end; /* The region to be submitted */
175 int rest_blocks;
176 int max_pages;
177 int nr_vecs;
178 sector_t blocknr;
179
180 int nbio;
181 atomic_t err;
182 struct completion bio_event;
183 /* completion event of segment write */
184
185 /*
186 * The following fields must be set explicitly
187 */
188 struct super_block *sb;
189 struct backing_dev_info *bdi; /* backing dev info */
190 struct buffer_head *bh_sr;
191};
192
193
194void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
195 struct nilfs_write_info *);
196int nilfs_segbuf_write(struct nilfs_segment_buffer *,
197 struct nilfs_write_info *);
198int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
199 struct nilfs_write_info *);
200
201#endif /* _NILFS_SEGBUF_H */ 180#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6eff66a070d5..6a7dbd8451db 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/pagevec.h> 34#include <linux/pagevec.h>
35#include <linux/slab.h>
35#include "nilfs.h" 36#include "nilfs.h"
36#include "btnode.h" 37#include "btnode.h"
37#include "page.h" 38#include "page.h"
@@ -141,7 +142,7 @@ int nilfs_init_transaction_cache(void)
141} 142}
142 143
143/** 144/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info 145 * nilfs_destroy_transaction_cache - destroy the cache for transaction info
145 * 146 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct 147 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info. 148 * nilfs_transaction_info.
@@ -201,7 +202,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
201 * This function allocates a nilfs_transaction_info struct to keep context 202 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in 203 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used 204 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab. 205 * instead; otherwise a new struct is assigned from a slab.
205 * 206 *
206 * When @vacancy_check flag is set, this function will check the amount of 207 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity. 208 * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -974,12 +975,12 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
974 nilfs->ns_nongc_ctime : sci->sc_seg_ctime); 975 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
975 raw_sr->sr_flags = 0; 976 raw_sr->sr_flags = 0;
976 977
977 nilfs_mdt_write_inode_direct( 978 nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
978 nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz)); 979 NILFS_SR_DAT_OFFSET(isz), 1);
979 nilfs_mdt_write_inode_direct( 980 nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
980 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz)); 981 NILFS_SR_CPFILE_OFFSET(isz), 1);
981 nilfs_mdt_write_inode_direct( 982 nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
982 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz)); 983 NILFS_SR_SUFILE_OFFSET(isz), 1);
983} 984}
984 985
985static void nilfs_redirty_inodes(struct list_head *head) 986static void nilfs_redirty_inodes(struct list_head *head)
@@ -1273,73 +1274,75 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1273 return err; 1274 return err;
1274} 1275}
1275 1276
1276static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum) 1277/**
1277{ 1278 * nilfs_segctor_begin_construction - setup segment buffer to make a new log
1278 struct buffer_head *bh_su; 1279 * @sci: nilfs_sc_info
1279 struct nilfs_segment_usage *raw_su; 1280 * @nilfs: nilfs object
1280 int err; 1281 */
1281
1282 err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
1283 if (unlikely(err))
1284 return err;
1285 nilfs_mdt_mark_buffer_dirty(bh_su);
1286 nilfs_mdt_mark_dirty(sufile);
1287 nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
1288 return 0;
1289}
1290
1291static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, 1282static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
1292 struct the_nilfs *nilfs) 1283 struct the_nilfs *nilfs)
1293{ 1284{
1294 struct nilfs_segment_buffer *segbuf, *n; 1285 struct nilfs_segment_buffer *segbuf, *prev;
1295 __u64 nextnum; 1286 __u64 nextnum;
1296 int err; 1287 int err, alloc = 0;
1297 1288
1298 if (list_empty(&sci->sc_segbufs)) { 1289 segbuf = nilfs_segbuf_new(sci->sc_super);
1299 segbuf = nilfs_segbuf_new(sci->sc_super); 1290 if (unlikely(!segbuf))
1300 if (unlikely(!segbuf)) 1291 return -ENOMEM;
1301 return -ENOMEM;
1302 list_add(&segbuf->sb_list, &sci->sc_segbufs);
1303 } else
1304 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1305 1292
1306 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset, 1293 if (list_empty(&sci->sc_write_logs)) {
1307 nilfs); 1294 nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
1295 nilfs->ns_pseg_offset, nilfs);
1296 if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
1297 nilfs_shift_to_next_segment(nilfs);
1298 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
1299 }
1300
1301 segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
1302 nextnum = nilfs->ns_nextnum;
1308 1303
1309 if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { 1304 if (nilfs->ns_segnum == nilfs->ns_nextnum)
1310 nilfs_shift_to_next_segment(nilfs); 1305 /* Start from the head of a new full segment */
1311 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); 1306 alloc++;
1307 } else {
1308 /* Continue logs */
1309 prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
1310 nilfs_segbuf_map_cont(segbuf, prev);
1311 segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
1312 nextnum = prev->sb_nextnum;
1313
1314 if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
1315 nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
1316 segbuf->sb_sum.seg_seq++;
1317 alloc++;
1318 }
1312 } 1319 }
1313 sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
1314 1320
1315 err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum); 1321 err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
1316 if (unlikely(err)) 1322 if (err)
1317 return err; 1323 goto failed;
1318 1324
1319 if (nilfs->ns_segnum == nilfs->ns_nextnum) { 1325 if (alloc) {
1320 /* Start from the head of a new full segment */
1321 err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); 1326 err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
1322 if (unlikely(err)) 1327 if (err)
1323 return err; 1328 goto failed;
1324 } else 1329 }
1325 nextnum = nilfs->ns_nextnum;
1326
1327 segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
1328 nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); 1330 nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
1329 1331
1330 /* truncating segment buffers */ 1332 BUG_ON(!list_empty(&sci->sc_segbufs));
1331 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, 1333 list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
1332 sb_list) { 1334 sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
1333 list_del_init(&segbuf->sb_list);
1334 nilfs_segbuf_free(segbuf);
1335 }
1336 return 0; 1335 return 0;
1336
1337 failed:
1338 nilfs_segbuf_free(segbuf);
1339 return err;
1337} 1340}
1338 1341
1339static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, 1342static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
1340 struct the_nilfs *nilfs, int nadd) 1343 struct the_nilfs *nilfs, int nadd)
1341{ 1344{
1342 struct nilfs_segment_buffer *segbuf, *prev, *n; 1345 struct nilfs_segment_buffer *segbuf, *prev;
1343 struct inode *sufile = nilfs->ns_sufile; 1346 struct inode *sufile = nilfs->ns_sufile;
1344 __u64 nextnextnum; 1347 __u64 nextnextnum;
1345 LIST_HEAD(list); 1348 LIST_HEAD(list);
@@ -1352,7 +1355,7 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
1352 * not be dirty. The following call ensures that the buffer is dirty 1355 * not be dirty. The following call ensures that the buffer is dirty
1353 * and will pin the buffer on memory until the sufile is written. 1356 * and will pin the buffer on memory until the sufile is written.
1354 */ 1357 */
1355 err = nilfs_touch_segusage(sufile, prev->sb_nextnum); 1358 err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
1356 if (unlikely(err)) 1359 if (unlikely(err))
1357 return err; 1360 return err;
1358 1361
@@ -1378,33 +1381,33 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
1378 list_add_tail(&segbuf->sb_list, &list); 1381 list_add_tail(&segbuf->sb_list, &list);
1379 prev = segbuf; 1382 prev = segbuf;
1380 } 1383 }
1381 list_splice(&list, sci->sc_segbufs.prev); 1384 list_splice_tail(&list, &sci->sc_segbufs);
1382 return 0; 1385 return 0;
1383 1386
1384 failed_segbuf: 1387 failed_segbuf:
1385 nilfs_segbuf_free(segbuf); 1388 nilfs_segbuf_free(segbuf);
1386 failed: 1389 failed:
1387 list_for_each_entry_safe(segbuf, n, &list, sb_list) { 1390 list_for_each_entry(segbuf, &list, sb_list) {
1388 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); 1391 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1389 WARN_ON(ret); /* never fails */ 1392 WARN_ON(ret); /* never fails */
1390 list_del_init(&segbuf->sb_list);
1391 nilfs_segbuf_free(segbuf);
1392 } 1393 }
1394 nilfs_destroy_logs(&list);
1393 return err; 1395 return err;
1394} 1396}
1395 1397
1396static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, 1398static void nilfs_free_incomplete_logs(struct list_head *logs,
1397 struct the_nilfs *nilfs) 1399 struct the_nilfs *nilfs)
1398{ 1400{
1399 struct nilfs_segment_buffer *segbuf; 1401 struct nilfs_segment_buffer *segbuf, *prev;
1400 int ret, done = 0; 1402 struct inode *sufile = nilfs->ns_sufile;
1403 int ret;
1401 1404
1402 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); 1405 segbuf = NILFS_FIRST_SEGBUF(logs);
1403 if (nilfs->ns_nextnum != segbuf->sb_nextnum) { 1406 if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
1404 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); 1407 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1405 WARN_ON(ret); /* never fails */ 1408 WARN_ON(ret); /* never fails */
1406 } 1409 }
1407 if (segbuf->sb_io_error) { 1410 if (atomic_read(&segbuf->sb_err)) {
1408 /* Case 1: The first segment failed */ 1411 /* Case 1: The first segment failed */
1409 if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) 1412 if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
1410 /* Case 1a: Partial segment appended into an existing 1413 /* Case 1a: Partial segment appended into an existing
@@ -1413,106 +1416,54 @@ static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
1413 segbuf->sb_fseg_end); 1416 segbuf->sb_fseg_end);
1414 else /* Case 1b: New full segment */ 1417 else /* Case 1b: New full segment */
1415 set_nilfs_discontinued(nilfs); 1418 set_nilfs_discontinued(nilfs);
1416 done++;
1417 } 1419 }
1418 1420
1419 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { 1421 prev = segbuf;
1420 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); 1422 list_for_each_entry_continue(segbuf, logs, sb_list) {
1421 WARN_ON(ret); /* never fails */ 1423 if (prev->sb_nextnum != segbuf->sb_nextnum) {
1422 if (!done && segbuf->sb_io_error) { 1424 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1423 if (segbuf->sb_segnum != nilfs->ns_nextnum) 1425 WARN_ON(ret); /* never fails */
1424 /* Case 2: extended segment (!= next) failed */
1425 nilfs_sufile_set_error(nilfs->ns_sufile,
1426 segbuf->sb_segnum);
1427 done++;
1428 }
1429 }
1430}
1431
1432static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
1433{
1434 struct nilfs_segment_buffer *segbuf;
1435
1436 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
1437 nilfs_segbuf_clear(segbuf);
1438 sci->sc_super_root = NULL;
1439}
1440
1441static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
1442{
1443 struct nilfs_segment_buffer *segbuf;
1444
1445 while (!list_empty(&sci->sc_segbufs)) {
1446 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1447 list_del_init(&segbuf->sb_list);
1448 nilfs_segbuf_free(segbuf);
1449 }
1450 /* sci->sc_curseg = NULL; */
1451}
1452
1453static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
1454 struct the_nilfs *nilfs, int err)
1455{
1456 if (unlikely(err)) {
1457 nilfs_segctor_free_incomplete_segments(sci, nilfs);
1458 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1459 int ret;
1460
1461 ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1462 sci->sc_freesegs,
1463 sci->sc_nfreesegs,
1464 NULL);
1465 WARN_ON(ret); /* do not happen */
1466 } 1426 }
1427 if (atomic_read(&segbuf->sb_err) &&
1428 segbuf->sb_segnum != nilfs->ns_nextnum)
1429 /* Case 2: extended segment (!= next) failed */
1430 nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
1431 prev = segbuf;
1467 } 1432 }
1468 nilfs_segctor_clear_segment_buffers(sci);
1469} 1433}
1470 1434
1471static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, 1435static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
1472 struct inode *sufile) 1436 struct inode *sufile)
1473{ 1437{
1474 struct nilfs_segment_buffer *segbuf; 1438 struct nilfs_segment_buffer *segbuf;
1475 struct buffer_head *bh_su;
1476 struct nilfs_segment_usage *raw_su;
1477 unsigned long live_blocks; 1439 unsigned long live_blocks;
1478 int ret; 1440 int ret;
1479 1441
1480 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { 1442 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1481 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1482 &raw_su, &bh_su);
1483 WARN_ON(ret); /* always succeed because bh_su is dirty */
1484 live_blocks = segbuf->sb_sum.nblocks + 1443 live_blocks = segbuf->sb_sum.nblocks +
1485 (segbuf->sb_pseg_start - segbuf->sb_fseg_start); 1444 (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
1486 raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime); 1445 ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
1487 raw_su->su_nblocks = cpu_to_le32(live_blocks); 1446 live_blocks,
1488 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, 1447 sci->sc_seg_ctime);
1489 bh_su); 1448 WARN_ON(ret); /* always succeed because the segusage is dirty */
1490 } 1449 }
1491} 1450}
1492 1451
1493static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci, 1452static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
1494 struct inode *sufile)
1495{ 1453{
1496 struct nilfs_segment_buffer *segbuf; 1454 struct nilfs_segment_buffer *segbuf;
1497 struct buffer_head *bh_su;
1498 struct nilfs_segment_usage *raw_su;
1499 int ret; 1455 int ret;
1500 1456
1501 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); 1457 segbuf = NILFS_FIRST_SEGBUF(logs);
1502 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, 1458 ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
1503 &raw_su, &bh_su); 1459 segbuf->sb_pseg_start -
1504 WARN_ON(ret); /* always succeed because bh_su is dirty */ 1460 segbuf->sb_fseg_start, 0);
1505 raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start - 1461 WARN_ON(ret); /* always succeed because the segusage is dirty */
1506 segbuf->sb_fseg_start);
1507 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
1508 1462
1509 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { 1463 list_for_each_entry_continue(segbuf, logs, sb_list) {
1510 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, 1464 ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
1511 &raw_su, &bh_su); 1465 0, 0);
1512 WARN_ON(ret); /* always succeed */ 1466 WARN_ON(ret); /* always succeed */
1513 raw_su->su_nblocks = 0;
1514 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1515 bh_su);
1516 } 1467 }
1517} 1468}
1518 1469
@@ -1520,17 +1471,15 @@ static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
1520 struct nilfs_segment_buffer *last, 1471 struct nilfs_segment_buffer *last,
1521 struct inode *sufile) 1472 struct inode *sufile)
1522{ 1473{
1523 struct nilfs_segment_buffer *segbuf = last, *n; 1474 struct nilfs_segment_buffer *segbuf = last;
1524 int ret; 1475 int ret;
1525 1476
1526 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, 1477 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1527 sb_list) {
1528 list_del_init(&segbuf->sb_list);
1529 sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; 1478 sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
1530 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); 1479 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1531 WARN_ON(ret); 1480 WARN_ON(ret);
1532 nilfs_segbuf_free(segbuf);
1533 } 1481 }
1482 nilfs_truncate_logs(&sci->sc_segbufs, last);
1534} 1483}
1535 1484
1536 1485
@@ -1562,6 +1511,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1562 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1511 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1563 break; 1512 break;
1564 1513
1514 nilfs_clear_logs(&sci->sc_segbufs);
1515
1516 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1517 if (unlikely(err))
1518 return err;
1519
1565 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1520 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1566 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1521 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1567 sci->sc_freesegs, 1522 sci->sc_freesegs,
@@ -1569,12 +1524,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1569 NULL); 1524 NULL);
1570 WARN_ON(err); /* do not happen */ 1525 WARN_ON(err); /* do not happen */
1571 } 1526 }
1572 nilfs_segctor_clear_segment_buffers(sci);
1573
1574 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1575 if (unlikely(err))
1576 return err;
1577
1578 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1527 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1579 sci->sc_stage = prev_stage; 1528 sci->sc_stage = prev_stage;
1580 } 1529 }
@@ -1814,26 +1763,13 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1814} 1763}
1815 1764
1816static int nilfs_segctor_write(struct nilfs_sc_info *sci, 1765static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1817 struct backing_dev_info *bdi) 1766 struct the_nilfs *nilfs)
1818{ 1767{
1819 struct nilfs_segment_buffer *segbuf; 1768 int ret;
1820 struct nilfs_write_info wi;
1821 int err, res;
1822
1823 wi.sb = sci->sc_super;
1824 wi.bh_sr = sci->sc_super_root;
1825 wi.bdi = bdi;
1826
1827 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1828 nilfs_segbuf_prepare_write(segbuf, &wi);
1829 err = nilfs_segbuf_write(segbuf, &wi);
1830 1769
1831 res = nilfs_segbuf_wait(segbuf, &wi); 1770 ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
1832 err = err ? : res; 1771 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
1833 if (err) 1772 return ret;
1834 return err;
1835 }
1836 return 0;
1837} 1773}
1838 1774
1839static void __nilfs_end_page_io(struct page *page, int err) 1775static void __nilfs_end_page_io(struct page *page, int err)
@@ -1911,15 +1847,17 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1911 } 1847 }
1912} 1848}
1913 1849
1914static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, 1850static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1915 struct page *failed_page, int err) 1851 struct buffer_head *bh_sr, int err)
1916{ 1852{
1917 struct nilfs_segment_buffer *segbuf; 1853 struct nilfs_segment_buffer *segbuf;
1918 struct page *bd_page = NULL, *fs_page = NULL; 1854 struct page *bd_page = NULL, *fs_page = NULL;
1855 struct buffer_head *bh;
1919 1856
1920 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { 1857 if (list_empty(logs))
1921 struct buffer_head *bh; 1858 return;
1922 1859
1860 list_for_each_entry(segbuf, logs, sb_list) {
1923 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, 1861 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1924 b_assoc_buffers) { 1862 b_assoc_buffers) {
1925 if (bh->b_page != bd_page) { 1863 if (bh->b_page != bd_page) {
@@ -1931,7 +1869,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1931 1869
1932 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1870 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1933 b_assoc_buffers) { 1871 b_assoc_buffers) {
1934 if (bh == sci->sc_super_root) { 1872 if (bh == bh_sr) {
1935 if (bh->b_page != bd_page) { 1873 if (bh->b_page != bd_page) {
1936 end_page_writeback(bd_page); 1874 end_page_writeback(bd_page);
1937 bd_page = bh->b_page; 1875 bd_page = bh->b_page;
@@ -1941,7 +1879,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1941 if (bh->b_page != fs_page) { 1879 if (bh->b_page != fs_page) {
1942 nilfs_end_page_io(fs_page, err); 1880 nilfs_end_page_io(fs_page, err);
1943 if (fs_page && fs_page == failed_page) 1881 if (fs_page && fs_page == failed_page)
1944 goto done; 1882 return;
1945 fs_page = bh->b_page; 1883 fs_page = bh->b_page;
1946 } 1884 }
1947 } 1885 }
@@ -1950,8 +1888,33 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1950 end_page_writeback(bd_page); 1888 end_page_writeback(bd_page);
1951 1889
1952 nilfs_end_page_io(fs_page, err); 1890 nilfs_end_page_io(fs_page, err);
1953 done: 1891}
1892
1893static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1894 struct the_nilfs *nilfs, int err)
1895{
1896 LIST_HEAD(logs);
1897 int ret;
1898
1899 list_splice_tail_init(&sci->sc_write_logs, &logs);
1900 ret = nilfs_wait_on_logs(&logs);
1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
1902
1903 list_splice_tail_init(&sci->sc_segbufs, &logs);
1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
1905 nilfs_free_incomplete_logs(&logs, nilfs);
1954 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); 1906 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
1907
1908 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1909 ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1910 sci->sc_freesegs,
1911 sci->sc_nfreesegs,
1912 NULL);
1913 WARN_ON(ret); /* do not happen */
1914 }
1915
1916 nilfs_destroy_logs(&logs);
1917 sci->sc_super_root = NULL;
1955} 1918}
1956 1919
1957static void nilfs_set_next_segment(struct the_nilfs *nilfs, 1920static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1969,11 +1932,10 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1969{ 1932{
1970 struct nilfs_segment_buffer *segbuf; 1933 struct nilfs_segment_buffer *segbuf;
1971 struct page *bd_page = NULL, *fs_page = NULL; 1934 struct page *bd_page = NULL, *fs_page = NULL;
1972 struct nilfs_sb_info *sbi = sci->sc_sbi; 1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1973 struct the_nilfs *nilfs = sbi->s_nilfs;
1974 int update_sr = (sci->sc_super_root != NULL); 1936 int update_sr = (sci->sc_super_root != NULL);
1975 1937
1976 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { 1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
1977 struct buffer_head *bh; 1939 struct buffer_head *bh;
1978 1940
1979 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, 1941 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
@@ -2046,21 +2008,34 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
2046 2008
2047 sci->sc_nblk_inc += sci->sc_nblk_this_inc; 2009 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
2048 2010
2049 segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs); 2011 segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
2050 nilfs_set_next_segment(nilfs, segbuf); 2012 nilfs_set_next_segment(nilfs, segbuf);
2051 2013
2052 if (update_sr) { 2014 if (update_sr) {
2053 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 2015 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2054 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 2016 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2055 sbi->s_super->s_dirt = 1; 2017 set_nilfs_sb_dirty(nilfs);
2056 2018
2057 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 2019 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2058 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2020 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2059 set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); 2021 set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2022 nilfs_segctor_clear_metadata_dirty(sci);
2060 } else 2023 } else
2061 clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); 2024 clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2062} 2025}
2063 2026
2027static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
2028{
2029 int ret;
2030
2031 ret = nilfs_wait_on_logs(&sci->sc_write_logs);
2032 if (!ret) {
2033 nilfs_segctor_complete_write(sci);
2034 nilfs_destroy_logs(&sci->sc_write_logs);
2035 }
2036 return ret;
2037}
2038
2064static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, 2039static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2065 struct nilfs_sb_info *sbi) 2040 struct nilfs_sb_info *sbi)
2066{ 2041{
@@ -2173,7 +2148,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2173 /* Avoid empty segment */ 2148 /* Avoid empty segment */
2174 if (sci->sc_stage.scnt == NILFS_ST_DONE && 2149 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2175 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { 2150 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
2176 nilfs_segctor_end_construction(sci, nilfs, 1); 2151 nilfs_segctor_abort_construction(sci, nilfs, 1);
2177 goto out; 2152 goto out;
2178 } 2153 }
2179 2154
@@ -2187,7 +2162,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2187 if (has_sr) { 2162 if (has_sr) {
2188 err = nilfs_segctor_fill_in_checkpoint(sci); 2163 err = nilfs_segctor_fill_in_checkpoint(sci);
2189 if (unlikely(err)) 2164 if (unlikely(err))
2190 goto failed_to_make_up; 2165 goto failed_to_write;
2191 2166
2192 nilfs_segctor_fill_in_super_root(sci, nilfs); 2167 nilfs_segctor_fill_in_super_root(sci, nilfs);
2193 } 2168 }
@@ -2195,47 +2170,51 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2195 2170
2196 /* Write partial segments */ 2171 /* Write partial segments */
2197 err = nilfs_segctor_prepare_write(sci, &failed_page); 2172 err = nilfs_segctor_prepare_write(sci, &failed_page);
2198 if (unlikely(err)) 2173 if (err) {
2174 nilfs_abort_logs(&sci->sc_segbufs, failed_page,
2175 sci->sc_super_root, err);
2199 goto failed_to_write; 2176 goto failed_to_write;
2200 2177 }
2201 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); 2178 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
2202 2179
2203 err = nilfs_segctor_write(sci, nilfs->ns_bdi); 2180 err = nilfs_segctor_write(sci, nilfs);
2204 if (unlikely(err)) 2181 if (unlikely(err))
2205 goto failed_to_write; 2182 goto failed_to_write;
2206 2183
2207 nilfs_segctor_complete_write(sci); 2184 if (sci->sc_stage.scnt == NILFS_ST_DONE ||
2208 2185 nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
2209 /* Commit segments */ 2186 /*
2210 if (has_sr) 2187 * At this point, we avoid double buffering
2211 nilfs_segctor_clear_metadata_dirty(sci); 2188 * for blocksize < pagesize because page dirty
2212 2189 * flag is turned off during write and dirty
2213 nilfs_segctor_end_construction(sci, nilfs, 0); 2190 * buffers are not properly collected for
2214 2191 * pages crossing over segments.
2192 */
2193 err = nilfs_segctor_wait(sci);
2194 if (err)
2195 goto failed_to_write;
2196 }
2215 } while (sci->sc_stage.scnt != NILFS_ST_DONE); 2197 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2216 2198
2199 sci->sc_super_root = NULL;
2200
2217 out: 2201 out:
2218 nilfs_segctor_destroy_segment_buffers(sci);
2219 nilfs_segctor_check_out_files(sci, sbi); 2202 nilfs_segctor_check_out_files(sci, sbi);
2220 return err; 2203 return err;
2221 2204
2222 failed_to_write: 2205 failed_to_write:
2223 nilfs_segctor_abort_write(sci, failed_page, err);
2224 nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
2225
2226 failed_to_make_up:
2227 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2206 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2228 nilfs_redirty_inodes(&sci->sc_dirty_files); 2207 nilfs_redirty_inodes(&sci->sc_dirty_files);
2229 2208
2230 failed: 2209 failed:
2231 if (nilfs_doing_gc()) 2210 if (nilfs_doing_gc())
2232 nilfs_redirty_inodes(&sci->sc_gc_inodes); 2211 nilfs_redirty_inodes(&sci->sc_gc_inodes);
2233 nilfs_segctor_end_construction(sci, nilfs, err); 2212 nilfs_segctor_abort_construction(sci, nilfs, err);
2234 goto out; 2213 goto out;
2235} 2214}
2236 2215
2237/** 2216/**
2238 * nilfs_secgtor_start_timer - set timer of background write 2217 * nilfs_segctor_start_timer - set timer of background write
2239 * @sci: nilfs_sc_info 2218 * @sci: nilfs_sc_info
2240 * 2219 *
2241 * If the timer has already been set, it ignores the new request. 2220 * If the timer has already been set, it ignores the new request.
@@ -2440,43 +2419,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2440 return err; 2419 return err;
2441} 2420}
2442 2421
2443struct nilfs_segctor_req {
2444 int mode;
2445 __u32 seq_accepted;
2446 int sc_err; /* construction failure */
2447 int sb_err; /* super block writeback failure */
2448};
2449
2450#define FLUSH_FILE_BIT (0x1) /* data file only */ 2422#define FLUSH_FILE_BIT (0x1) /* data file only */
2451#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ 2423#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2452 2424
2453static void nilfs_segctor_accept(struct nilfs_sc_info *sci, 2425/**
2454 struct nilfs_segctor_req *req) 2426 * nilfs_segctor_accept - record accepted sequence count of log-write requests
2427 * @sci: segment constructor object
2428 */
2429static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2455{ 2430{
2456 req->sc_err = req->sb_err = 0;
2457 spin_lock(&sci->sc_state_lock); 2431 spin_lock(&sci->sc_state_lock);
2458 req->seq_accepted = sci->sc_seq_request; 2432 sci->sc_seq_accepted = sci->sc_seq_request;
2459 spin_unlock(&sci->sc_state_lock); 2433 spin_unlock(&sci->sc_state_lock);
2460 2434
2461 if (sci->sc_timer) 2435 if (sci->sc_timer)
2462 del_timer_sync(sci->sc_timer); 2436 del_timer_sync(sci->sc_timer);
2463} 2437}
2464 2438
2465static void nilfs_segctor_notify(struct nilfs_sc_info *sci, 2439/**
2466 struct nilfs_segctor_req *req) 2440 * nilfs_segctor_notify - notify the result of request to caller threads
2441 * @sci: segment constructor object
2442 * @mode: mode of log forming
2443 * @err: error code to be notified
2444 */
2445static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2467{ 2446{
2468 /* Clear requests (even when the construction failed) */ 2447 /* Clear requests (even when the construction failed) */
2469 spin_lock(&sci->sc_state_lock); 2448 spin_lock(&sci->sc_state_lock);
2470 2449
2471 if (req->mode == SC_LSEG_SR) { 2450 if (mode == SC_LSEG_SR) {
2472 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; 2451 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2473 sci->sc_seq_done = req->seq_accepted; 2452 sci->sc_seq_done = sci->sc_seq_accepted;
2474 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); 2453 nilfs_segctor_wakeup(sci, err);
2475 sci->sc_flush_request = 0; 2454 sci->sc_flush_request = 0;
2476 } else { 2455 } else {
2477 if (req->mode == SC_FLUSH_FILE) 2456 if (mode == SC_FLUSH_FILE)
2478 sci->sc_flush_request &= ~FLUSH_FILE_BIT; 2457 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2479 else if (req->mode == SC_FLUSH_DAT) 2458 else if (mode == SC_FLUSH_DAT)
2480 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2481 2460
2482 /* re-enable timer if checkpoint creation was not done */ 2461 /* re-enable timer if checkpoint creation was not done */
@@ -2487,30 +2466,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2487 spin_unlock(&sci->sc_state_lock); 2466 spin_unlock(&sci->sc_state_lock);
2488} 2467}
2489 2468
2490static int nilfs_segctor_construct(struct nilfs_sc_info *sci, 2469/**
2491 struct nilfs_segctor_req *req) 2470 * nilfs_segctor_construct - form logs and write them to disk
2471 * @sci: segment constructor object
2472 * @mode: mode of log forming
2473 */
2474static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2492{ 2475{
2493 struct nilfs_sb_info *sbi = sci->sc_sbi; 2476 struct nilfs_sb_info *sbi = sci->sc_sbi;
2494 struct the_nilfs *nilfs = sbi->s_nilfs; 2477 struct the_nilfs *nilfs = sbi->s_nilfs;
2495 int err = 0; 2478 int err = 0;
2496 2479
2480 nilfs_segctor_accept(sci);
2481
2497 if (nilfs_discontinued(nilfs)) 2482 if (nilfs_discontinued(nilfs))
2498 req->mode = SC_LSEG_SR; 2483 mode = SC_LSEG_SR;
2499 if (!nilfs_segctor_confirm(sci)) { 2484 if (!nilfs_segctor_confirm(sci))
2500 err = nilfs_segctor_do_construct(sci, req->mode); 2485 err = nilfs_segctor_do_construct(sci, mode);
2501 req->sc_err = err; 2486
2502 }
2503 if (likely(!err)) { 2487 if (likely(!err)) {
2504 if (req->mode != SC_FLUSH_DAT) 2488 if (mode != SC_FLUSH_DAT)
2505 atomic_set(&nilfs->ns_ndirtyblks, 0); 2489 atomic_set(&nilfs->ns_ndirtyblks, 0);
2506 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2490 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2507 nilfs_discontinued(nilfs)) { 2491 nilfs_discontinued(nilfs)) {
2508 down_write(&nilfs->ns_sem); 2492 down_write(&nilfs->ns_sem);
2509 req->sb_err = nilfs_commit_super(sbi, 2493 err = nilfs_commit_super(
2510 nilfs_altsb_need_update(nilfs)); 2494 sbi, nilfs_altsb_need_update(nilfs));
2511 up_write(&nilfs->ns_sem); 2495 up_write(&nilfs->ns_sem);
2512 } 2496 }
2513 } 2497 }
2498
2499 nilfs_segctor_notify(sci, mode, err);
2514 return err; 2500 return err;
2515} 2501}
2516 2502
@@ -2541,7 +2527,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2541 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2527 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2542 struct the_nilfs *nilfs = sbi->s_nilfs; 2528 struct the_nilfs *nilfs = sbi->s_nilfs;
2543 struct nilfs_transaction_info ti; 2529 struct nilfs_transaction_info ti;
2544 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2545 int err; 2530 int err;
2546 2531
2547 if (unlikely(!sci)) 2532 if (unlikely(!sci))
@@ -2559,13 +2544,11 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2559 2544
2560 sci->sc_freesegs = kbufs[4]; 2545 sci->sc_freesegs = kbufs[4];
2561 sci->sc_nfreesegs = argv[4].v_nmembs; 2546 sci->sc_nfreesegs = argv[4].v_nmembs;
2562 list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); 2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
2563 2548
2564 for (;;) { 2549 for (;;) {
2565 nilfs_segctor_accept(sci, &req); 2550 err = nilfs_segctor_construct(sci, SC_LSEG_SR);
2566 err = nilfs_segctor_construct(sci, &req);
2567 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); 2551 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2568 nilfs_segctor_notify(sci, &req);
2569 2552
2570 if (likely(!err)) 2553 if (likely(!err))
2571 break; 2554 break;
@@ -2575,6 +2558,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2575 set_current_state(TASK_INTERRUPTIBLE); 2558 set_current_state(TASK_INTERRUPTIBLE);
2576 schedule_timeout(sci->sc_interval); 2559 schedule_timeout(sci->sc_interval);
2577 } 2560 }
2561 if (nilfs_test_opt(sbi, DISCARD)) {
2562 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
2563 sci->sc_nfreesegs);
2564 if (ret) {
2565 printk(KERN_WARNING
2566 "NILFS warning: error %d on discard request, "
2567 "turning discards off for the device\n", ret);
2568 nilfs_clear_opt(sbi, DISCARD);
2569 }
2570 }
2578 2571
2579 out_unlock: 2572 out_unlock:
2580 sci->sc_freesegs = NULL; 2573 sci->sc_freesegs = NULL;
@@ -2588,13 +2581,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2588{ 2581{
2589 struct nilfs_sb_info *sbi = sci->sc_sbi; 2582 struct nilfs_sb_info *sbi = sci->sc_sbi;
2590 struct nilfs_transaction_info ti; 2583 struct nilfs_transaction_info ti;
2591 struct nilfs_segctor_req req = { .mode = mode };
2592 2584
2593 nilfs_transaction_lock(sbi, &ti, 0); 2585 nilfs_transaction_lock(sbi, &ti, 0);
2594 2586 nilfs_segctor_construct(sci, mode);
2595 nilfs_segctor_accept(sci, &req);
2596 nilfs_segctor_construct(sci, &req);
2597 nilfs_segctor_notify(sci, &req);
2598 2587
2599 /* 2588 /*
2600 * Unclosed segment should be retried. We do this using sc_timer. 2589 * Unclosed segment should be retried. We do this using sc_timer.
@@ -2650,6 +2639,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2650static int nilfs_segctor_thread(void *arg) 2639static int nilfs_segctor_thread(void *arg)
2651{ 2640{
2652 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2653 struct timer_list timer; 2643 struct timer_list timer;
2654 int timeout = 0; 2644 int timeout = 0;
2655 2645
@@ -2695,7 +2685,6 @@ static int nilfs_segctor_thread(void *arg)
2695 } else { 2685 } else {
2696 DEFINE_WAIT(wait); 2686 DEFINE_WAIT(wait);
2697 int should_sleep = 1; 2687 int should_sleep = 1;
2698 struct the_nilfs *nilfs;
2699 2688
2700 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2689 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2701 TASK_INTERRUPTIBLE); 2690 TASK_INTERRUPTIBLE);
@@ -2716,8 +2705,8 @@ static int nilfs_segctor_thread(void *arg)
2716 finish_wait(&sci->sc_wait_daemon, &wait); 2705 finish_wait(&sci->sc_wait_daemon, &wait);
2717 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2718 time_after_eq(jiffies, sci->sc_timer->expires)); 2707 time_after_eq(jiffies, sci->sc_timer->expires));
2719 nilfs = sci->sc_sbi->s_nilfs; 2708
2720 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) 2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2721 set_nilfs_discontinued(nilfs); 2710 set_nilfs_discontinued(nilfs);
2722 } 2711 }
2723 goto loop; 2712 goto loop;
@@ -2788,6 +2777,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2788 spin_lock_init(&sci->sc_state_lock); 2777 spin_lock_init(&sci->sc_state_lock);
2789 INIT_LIST_HEAD(&sci->sc_dirty_files); 2778 INIT_LIST_HEAD(&sci->sc_dirty_files);
2790 INIT_LIST_HEAD(&sci->sc_segbufs); 2779 INIT_LIST_HEAD(&sci->sc_segbufs);
2780 INIT_LIST_HEAD(&sci->sc_write_logs);
2791 INIT_LIST_HEAD(&sci->sc_gc_inodes); 2781 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2792 INIT_LIST_HEAD(&sci->sc_copied_buffers); 2782 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2793 2783
@@ -2811,12 +2801,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2811 do { 2801 do {
2812 struct nilfs_sb_info *sbi = sci->sc_sbi; 2802 struct nilfs_sb_info *sbi = sci->sc_sbi;
2813 struct nilfs_transaction_info ti; 2803 struct nilfs_transaction_info ti;
2814 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2815 2804
2816 nilfs_transaction_lock(sbi, &ti, 0); 2805 nilfs_transaction_lock(sbi, &ti, 0);
2817 nilfs_segctor_accept(sci, &req); 2806 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2818 ret = nilfs_segctor_construct(sci, &req);
2819 nilfs_segctor_notify(sci, &req);
2820 nilfs_transaction_unlock(sbi); 2807 nilfs_transaction_unlock(sbi);
2821 2808
2822 } while (ret && retrycount-- > 0); 2809 } while (ret && retrycount-- > 0);
@@ -2843,7 +2830,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2843 || sci->sc_seq_request != sci->sc_seq_done); 2830 || sci->sc_seq_request != sci->sc_seq_done);
2844 spin_unlock(&sci->sc_state_lock); 2831 spin_unlock(&sci->sc_state_lock);
2845 2832
2846 if (flag || nilfs_segctor_confirm(sci)) 2833 if (flag || !nilfs_segctor_confirm(sci))
2847 nilfs_segctor_write_out(sci); 2834 nilfs_segctor_write_out(sci);
2848 2835
2849 WARN_ON(!list_empty(&sci->sc_copied_buffers)); 2836 WARN_ON(!list_empty(&sci->sc_copied_buffers));
@@ -2855,6 +2842,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2855 } 2842 }
2856 2843
2857 WARN_ON(!list_empty(&sci->sc_segbufs)); 2844 WARN_ON(!list_empty(&sci->sc_segbufs));
2845 WARN_ON(!list_empty(&sci->sc_write_logs));
2858 2846
2859 down_write(&sbi->s_nilfs->ns_segctor_sem); 2847 down_write(&sbi->s_nilfs->ns_segctor_sem);
2860 2848
@@ -2866,7 +2854,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2866 * @sbi: nilfs_sb_info 2854 * @sbi: nilfs_sb_info
2867 * 2855 *
2868 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2856 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2869 * initilizes it, and starts the segment constructor. 2857 * initializes it, and starts the segment constructor.
2870 * 2858 *
2871 * Return Value: On success, 0 is returned. On error, one of the following 2859 * Return Value: On success, 0 is returned. On error, one of the following
2872 * negative error code is returned. 2860 * negative error code is returned.
@@ -2878,8 +2866,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2878 struct the_nilfs *nilfs = sbi->s_nilfs; 2866 struct the_nilfs *nilfs = sbi->s_nilfs;
2879 int err; 2867 int err;
2880 2868
2881 /* Each field of nilfs_segctor is cleared through the initialization 2869 if (NILFS_SC(sbi)) {
2882 of super-block info */ 2870 /*
2871 * This happens if the filesystem was remounted
2872 * read/write after nilfs_error degenerated it into a
2873 * read-only mount.
2874 */
2875 nilfs_detach_segment_constructor(sbi);
2876 }
2877
2883 sbi->s_sc_info = nilfs_segctor_new(sbi); 2878 sbi->s_sc_info = nilfs_segctor_new(sbi);
2884 if (!sbi->s_sc_info) 2879 if (!sbi->s_sc_info)
2885 return -ENOMEM; 2880 return -ENOMEM;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0d2a475a741b..82dfd6a686b9 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
30#include "sb.h" 30#include "sb.h"
31 31
32/** 32/**
33 * struct nilfs_recovery_info - Recovery infomation 33 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root 35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint 36 * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
71 */ 71 */
72struct nilfs_cstage { 72struct nilfs_cstage {
73 int scnt; 73 int scnt;
74 unsigned flags; 74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr; 75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr; 76 struct nilfs_inode_info *gc_inode_ptr;
77}; 77};
@@ -97,6 +97,7 @@ struct nilfs_segsum_pointer {
97 * @sc_dsync_start: start byte offset of data pages 97 * @sc_dsync_start: start byte offset of data pages
98 * @sc_dsync_end: end byte offset of data pages (inclusive) 98 * @sc_dsync_end: end byte offset of data pages (inclusive)
99 * @sc_segbufs: List of segment buffers 99 * @sc_segbufs: List of segment buffers
100 * @sc_write_logs: List of segment buffers to hold logs under writing
100 * @sc_segbuf_nblocks: Number of available blocks in segment buffers. 101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
101 * @sc_curseg: Current segment buffer 102 * @sc_curseg: Current segment buffer
102 * @sc_super_root: Pointer to the super root buffer 103 * @sc_super_root: Pointer to the super root buffer
@@ -115,6 +116,7 @@ struct nilfs_segsum_pointer {
115 * @sc_wait_daemon: Daemon wait queue 116 * @sc_wait_daemon: Daemon wait queue
116 * @sc_wait_task: Start/end wait queue to control segctord task 117 * @sc_wait_task: Start/end wait queue to control segctord task
117 * @sc_seq_request: Request counter 118 * @sc_seq_request: Request counter
119 * @sc_seq_accept: Accepted request count
118 * @sc_seq_done: Completion counter 120 * @sc_seq_done: Completion counter
119 * @sc_sync: Request of explicit sync operation 121 * @sc_sync: Request of explicit sync operation
120 * @sc_interval: Timeout value of background construction 122 * @sc_interval: Timeout value of background construction
@@ -143,6 +145,7 @@ struct nilfs_sc_info {
143 145
144 /* Segment buffers */ 146 /* Segment buffers */
145 struct list_head sc_segbufs; 147 struct list_head sc_segbufs;
148 struct list_head sc_write_logs;
146 unsigned long sc_segbuf_nblocks; 149 unsigned long sc_segbuf_nblocks;
147 struct nilfs_segment_buffer *sc_curseg; 150 struct nilfs_segment_buffer *sc_curseg;
148 struct buffer_head *sc_super_root; 151 struct buffer_head *sc_super_root;
@@ -167,6 +170,7 @@ struct nilfs_sc_info {
167 wait_queue_head_t sc_wait_task; 170 wait_queue_head_t sc_wait_task;
168 171
169 __u32 sc_seq_request; 172 __u32 sc_seq_request;
173 __u32 sc_seq_accepted;
170 __u32 sc_seq_done; 174 __u32 sc_seq_done;
171 175
172 int sc_sync; 176 int sc_sync;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 37994d4a59cc..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Koji Sato <koji@osrg.net>. 20 * Written by Koji Sato <koji@osrg.net>.
21 * Rivised by Ryusuke Konishi <ryusuke@osrg.net>. 21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 */ 22 */
23 23
24#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -31,6 +31,16 @@
31#include "sufile.h" 31#include "sufile.h"
32 32
33 33
34struct nilfs_sufile_info {
35 struct nilfs_mdt_info mi;
36 unsigned long ncleansegs;
37};
38
39static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
40{
41 return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
42}
43
34static inline unsigned long 44static inline unsigned long
35nilfs_sufile_segment_usages_per_block(const struct inode *sufile) 45nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
36{ 46{
@@ -62,14 +72,6 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
62 max - curr + 1); 72 max - curr + 1);
63} 73}
64 74
65static inline struct nilfs_sufile_header *
66nilfs_sufile_block_get_header(const struct inode *sufile,
67 struct buffer_head *bh,
68 void *kaddr)
69{
70 return kaddr + bh_offset(bh);
71}
72
73static struct nilfs_segment_usage * 75static struct nilfs_segment_usage *
74nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, 76nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
75 struct buffer_head *bh, void *kaddr) 77 struct buffer_head *bh, void *kaddr)
@@ -110,6 +112,15 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
110} 112}
111 113
112/** 114/**
115 * nilfs_sufile_get_ncleansegs - return the number of clean segments
116 * @sufile: inode of segment usage file
117 */
118unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
119{
120 return NILFS_SUI(sufile)->ncleansegs;
121}
122
123/**
113 * nilfs_sufile_updatev - modify multiple segment usages at a time 124 * nilfs_sufile_updatev - modify multiple segment usages at a time
114 * @sufile: inode of segment usage file 125 * @sufile: inode of segment usage file
115 * @segnumv: array of segment numbers 126 * @segnumv: array of segment numbers
@@ -270,7 +281,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
270 if (ret < 0) 281 if (ret < 0)
271 goto out_sem; 282 goto out_sem;
272 kaddr = kmap_atomic(header_bh->b_page, KM_USER0); 283 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
273 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); 284 header = kaddr + bh_offset(header_bh);
274 ncleansegs = le64_to_cpu(header->sh_ncleansegs); 285 ncleansegs = le64_to_cpu(header->sh_ncleansegs);
275 last_alloc = le64_to_cpu(header->sh_last_alloc); 286 last_alloc = le64_to_cpu(header->sh_last_alloc);
276 kunmap_atomic(kaddr, KM_USER0); 287 kunmap_atomic(kaddr, KM_USER0);
@@ -302,13 +313,13 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
302 kunmap_atomic(kaddr, KM_USER0); 313 kunmap_atomic(kaddr, KM_USER0);
303 314
304 kaddr = kmap_atomic(header_bh->b_page, KM_USER0); 315 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
305 header = nilfs_sufile_block_get_header( 316 header = kaddr + bh_offset(header_bh);
306 sufile, header_bh, kaddr);
307 le64_add_cpu(&header->sh_ncleansegs, -1); 317 le64_add_cpu(&header->sh_ncleansegs, -1);
308 le64_add_cpu(&header->sh_ndirtysegs, 1); 318 le64_add_cpu(&header->sh_ndirtysegs, 1);
309 header->sh_last_alloc = cpu_to_le64(segnum); 319 header->sh_last_alloc = cpu_to_le64(segnum);
310 kunmap_atomic(kaddr, KM_USER0); 320 kunmap_atomic(kaddr, KM_USER0);
311 321
322 NILFS_SUI(sufile)->ncleansegs--;
312 nilfs_mdt_mark_buffer_dirty(header_bh); 323 nilfs_mdt_mark_buffer_dirty(header_bh);
313 nilfs_mdt_mark_buffer_dirty(su_bh); 324 nilfs_mdt_mark_buffer_dirty(su_bh);
314 nilfs_mdt_mark_dirty(sufile); 325 nilfs_mdt_mark_dirty(sufile);
@@ -351,6 +362,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
351 kunmap_atomic(kaddr, KM_USER0); 362 kunmap_atomic(kaddr, KM_USER0);
352 363
353 nilfs_sufile_mod_counter(header_bh, -1, 1); 364 nilfs_sufile_mod_counter(header_bh, -1, 1);
365 NILFS_SUI(sufile)->ncleansegs--;
366
354 nilfs_mdt_mark_buffer_dirty(su_bh); 367 nilfs_mdt_mark_buffer_dirty(su_bh);
355 nilfs_mdt_mark_dirty(sufile); 368 nilfs_mdt_mark_dirty(sufile);
356} 369}
@@ -380,6 +393,8 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
380 kunmap_atomic(kaddr, KM_USER0); 393 kunmap_atomic(kaddr, KM_USER0);
381 394
382 nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); 395 nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
396 NILFS_SUI(sufile)->ncleansegs -= clean;
397
383 nilfs_mdt_mark_buffer_dirty(su_bh); 398 nilfs_mdt_mark_buffer_dirty(su_bh);
384 nilfs_mdt_mark_dirty(sufile); 399 nilfs_mdt_mark_dirty(sufile);
385} 400}
@@ -409,79 +424,65 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
409 nilfs_mdt_mark_buffer_dirty(su_bh); 424 nilfs_mdt_mark_buffer_dirty(su_bh);
410 425
411 nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); 426 nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
427 NILFS_SUI(sufile)->ncleansegs++;
428
412 nilfs_mdt_mark_dirty(sufile); 429 nilfs_mdt_mark_dirty(sufile);
413} 430}
414 431
415/** 432/**
416 * nilfs_sufile_get_segment_usage - get a segment usage 433 * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
417 * @sufile: inode of segment usage file 434 * @sufile: inode of segment usage file
418 * @segnum: segment number 435 * @segnum: segment number
419 * @sup: pointer to segment usage
420 * @bhp: pointer to buffer head
421 *
422 * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
423 * specified by @segnum.
424 *
425 * Return Value: On success, 0 is returned, and the segment usage and the
426 * buffer head of the buffer on which the segment usage is located are stored
427 * in the place pointed by @sup and @bhp, respectively. On error, one of the
428 * following negative error codes is returned.
429 *
430 * %-EIO - I/O error.
431 *
432 * %-ENOMEM - Insufficient amount of memory available.
433 *
434 * %-EINVAL - Invalid segment usage number.
435 */ 436 */
436int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum, 437int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
437 struct nilfs_segment_usage **sup,
438 struct buffer_head **bhp)
439{ 438{
440 struct buffer_head *bh; 439 struct buffer_head *bh;
441 struct nilfs_segment_usage *su;
442 void *kaddr;
443 int ret; 440 int ret;
444 441
445 /* segnum is 0 origin */ 442 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
446 if (segnum >= nilfs_sufile_get_nsegments(sufile)) 443 if (!ret) {
447 return -EINVAL; 444 nilfs_mdt_mark_buffer_dirty(bh);
448 down_write(&NILFS_MDT(sufile)->mi_sem); 445 nilfs_mdt_mark_dirty(sufile);
449 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
450 if (ret < 0)
451 goto out_sem;
452 kaddr = kmap(bh->b_page);
453 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
454 if (nilfs_segment_usage_error(su)) {
455 kunmap(bh->b_page);
456 brelse(bh); 446 brelse(bh);
457 ret = -EINVAL;
458 goto out_sem;
459 } 447 }
460
461 if (sup != NULL)
462 *sup = su;
463 *bhp = bh;
464
465 out_sem:
466 up_write(&NILFS_MDT(sufile)->mi_sem);
467 return ret; 448 return ret;
468} 449}
469 450
470/** 451/**
471 * nilfs_sufile_put_segment_usage - put a segment usage 452 * nilfs_sufile_set_segment_usage - set usage of a segment
472 * @sufile: inode of segment usage file 453 * @sufile: inode of segment usage file
473 * @segnum: segment number 454 * @segnum: segment number
474 * @bh: buffer head 455 * @nblocks: number of live blocks in the segment
475 * 456 * @modtime: modification time (option)
476 * Description: nilfs_sufile_put_segment_usage() releases the segment usage
477 * specified by @segnum. @bh must be the buffer head which have been returned
478 * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
479 */ 457 */
480void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum, 458int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
481 struct buffer_head *bh) 459 unsigned long nblocks, time_t modtime)
482{ 460{
483 kunmap(bh->b_page); 461 struct buffer_head *bh;
462 struct nilfs_segment_usage *su;
463 void *kaddr;
464 int ret;
465
466 down_write(&NILFS_MDT(sufile)->mi_sem);
467 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
468 if (ret < 0)
469 goto out_sem;
470
471 kaddr = kmap_atomic(bh->b_page, KM_USER0);
472 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
473 WARN_ON(nilfs_segment_usage_error(su));
474 if (modtime)
475 su->su_lastmod = cpu_to_le64(modtime);
476 su->su_nblocks = cpu_to_le32(nblocks);
477 kunmap_atomic(kaddr, KM_USER0);
478
479 nilfs_mdt_mark_buffer_dirty(bh);
480 nilfs_mdt_mark_dirty(sufile);
484 brelse(bh); 481 brelse(bh);
482
483 out_sem:
484 up_write(&NILFS_MDT(sufile)->mi_sem);
485 return ret;
485} 486}
486 487
487/** 488/**
@@ -515,7 +516,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
515 goto out_sem; 516 goto out_sem;
516 517
517 kaddr = kmap_atomic(header_bh->b_page, KM_USER0); 518 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
518 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); 519 header = kaddr + bh_offset(header_bh);
519 sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); 520 sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
520 sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); 521 sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
521 sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); 522 sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
@@ -532,33 +533,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
532 return ret; 533 return ret;
533} 534}
534 535
535/**
536 * nilfs_sufile_get_ncleansegs - get the number of clean segments
537 * @sufile: inode of segment usage file
538 * @nsegsp: pointer to the number of clean segments
539 *
540 * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
541 * segments.
542 *
543 * Return Value: On success, 0 is returned and the number of clean segments is
544 * stored in the place pointed by @nsegsp. On error, one of the following
545 * negative error codes is returned.
546 *
547 * %-EIO - I/O error.
548 *
549 * %-ENOMEM - Insufficient amount of memory available.
550 */
551int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
552{
553 struct nilfs_sustat sustat;
554 int ret;
555
556 ret = nilfs_sufile_get_stat(sufile, &sustat);
557 if (ret == 0)
558 *nsegsp = sustat.ss_ncleansegs;
559 return ret;
560}
561
562void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, 536void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
563 struct buffer_head *header_bh, 537 struct buffer_head *header_bh,
564 struct buffer_head *su_bh) 538 struct buffer_head *su_bh)
@@ -577,8 +551,10 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
577 nilfs_segment_usage_set_error(su); 551 nilfs_segment_usage_set_error(su);
578 kunmap_atomic(kaddr, KM_USER0); 552 kunmap_atomic(kaddr, KM_USER0);
579 553
580 if (suclean) 554 if (suclean) {
581 nilfs_sufile_mod_counter(header_bh, -1, 0); 555 nilfs_sufile_mod_counter(header_bh, -1, 0);
556 NILFS_SUI(sufile)->ncleansegs--;
557 }
582 nilfs_mdt_mark_buffer_dirty(su_bh); 558 nilfs_mdt_mark_buffer_dirty(su_bh);
583 nilfs_mdt_mark_dirty(sufile); 559 nilfs_mdt_mark_dirty(sufile);
584} 560}
@@ -657,3 +633,48 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
657 up_read(&NILFS_MDT(sufile)->mi_sem); 633 up_read(&NILFS_MDT(sufile)->mi_sem);
658 return ret; 634 return ret;
659} 635}
636
637/**
638 * nilfs_sufile_read - read sufile inode
639 * @sufile: sufile inode
640 * @raw_inode: on-disk sufile inode
641 */
642int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
643{
644 struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
645 struct buffer_head *header_bh;
646 struct nilfs_sufile_header *header;
647 void *kaddr;
648 int ret;
649
650 ret = nilfs_read_inode_common(sufile, raw_inode);
651 if (ret < 0)
652 return ret;
653
654 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
655 if (!ret) {
656 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
657 header = kaddr + bh_offset(header_bh);
658 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
659 kunmap_atomic(kaddr, KM_USER0);
660 brelse(header_bh);
661 }
662 return ret;
663}
664
665/**
666 * nilfs_sufile_new - create sufile
667 * @nilfs: nilfs object
668 * @susize: size of a segment usage entry
669 */
670struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
671{
672 struct inode *sufile;
673
674 sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
675 sizeof(struct nilfs_sufile_info));
676 if (sufile)
677 nilfs_mdt_set_entry_size(sufile, susize,
678 sizeof(struct nilfs_sufile_header));
679 return sufile;
680}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 0e99e5c0bd0f..15163b8aff7d 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -34,14 +34,13 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; 34 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
35} 35}
36 36
37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
38
37int nilfs_sufile_alloc(struct inode *, __u64 *); 39int nilfs_sufile_alloc(struct inode *, __u64 *);
38int nilfs_sufile_get_segment_usage(struct inode *, __u64, 40int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
39 struct nilfs_segment_usage **, 41int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
40 struct buffer_head **); 42 unsigned long nblocks, time_t modtime);
41void nilfs_sufile_put_segment_usage(struct inode *, __u64,
42 struct buffer_head *);
43int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); 43int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
44int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
45ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, 44ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
46 size_t); 45 size_t);
47 46
@@ -62,6 +61,9 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
62void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, 61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
63 struct buffer_head *); 62 struct buffer_head *);
64 63
64int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
65struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
66
65/** 67/**
66 * nilfs_sufile_scrap - make a segment garbage 68 * nilfs_sufile_scrap - make a segment garbage
67 * @sufile: inode of segment usage file 69 * @sufile: inode of segment usage file
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 644e66727dd0..0cdbc5e7655a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function,
96 if (!(sb->s_flags & MS_RDONLY)) { 96 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs; 97 struct the_nilfs *nilfs = sbi->s_nilfs;
98 98
99 if (!nilfs_test_opt(sbi, ERRORS_CONT))
100 nilfs_detach_segment_constructor(sbi);
101
102 down_write(&nilfs->ns_sem); 99 down_write(&nilfs->ns_sem);
103 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { 100 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
104 nilfs->ns_mount_state |= NILFS_ERROR_FS; 101 nilfs->ns_mount_state |= NILFS_ERROR_FS;
@@ -301,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
301 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 298 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
302 nilfs->ns_sbwtime[1] = t; 299 nilfs->ns_sbwtime[1] = t;
303 } 300 }
304 sbi->s_super->s_dirt = 0; 301 clear_nilfs_sb_dirty(nilfs);
305 return nilfs_sync_super(sbi, dupsb); 302 return nilfs_sync_super(sbi, dupsb);
306} 303}
307 304
@@ -345,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
345 err = nilfs_construct_segment(sb); 342 err = nilfs_construct_segment(sb);
346 343
347 down_write(&nilfs->ns_sem); 344 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt) 345 if (nilfs_sb_dirty(nilfs))
349 nilfs_commit_super(sbi, 1); 346 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem); 347 up_write(&nilfs->ns_sem);
351 348
@@ -363,14 +360,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
363 list_add(&sbi->s_list, &nilfs->ns_supers); 360 list_add(&sbi->s_list, &nilfs->ns_supers);
364 up_write(&nilfs->ns_super_sem); 361 up_write(&nilfs->ns_super_sem);
365 362
366 sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO); 363 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
367 if (!sbi->s_ifile) 364 if (!sbi->s_ifile)
368 return -ENOMEM; 365 return -ENOMEM;
369 366
370 err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
371 if (unlikely(err))
372 goto failed;
373
374 down_read(&nilfs->ns_segctor_sem); 367 down_read(&nilfs->ns_segctor_sem);
375 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 368 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
376 &bh_cp); 369 &bh_cp);
@@ -411,7 +404,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
411{ 404{
412 struct the_nilfs *nilfs = sbi->s_nilfs; 405 struct the_nilfs *nilfs = sbi->s_nilfs;
413 406
414 nilfs_mdt_clear(sbi->s_ifile);
415 nilfs_mdt_destroy(sbi->s_ifile); 407 nilfs_mdt_destroy(sbi->s_ifile);
416 sbi->s_ifile = NULL; 408 sbi->s_ifile = NULL;
417 down_write(&nilfs->ns_super_sem); 409 down_write(&nilfs->ns_super_sem);
@@ -419,22 +411,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
419 up_write(&nilfs->ns_super_sem); 411 up_write(&nilfs->ns_super_sem);
420} 412}
421 413
422static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
423{
424 struct the_nilfs *nilfs = sbi->s_nilfs;
425 int err = 0;
426
427 down_write(&nilfs->ns_sem);
428 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
429 nilfs->ns_mount_state |= NILFS_VALID_FS;
430 err = nilfs_commit_super(sbi, 1);
431 if (likely(!err))
432 printk(KERN_INFO "NILFS: recovery complete.\n");
433 }
434 up_write(&nilfs->ns_sem);
435 return err;
436}
437
438static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) 414static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
439{ 415{
440 struct super_block *sb = dentry->d_sb; 416 struct super_block *sb = dentry->d_sb;
@@ -460,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
460 /* 436 /*
461 * Compute the overhead 437 * Compute the overhead
462 * 438 *
463 * When distributing meta data blocks outside semgent structure, 439 * When distributing meta data blocks outside segment structure,
464 * We must count them as the overhead. 440 * We must count them as the overhead.
465 */ 441 */
466 overhead = 0; 442 overhead = 0;
@@ -490,7 +466,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
490 struct nilfs_sb_info *sbi = NILFS_SB(sb); 466 struct nilfs_sb_info *sbi = NILFS_SB(sb);
491 467
492 if (!nilfs_test_opt(sbi, BARRIER)) 468 if (!nilfs_test_opt(sbi, BARRIER))
493 seq_printf(seq, ",barrier=off"); 469 seq_printf(seq, ",nobarrier");
494 if (nilfs_test_opt(sbi, SNAPSHOT)) 470 if (nilfs_test_opt(sbi, SNAPSHOT))
495 seq_printf(seq, ",cp=%llu", 471 seq_printf(seq, ",cp=%llu",
496 (unsigned long long int)sbi->s_snapshot_cno); 472 (unsigned long long int)sbi->s_snapshot_cno);
@@ -500,6 +476,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
500 seq_printf(seq, ",errors=panic"); 476 seq_printf(seq, ",errors=panic");
501 if (nilfs_test_opt(sbi, STRICT_ORDER)) 477 if (nilfs_test_opt(sbi, STRICT_ORDER))
502 seq_printf(seq, ",order=strict"); 478 seq_printf(seq, ",order=strict");
479 if (nilfs_test_opt(sbi, NORECOVERY))
480 seq_printf(seq, ",norecovery");
481 if (nilfs_test_opt(sbi, DISCARD))
482 seq_printf(seq, ",discard");
503 483
504 return 0; 484 return 0;
505} 485}
@@ -568,33 +548,22 @@ static const struct export_operations nilfs_export_ops = {
568 548
569enum { 549enum {
570 Opt_err_cont, Opt_err_panic, Opt_err_ro, 550 Opt_err_cont, Opt_err_panic, Opt_err_ro,
571 Opt_barrier, Opt_snapshot, Opt_order, 551 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
572 Opt_err, 552 Opt_discard, Opt_err,
573}; 553};
574 554
575static match_table_t tokens = { 555static match_table_t tokens = {
576 {Opt_err_cont, "errors=continue"}, 556 {Opt_err_cont, "errors=continue"},
577 {Opt_err_panic, "errors=panic"}, 557 {Opt_err_panic, "errors=panic"},
578 {Opt_err_ro, "errors=remount-ro"}, 558 {Opt_err_ro, "errors=remount-ro"},
579 {Opt_barrier, "barrier=%s"}, 559 {Opt_nobarrier, "nobarrier"},
580 {Opt_snapshot, "cp=%u"}, 560 {Opt_snapshot, "cp=%u"},
581 {Opt_order, "order=%s"}, 561 {Opt_order, "order=%s"},
562 {Opt_norecovery, "norecovery"},
563 {Opt_discard, "discard"},
582 {Opt_err, NULL} 564 {Opt_err, NULL}
583}; 565};
584 566
585static int match_bool(substring_t *s, int *result)
586{
587 int len = s->to - s->from;
588
589 if (strncmp(s->from, "on", len) == 0)
590 *result = 1;
591 else if (strncmp(s->from, "off", len) == 0)
592 *result = 0;
593 else
594 return 1;
595 return 0;
596}
597
598static int parse_options(char *options, struct super_block *sb) 567static int parse_options(char *options, struct super_block *sb)
599{ 568{
600 struct nilfs_sb_info *sbi = NILFS_SB(sb); 569 struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -612,13 +581,8 @@ static int parse_options(char *options, struct super_block *sb)
612 581
613 token = match_token(p, tokens, args); 582 token = match_token(p, tokens, args);
614 switch (token) { 583 switch (token) {
615 case Opt_barrier: 584 case Opt_nobarrier:
616 if (match_bool(&args[0], &option)) 585 nilfs_clear_opt(sbi, BARRIER);
617 return 0;
618 if (option)
619 nilfs_set_opt(sbi, BARRIER);
620 else
621 nilfs_clear_opt(sbi, BARRIER);
622 break; 586 break;
623 case Opt_order: 587 case Opt_order:
624 if (strcmp(args[0].from, "relaxed") == 0) 588 if (strcmp(args[0].from, "relaxed") == 0)
@@ -647,6 +611,12 @@ static int parse_options(char *options, struct super_block *sb)
647 sbi->s_snapshot_cno = option; 611 sbi->s_snapshot_cno = option;
648 nilfs_set_opt(sbi, SNAPSHOT); 612 nilfs_set_opt(sbi, SNAPSHOT);
649 break; 613 break;
614 case Opt_norecovery:
615 nilfs_set_opt(sbi, NORECOVERY);
616 break;
617 case Opt_discard:
618 nilfs_set_opt(sbi, DISCARD);
619 break;
650 default: 620 default:
651 printk(KERN_ERR 621 printk(KERN_ERR
652 "NILFS: Unrecognized mount option \"%s\"\n", p); 622 "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -672,9 +642,7 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
672 int mnt_count = le16_to_cpu(sbp->s_mnt_count); 642 int mnt_count = le16_to_cpu(sbp->s_mnt_count);
673 643
674 /* nilfs->sem must be locked by the caller. */ 644 /* nilfs->sem must be locked by the caller. */
675 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { 645 if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
676 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
677 } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
678 printk(KERN_WARNING 646 printk(KERN_WARNING
679 "NILFS warning: mounting fs with errors\n"); 647 "NILFS warning: mounting fs with errors\n");
680#if 0 648#if 0
@@ -782,11 +750,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
782 sb->s_root = NULL; 750 sb->s_root = NULL;
783 sb->s_time_gran = 1; 751 sb->s_time_gran = 1;
784 752
785 if (!nilfs_loaded(nilfs)) { 753 err = load_nilfs(nilfs, sbi);
786 err = load_nilfs(nilfs, sbi); 754 if (err)
787 if (err) 755 goto failed_sbi;
788 goto failed_sbi; 756
789 }
790 cno = nilfs_last_cno(nilfs); 757 cno = nilfs_last_cno(nilfs);
791 758
792 if (sb->s_flags & MS_RDONLY) { 759 if (sb->s_flags & MS_RDONLY) {
@@ -854,12 +821,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
854 up_write(&nilfs->ns_sem); 821 up_write(&nilfs->ns_sem);
855 } 822 }
856 823
857 err = nilfs_mark_recovery_complete(sbi);
858 if (unlikely(err)) {
859 printk(KERN_ERR "NILFS: recovery failed.\n");
860 goto failed_root;
861 }
862
863 down_write(&nilfs->ns_super_sem); 824 down_write(&nilfs->ns_super_sem);
864 if (!nilfs_test_opt(sbi, SNAPSHOT)) 825 if (!nilfs_test_opt(sbi, SNAPSHOT))
865 nilfs->ns_current = sbi; 826 nilfs->ns_current = sbi;
@@ -867,10 +828,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
867 828
868 return 0; 829 return 0;
869 830
870 failed_root:
871 dput(sb->s_root);
872 sb->s_root = NULL;
873
874 failed_segctor: 831 failed_segctor:
875 nilfs_detach_segment_constructor(sbi); 832 nilfs_detach_segment_constructor(sbi);
876 833
@@ -909,12 +866,20 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
909 if ((*flags & MS_RDONLY) && 866 if ((*flags & MS_RDONLY) &&
910 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 867 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
911 printk(KERN_WARNING "NILFS (device %s): couldn't " 868 printk(KERN_WARNING "NILFS (device %s): couldn't "
912 "remount to a different snapshot. \n", 869 "remount to a different snapshot.\n",
913 sb->s_id); 870 sb->s_id);
914 err = -EINVAL; 871 err = -EINVAL;
915 goto restore_opts; 872 goto restore_opts;
916 } 873 }
917 874
875 if (!nilfs_valid_fs(nilfs)) {
876 printk(KERN_WARNING "NILFS (device %s): couldn't "
877 "remount because the filesystem is in an "
878 "incomplete recovery state.\n", sb->s_id);
879 err = -EINVAL;
880 goto restore_opts;
881 }
882
918 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 883 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
919 goto out; 884 goto out;
920 if (*flags & MS_RDONLY) { 885 if (*flags & MS_RDONLY) {
@@ -1156,8 +1121,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1156 /* Abandoning the newly allocated superblock */ 1121 /* Abandoning the newly allocated superblock */
1157 mutex_unlock(&nilfs->ns_mount_mutex); 1122 mutex_unlock(&nilfs->ns_mount_mutex);
1158 put_nilfs(nilfs); 1123 put_nilfs(nilfs);
1159 up_write(&s->s_umount); 1124 deactivate_locked_super(s);
1160 deactivate_super(s);
1161 /* 1125 /*
1162 * deactivate_super() invokes close_bdev_exclusive(). 1126 * deactivate_super() invokes close_bdev_exclusive().
1163 * We must finish all post-cleaning before this call; 1127 * We must finish all post-cleaning before this call;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad391a8c3e7e..33871f7e4f01 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -146,13 +146,9 @@ void put_nilfs(struct the_nilfs *nilfs)
146 146
147 might_sleep(); 147 might_sleep();
148 if (nilfs_loaded(nilfs)) { 148 if (nilfs_loaded(nilfs)) {
149 nilfs_mdt_clear(nilfs->ns_sufile);
150 nilfs_mdt_destroy(nilfs->ns_sufile); 149 nilfs_mdt_destroy(nilfs->ns_sufile);
151 nilfs_mdt_clear(nilfs->ns_cpfile);
152 nilfs_mdt_destroy(nilfs->ns_cpfile); 150 nilfs_mdt_destroy(nilfs->ns_cpfile);
153 nilfs_mdt_clear(nilfs->ns_dat);
154 nilfs_mdt_destroy(nilfs->ns_dat); 151 nilfs_mdt_destroy(nilfs->ns_dat);
155 /* XXX: how and when to clear nilfs->ns_gc_dat? */
156 nilfs_mdt_destroy(nilfs->ns_gc_dat); 152 nilfs_mdt_destroy(nilfs->ns_gc_dat);
157 } 153 }
158 if (nilfs_init(nilfs)) { 154 if (nilfs_init(nilfs)) {
@@ -166,7 +162,6 @@ void put_nilfs(struct the_nilfs *nilfs)
166static int nilfs_load_super_root(struct the_nilfs *nilfs, 162static int nilfs_load_super_root(struct the_nilfs *nilfs,
167 struct nilfs_sb_info *sbi, sector_t sr_block) 163 struct nilfs_sb_info *sbi, sector_t sr_block)
168{ 164{
169 static struct lock_class_key dat_lock_key;
170 struct buffer_head *bh_sr; 165 struct buffer_head *bh_sr;
171 struct nilfs_super_root *raw_sr; 166 struct nilfs_super_root *raw_sr;
172 struct nilfs_super_block **sbp = nilfs->ns_sbp; 167 struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -187,51 +182,36 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
187 inode_size = nilfs->ns_inode_size; 182 inode_size = nilfs->ns_inode_size;
188 183
189 err = -ENOMEM; 184 err = -ENOMEM;
190 nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); 185 nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
191 if (unlikely(!nilfs->ns_dat)) 186 if (unlikely(!nilfs->ns_dat))
192 goto failed; 187 goto failed;
193 188
194 nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); 189 nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
195 if (unlikely(!nilfs->ns_gc_dat)) 190 if (unlikely(!nilfs->ns_gc_dat))
196 goto failed_dat; 191 goto failed_dat;
197 192
198 nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO); 193 nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
199 if (unlikely(!nilfs->ns_cpfile)) 194 if (unlikely(!nilfs->ns_cpfile))
200 goto failed_gc_dat; 195 goto failed_gc_dat;
201 196
202 nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO); 197 nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
203 if (unlikely(!nilfs->ns_sufile)) 198 if (unlikely(!nilfs->ns_sufile))
204 goto failed_cpfile; 199 goto failed_cpfile;
205 200
206 err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
207 if (unlikely(err))
208 goto failed_sufile;
209
210 err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
211 if (unlikely(err))
212 goto failed_sufile;
213
214 lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
215 lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
216
217 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat); 201 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
218 nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
219 sizeof(struct nilfs_cpfile_header));
220 nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
221 sizeof(struct nilfs_sufile_header));
222 202
223 err = nilfs_mdt_read_inode_direct( 203 err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
224 nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size)); 204 NILFS_SR_DAT_OFFSET(inode_size));
225 if (unlikely(err)) 205 if (unlikely(err))
226 goto failed_sufile; 206 goto failed_sufile;
227 207
228 err = nilfs_mdt_read_inode_direct( 208 err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
229 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size)); 209 NILFS_SR_CPFILE_OFFSET(inode_size));
230 if (unlikely(err)) 210 if (unlikely(err))
231 goto failed_sufile; 211 goto failed_sufile;
232 212
233 err = nilfs_mdt_read_inode_direct( 213 err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
234 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size)); 214 NILFS_SR_SUFILE_OFFSET(inode_size));
235 if (unlikely(err)) 215 if (unlikely(err))
236 goto failed_sufile; 216 goto failed_sufile;
237 217
@@ -281,29 +261,30 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
281 struct nilfs_recovery_info ri; 261 struct nilfs_recovery_info ri;
282 unsigned int s_flags = sbi->s_super->s_flags; 262 unsigned int s_flags = sbi->s_super->s_flags;
283 int really_read_only = bdev_read_only(nilfs->ns_bdev); 263 int really_read_only = bdev_read_only(nilfs->ns_bdev);
284 unsigned valid_fs; 264 int valid_fs = nilfs_valid_fs(nilfs);
285 int err = 0; 265 int err;
286
287 nilfs_init_recovery_info(&ri);
288 266
289 down_write(&nilfs->ns_sem); 267 if (nilfs_loaded(nilfs)) {
290 valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); 268 if (valid_fs ||
291 up_write(&nilfs->ns_sem); 269 ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
270 return 0;
271 printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
272 "recovery state.\n");
273 return -EINVAL;
274 }
292 275
293 if (!valid_fs && (s_flags & MS_RDONLY)) { 276 if (!valid_fs) {
294 printk(KERN_INFO "NILFS: INFO: recovery " 277 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
295 "required for readonly filesystem.\n"); 278 if (s_flags & MS_RDONLY) {
296 if (really_read_only) { 279 printk(KERN_INFO "NILFS: INFO: recovery "
297 printk(KERN_ERR "NILFS: write access " 280 "required for readonly filesystem.\n");
298 "unavailable, cannot proceed.\n"); 281 printk(KERN_INFO "NILFS: write access will "
299 err = -EROFS; 282 "be enabled during recovery.\n");
300 goto failed;
301 } 283 }
302 printk(KERN_INFO "NILFS: write access will "
303 "be enabled during recovery.\n");
304 sbi->s_super->s_flags &= ~MS_RDONLY;
305 } 284 }
306 285
286 nilfs_init_recovery_info(&ri);
287
307 err = nilfs_search_super_root(nilfs, sbi, &ri); 288 err = nilfs_search_super_root(nilfs, sbi, &ri);
308 if (unlikely(err)) { 289 if (unlikely(err)) {
309 printk(KERN_ERR "NILFS: error searching super root.\n"); 290 printk(KERN_ERR "NILFS: error searching super root.\n");
@@ -316,19 +297,56 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
316 goto failed; 297 goto failed;
317 } 298 }
318 299
319 if (!valid_fs) { 300 if (valid_fs)
320 err = nilfs_recover_logical_segments(nilfs, sbi, &ri); 301 goto skip_recovery;
321 if (unlikely(err)) { 302
322 nilfs_mdt_destroy(nilfs->ns_cpfile); 303 if (s_flags & MS_RDONLY) {
323 nilfs_mdt_destroy(nilfs->ns_sufile); 304 if (nilfs_test_opt(sbi, NORECOVERY)) {
324 nilfs_mdt_destroy(nilfs->ns_dat); 305 printk(KERN_INFO "NILFS: norecovery option specified. "
325 goto failed; 306 "skipping roll-forward recovery\n");
307 goto skip_recovery;
326 } 308 }
327 if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED) 309 if (really_read_only) {
328 sbi->s_super->s_dirt = 1; 310 printk(KERN_ERR "NILFS: write access "
311 "unavailable, cannot proceed.\n");
312 err = -EROFS;
313 goto failed_unload;
314 }
315 sbi->s_super->s_flags &= ~MS_RDONLY;
316 } else if (nilfs_test_opt(sbi, NORECOVERY)) {
317 printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
318 "option was specified for a read/write mount\n");
319 err = -EINVAL;
320 goto failed_unload;
321 }
322
323 err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
324 if (err)
325 goto failed_unload;
326
327 down_write(&nilfs->ns_sem);
328 nilfs->ns_mount_state |= NILFS_VALID_FS;
329 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
330 err = nilfs_commit_super(sbi, 1);
331 up_write(&nilfs->ns_sem);
332
333 if (err) {
334 printk(KERN_ERR "NILFS: failed to update super block. "
335 "recovery unfinished.\n");
336 goto failed_unload;
329 } 337 }
338 printk(KERN_INFO "NILFS: recovery complete.\n");
330 339
340 skip_recovery:
331 set_nilfs_loaded(nilfs); 341 set_nilfs_loaded(nilfs);
342 nilfs_clear_recovery_info(&ri);
343 sbi->s_super->s_flags = s_flags;
344 return 0;
345
346 failed_unload:
347 nilfs_mdt_destroy(nilfs->ns_cpfile);
348 nilfs_mdt_destroy(nilfs->ns_sufile);
349 nilfs_mdt_destroy(nilfs->ns_dat);
332 350
333 failed: 351 failed:
334 nilfs_clear_recovery_info(&ri); 352 nilfs_clear_recovery_info(&ri);
@@ -368,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
368 386
369 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
370 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { 388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
371 printk(KERN_ERR "NILFS: too short segment. \n"); 389 printk(KERN_ERR "NILFS: too short segment.\n");
372 return -EINVAL; 390 return -EINVAL;
373 } 391 }
374 392
@@ -628,34 +646,65 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
628 goto out; 646 goto out;
629} 647}
630 648
649int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
650 size_t nsegs)
651{
652 sector_t seg_start, seg_end;
653 sector_t start = 0, nblocks = 0;
654 unsigned int sects_per_block;
655 __u64 *sn;
656 int ret = 0;
657
658 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
659 bdev_logical_block_size(nilfs->ns_bdev);
660 for (sn = segnump; sn < segnump + nsegs; sn++) {
661 nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
662
663 if (!nblocks) {
664 start = seg_start;
665 nblocks = seg_end - seg_start + 1;
666 } else if (start + nblocks == seg_start) {
667 nblocks += seg_end - seg_start + 1;
668 } else {
669 ret = blkdev_issue_discard(nilfs->ns_bdev,
670 start * sects_per_block,
671 nblocks * sects_per_block,
672 GFP_NOFS,
673 DISCARD_FL_BARRIER);
674 if (ret < 0)
675 return ret;
676 nblocks = 0;
677 }
678 }
679 if (nblocks)
680 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block,
682 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER);
684 return ret;
685}
686
631int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 687int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
632{ 688{
633 struct inode *dat = nilfs_dat_inode(nilfs); 689 struct inode *dat = nilfs_dat_inode(nilfs);
634 unsigned long ncleansegs; 690 unsigned long ncleansegs;
635 int err;
636 691
637 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 692 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
638 err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs); 693 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
639 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 694 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
640 if (likely(!err)) 695 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
641 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; 696 return 0;
642 return err;
643} 697}
644 698
645int nilfs_near_disk_full(struct the_nilfs *nilfs) 699int nilfs_near_disk_full(struct the_nilfs *nilfs)
646{ 700{
647 struct inode *sufile = nilfs->ns_sufile;
648 unsigned long ncleansegs, nincsegs; 701 unsigned long ncleansegs, nincsegs;
649 int ret;
650 702
651 ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs); 703 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
652 if (likely(!ret)) { 704 nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
653 nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / 705 nilfs->ns_blocks_per_segment + 1;
654 nilfs->ns_blocks_per_segment + 1; 706
655 if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs) 707 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
656 ret++;
657 }
658 return ret;
659} 708}
660 709
661/** 710/**
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 20abd55881e0..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/slab.h>
32#include "sb.h" 33#include "sb.h"
33 34
34/* the_nilfs struct */ 35/* the_nilfs struct */
@@ -38,6 +39,7 @@ enum {
38 the latest checkpoint was loaded */ 39 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 40 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */ 41 THE_NILFS_GC_RUNNING, /* gc process is running */
42 THE_NILFS_SB_DIRTY, /* super block is dirty */
41}; 43};
42 44
43/** 45/**
@@ -197,6 +199,7 @@ THE_NILFS_FNS(INIT, init)
197THE_NILFS_FNS(LOADED, loaded) 199THE_NILFS_FNS(LOADED, loaded)
198THE_NILFS_FNS(DISCONTINUED, discontinued) 200THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running) 201THE_NILFS_FNS(GC_RUNNING, gc_running)
202THE_NILFS_FNS(SB_DIRTY, sb_dirty)
200 203
201/* Minimum interval of periodical update of superblocks (in seconds) */ 204/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 205#define NILFS_SB_FREQ 10
@@ -221,6 +224,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
221void put_nilfs(struct the_nilfs *); 224void put_nilfs(struct the_nilfs *);
222int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 225int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
223int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 226int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
227int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
224int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 228int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
225struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 229struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
226int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); 230int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
@@ -258,6 +262,16 @@ static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
258 kfree(sbi); 262 kfree(sbi);
259} 263}
260 264
265static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
266{
267 unsigned valid_fs;
268
269 down_read(&nilfs->ns_sem);
270 valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
271 up_read(&nilfs->ns_sem);
272 return valid_fs;
273}
274
261static inline void 275static inline void
262nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, 276nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
263 sector_t *seg_start, sector_t *seg_end) 277 sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/dcache.h> 19#include <linux/dcache.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/gfp.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/srcu.h> 24#include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
87#include <linux/kernel.h> 87#include <linux/kernel.h>
88#include <linux/module.h> 88#include <linux/module.h>
89#include <linux/mutex.h> 89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h> 90#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */ 91#include <linux/writeback.h> /* for inode_lock */
93 92
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index c9ee67b442e1..1afb0a10229f 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data)
121 if (warned) 121 if (warned)
122 return 0; 122 return 0;
123 123
124 warned = false; 124 warned = true;
125 entry = p; 125 entry = p;
126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
127 127
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dcd2040d330c..472cdf29ef82 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -29,14 +29,12 @@
29#include <linux/init.h> /* module_init */ 29#include <linux/init.h> /* module_init */
30#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/kernel.h> /* roundup() */ 31#include <linux/kernel.h> /* roundup() */
32#include <linux/magic.h> /* superblock magic number */
33#include <linux/mount.h> /* mntget */
34#include <linux/namei.h> /* LOOKUP_FOLLOW */ 32#include <linux/namei.h> /* LOOKUP_FOLLOW */
35#include <linux/path.h> /* struct path */
36#include <linux/sched.h> /* struct user */ 33#include <linux/sched.h> /* struct user */
37#include <linux/slab.h> /* struct kmem_cache */ 34#include <linux/slab.h> /* struct kmem_cache */
38#include <linux/syscalls.h> 35#include <linux/syscalls.h>
39#include <linux/types.h> 36#include <linux/types.h>
37#include <linux/anon_inodes.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/poll.h> 39#include <linux/poll.h>
42#include <linux/wait.h> 40#include <linux/wait.h>
@@ -45,8 +43,6 @@
45 43
46#include <asm/ioctls.h> 44#include <asm/ioctls.h>
47 45
48static struct vfsmount *inotify_mnt __read_mostly;
49
50/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
51static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
52static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
@@ -69,36 +65,30 @@ static int zero;
69 65
70ctl_table inotify_table[] = { 66ctl_table inotify_table[] = {
71 { 67 {
72 .ctl_name = INOTIFY_MAX_USER_INSTANCES,
73 .procname = "max_user_instances", 68 .procname = "max_user_instances",
74 .data = &inotify_max_user_instances, 69 .data = &inotify_max_user_instances,
75 .maxlen = sizeof(int), 70 .maxlen = sizeof(int),
76 .mode = 0644, 71 .mode = 0644,
77 .proc_handler = &proc_dointvec_minmax, 72 .proc_handler = proc_dointvec_minmax,
78 .strategy = &sysctl_intvec,
79 .extra1 = &zero, 73 .extra1 = &zero,
80 }, 74 },
81 { 75 {
82 .ctl_name = INOTIFY_MAX_USER_WATCHES,
83 .procname = "max_user_watches", 76 .procname = "max_user_watches",
84 .data = &inotify_max_user_watches, 77 .data = &inotify_max_user_watches,
85 .maxlen = sizeof(int), 78 .maxlen = sizeof(int),
86 .mode = 0644, 79 .mode = 0644,
87 .proc_handler = &proc_dointvec_minmax, 80 .proc_handler = proc_dointvec_minmax,
88 .strategy = &sysctl_intvec,
89 .extra1 = &zero, 81 .extra1 = &zero,
90 }, 82 },
91 { 83 {
92 .ctl_name = INOTIFY_MAX_QUEUED_EVENTS,
93 .procname = "max_queued_events", 84 .procname = "max_queued_events",
94 .data = &inotify_max_queued_events, 85 .data = &inotify_max_queued_events,
95 .maxlen = sizeof(int), 86 .maxlen = sizeof(int),
96 .mode = 0644, 87 .mode = 0644,
97 .proc_handler = &proc_dointvec_minmax, 88 .proc_handler = proc_dointvec_minmax,
98 .strategy = &sysctl_intvec,
99 .extra1 = &zero 89 .extra1 = &zero
100 }, 90 },
101 { .ctl_name = 0 } 91 { }
102}; 92};
103#endif /* CONFIG_SYSCTL */ 93#endif /* CONFIG_SYSCTL */
104 94
@@ -558,7 +548,7 @@ retry:
558 548
559 spin_lock(&group->inotify_data.idr_lock); 549 spin_lock(&group->inotify_data.idr_lock);
560 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, 550 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
561 group->inotify_data.last_wd, 551 group->inotify_data.last_wd+1,
562 &tmp_ientry->wd); 552 &tmp_ientry->wd);
563 spin_unlock(&group->inotify_data.idr_lock); 553 spin_unlock(&group->inotify_data.idr_lock);
564 if (ret) { 554 if (ret) {
@@ -638,7 +628,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
638 628
639 spin_lock_init(&group->inotify_data.idr_lock); 629 spin_lock_init(&group->inotify_data.idr_lock);
640 idr_init(&group->inotify_data.idr); 630 idr_init(&group->inotify_data.idr);
641 group->inotify_data.last_wd = 1; 631 group->inotify_data.last_wd = 0;
642 group->inotify_data.user = user; 632 group->inotify_data.user = user;
643 group->inotify_data.fa = NULL; 633 group->inotify_data.fa = NULL;
644 634
@@ -651,8 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
651{ 641{
652 struct fsnotify_group *group; 642 struct fsnotify_group *group;
653 struct user_struct *user; 643 struct user_struct *user;
654 struct file *filp; 644 int ret;
655 int fd, ret;
656 645
657 /* Check the IN_* constants for consistency. */ 646 /* Check the IN_* constants for consistency. */
658 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); 647 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
@@ -661,16 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
661 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) 650 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
662 return -EINVAL; 651 return -EINVAL;
663 652
664 fd = get_unused_fd_flags(flags & O_CLOEXEC);
665 if (fd < 0)
666 return fd;
667
668 filp = get_empty_filp();
669 if (!filp) {
670 ret = -ENFILE;
671 goto out_put_fd;
672 }
673
674 user = get_current_user(); 653 user = get_current_user();
675 if (unlikely(atomic_read(&user->inotify_devs) >= 654 if (unlikely(atomic_read(&user->inotify_devs) >=
676 inotify_max_user_instances)) { 655 inotify_max_user_instances)) {
@@ -685,25 +664,16 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
685 goto out_free_uid; 664 goto out_free_uid;
686 } 665 }
687 666
688 filp->f_op = &inotify_fops;
689 filp->f_path.mnt = mntget(inotify_mnt);
690 filp->f_path.dentry = dget(inotify_mnt->mnt_root);
691 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
692 filp->f_mode = FMODE_READ;
693 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
694 filp->private_data = group;
695
696 atomic_inc(&user->inotify_devs); 667 atomic_inc(&user->inotify_devs);
697 668
698 fd_install(fd, filp); 669 ret = anon_inode_getfd("inotify", &inotify_fops, group,
699 670 O_RDONLY | flags);
700 return fd; 671 if (ret >= 0)
672 return ret;
701 673
674 atomic_dec(&user->inotify_devs);
702out_free_uid: 675out_free_uid:
703 free_uid(user); 676 free_uid(user);
704 put_filp(filp);
705out_put_fd:
706 put_unused_fd(fd);
707 return ret; 677 return ret;
708} 678}
709 679
@@ -747,10 +717,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
747 717
748 /* create/update an inode mark */ 718 /* create/update an inode mark */
749 ret = inotify_update_watch(group, inode, mask); 719 ret = inotify_update_watch(group, inode, mask);
750 if (unlikely(ret))
751 goto path_put_and_out;
752
753path_put_and_out:
754 path_put(&path); 720 path_put(&path);
755fput_and_out: 721fput_and_out:
756 fput_light(filp, fput_needed); 722 fput_light(filp, fput_needed);
@@ -794,20 +760,6 @@ out:
794 return ret; 760 return ret;
795} 761}
796 762
797static int
798inotify_get_sb(struct file_system_type *fs_type, int flags,
799 const char *dev_name, void *data, struct vfsmount *mnt)
800{
801 return get_sb_pseudo(fs_type, "inotify", NULL,
802 INOTIFYFS_SUPER_MAGIC, mnt);
803}
804
805static struct file_system_type inotify_fs_type = {
806 .name = "inotifyfs",
807 .get_sb = inotify_get_sb,
808 .kill_sb = kill_anon_super,
809};
810
811/* 763/*
812 * inotify_user_setup - Our initialization function. Note that we cannnot return 764 * inotify_user_setup - Our initialization function. Note that we cannnot return
813 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 765 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
@@ -815,16 +767,6 @@ static struct file_system_type inotify_fs_type = {
815 */ 767 */
816static int __init inotify_user_setup(void) 768static int __init inotify_user_setup(void)
817{ 769{
818 int ret;
819
820 ret = register_filesystem(&inotify_fs_type);
821 if (unlikely(ret))
822 panic("inotify: register_filesystem returned %d!\n", ret);
823
824 inotify_mnt = kern_mount(&inotify_fs_type);
825 if (IS_ERR(inotify_mnt))
826 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
827
828 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
829 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
830 772
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e194372..000000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
1ToDo/Notes:
2 - Find and fix bugs.
3 - The only places in the kernel where a file is resized are
4 ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
5 held. Just have to be careful in read-/writepage and other helpers
6 not running under i_mutex that we play nice. Also need to be careful
7 with initialized_size extension in ntfs_file_write*() and writepage.
8 UPDATE: The only things that need to be checked are the compressed
9 write and the other attribute resize/write cases like index
10 attributes, etc. For now none of these are implemented so are safe.
11 - Implement filling in of holes in aops.c::ntfs_writepage() and its
12 helpers.
13 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
14 leave the volume dirty on umount if the final iput(vol->mft_ino)
15 causes a write of any mirrored mft records due to the mft mirror
16 inode having been discarded already. Whether this can actually ever
17 happen is unclear however so it is worth waiting until someone hits
18 the problem.
19
202.1.29 - Fix a deadlock at mount time.
21
22 - During mount the VFS holds s_umount lock on the superblock. So when
23 we try to empty the journal $LogFile contents by calling
24 ntfs_attr_set() when the machine does not have much memory and the
25 journal is large ntfs_attr_set() results in the VM trying to balance
26 dirty pages which in turn tries to that the s_umount lock and thus we
27 get a deadlock. The solution is to not use ntfs_attr_set() and
28 instead do the zeroing by hand at the block level rather than page
29 cache level.
30 - Fix sparse warnings.
31
322.1.28 - Fix a deadlock.
33
34 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey
35 Vlasov for the report and detailed analysis of the deadlock. The fix
36 involved getting rid of ntfs_put_inode() altogether and hence NTFS no
37 longer has a ->put_inode super operation.
38
392.1.27 - Various bug fixes and cleanups.
40
41 - Fix two compiler warnings on Alpha. Thanks to Andrew Morton for
42 reporting them.
43 - Fix an (innocent) off-by-one error in the runlist code.
44 - Fix a buggette in an "should be impossible" case handling where we
45 continued the attribute lookup loop instead of aborting it.
46 - Use buffer_migrate_page() for the ->migratepage function of all ntfs
47 address space operations.
48 - Fix comparison of $MFT and $MFTMirr to not bail out when there are
49 unused, invalid mft records which are the same in both $MFT and
50 $MFTMirr.
51 - Add support for sparse files which have a compression unit of 0.
52 - Remove all the make_bad_inode() calls. This should only be called
53 from read inode and new inode code paths.
54 - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
55 allowed by NTFS, i.e. 255 Unicode characters, not including the
56 terminating NULL (which is not stored on disk).
57 - Improve comments on file attribute flags in fs/ntfs/layout.h.
58 - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
59 forgot to update a temporary variable so loading index inodes which
60 have an index allocation attribute failed.
61 - Add a missing call to flush_dcache_mft_record_page() in
62 fs/ntfs/inode.c::ntfs_write_inode().
63 - Handle the recently introduced -ENAMETOOLONG return value from
64 fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
65 - Semaphore to mutex conversion. (Ingo Molnar)
66
672.1.26 - Minor bug fixes and updates.
68
69 - Fix a potential overflow in file.c where a cast to s64 was missing in
70 a left shift of a page index.
71 - The struct inode has had its i_sem semaphore changed to a mutex named
72 i_mutex.
73 - We have struct kmem_cache now so use it instead of the typedef
74 kmem_cache_t. (Pekka Enberg)
75 - Implement support for sector sizes above 512 bytes (up to the maximum
76 supported by NTFS which is 4096 bytes).
77 - Do more detailed reporting of why we cannot mount read-write by
78 special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
79 - Miscellaneous updates to layout.h.
80 - Cope with attribute list attribute having invalid flags. Windows
81 copes with this and even chkdsk does not detect or fix this so we
82 have to cope with it, too. Thanks to Pawel Kot for reporting the
83 problem.
84
852.1.25 - (Almost) fully implement write(2) and truncate(2).
86
87 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
88 {__,}ntfs_cluster_free() to also take an optional attribute search
89 context as argument. This allows calling these functions with the
90 mft record mapped. Update all callers.
91 - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
92 error handling by passing in the active search context when calling
93 ntfs_cluster_free().
94 - Change ntfs_cluster_alloc() to take an extra boolean parameter
95 specifying whether the cluster are being allocated to extend an
96 attribute or to fill a hole.
97 - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
98 with @is_extension set to TRUE and remove the runlist terminator
99 fixup code as this is now done by ntfs_cluster_alloc().
100 - Change ntfs_attr_make_non_resident to take the attribute value size
101 as an extra parameter. This is needed since we need to know the size
102 before we can map the mft record and our callers always know it. The
103 reason we cannot simply read the size from the vfs inode i_size is
104 that this is not necessarily uptodate. This happens when
105 ntfs_attr_make_non_resident() is called in the ->truncate call path.
106 - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
107 which is zero for a resident attribute but should no longer be zero
108 once the attribute is non-resident as it then has real clusters
109 allocated.
110 - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
111 extend the allocation of an attributes. Optionally, the data size,
112 but not the initialized size can be extended, too.
113 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
114 uncompressed and unencrypted files and it never creates sparse files
115 at least for the moment (making a file sparse requires us to modify
116 its directory entries and we do not support directory operations at
117 the moment). Also, support for highly fragmented files, i.e. ones
118 whose data attribute is split across multiple extents, is severly
119 limited. When such a case is encountered, EOPNOTSUPP is returned.
120 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
121 the initial implementation of file truncation. Now both open(2)ing
122 a file with the O_TRUNC flag and the {,f}truncate(2) system calls
123 will resize a file appropriately. The limitations are that only
124 uncompressed and unencrypted files are supported. Also, there is
125 only very limited support for highly fragmented files (the ones whose
126 $DATA attribute is split into multiple attribute extents).
127 - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
128 and cond_resched() in the main loop as we could be dirtying a lot of
129 pages and this ensures we play nice with the VM and the system as a
130 whole.
131 - Implement file operations ->write, ->aio_write, ->writev for regular
132 files. This replaces the old use of generic_file_write(), et al and
133 the address space operations ->prepare_write and ->commit_write.
134 This means that both sparse and non-sparse (unencrypted and
135 uncompressed) files can now be extended using the normal write(2)
136 code path. There are two limitations at present and these are that
137 we never create sparse files and that we only have limited support
138 for highly fragmented files, i.e. ones whose data attribute is split
139 across multiple extents. When such a case is encountered,
140 EOPNOTSUPP is returned.
141 - $EA attributes can be both resident and non-resident.
142 - Use %z for size_t to fix compilation warnings. (Andrew Morton)
143 - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
144 - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
145 patch by Yura Pakhuchiy.)
146
1472.1.24 - Lots of bug fixes and support more clean journal states.
148
149 - Support journals ($LogFile) which have been modified by chkdsk. This
150 means users can boot into Windows after we marked the volume dirty.
151 The Windows boot will run chkdsk and then reboot. The user can then
152 immediately boot into Linux rather than having to do a full Windows
153 boot first before rebooting into Linux and we will recognize such a
154 journal and empty it as it is clean by definition. Note, this only
155 works if chkdsk left the journal in an obviously clean state.
156 - Support journals ($LogFile) with only one restart page as well as
157 journals with two different restart pages. We sanity check both and
158 either use the only sane one or the more recent one of the two in the
159 case that both are valid.
160 - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
161 ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
162 hence cannot fail.
163 - Use ntfs_malloc_nofs_nofail() in the two critical regions in
164 fs/ntfs/runlist.c::ntfs_runlists_merge(). This means we no longer
165 need to panic() if the allocation fails as it now cannot fail.
166 - Fix two nasty runlist merging bugs that had gone unnoticed so far.
167 Thanks to Stefano Picerno for the bug report.
168 - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
169 - Fix handling of valid but empty mapping pairs array in
170 fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
171 - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
172 messages and include the inode number. Thanks to Yura Pakhuchiy for
173 pointing this out.
174 - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
175 length is zero.
176 - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
177 specified hole into a runlist.
178 - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup(). When the returned
179 index entry is in the index root, we forgot to set the @ir pointer in
180 the index context. Thanks to Yura Pakhuchiy for finding this bug.
181 - Remove bogus setting of PageError in ntfs_read_compressed_block().
182 - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
183 - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
184 access to the allocated size in the ntfs inode with the size lock.
185 - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
186 return LCN_ENOENT when there is no runlist and the allocated size is
187 zero.
188 - Fix load_attribute_list() to handle the case of a NULL runlist.
189 - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
190 - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
191 to ensure that these functions are never called for compressed or
192 encrypted attributes.
193 - Fix cluster (de)allocators to work when the runlist is NULL and more
194 importantly to take a locked runlist rather than them locking it
195 which leads to lock reversal.
196 - Truncate {a,c,m}time to the ntfs supported time granularity when
197 updating the times in the inode in ntfs_setattr().
198 - Fixup handling of sparse, compressed, and encrypted attributes in
199 fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
200 fs/ntfs/aops.c::ntfs_{read,write}page().
201 - Make ntfs_write_block() not instantiate sparse blocks if they contain
202 only zeroes.
203 - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
204 lock protection over the buffer submission for i/o which allows the
205 removal of the get_bh()/put_bh() pairs for each buffer.
206 - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
207 where a concurrent truncate has truncated the runlist under our feet.
208 - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
209 - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
210 in the first buffer head instead of a driver global spin lock to
211 improve scalability.
212 - Minor fix to error handling and error message display in
213 fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
214 - Change the mount options {u,f,d}mask to always parse the number as
215 an octal number to conform to how chmod(1) works, too. Thanks to
216 Giuseppe Bilotta and Horst von Brand for pointing out the errors of
217 my ways.
218 - Fix various bugs in the runlist merging code. (Based on libntfs
219 changes by Richard Russon.)
220 - Fix sparse warnings that have crept in over time.
221 - Change ntfs_cluster_free() to require a write locked runlist on entry
222 since we otherwise get into a lock reversal deadlock if a read locked
223 runlist is passed in. In the process also change it to take an ntfs
224 inode instead of a vfs inode as parameter.
225 - Fix the definition of the CHKD ntfs record magic. It had an off by
226 two error causing it to be CHKB instead of CHKD.
227 - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
228 count to become negative and hence we had a wild memset() scribbling
229 all over the system's ram.
230
2312.1.23 - Implement extension of resident files and make writing safe as well as
232 many bug fixes, cleanups, and enhancements...
233
234 - Add printk rate limiting for ntfs_warning() and ntfs_error() when
235 compiled without debug. This avoids a possible denial of service
236 attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
237 out.
238 - Fix compilation warnings on ia64. (Randy Dunlap)
239 - Use i_size_{read,write}() instead of reading i_size by hand and cache
240 the value where apropriate.
241 - Add size_lock to the ntfs_inode structure. This is an rw spinlock
242 and it locks against access to the inode sizes. Note, ->size_lock
243 is also accessed from irq context so you must use the _irqsave and
244 _irqrestore lock and unlock functions, respectively. Protect all
245 accesses to allocated_size, initialized_size, and compressed_size.
246 - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
247 - Implement extension of resident files in the regular file write code
248 paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()). At present
249 this only works until the data attribute becomes too big for the mft
250 record after which we abort the write returning -EOPNOTSUPP from
251 ntfs_prepare_write().
252 - Add disable_sparse mount option together with a per volume sparse
253 enable bit which is set appropriately and a per inode sparse disable
254 bit which is preset on some system file inodes as appropriate.
255 - Enforce that sparse support is disabled on NTFS volumes pre 3.0.
256 - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
257 the creation of the unmapped runlist element for the base attribute
258 extent.
259 - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
260 helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
261 This allows us to map runlist fragments with the runlist lock already
262 held without having to drop and reacquire it around the call. Adapt
263 all callers.
264 - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
265 runlist. This allows us to find runlist elements with the runlist
266 lock already held without having to drop and reacquire it around the
267 call. Adapt all callers.
268 - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
269 warning in the do_div() call on sparc32. Thanks to Meelis Roos for
270 the report and analysis of the warning.
271 - Fix a nasty runlist merge bug when merging two holes.
272 - Set the ntfs_inode->allocated_size to the real allocated size in the
273 mft record for resident attributes (fs/ntfs/inode.c).
274 - Small readability cleanup to use "a" instead of "ctx->attr"
275 everywhere (fs/ntfs/inode.c).
276 - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
277 definition of ntfs_export_ops from fs/ntfs/super.c to namei.c. Also,
278 declare ntfs_export_ops in fs/ntfs/ntfs.h.
279 - Correct sparse file handling. The compressed values need to be
280 checked and set in the ntfs inode as done for compressed files and
281 the compressed size needs to be used for vfs inode->i_blocks instead
282 of the allocated size, again, as done for compressed files.
283 - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
284 non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
285 - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
286 write code.
287 - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
288 dropping the read lock and taking the write lock we were not checking
289 whether someone else did not already do the work we wanted to do.
290 - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
291 ntfs_attr_find_vcn_nolock() and update all callers.
292 - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
293 - Fix sign of various error return values to be negative in
294 fs/ntfs/lcnalloc.c.
295 - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
296 handle the case where an attribute is converted from resident to
297 non-resident by a concurrent file write.
298 - Remove checks for NULL before calling kfree() since kfree() does the
299 checking itself. (Jesper Juhl)
300 - Some utilities modify the boot sector but do not update the checksum.
301 Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
302 only emit a warning when the checksum is incorrect rather than
303 refusing the mount. Thanks to Bernd Casimir for pointing this
304 problem out.
305 - Update attribute definition handling.
306 - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
307 - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
308 - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
309 better code generation and one less sparse warning in fs/ntfs/aops.c.
310 - Remove spurious void pointer casts from fs/ntfs/. (Pekka Enberg)
311 - Use C99 style structure initialization after memory allocation where
312 possible (fs/ntfs/{attrib.c,index.c,super.c}). Thanks to Al Viro and
313 Pekka Enberg.
314 - Stamp the transaction log ($UsnJrnl), aka user space journal, if it
315 is active on the volume and we are mounting read-write or remounting
316 from read-only to read-write.
317 - Fix a bug in address space operations error recovery code paths where
318 if the runlist was not mapped at all and a mapping error occured we
319 would leave the runlist locked on exit to the function so that the
320 next access to the same file would try to take the lock and deadlock.
321 - Detect the case when Windows has been suspended to disk on the volume
322 to be mounted and if this is the case do not allow (re)mounting
323 read-write. This is done by parsing hiberfil.sys if present.
324 - Fix several occurences of a bug where we would perform 'var & ~const'
325 with a 64-bit variable and a int, i.e. 32-bit, constant. This causes
326 the higher order 32-bits of the 64-bit variable to be zeroed. To fix
327 this cast the 'const' to the same 64-bit type as 'var'.
328 - Change the runlist terminator of the newly allocated cluster(s) to
329 LCN_ENOENT in ntfs_attr_make_non_resident(). Otherwise the runlist
330 code gets confused.
331 - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
332 and ntfs_mapping_pairs_build() to allow the runlist encoding to be
333 partial which is desirable when filling holes in sparse attributes.
334 Update all callers.
335 - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
336 if the requested vcn is inside it. Otherwise we get into problems
337 when we try to map an out of bounds vcn because we then try to map
338 the already mapped runlist fragment which causes
339 ntfs_mapping_pairs_decompress() to fail and return error. Update
340 ntfs_attr_find_vcn_nolock() accordingly.
341 - Fix a nasty deadlock that appeared in recent kernels.
342 The situation: VFS inode X on a mounted ntfs volume is dirty. For
343 same inode X, the ntfs_inode is dirty and thus corresponding on-disk
344 inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
345 to the table of inodes, i.e. $MFT, inode 0.
346 What happens:
347 Process 1: sys_sync()/umount()/whatever... calls
348 __sync_single_inode() for $MFT -> do_writepages() -> write_page for
349 the dirty page containing the on-disk inode X, the page is now locked
350 -> ntfs_write_mst_block() which clears PageUptodate() on the page to
351 prevent anyone else getting hold of it whilst it does the write out.
352 This is necessary as the on-disk inode needs "fixups" applied before
353 the write to disk which are removed again after the write and
354 PageUptodate is then set again. It then analyses the page looking
355 for dirty on-disk inodes and when it finds one it calls
356 ntfs_may_write_mft_record() to see if it is safe to write this
357 on-disk inode. This then calls ilookup5() to check if the
358 corresponding VFS inode is in icache(). This in turn calls ifind()
359 which waits on the inode lock via wait_on_inode whilst holding the
360 global inode_lock.
361 Process 2: pdflush results in a call to __sync_single_inode for the
362 same VFS inode X on the ntfs volume. This locks the inode (I_LOCK)
363 then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
364 read_cache_page() for the page (in page cache of table of inodes
365 $MFT, inode 0) containing the on-disk inode. This page has
366 PageUptodate() clear because of Process 1 (see above) so
367 read_cache_page() blocks when it tries to take the page lock for the
368 page so it can call ntfs_read_page().
369 Thus Process 1 is holding the page lock on the page containing the
370 on-disk inode X and it is waiting on the inode X to be unlocked in
371 ifind() so it can write the page out and then unlock the page.
372 And Process 2 is holding the inode lock on inode X and is waiting for
373 the page to be unlocked so it can call ntfs_readpage() or discover
374 that Process 1 set PageUptodate() again and use the page.
375 Thus we have a deadlock due to ifind() waiting on the inode lock.
376 The solution: The fix is to use the newly introduced
377 ilookup5_nowait() which does not wait on the inode's lock and hence
378 avoids the deadlock. This is safe as we do not care about the VFS
379 inode and only use the fact that it is in the VFS inode cache and the
380 fact that the vfs and ntfs inodes are one struct in memory to find
381 the ntfs inode in memory if present. Also, the ntfs inode has its
382 own locking so it does not matter if the vfs inode is locked.
383 - Fix bug in mft record writing where we forgot to set the device in
384 the buffers when mapping them after the VM had discarded them.
385 Thanks to Martin MOKREJÃ… for the bug report.
386
3872.1.22 - Many bug and race fixes and error handling improvements.
388
389 - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
390 - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
391 instead of void and provide a helper ntfs_truncate_vfs() for the
392 vfs ->truncate method.
393 - Add a new ntfs inode flag NInoTruncateFailed() and modify
394 fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
395 - Fix min_size and max_size definitions in ATTR_DEF structure in
396 fs/ntfs/layout.h to be signed.
397 - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
398 ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
399 ntfs_attr_can_be_resident(), which in turn use the new private helper
400 ntfs_attr_find_in_attrdef().
401 - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
402 mapping->private_lock around the dirtying of the buffer heads
403 analagous to the way it is done in __set_page_dirty_buffers().
404 - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
405 mount time as this cannot work with the current implementation.
406 - Check for location of attribute name and improve error handling in
407 general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
408 - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
409 i_size, i.e. race with truncate, invalidate the buffers on the page
410 so that they become freeable and hence the page does not leak.
411 - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge(). (Adrian
412 Bunk)
413 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
414 a NULL pointer dereference in the error code path when a corrupt
415 attribute was found. (Thanks to Domen Puncer for the bug report.)
416 - Add MODULE_VERSION() to fs/ntfs/super.c.
417 - Make several functions and variables static. (Adrian Bunk)
418 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
419 buffers for the page if they are not present and then marks the
420 buffers belonging to the ntfs record dirty. This causes the buffers
421 to become busy and hence they are safe from removal until the page
422 has been written out.
423 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
424 error handling code path that resulted in a BUG() due to trying to
425 unmap an extent mft record when the mapping of it had failed and it
426 thus was not mapped. (Thanks to Ken MacFerrin for the bug report.)
427 - Drop the runlist lock after the vcn has been read in
428 fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
429 - Rewrite handling of multi sector transfer errors. We now do not set
430 PageError() when such errors are detected in the async i/o handler
431 fs/ntfs/aops.c::ntfs_end_buffer_async_read(). All users of mst
432 protected attributes now check the magic of each ntfs record as they
433 use it and act appropriately. This has the effect of making errors
434 granular per ntfs record rather than per page which solves the case
435 where we cannot access any of the ntfs records in a page when a
436 single one of them had an mst error. (Thanks to Ken MacFerrin for
437 the bug report.)
438 - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
439 where we failed to release i_mutex on the $Quota/$Q attribute inode.
440 - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
441 - Add mapping of unmapped buffers to all remaining code paths, i.e.
442 fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
443 and write_mft_record_nolock(). From now on we require that the
444 complete runlist for the mft mirror is always mapped into memory.
445 - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
446 - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
447 - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
448 resident attribute will be smaller than a page which makes the code
449 simpler. Also make the code more tolerant to concurrent ->truncate.
450
4512.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
452
453 - Implement extent mft record deallocation
454 fs/ntfs/mft.c::ntfs_extent_mft_record_free().
455 - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
456 - Add vol->mft_data_pos and initialize it at mount time.
457 - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
458 ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
459 ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
460 ntfs_runlists_merge() and adapt all callers.
461 - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
462 ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
463 and ntfs_mapping_pairs_build(), adapted from libntfs.
464 - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
465 static and add a declaration for it to lcnalloc.h.
466 - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
467 inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
468 cluster bitmap lock for the duration of the call.
469 - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
470 - Implement the equivalent of memset() for an ntfs attribute in
471 fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
472 fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
473 - Remove unnecessary casts from LCN_* constants.
474 - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
475 - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
476 change MFT_RECORD to contain the NTFS 3.1+ specific fields.
477 - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
478 marks all buffers belonging to an ntfs record dirty, followed by
479 marking the page the ntfs record is in dirty and also marking the vfs
480 inode containing the ntfs record dirty (I_DIRTY_PAGES).
481 - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
482 new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
483 longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
484 - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
485 include errors.
486 - Move the typedefs for runlist_element and runlist from types.h to
487 runlist.h and fix resulting include errors.
488 - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
489 - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
490 mark_ntfs_record_dirty() which also changes the behaviour in that we
491 now set the buffers belonging to the mft record dirty as well as the
492 page itself.
493 - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
494 to cope with the fact that there now are dirty buffers in mft pages.
495 - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
496 mark_ntfs_record_dirty() and thus to set the buffers belonging to the
497 mft record dirty as well as the page itself.
498 - Fix compiler warnings on x86-64 in fs/ntfs/dir.c. (Randy Dunlap,
499 slightly modified by me)
500 - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
501 the mft record is already locked and otherwise behaves the same way
502 as fs/ntfs/mft.c::map_mft_record().
503 - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
504 writes the mft record if the buffers belonging to it are dirty.
505 Otherwise we assume that it was written out by other means already.
506 - Attempting to write outside initialized size is _not_ a bug so remove
507 the bug check from fs/ntfs/aops.c::ntfs_write_mst_block(). It is in
508 fact required to write outside initialized size when preparing to
509 extend the initialized size.
510 - Map the page instead of using page_address() before writing to it in
511 fs/ntfs/aops.c::ntfs_mft_writepage().
512 - Provide exclusion between opening an inode / mapping an mft record
513 and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
514 by setting the page not uptodate throughout ntfs_mft_writepage().
515 - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
516 to ensure noone can see the page whilst the mst fixups are applied.
517 - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
518 checks if an mft record may be written out safely obtaining any
519 necessary locks in the process. This is used by
520 fs/ntfs/aops.c::ntfs_write_mst_block().
521 - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
522 writing mft records and improve its error handling in the process.
523 Now if any of the records in the page fail to be written out, all
524 other records will be written out instead of aborting completely.
525 - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
526 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
527 ntfs_mst_aops for all inodes which are NInoMstProtected() and
528 ntfs_aops for all other inodes.
529 - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
530 ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
531 no longer require an ntfs inode to be present. Update all callers.
532 - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
533 - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
534 to ensure noone can see the page whilst the mst fixups are applied.
535 - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
536 fs/ntfs/mft.c::try_map_mft_record().
537 - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
538 with the ntfs inode which contains the page rather than the ntfs
539 inode the mft record of which is in the page.
540 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
541 index inode bitmap inode release code from there to
542 fs/ntfs/inode.c::ntfs_clear_big_inode(). (Thanks to Christoph
543 Hellwig for spotting this.)
544 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
545 inode semaphore around the code that sets ni->itype.index.bmp_ino to
546 NULL and reorganize the code to optimize it a bit. (Thanks to
547 Christoph Hellwig for spotting this.)
548 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
549 ntfs inode as a parameter as this is confusing and misleading and the
550 needed ntfs inode is available via NTFS_I(page->mapping->host).
551 Adapt all callers to this change.
552 - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
553 fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
554 of the first buffer in a record and to take this as the ntfs record
555 dirty state. We cannot look at the dirty state for subsequent
556 buffers because we might be racing with
557 fs/ntfs/aops.c::mark_ntfs_record_dirty().
558 - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
559 inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
560 add a declaration for it to inode.h. Fix some compilation issues
561 that resulted due to #includes and header file interdependencies.
562 - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
563 - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
564 - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
565 record sequence number if it is specified (i.e. not zero).
566 - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
567 functions used by it.
568 - Update Documentation/filesystems/ntfs.txt with instructions on how to
569 use the Device-Mapper driver with NTFS ftdisk/LDM raid. This removes
570 the linear raid problem with the Software RAID / MD driver when one
571 or more of the devices has an odd number of sectors.
572
5732.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
574
575 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
576 where we did not clear ctx->al_entry but it was still set due to
577 changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
578 particular.
579 - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
580 where we forgot to unmap the extent mft record when we had finished
581 enumerating an attribute which caused a bug check to trigger when the
582 VFS calls ->clear_inode.
583
5842.1.19 - Many cleanups, improvements, and a minor bug fix.
585
586 - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
587 change the uid, gid, and mode of an inode as we do not support NTFS
588 ACLs yet.
589 - Remove BKL use from ntfs_setattr() syncing up with the rest of the
590 kernel.
591 - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
592 and ntfs_filldir() as per suggestion from Al Viro.
593 - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
594 - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
595 inode size has changed and to only output an error if so.
596 - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
597 - Add le{16,32,64} as well as sle{16,32,64} data types to
598 fs/ntfs/types.h.
599 - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
600 - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
601 respectively, to fs/ntfs/types.h.
602 - Update endianness conversion macros in fs/ntfs/endian.h to use the
603 new types as appropriate.
604 - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
605 and index.c.
606 - Add leMFT_REF data type to fs/ntfs/layout.h.
607 - Update all NTFS header files with the new little endian data types.
608 Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
609 - Do proper type casting when using ntfs_is_*_recordp() in
610 fs/ntfs/logfile.c, mft.c, and super.c.
611 - Fix all the sparse bitwise warnings. Had to change all the typedef
612 enums storing little endian values to simple enums plus a typedef for
613 the datatype to make sparse happy.
614 - Fix a bug found by the new sparse bitwise warnings where the default
615 upcase table was defined as a pointer to wchar_t rather than ntfschar
616 in fs/ntfs/ntfs.h and super.c.
617 - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
618
6192.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
620
621 - Remove vol->nr_mft_records as it was pretty meaningless and optimize
622 the calculation of total/free inodes as used by statfs().
623 - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
624 because the code itself is using the ntfs_lock semaphore which
625 provides safe locking. (Ingo Molnar)
626 - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
627 could occur in the future for when we start closing/freeing extent
628 inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
629 we free it.
630 - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
631 find_external_attr() to ntfs_external_attr_find() to cleanup the
632 namespace a bit and to be more consistent with libntfs.
633 - Rename {{re,}init,get,put}_attr_search_ctx() to
634 ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
635 attr_search_context to ntfs_attr_search_ctx.
636 - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
637 for the attribute list attribute itself.
638 - Fix endianness bug in ntfs_external_attr_find().
639 - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
640 if the attribute is not found, and -EIO on real error. In the case
641 of -ENOENT, the search context is updated to describe the attribute
642 before which the attribute being searched for would need to be
643 inserted if such an action were to be desired and in the case of
644 ntfs_external_attr_find() the search context is also updated to
645 indicate the attribute list entry before which the attribute list
646 entry of the attribute being searched for would need to be inserted
647 if such an action were to be desired. Also make ntfs_find_attr()
648 static and remove its prototype from attrib.h as it is not used
649 anywhere other than attrib.c. Update ntfs_attr_lookup() and all
650 callers of ntfs_{external,}attr_{find,lookup}() for the new return
651 values.
652 - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
653
6542.1.17 - Fix bugs in mount time error code paths and other updates.
655
656 - Implement bitmap modification code (fs/ntfs/bitmap.[hc]). This
657 includes functions to set/clear a single bit or a run of bits.
658 - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
659 runlist element containing a particular vcn. It also takes care of
660 mapping any needed runlist fragments.
661 - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
662 - Load attribute definition table from $AttrDef at mount time.
663 - Fix bugs in mount time error code paths involving (de)allocation of
664 the default and volume upcase tables.
665 - Remove ntfs_nr_mounts as it is no longer used.
666
6672.1.16 - Implement access time updates, file sync, async io, and read/writev.
668
669 - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
670 This is done by setting the appropriate file operations pointers to
671 the generic helper functions provided by mm/filemap.c.
672 - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
673 and directories (fs/ntfs/dir.c).
674 - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
675 Note, except for the root directory and any other system files opened
676 by the user, the system files will not have their access times
677 updated as they are only accessed at the inode level an hence the
678 file level functions which cause the times to be updated are never
679 invoked.
680
6812.1.15 - Invalidate quotas when (re)mounting read-write.
682
683 - Add new element itype.index.collation_rule to the ntfs inode
684 structure and set it appropriately in ntfs_read_locked_inode().
685 - Implement a new inode type "index" to allow efficient access to the
686 indices found in various system files and adapt inode handling
687 accordingly (fs/ntfs/inode.[hc]). An index inode is essentially an
688 attribute inode (NInoAttr() is true) with an attribute type of
689 AT_INDEX_ALLOCATION. As such, it is no longer allowed to call
690 ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
691 there would be no way to distinguish between normal attribute inodes
692 and index inodes. The function to obtain an index inode is
693 ntfs_index_iget() and it uses the helper function
694 ntfs_read_locked_index_inode(). Note, we do not overload
695 ntfs_attr_iget() as indices consist of multiple attributes so using
696 ntfs_attr_iget() to obtain an index inode would be confusing.
697 - Ensure that there is no overflow when doing page->index <<
698 PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
699 - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
700 and ntfs_read_block().
701 - Use case sensitive attribute lookups instead of case insensitive ones.
702 - Lock all page cache pages belonging to mst protected attributes while
703 accessing them to ensure we never see corrupt data while the page is
704 under writeout.
705 - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
706 We have ntfs_is_collation_rule_supported() to check if the collation
707 rule you want to use is supported and ntfs_collation() which actually
708 collates two data items. We currently only support COLLATION_BINARY
709 and COLLATION_NTOFS_ULONG but support for other collation rules will
710 be added as the need arises.
711 - Add a new type, ntfs_index_context, to allow retrieval of an index
712 entry using the corresponding index key. To get an index context,
713 use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
714 This also adds a new slab cache for the index contexts. To lookup a
715 key in an index inode, use ntfs_index_lookup(). After modifying an
716 index entry, call ntfs_index_entry_flush_dcache_page() followed by
717 ntfs_index_entry_mark_dirty() to ensure the changes are written out
718 to disk. For details see fs/ntfs/index.[hc]. Note, at present, if
719 an index entry is in the index allocation attribute rather than the
720 index root attribute it will not be written out (you will get a
721 warning message about discarded changes instead).
722 - Load the quota file ($Quota) and check if quota tracking is enabled
723 and if so, mark the quotas out of date. This causes windows to
724 rescan the volume on boot and update all quota entries.
725 - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
726 It is simply set to __set_page_dirty_nobuffers() to make sure that
727 running set_page_dirty() on a page containing mft/ntfs records will
728 not affect the dirty state of the page buffers.
729 - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
730 buffers that are inside the ntfs record in the page dirty after which
731 it sets the page dirty. This allows ->writepage to only write the
732 dirty index records rather than having to write all the records in
733 the page. Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
734 use this rather than __set_page_dirty_nobuffers().
735 - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
736 writing of page cache pages belonging to mst protected attributes
737 like the index allocation attribute in directory indices and other
738 indices like $Quota/$Q, etc. This means that the quota is now marked
739 out of date on all volumes rather than only on ones where the quota
740 defaults entry is in the index root attribute of the $Quota/$Q index.
741
7422.1.14 - Fix an NFSd caused deadlock reported by several users.
743
744 - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
745 to a buffer so that we can put the search context and unmap the mft
746 record before calling the filldir() callback. We need to do this
747 because of NFSd which calls ->lookup() from its filldir callback()
748 and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
749 of the directory and since ntfs_readdir() has got it mapped already
750 ntfs_lookup() deadlocks.
751
7522.1.13 - Enable overwriting of resident files and housekeeping of system files.
753
754 - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
755 keeping the mft mirror in sync with the mft when mirrored mft records
756 are written. The functions are write_mft_record{,_nolock}(). The
757 implementation is quite rudimentary for now with lots of things not
758 implemented yet but I am not sure any of them can actually occur so
759 I will wait for people to hit each one and only then implement it.
760 - Commit open system inodes at umount time. This should make it
761 virtually impossible for sync_mft_mirror_umount() to ever be needed.
762 - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
763 ntfs super operations. This gives us inode writing via the VFS inode
764 dirty code paths. Note: Access time updates are not implemented yet.
765 - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
766 fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
767 finally enabling resident file overwrite! (-8 This also includes a
768 placeholder for ->writepage (ntfs_mft_writepage()), which for now
769 just redirties the page and returns. Also, at umount time, we for
770 now throw away all mft data page cache pages after the last call to
771 ntfs_commit_inode() in the hope that all inodes will have been
772 written out by then and hence no dirty (meta)data will be lost. We
773 also check for this case and emit an error message telling the user
774 to run chkdsk.
775 - Use set_page_writeback() and end_page_writeback() in the resident
776 attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
777 the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
778 page is clean.
779 - Implement ntfs_mft_writepage() so it now checks if any of the mft
780 records in the page are dirty and if so redirties the page and
781 returns. Otherwise it just returns (after doing set_page_writeback(),
782 unlock_page(), end_page_writeback() or the radix-tree tag
783 PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
784 alowing the VM to do with the page as it pleases. Also, at umount
785 time, now only throw away dirty mft (meta)data pages if dirty inodes
786 are present and ask the user to email us if they see this happening.
787 - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
788 information flags (fs/ntfs/super.c).
789 - Mark the volume dirty when (re)mounting read-write and mark it clean
790 when unmounting or remounting read-only. If any volume errors are
791 found, the volume is left marked dirty to force chkdsk to run.
792 - Add code to set the NT4 compatibility flag when (re)mounting
793 read-write for newer NTFS versions but leave it commented out for now
794 since we do not make any modifications that are NTFS 1.2 specific yet
795 and since setting this flag breaks Captive-NTFS which is not nice.
796 This code must be enabled once we start writing NTFS 1.2 specific
797 changes otherwise Windows NTFS driver might crash / cause corruption.
798
7992.1.12 - Fix the second fix to the decompression engine and some cleanups.
800
801 - Add a new address space operations struct, ntfs_mst_aops, for mst
802 protected attributes. This is because the default ntfs_aops do not
803 make sense with mst protected data and were they to write anything to
804 such an attribute they would cause data corruption so we provide
805 ntfs_mst_aops which does not have any write related operations set.
806 - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
807 includes an adapted ntfs_commit_inode() and an implementation of
808 ntfs_write_inode() which for now just cleans dirty inodes without
809 writing them (it does emit a warning that this is happening).
810 - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
811 entry) as it was only fixing a theoretical bug but at the same time
812 it badly broke the handling of sparse and uncompressed compression
813 blocks.
814
8152.1.11 - Driver internal cleanups.
816
817 - Only build logfile.o if building the driver with read-write support.
818 - Really final white space cleanups.
819 - Use generic_ffs() instead of ffs() in logfile.c which allows the
820 log_page_size variable to be optimized by gcc into a constant.
821 - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
822 char as defined by POSIX and as found on some systems.
823
8242.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
825
826 - Finish off the white space cleanups (remove trailing spaces, etc).
827 - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
828 the kludges around the first iget(). Instead of (re)setting ->s_op
829 we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
830 insert_inode_hash() / call ntfs_read_inode_mount() directly. This
831 kills the need for second super_operations and allows to return error
832 from ntfs_read_inode_mount() without resorting to ugly "poisoning"
833 tricks. (Al Viro)
834 - Force read-only (re)mounting if any of the following bits are set in
835 the volume information flags:
836 VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
837 VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
838 VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
839 To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
840 above bits set so the test is made easy.
841
8422.1.9 - Fix two bugs in decompression engine.
843
844 - Fix a bug where we would not always detect that we have reached the
845 end of a compression block because we were ending at minus one byte
846 which is effectively the same as being at the end. The fix is to
847 check whether the uncompressed buffer has been fully filled and if so
848 we assume we have reached the end of the compression block. A big
849 thank you to Marcin Gibuła for the bug report, the assistance in
850 tracking down the bug and testing the fix.
851 - Fix a possible bug where when a compressed read is truncated to the
852 end of the file, the offset inside the last page was not truncated.
853
8542.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
855
856 - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
857 - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
858 utc2ntfs() to work with struct timespec instead of time_t on the
859 Linux UTC time side thus preserving the full precision of the NTFS
860 time and only loosing up to 99 nano-seconds in the Linux UTC time.
861 - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
862 static inline.
863 - Remove unused ntfs_dirty_inode().
864 - Cleanup super operations declaration in fs/ntfs/super.c.
865 - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
866 - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
867 fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
868 - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
869 fs/ntfs/inode.h so they can be used elsewhere.
870 - Determine the mft mirror size as the number of mirrored mft records
871 and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
872 - Load the mft mirror at mount time and compare the mft records stored
873 in it to the ones in the mft. Force a read-only mount if the two do
874 not match (fs/ntfs/super.c).
875 - Fix type casting related warnings on 64-bit architectures. Thanks
876 to Meelis Roos for reporting them.
877 - Move %L to %ll as %L is floating point and %ll is integer which is
878 what we want.
879 - Read the journal ($LogFile) and determine if the volume has been
880 shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
881 and fs/ntfs/logfile.c). This is a little bit of a crude check in
882 that we only look at the restart areas and not at the actual log
883 records so that there will be a very small number of cases where we
884 think that a volume is dirty when in fact it is clean. This should
885 only affect volumes that have not been shutdown cleanly and did not
886 have any pending, non-check-pointed i/o.
887 - If the $LogFile indicates a clean shutdown and a read-write (re)mount
888 is requested, empty $LogFile by overwriting it with 0xff bytes to
889 ensure that Windows cannot cause data corruption by replaying a stale
890 journal after Linux has written to the volume.
891
8922.1.7 - Enable NFS exporting of mounted NTFS volumes.
893
894 - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
895 - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
896 - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
897 default doesn't allow inode number 0 which is a valid inode on NTFS
898 and even if it did allow that it uses iget() instead of ntfs_iget()
899 which makes it useless for us.
900 - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
901 default just returns -EACCES which is not very useful.
902 - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
903 and set them up in the super block at mount time (super.c) this
904 allows mounted NTFS volumes to be exported via NFS.
905 - Add missing return -EOPNOTSUPP; in
906 fs/ntfs/aops.c::ntfs_commit_nonresident_write().
907 - Enforce no atime and no dir atime updates at mount/remount time as
908 they are not implemented yet anyway.
909 - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
910 after a NULL check. Thanks to Dave Jones for pointing this out.
911
9122.1.6 - Fix minor bug in handling of compressed directories.
913
914 - Fix bug in handling of compressed directories. A compressed
915 directory is not really compressed so when we set the ->i_blocks
916 field of a compressed directory inode we were setting it from the
917 non-existing field ni->itype.compressed.size which gave random
918 results... For directories we now always use ni->allocated_size.
919
9202.1.5 - Fix minor bug in attribute list attribute handling.
921
922 - Fix bug in attribute list handling. Actually it is not as much a bug
923 as too much protection in that we were not allowing attribute lists
924 which waste space on disk while Windows XP clearly allows it and in
925 fact creates such attribute lists so our driver was failing.
926 - Update NTFS documentation ready for 2.6 kernel release.
927
9282.1.4 - Reduce compiler requirements.
929
930 - Remove all uses of unnamed structs and unions in the driver to make
931 old and newer gcc versions happy. Makes it a bit uglier IMO but at
932 least people will stop hassling me about it.
933
9342.1.3 - Important bug fixes in corner cases.
935
936 - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
937 clusters. (Philipp Thomas)
938 - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
939 multiple of the block_size but not the cluster size. (Szabolcs
940 Szakacsits)
941
9422.1.2 - Important bug fixes aleviating the hangs in statfs.
943
944 - Fix buggy free cluster and free inode determination logic.
945
9462.1.1 - Minor updates.
947
948 - Add handling for initialized_size != data_size in compressed files.
949 - Reduce function local stack usage from 0x3d4 bytes to just noise in
950 fs/ntfs/upcase.c. (Randy Dunlap)
951 - Remove compiler warnings for newer gcc.
952 - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
953 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
954 in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
955 kmap_atomic(KM_USER0).
956
9572.1.0 - First steps towards write support: implement file overwrite.
958
959 - Add configuration option for developmental write support with an
960 appropriately scary configuration help text.
961 - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
962 helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
963 overwriting of existing files on ntfs. Note: Resident files are
964 only written into memory, and not written out to disk at present, so
965 avoid writing to files smaller than about 1kiB.
966 - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
967 helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
968 counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
969 fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
970 add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
971 This enables write(2) based overwriting of existing files on ntfs.
972 Note: As with mmap(2) based overwriting, resident files are only
973 written into memory, and not written out to disk at present, so avoid
974 writing to files smaller than about 1kiB.
975 - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
976 ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
977 files with the purpose of intercepting and aborting all i_size
978 changes which we do not support yet. ntfs_truncate() actually only
979 emits a warning message but AFAICS our interception of i_size changes
980 elsewhere means ntfs_truncate() never gets called for i_size changes.
981 It is only called from generic_file_write() when we fail in
982 ntfs_prepare_{,nonresident_}write() in order to discard any
983 instantiated buffers beyond i_size. Thus i_size is not actually
984 changed so our warning message is enough. Unfortunately it is not
985 possible to easily determine if i_size is being changed or not hence
986 we just emit an appropriately worded error message.
987
9882.0.25 - Small bug fixes and cleanups.
989
990 - Unlock the page in an out of memory error code path in
991 fs/ntfs/aops.c::ntfs_read_block().
992 - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
993 just unlock the page and return. (This can happen due to ->writepage
994 clearing PageUptodate() during write out of MstProtected()
995 attributes.
996 - Remove leaked write code again.
997
9982.0.24 - Cleanups.
999
1000 - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
1001 inside BUG_ON(). (Adam J. Richter)
1002 - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
1003 calls for improved debugging. (Adam J. Richter)
1004 - Add errors flag to the ntfs volume state, accessed via
1005 NVol{,Set,Clear}Errors(vol).
1006 - Do not allow read-write remounts of read-only volumes with errors.
1007 - Clarify comment for ntfs file operation sendfile which was added by
1008 Christoph Hellwig a while ago (just using generic_file_sendfile())
1009 to say that ntfs ->sendfile is only used for the case where the
1010 source data is on the ntfs partition and the destination is
1011 somewhere else, i.e. nothing we need to concern ourselves with.
1012 - Add generic_file_write() as our ntfs file write operation.
1013
10142.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
1015
1016 - Massive internal locking changes to mft record locking. Fixes lock
1017 recursion and replaces the mrec_lock read/write semaphore with a
1018 mutex. Also removes the now superfluous mft_count. This fixes several
1019 race conditions and deadlocks, especially in the future write code.
1020 - Fix ntfs over loopback for compressed files by adding an
1021 optimization barrier. (gcc was screwing up otherwise ?)
1022 - Miscellaneous cleanups all over the code and a fix or two in error
1023 handling code paths.
1024 Thanks go to Christoph Hellwig for pointing out the following two:
1025 - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
1026 - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
1027
10282.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
1029
1030 - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
1031 at entry/exit respectively.
1032 - Use C99 initializers for structures.
1033 - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
1034
10352.0.21 - Check for, and refuse to work with too large files/directories/volumes.
1036
1037 - Limit volume size at mount time to 2TiB on architectures where
1038 unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
1039 This is the most we can do without overflowing the 32-bit limit of
1040 the block device size imposed on us by sb_bread() and sb_getblk()
1041 for the time being.
1042 - Limit file/directory size at open() time to 16TiB on architectures
1043 where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
1044 fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
1045 overflowing the page cache page index.
1046
10472.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
1048
1049 - Move the directory index bitmap to use an attribute inode instead of
1050 having special fields for it inside the ntfs inode structure. This
1051 means that the index bitmaps now use the page cache for i/o, too,
1052 and also as a side effect we get support for non-resident index
1053 bitmaps for free.
1054 - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
1055 fix a page leak that manifested itself in some cases.
1056 - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
1057 index bitmap inode on the final iput().
1058
10592.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
1060
1061 - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
1062 to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
1063 - Drop the "file" from ntfs_file_read_compressed_block().
1064 - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
1065 ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
1066 - Update ntfs_end_buffer_async_read() with the improved logic from
1067 its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
1068 further logic improvements to better determine when we set PageError.
1069 - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
1070 check for the buffers being uptodate first in line with the updated
1071 fs/buffer.c::block_read_full_page(). This plugs a small race
1072 condition.
1073
10742.0.18 - Fix race condition in reading of compressed files.
1075
1076 - There was a narrow window between checking a buffer head for being
1077 uptodate and locking it in ntfs_file_read_compressed_block(). We now
1078 lock the buffer and then check whether it is uptodate or not.
1079
10802.0.17 - Cleanups and optimizations - shrinking the ToDo list.
1081
1082 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
1083 code and update callers, i.e. ntfs_iget(), to pass that error code
1084 up instead of just using -EIO.
1085 - Modifications to super.c to ensure that both mount and remount
1086 cannot set any write related options when the driver is compiled
1087 read-only.
1088 - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
1089 cache the current runlist element. This should improve performance
1090 when reading very large and/or very fragmented data.
1091
10922.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
1093
1094 - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
1095 wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
1096 - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
1097 - Convert $MFT/$BITMAP access to attribute inode API and remove all
1098 remnants of the ugly mftbmp address space and operations hack. This
1099 means we finally have only one readpage function as well as only one
1100 async io completion handler. Yey! The mft bitmap is now just an
1101 attribute inode and is accessed from vol->mftbmp_ino just as if it
1102 were a normal file. Fake inodes rule. (-:
1103
11042.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
1105
1106 - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
1107 remounts to fail when the partition had an entry in /etc/fstab and
1108 the entry specified the nls= option.
1109 - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
1110 expand all the helper functions NVolFoo(), NVolSetFoo(), and
1111 NVolClearFoo().
1112 - Move copyright statement from driver initialisation message to
1113 module description (fs/super.c). This makes the initialisation
1114 message fit on one line and fits in better with rest of kernel.
1115 - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
1116 attribute inodes, and both for files and directories.
1117 - Implement fake attribute inodes allowing all attribute i/o to go via
1118 the page cache and to use all the normal vfs/mm functionality:
1119 - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
1120 to fs/ntfs/inode.c.
1121 - Add needed cleanup code to ntfs_clear_big_inode().
1122 - Merge address space operations for files and directories (aops.c),
1123 now just have ntfs_aops:
1124 - Rename:
1125 end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
1126 ntfs_attr_read_block() -> ntfs_read_block(),
1127 ntfs_file_read_page() -> ntfs_readpage().
1128 - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
1129 attribute inodes, and both for files and directories.
1130 - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
1131
11322.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
1133
1134 - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
1135 the locking out of super.c::get_nr_free_mft_records() and taking and
1136 dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
1137 - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
1138 current userspace ntfs library code. This means that if a merge
1139 fails the original runlists are always left unmodified instead of
1140 being silently corrupted.
1141 - Misc typo fixes.
1142
11432.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
1144
1145 - Remove nr_mft_bits and the now superfluous union with nr_mft_records
1146 from ntfs_volume structure.
1147 - Remove nr_lcn_bits and the now superfluous union with nr_clusters
1148 from ntfs_volume structure.
1149 - Use iget5_locked() and friends instead of conventional iget(). Wrap
1150 the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
1151 to use ntfs_iget(). Leave only one iget() call at mount time so we
1152 don't need an ntfs_iget_mount().
1153 - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
1154 additional argument.
1155
11562.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
1157
1158 - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
1159 fs/ntfs/aops.c::end_buffer_read_file_async() into one function
1160 fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
1161 to determine whether to apply mst fixups or not.
1162 - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
1163 and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
1164 fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
1165 fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
1166 the VFS readpage function prototype to the ntfs_attr_read_block()
1167 function prototype.
1168
11692.0.11 - Initial preparations for fake inode based attribute i/o.
1170
1171 - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
1172 do some macro magic (adapted from include/linux/buffer_head.h) to
1173 expand all the helper functions NInoFoo(), NInoSetFoo(), and
1174 NInoClearFoo().
1175 - Add new flag to ntfs_inode_state_bits: NI_Sparse.
1176 - Add new fields to ntfs_inode structure to allow use of fake inodes
1177 for attribute i/o: type, name, name_len. Also add new state bits:
1178 NI_Attr, which, if set, indicates the inode is a fake inode, and
1179 NI_MstProtected, which, if set, indicates the attribute uses multi
1180 sector transfer protection, i.e. fixups need to be applied after
1181 reads and before/after writes.
1182 - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
1183 ntfs_{new,clear,destroy}_extent_inode() and update callers.
1184 - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
1185 instead of ntfs_destroy_extent_inode().
1186 - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
1187 - Make all operations on ntfs inode state bits use the NIno* functions.
1188 - Set up the new ntfs inode fields and state bits in
1189 fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
1190 allocated memory to __ntfs_clear_inode().
1191 - Cleanup ntfs_inode structure a bit for better ordering of elements
1192 w.r.t. their size to allow better packing of the structure in memory.
1193
11942.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
1195
1196 - Add check at mount time to verify that the number of inodes on the
1197 volume does not exceed 2^32 - 1, which is the maximum allowed for
1198 NTFS according to Microsoft.
1199 - Change mft_no member of ntfs_inode structure to be unsigned long.
1200 Update all users. This makes ntfs_inode->mft_no just a copy of struct
1201 inode->i_ino. But we can't just always use struct inode->i_ino and
1202 remove mft_no because extent inodes do not have an attached struct
1203 inode.
1204
12052.0.9 - Decompression engine now uses a single buffer and other cleanups.
1206
1207 - Change decompression engine to use a single buffer protected by a
1208 spin lock instead of per-CPU buffers. (Rusty Russell)
1209 - Do not update cb_pos when handling a partial final page during
1210 decompression of a sparse compression block, as the value is later
1211 reset without being read/used. (Rusty Russell)
1212 - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
1213 Morton)
1214 - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
1215 NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
1216 it also makes everything safer so it is a good thing.
1217 - Miscellaneous minor cleanups to comments.
1218
12192.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
1220
1221 Big thanks go to Al Viro and other inhabitants of #kernel for investing
1222 their time to discuss the case sensitivity and dcache aliasing issues.
1223
1224 - Remove unused source file fs/ntfs/attraops.c.
1225 - Remove show_inodes mount option(s), thus dropping support for
1226 displaying of short file names.
1227 - Remove deprecated mount option posix.
1228 - Restore show_sys_files mount option.
1229 - Add new mount option case_sensitive, to determine if the driver
1230 treats file names as case sensitive or not. If case sensitive, create
1231 file names in the POSIX namespace. Otherwise create file names in the
1232 LONG/WIN32 namespace. Note, files remain accessible via their short
1233 file name, if it exists.
1234 - Remove really dumb logic bug in boot sector recovery code.
1235 - Fix dcache aliasing issues wrt short/long file names via changes
1236 to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
1237 fs/ntfs/namei.c::ntfs_lookup():
1238 - Add additional argument to ntfs_lookup_inode_by_name() in which we
1239 return information about the matching file name if the case is not
1240 matching or the match is a short file name. See comments above the
1241 function definition for details.
1242 - Change ntfs_lookup() to only create dcache entries for the correctly
1243 cased file name and only for the WIN32 namespace counterpart of DOS
1244 namespace file names. This ensures we have only one dentry per
1245 directory and also removes all dcache aliasing issues between short
1246 and long file names once we add write support. See comments above
1247 function for details.
1248 - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
1249
12502.0.7 - Minor cleanups and updates for changes in core kernel code.
1251
1252 - Remove much of the NULL struct element initializers.
1253 - Various updates to make compatible with recent kernels.
1254 - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
1255 in fs/ntfs/ntfs.h instead.
1256 - Remove no longer needed KERNEL_VERSION checks. We are now in the
1257 kernel proper so they are no longer needed.
1258
12592.0.6 - Major bugfix to make compatible with other kernel changes.
1260
1261 - Initialize the mftbmp address space properly now that there are more
1262 fields in the struct address_space. This was leading to hangs and
1263 oopses on umount since 2.5.12 because of changes to other parts of
1264 the kernel. We probably want a kernel generic init_address_space()
1265 function...
1266 - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
1267 only caller of ->readdir() is vfs_readdir() which holds i_mutex
1268 during the call, and i_mutex is sufficient protection against changes
1269 in the directory inode (including ->i_size).
1270 - Use generic_file_llseek() for directories (as opposed to
1271 default_llseek()) as this downs i_mutex instead of the BKL which is
1272 what we now need for exclusion against ->f_pos changes considering we
1273 no longer take the BKL in ntfs_readdir().
1274
12752.0.5 - Major bugfix. Buffer overflow in extent inode handling.
1276
1277 - No need to set old blocksize in super.c::ntfs_fill_super() as the
1278 VFS does so via invocation of deactivate_super() calling
1279 fs->fill_super() calling block_kill_super() which does it.
1280 - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
1281 -> Do we really need it? I don't think so as we have exclusion on
1282 the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
1283 move the ->f_pos accesses under the mrec_lock though. Check this...
1284 - Fix really, really, really stupid buffer overflow in extent inode
1285 handling in mft.c::map_extent_mft_record().
1286
12872.0.4 - Cleanups and updates for kernel 2.5.11.
1288
1289 - Add documentation on how to use the MD driver to be able to use NTFS
1290 stripe and volume sets in Linux and generally cleanup documentation
1291 a bit.
1292 Remove all uses of kdev_t in favour of struct block_device *:
1293 - Change compress.c::ntfs_file_read_compressed_block() to use
1294 sb_getblk() instead of getblk().
1295 - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
1296 of get_hardsect_size().
1297 - No need to get old blocksize in super.c::ntfs_fill_super() as
1298 fs/super.c::get_sb_bdev() already does this.
1299 - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
1300
13012.0.3 - Small bug fixes, cleanups, and performance improvements.
1302
1303 - Remove some dead code from mft.c.
1304 - Optimize readpage and read_block functions throughout aops.c so that
1305 only initialized blocks are read. Non-initialized ones have their
1306 buffer head mapped, zeroed, and set up to date, without scheduling
1307 any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
1308 Thanks go to Andrew Morton for spotting the below:
1309 - Fix buglet in allocate_compression_buffers() error code path.
1310 - Call flush_dcache_page() after modifying page cache page contents in
1311 ntfs_file_readpage().
1312 - Check for existence of page buffers throughout aops.c before calling
1313 create_empty_buffers(). This happens when an I/O error occurs and the
1314 read is retried. (It also happens once writing is implemented so that
1315 needed doing anyway but I had left it for later...)
1316 - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
1317 readpage and read_block functions. Reasoning same as above (i.e. I/O
1318 error retries and future write code paths.)
1319
13202.0.2 - Minor updates and cleanups.
1321
1322 - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
1323 and cleanup the code a bit, removing the unused size parameter.
1324 - Change default fmask to 0177 and update documentation.
1325 - Change attrib.c::get_attr_search_ctx() to return the search context
1326 directly instead of taking the address of a pointer. A return value
1327 of NULL means the allocation failed. Updated all callers
1328 appropriately.
1329 - Update to 2.5.9 kernel (preserving backwards compatibility) by
1330 replacing all occurences of page->buffers with page_buffers(page).
1331 - Fix minor bugs in runlist merging, also minor cleanup.
1332 - Updates to bootsector layout and mft mirror contents descriptions.
1333 - Small bug fix in error detection in unistr.c and some cleanups.
1334 - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
1335 bytes.
1336
13372.0.1 - Minor updates.
1338
1339 - Make default umask correspond to documentation.
1340 - Improve documentation.
1341 - Set default mode to include execute bit. The {u,f,d}mask can be used
1342 to take it away if desired. This allows binaries to be executed from
1343 a mounted ntfs partition.
1344
13452.0.0 - New version number. Remove TNG from the name. Now in the kernel.
1346
1347 - Add kill_super, just keeping up with the vfs changes in the kernel.
1348 - Repeat some changes from tng-0.0.8 that somehow got lost on the way
1349 from the CVS import into BitKeeper.
1350 - Begin to implement proper handling of allocated_size vs
1351 initialized_size vs data_size (i.e. i_size). Done are
1352 mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
1353 and attrib.c::load_attribute_list().
1354 - Lock the runlist in attrib.c::load_attribute_list() while using it.
1355 - Fix memory leak in ntfs_file_read_compressed_block() and generally
1356 clean up compress.c a little, removing some uncommented/unused debug
1357 code.
1358 - Tidy up dir.c a little bit.
1359 - Don't bother getting the runlist in inode.c::ntfs_read_inode().
1360 - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
1361 creating aops.c::ntfs_mst_readpage(), improving the handling of
1362 holes and overflow in the process and implementing the correct
1363 equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
1364 I am aiming for correctness at the moment. Modularisation can come
1365 later.
1366 - Rename aops.c::end_buffer_read_index_async() to
1367 end_buffer_read_mst_async() and optimize the overflow checking and
1368 handling.
1369 - Use the host of the mftbmp address space mapping to hold the ntfs
1370 volume. This is needed so the async i/o completion handler can
1371 retrieve a pointer to the volume. Hopefully this will not cause
1372 problems elsewhere in the kernel... Otherwise will need to use a
1373 fake inode.
1374 - Complete implementation of proper handling of allocated_size vs
1375 initialized_size vs data_size (i.e. i_size) in whole driver.
1376 Basically aops.c is now completely rewritten.
1377 - Change NTFS driver name to just NTFS and set version number to 2.0.0
1378 to make a clear distinction from the old driver which is still on
1379 version 1.1.22.
1380
1381tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
1382
1383 - Replace bdevname(sb->s_dev) with sb->s_id.
1384 - Remove now superfluous new-line characters in all callers of
1385 ntfs_debug().
1386 - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
1387 directories. Without this the "find" utility gets very upset which is
1388 fair enough as Linux/Unix do not support directory hard links.
1389 - Further runlist merging work. (Richard Russon)
1390 - Backwards compatibility for gcc-2.95. (Richard Russon)
1391 - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
1392 - Convert to new filesystem declaration using ->ntfs_get_sb() and
1393 replacing ntfs_read_super() with ntfs_fill_super().
1394 - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
1395 overflow on 32-bit architectures.
1396 - Cleanup upcase loading code to use ntfs_(un)map_page().
1397 - Disable/reenable preemtion in critical sections of compession engine.
1398 - Replace device size determination in ntfs_fill_super() with
1399 sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
1400 function super.c::get_nr_blocks().
1401 - Implement a mount time option (show_inodes) allowing choice of which
1402 types of inode names readdir() returns and modify ntfs_filldir()
1403 accordingly. There are several parameters to show_inodes:
1404 system: system files
1405 win32: long file names (including POSIX file names) [DEFAULT]
1406 long: same as win32
1407 dos: short file names only (excluding POSIX file names)
1408 short: same as dos
1409 posix: same as both win32 and dos
1410 all: all file names
1411 Note that the options are additive, i.e. specifying:
1412 -o show_inodes=system,show_inodes=win32,show_inodes=dos
1413 is the same as specifying:
1414 -o show_inodes=all
1415 Note that the "posix" and "all" options will show all directory
1416 names, BUT the link count on each directory inode entry is set to 1,
1417 due to Linux not supporting directory hard links. This may well
1418 confuse some userspace applications, since the directory names will
1419 have the same inode numbers. Thus it is NOT advisable to use the
1420 "posix" or "all" options. We provide them only for completeness sake.
1421 - Add copies of allocated_size, initialized_size, and compressed_size to
1422 the ntfs inode structure and set them up in
1423 inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
1424 for files and the index allocation attribute for directories.
1425 - Add copies of allocated_size and initialized_size to ntfs inode for
1426 $BITMAP attribute of large directories and set them up in
1427 inode.c::ntfs_read_inode().
1428 - Add copies of allocated_size and initialized_size to ntfs volume for
1429 $BITMAP attribute of $MFT and set them up in
1430 super.c::load_system_files().
1431 - Parse deprecated ntfs driver options (iocharset, show_sys_files,
1432 posix, and utf8) and tell user what the new options to use are. Note
1433 we still do support them but they will be removed with kernel 2.7.x.
1434 - Change all occurences of integer long long printf formatting to hex
1435 as printk() will not support long long integer format if/when the
1436 div64 patch goes into the kernel.
1437 - Make slab caches have stable names and change the names to what they
1438 were intended to be. These changes are required/made possible by the
1439 new slab cache name handling which removes the length limitation by
1440 requiring the caller of kmem_cache_create() to supply a stable name
1441 which is then referenced but not copied.
1442 - Rename run_list structure to run_list_element and create a new
1443 run_list structure containing a pointer to a run_list_element
1444 structure and a read/write semaphore. Adapt all users of runlists
1445 to new scheme and take and release the lock as needed. This fixes a
1446 nasty race as the run_list changes even when inodes are locked for
1447 reading and even when the inode isn't locked at all, so we really
1448 needed the serialization. We use a semaphore rather than a spinlock
1449 as memory allocations can sleep and doing everything GFP_ATOMIC
1450 would be silly.
1451 - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
1452 This can never happen due to the nature of lookup_attr() and how we
1453 support attribute lists. If it did happen it would imply the inode
1454 being corrupt.
1455 - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
1456 bad if found.
1457 - Update to 2.5.6-pre2 changes in struct address_space.
1458 - Use parent_ino() when accessing d_parent inode number in dir.c.
1459 - Import Sourceforge CVS repository into BitKeeper repository:
1460 http://linux-ntfs.bkbits.net/ntfs-tng-2.5
1461 - Update fs/Makefile, fs/Config.help, fs/Config.in, and
1462 Documentation/filesystems/ntfs.txt for NTFS TNG.
1463 - Create kernel configuration option controlling whether debugging
1464 is enabled or not.
1465 - Add the required export of end_buffer_io_sync() from the patches
1466 directory to the kernel code.
1467 - Update inode.c::ntfs_show_options() with show_inodes mount option.
1468 - Update errors mount option.
1469
1470tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
1471
1472 - Cleanup mft.c and it's debug/error output in particular. Fix a minor
1473 bug in mapping of extent inodes. Update all the comments to fit all
1474 the recent code changes.
1475 - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
1476 - Cleanups in compress.c, mostly comments and folding help.
1477 - Implement attrib.c::map_run_list() as a generic helper.
1478 - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
1479 thus making code shorter and enabling attribute list support.
1480 - Cleanup incorrect use of [su]64 with %L printf format specifier in
1481 all source files. Type casts to [unsigned] long long added to correct
1482 the mismatches (important for architectures which have long long not
1483 being 64 bits).
1484 - Merge async io completion handlers for directory indexes and $MFT
1485 data into one by setting the index_block_size{_bits} of the ntfs
1486 inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
1487 - Cleanup aops.c, update comments.
1488 - Make ntfs_file_get_block() use map_run_list() so all files now
1489 support attribute lists.
1490 - Make ntfs_dir_readpage() almost verbatim copy of
1491 block_read_full_page() by using ntfs_file_get_block() with only real
1492 difference being the use of our own async io completion handler
1493 rather than the default one, thus reducing the amount of code and
1494 automatically enabling attribute list support for directory indices.
1495 - Fix bug in load_attribute_list() - forgot to call brelse in error
1496 code path.
1497 - Change parameters to find_attr() and lookup_attr(). We no longer
1498 pass in the upcase table and its length. These can be gotten from
1499 ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
1500 - Cleanups in attrib.c.
1501 - Implement merging of runlists, attrib.c::merge_run_lists() and its
1502 helpers. (Richard Russon)
1503 - Attribute lists part 2, attribute extents and multi part runlists:
1504 enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
1505 further runlist parts via attrib.c::map_run_list().
1506 - Tiny endianness bug fix in decompress_mapping_pairs().
1507
1508tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
1509
1510 - Enable encrypted directories. (Their index root is marked encrypted
1511 to indicate that new files in that directory should be created
1512 encrypted.)
1513 - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
1514 - Enable $Extend system directory. Most (if not all) extended system
1515 files do not have unnamed data attributes so ntfs_read_inode() had to
1516 special case them but that is ok, as the special casing recovery
1517 happens inside an error code path so there is zero slow down in the
1518 normal fast path. The special casing is done by introducing a new
1519 function inode.c::ntfs_is_extended_system_file() which checks if any
1520 of the hard links in the inode point to $Extend as being their parent
1521 directory and if they do we assume this is an extended system file.
1522 - Create a sysctl/proc interface to allow {dis,en}abling of debug output
1523 when compiled with -DDEBUG. Default is debug messages to be disabled.
1524 To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
1525 (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
1526 interface is enabled). Inspired by old ntfs driver.
1527 - Add debug_msgs insmod/kernel boot parameter to set whether debug
1528 messages are {dis,en}abled. This is useful to enable debug messages
1529 during ntfs initialization and is the only way to activate debugging
1530 when the sysctl interface is not enabled.
1531 - Cleanup debug output in various places.
1532 - Remove all dollar signs ($) from the source (except comments) to
1533 enable compilation on architectures whose gcc compiler does not
1534 support dollar signs in the names of variables/constants. Attribute
1535 types now start with AT_ instead of $ and $I30 is now just I30.
1536 - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
1537 - Load complete runlist for $MFT/$BITMAP during mount and cleanup
1538 access functions. This means we now cope with $MFT/$BITMAP being
1539 spread accross several mft records.
1540 - Disable modification of mft_zone_multiplier on remount. We can always
1541 reenable this later on if we really want to, but we will need to make
1542 sure we readjust the mft_zone size / layout accordingly.
1543
1544tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
1545
1546 - Use sb_set_blocksize() instead of set_blocksize() and verify the
1547 return value.
1548 - Use sb_bread() instead of bread() throughout.
1549 - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
1550 of a directory index block vcn. Apply resulting simplifications in
1551 dir.c everywhere.
1552 - Fix a small bug somewhere (but forgot what it was).
1553 - Change ntfs_{debug,error,warning} to enable gcc to do type checking
1554 on the printf-format parameter list and fix bugs reported by gcc
1555 as a result. (Richard Russon)
1556 - Move inode allocation strategy to Al's new stuff but maintain the
1557 divorce of ntfs_inode from struct inode. To achieve this we have two
1558 separate slab caches, one for big ntfs inodes containing a struct
1559 inode and pure ntfs inodes and at the same time fix some faulty
1560 error code paths in ntfs_read_inode().
1561 - Show mount options in proc (inode.c::ntfs_show_options()).
1562
1563tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
1564
1565 - Modified (un)map_mft_record functions to be common for read and write
1566 case. To specify which is which, added extra parameter at front of
1567 parameter list. Pass either READ or WRITE to this, each has the
1568 obvious meaning.
1569 - General cleanups to allow for easier folding in vi.
1570 - attrib.c::decompress_mapping_pairs() now accepts the old runlist
1571 argument, and invokes attrib.c::merge_run_lists() to merge the old
1572 and the new runlists.
1573 - Removed attrib.c::find_first_attr().
1574 - Implemented loading of attribute list and complete runlist for $MFT.
1575 This means we now cope with $MFT being spread across several mft
1576 records.
1577 - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
1578 - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
1579 - Make ntfs_volume be allocated via kmalloc() instead of using a slab
1580 cache. There are too little ntfs_volume structures at any one time
1581 to justify a private slab cache.
1582 - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
1583 Use KM_BIO_IRQ on advice from IRC/kernel...
1584 - Use ntfs_map_page() in map_mft_record() and create ->readpage method
1585 for reading $MFT (ntfs_mft_readpage). In the process create dedicated
1586 address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
1587 removed the now superfluous exports from the kernel core patch.
1588 - Fix a bug where kfree() was used instead of ntfs_free().
1589 - Change map_mft_record() to take ntfs_inode as argument instead of
1590 vfs inode. Dito for unmap_mft_record(). Adapt all callers.
1591 - Add pointer to ntfs_volume to ntfs_inode.
1592 - Add mft record number and sequence number to ntfs_inode. Stop using
1593 i_ino and i_generation for in-driver purposes.
1594 - Implement attrib.c::merge_run_lists(). (Richard Russon)
1595 - Remove use of proper inodes by extent inodes. Move i_ino and
1596 i_generation to ntfs_inode to do this. Apply simplifications that
1597 result and remove iget_no_wait(), etc.
1598 - Pass ntfs_inode everywhere in the driver (used to be struct inode).
1599 - Add reference counting in ntfs_inode for the ntfs inode itself and
1600 for the mapped mft record.
1601 - Extend mft record mapping so we can (un)map extent mft records (new
1602 functions (un)map_extent_mft_record), and so mappings are reference
1603 counted and don't have to happen twice if already mapped - just ref
1604 count increases.
1605 - Add -o iocharset as alias to -o nls for backwards compatibility.
1606 - The latest core patch is now tiny. In fact just a single additional
1607 export is necessary over the base kernel.
1608
1609tng-0.0.3 - Cleanups, enhancements, bug fixes.
1610
1611 - Work on attrib.c::decompress_mapping_pairs() to detect base extents
1612 and setup the runlist appropriately using knowledge provided by the
1613 sizes in the base attribute record.
1614 - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
1615 any more.
1616 - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
1617 page or use vmalloc depending on the amount of memory requested.
1618 - Cleanup error output. The __FUNCTION__ "(): " is now added
1619 automatically. Introduced a new header file debug.h to support this
1620 and also moved ntfs_debug() function into it.
1621 - Make reading of compressed files more intelligent and especially get
1622 rid of the vmalloc_nofs() from readpage(). This now uses per CPU
1623 buffers (allocated at first mount with cluster size <= 4kiB and
1624 deallocated on last umount with cluster size <= 4kiB), and
1625 asynchronous io for the compressed data using a list of buffer heads.
1626 Er, we use synchronous io as async io only works on whole pages
1627 covered by buffers and not on individual buffer heads...
1628 - Bug fix for reading compressed files with sparse compression blocks.
1629
1630tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
1631
1632 - Fixed handling of directories when cluster size exceeds index block
1633 size.
1634 - Hide DOS only name space directory entries from readdir() but allow
1635 them in lookup(). This should fix the problem that Linux doesn't
1636 support directory hard links, while still allowing access to entries
1637 via their short file name. This also has the benefit of mimicking
1638 what Windows users are used to, so it is the ideal solution.
1639 - Implemented sync_page everywhere so no more hangs in D state when
1640 waiting for a page.
1641 - Stop using bforget() in favour of brelse().
1642 - Stop locking buffers unnecessarily.
1643 - Implemented compressed files (inode->mapping contains uncompressed
1644 data, raw compressed data is currently bread() into a vmalloc()ed
1645 memory buffer).
1646 - Enable compressed directories. (Their index root is marked compressed
1647 to indicate that new files in that directory should be created
1648 compressed.)
1649 - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
1650 functions. (Thanks to Will Dyson for pointing this out.)
1651 - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
1652 ntfs_sb_info) out of the common inode and super_block structures and
1653 started using the generic_ip and generic_sbp pointers instead. This
1654 makes ntfs entirely private with respect to the kernel tree.
1655 - Detect compiler version and abort with error message if gcc less than
1656 2.96 is used.
1657 - Fix bug in name comparison function in unistr.c.
1658 - Implement attribute lists part 1, the infrastructure: search contexts
1659 and operations, find_external_attr(), lookup_attr()) and make the
1660 code use the infrastructure.
1661 - Fix stupid buffer overflow bug that became apparent on larger run
1662 list containing attributes.
1663 - Fix bugs in readdir() that became apparent on larger directories.
1664
1665 The driver is now really useful and survives the test
1666 find . -type f -exec md5sum "{}" \;
1667 without any error messages on a over 1GiB sized partition with >16k
1668 files on it, including compressed files and directories and many files
1669 and directories with attribute lists.
1670
1671tng-0.0.1 - The first useful version.
1672
1673 - Added ntfs_lookup().
1674 - Added default upcase generation and handling.
1675 - Added compile options to be shown on module init.
1676 - Many bug fixes that were "hidden" before.
1677 - Update to latest kernel.
1678 - Added ntfs_readdir().
1679 - Added file operations for mmap(), read(), open() and llseek(). We just
1680 use the generic ones. The whole point of going through implementing
1681 readpage() methods and where possible get_block() call backs is that
1682 this allows us to make use of the generic high level methods provided
1683 by the kernel.
1684
1685 The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
1686 though and it doesn't implement accesssing compressed files yet. Also,
1687 accessing files with attribute list attributes is not implemented yet
1688 either. But for small or simple filesystems it should work and allow
1689 you to list directories, use stat on directory entries and the file
1690 system, open, read, mmap and llseek around in files. A big mile stone
1691 has been reached!
1692
1693tng-0.0.0 - Initial version tag.
1694
1695 Initial driver implementation. The driver can mount and umount simple
1696 NTFS filesystems (i.e. ones without attribute lists in the system
1697 files). If the mount fails there might be problems in the error handling
1698 code paths, so be warned. Otherwise it seems to be loading the system
1699 files nicely and the mft record read mapping/unmapping seems to be
1700 working nicely, too. Proof of inode metadata in the page cache and non-
1701 resident file unnamed stream data in the page cache concepts is thus
1702 complete.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/gfp.h>
26#include <linux/mm.h> 27#include <linux/mm.h>
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/swap.h> 29#include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27 28
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 9669541d0119..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
25#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/slab.h>
28 29
29#include "attrib.h" 30#include "attrib.h"
30#include "inode.h" 31#include "inode.h"
@@ -927,7 +928,7 @@ lock_retry_remap:
927 return 0; 928 return 0;
928 929
929 ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? 930 ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
930 "EOVERFLOW" : (!err ? "EIO" : "unkown error")); 931 "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
931 return err < 0 ? err : -EIO; 932 return err < 0 ? err : -EIO;
932 933
933read_err: 934read_err:
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e37..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24 25
25#include "dir.h" 26#include "dir.h"
26#include "aops.h" 27#include "aops.h"
@@ -1545,7 +1546,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
1545 write_inode_now(bmp_vi, !datasync); 1546 write_inode_now(bmp_vi, !datasync);
1546 iput(bmp_vi); 1547 iput(bmp_vi);
1547 } 1548 }
1548 ret = ntfs_write_inode(vi, 1); 1549 ret = __ntfs_write_inode(vi, 1);
1549 write_inode_now(vi, !datasync); 1550 write_inode_now(vi, !datasync);
1550 err = sync_blockdev(vi->i_sb->s_bdev); 1551 err = sync_blockdev(vi->i_sb->s_bdev);
1551 if (unlikely(err && !ret)) 1552 if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 663c0e341f8b..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/gfp.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/pagevec.h> 25#include <linux/pagevec.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
@@ -399,7 +400,7 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
399 * @cached_page: allocated but as yet unused page 400 * @cached_page: allocated but as yet unused page
400 * @lru_pvec: lru-buffering pagevec of caller 401 * @lru_pvec: lru-buffering pagevec of caller
401 * 402 *
402 * Obtain @nr_pages locked page cache pages from the mapping @maping and 403 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
403 * starting at index @index. 404 * starting at index @index.
404 * 405 *
405 * If a page is newly created, increment its refcount and add it to the 406 * If a page is newly created, increment its refcount and add it to the
@@ -1281,7 +1282,7 @@ rl_not_mapped_enoent:
1281 1282
1282/* 1283/*
1283 * Copy as much as we can into the pages and return the number of bytes which 1284 * Copy as much as we can into the pages and return the number of bytes which
1284 * were sucessfully copied. If a fault is encountered then clear the pages 1285 * were successfully copied. If a fault is encountered then clear the pages
1285 * out to (ofs + bytes) and return the number of bytes which were copied. 1286 * out to (ofs + bytes) and return the number of bytes which were copied.
1286 */ 1287 */
1287static inline size_t ntfs_copy_from_user(struct page **pages, 1288static inline size_t ntfs_copy_from_user(struct page **pages,
@@ -2182,7 +2183,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
2182 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2183 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2183 BUG_ON(S_ISDIR(vi->i_mode)); 2184 BUG_ON(S_ISDIR(vi->i_mode));
2184 if (!datasync || !NInoNonResident(NTFS_I(vi))) 2185 if (!datasync || !NInoNonResident(NTFS_I(vi)))
2185 ret = ntfs_write_inode(vi, 1); 2186 ret = __ntfs_write_inode(vi, 1);
2186 write_inode_now(vi, !datasync); 2187 write_inode_now(vi, !datasync);
2187 /* 2188 /*
2188 * NOTE: If we were to use mapping->private_list (see ext2 and 2189 * NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/slab.h>
23
22#include "aops.h" 24#include "aops.h"
23#include "collate.h" 25#include "collate.h"
24#include "debug.h" 26#include "debug.h"
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9938034762cc..4b57fb1eac2a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -530,7 +530,7 @@ err_corrupt_attr:
530 * the ntfs inode. 530 * the ntfs inode.
531 * 531 *
532 * Q: What locks are held when the function is called? 532 * Q: What locks are held when the function is called?
533 * A: i_state has I_LOCK set, hence the inode is locked, also 533 * A: i_state has I_NEW set, hence the inode is locked, also
534 * i_count is set to 1, so it is not going to go away 534 * i_count is set to 1, so it is not going to go away
535 * i_flags is set to 0 and we have no business touching it. Only an ioctl() 535 * i_flags is set to 0 and we have no business touching it. Only an ioctl()
536 * is allowed to write to them. We should of course be honouring them but 536 * is allowed to write to them. We should of course be honouring them but
@@ -1207,7 +1207,7 @@ err_out:
1207 * necessary fields in @vi as well as initializing the ntfs inode. 1207 * necessary fields in @vi as well as initializing the ntfs inode.
1208 * 1208 *
1209 * Q: What locks are held when the function is called? 1209 * Q: What locks are held when the function is called?
1210 * A: i_state has I_LOCK set, hence the inode is locked, also 1210 * A: i_state has I_NEW set, hence the inode is locked, also
1211 * i_count is set to 1, so it is not going to go away 1211 * i_count is set to 1, so it is not going to go away
1212 * 1212 *
1213 * Return 0 on success and -errno on error. In the error case, the inode will 1213 * Return 0 on success and -errno on error. In the error case, the inode will
@@ -1474,7 +1474,7 @@ err_out:
1474 * normal directory inodes. 1474 * normal directory inodes.
1475 * 1475 *
1476 * Q: What locks are held when the function is called? 1476 * Q: What locks are held when the function is called?
1477 * A: i_state has I_LOCK set, hence the inode is locked, also 1477 * A: i_state has I_NEW set, hence the inode is locked, also
1478 * i_count is set to 1, so it is not going to go away 1478 * i_count is set to 1, so it is not going to go away
1479 * 1479 *
1480 * Return 0 on success and -errno on error. In the error case, the inode will 1480 * Return 0 on success and -errno on error. In the error case, the inode will
@@ -2957,7 +2957,7 @@ out:
2957 * 2957 *
2958 * Return 0 on success and -errno on error. 2958 * Return 0 on success and -errno on error.
2959 */ 2959 */
2960int ntfs_write_inode(struct inode *vi, int sync) 2960int __ntfs_write_inode(struct inode *vi, int sync)
2961{ 2961{
2962 sle64 nt; 2962 sle64 nt;
2963 ntfs_inode *ni = NTFS_I(vi); 2963 ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a3..9a113544605d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
307 307
308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); 308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
309 309
310extern int ntfs_write_inode(struct inode *vi, int sync); 310extern int __ntfs_write_inode(struct inode *vi, int sync);
311 311
312static inline void ntfs_commit_inode(struct inode *vi) 312static inline void ntfs_commit_inode(struct inode *vi)
313{ 313{
314 if (!is_bad_inode(vi)) 314 if (!is_bad_inode(vi))
315 ntfs_write_inode(vi, 1); 315 __ntfs_write_inode(vi, 1);
316 return; 316 return;
317} 317}
318 318
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 89b02985c054..4dadcdf3d451 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -338,7 +338,7 @@ err_out:
338 * copy of the complete multi sector transfer deprotected page. On failure, 338 * copy of the complete multi sector transfer deprotected page. On failure,
339 * *@wrp is undefined. 339 * *@wrp is undefined.
340 * 340 *
341 * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current 341 * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
342 * logfile lsn according to this restart page. On failure, *@lsn is undefined. 342 * logfile lsn according to this restart page. On failure, *@lsn is undefined.
343 * 343 *
344 * The following error codes are defined: 344 * The following error codes are defined:
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h> 24#include <linux/exportfs.h>
25#include <linux/security.h> 25#include <linux/security.h>
26#include <linux/slab.h>
26 27
27#include "attrib.h" 28#include "attrib.h"
28#include "debug.h" 29#include "debug.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e9..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
34#include <linux/bitmap.h>
34 35
35#include "sysctl.h" 36#include "sysctl.h"
36#include "logfile.h" 37#include "logfile.h"
@@ -39,6 +40,7 @@
39#include "dir.h" 40#include "dir.h"
40#include "debug.h" 41#include "debug.h"
41#include "index.h" 42#include "index.h"
43#include "inode.h"
42#include "aops.h" 44#include "aops.h"
43#include "layout.h" 45#include "layout.h"
44#include "malloc.h" 46#include "malloc.h"
@@ -2457,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
2457static s64 get_nr_free_clusters(ntfs_volume *vol) 2459static s64 get_nr_free_clusters(ntfs_volume *vol)
2458{ 2460{
2459 s64 nr_free = vol->nr_clusters; 2461 s64 nr_free = vol->nr_clusters;
2460 u32 *kaddr;
2461 struct address_space *mapping = vol->lcnbmp_ino->i_mapping; 2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
2462 struct page *page; 2463 struct page *page;
2463 pgoff_t index, max_index; 2464 pgoff_t index, max_index;
@@ -2476,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2476 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", 2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
2477 max_index, PAGE_CACHE_SIZE / 4); 2478 max_index, PAGE_CACHE_SIZE / 4);
2478 for (index = 0; index < max_index; index++) { 2479 for (index = 0; index < max_index; index++) {
2479 unsigned int i; 2480 unsigned long *kaddr;
2481
2480 /* 2482 /*
2481 * Read the page from page cache, getting it from backing store 2483 * Read the page from page cache, getting it from backing store
2482 * if necessary, and increment the use count. 2484 * if necessary, and increment the use count.
@@ -2489,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2489 nr_free -= PAGE_CACHE_SIZE * 8; 2491 nr_free -= PAGE_CACHE_SIZE * 8;
2490 continue; 2492 continue;
2491 } 2493 }
2492 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2494 kaddr = kmap_atomic(page, KM_USER0);
2493 /* 2495 /*
2494 * For each 4 bytes, subtract the number of set bits. If this 2496 * Subtract the number of set bits. If this
2495 * is the last page and it is partial we don't really care as 2497 * is the last page and it is partial we don't really care as
2496 * it just means we do a little extra work but it won't affect 2498 * it just means we do a little extra work but it won't affect
2497 * the result as all out of range bytes are set to zero by 2499 * the result as all out of range bytes are set to zero by
2498 * ntfs_readpage(). 2500 * ntfs_readpage().
2499 */ 2501 */
2500 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2502 nr_free -= bitmap_weight(kaddr,
2501 nr_free -= (s64)hweight32(kaddr[i]); 2503 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2502 kunmap_atomic(kaddr, KM_USER0); 2504 kunmap_atomic(kaddr, KM_USER0);
2503 page_cache_release(page); 2505 page_cache_release(page);
2504 } 2506 }
@@ -2537,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2537static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, 2539static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2538 s64 nr_free, const pgoff_t max_index) 2540 s64 nr_free, const pgoff_t max_index)
2539{ 2541{
2540 u32 *kaddr;
2541 struct address_space *mapping = vol->mftbmp_ino->i_mapping; 2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2542 struct page *page; 2543 struct page *page;
2543 pgoff_t index; 2544 pgoff_t index;
@@ -2547,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2547 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " 2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
2548 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); 2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
2549 for (index = 0; index < max_index; index++) { 2550 for (index = 0; index < max_index; index++) {
2550 unsigned int i; 2551 unsigned long *kaddr;
2552
2551 /* 2553 /*
2552 * Read the page from page cache, getting it from backing store 2554 * Read the page from page cache, getting it from backing store
2553 * if necessary, and increment the use count. 2555 * if necessary, and increment the use count.
@@ -2560,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2560 nr_free -= PAGE_CACHE_SIZE * 8; 2562 nr_free -= PAGE_CACHE_SIZE * 8;
2561 continue; 2563 continue;
2562 } 2564 }
2563 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2565 kaddr = kmap_atomic(page, KM_USER0);
2564 /* 2566 /*
2565 * For each 4 bytes, subtract the number of set bits. If this 2567 * Subtract the number of set bits. If this
2566 * is the last page and it is partial we don't really care as 2568 * is the last page and it is partial we don't really care as
2567 * it just means we do a little extra work but it won't affect 2569 * it just means we do a little extra work but it won't affect
2568 * the result as all out of range bytes are set to zero by 2570 * the result as all out of range bytes are set to zero by
2569 * ntfs_readpage(). 2571 * ntfs_readpage().
2570 */ 2572 */
2571 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2573 nr_free -= bitmap_weight(kaddr,
2572 nr_free -= (s64)hweight32(kaddr[i]); 2574 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2573 kunmap_atomic(kaddr, KM_USER0); 2575 kunmap_atomic(kaddr, KM_USER0);
2574 page_cache_release(page); 2576 page_cache_release(page);
2575 } 2577 }
@@ -2662,6 +2664,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
2662 return 0; 2664 return 0;
2663} 2665}
2664 2666
2667#ifdef NTFS_RW
2668static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
2669{
2670 return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
2671}
2672#endif
2673
2665/** 2674/**
2666 * The complete super operations. 2675 * The complete super operations.
2667 */ 2676 */
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 9ef85e628fe1..79a89184cb5e 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -36,12 +36,11 @@
36/* Definition of the ntfs sysctl. */ 36/* Definition of the ntfs sysctl. */
37static ctl_table ntfs_sysctls[] = { 37static ctl_table ntfs_sysctls[] = {
38 { 38 {
39 .ctl_name = CTL_UNNUMBERED, /* Binary and text IDs. */
40 .procname = "ntfs-debug", 39 .procname = "ntfs-debug",
41 .data = &debug_msgs, /* Data pointer and size. */ 40 .data = &debug_msgs, /* Data pointer and size. */
42 .maxlen = sizeof(debug_msgs), 41 .maxlen = sizeof(debug_msgs),
43 .mode = 0644, /* Mode, proc handler. */ 42 .mode = 0644, /* Mode, proc handler. */
44 .proc_handler = &proc_dointvec 43 .proc_handler = proc_dointvec
45 }, 44 },
46 {} 45 {}
47}; 46};
@@ -49,7 +48,6 @@ static ctl_table ntfs_sysctls[] = {
49/* Define the parent directory /proc/sys/fs. */ 48/* Define the parent directory /proc/sys/fs. */
50static ctl_table sysctls_root[] = { 49static ctl_table sysctls_root[] = {
51 { 50 {
52 .ctl_name = CTL_FS,
53 .procname = "fs", 51 .procname = "fs",
54 .mode = 0555, 52 .mode = 0555,
55 .child = ntfs_sysctls 53 .child = ntfs_sysctls
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 701b7a3a872e..0d840669698e 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -6,6 +6,7 @@ config OCFS2_FS
6 select CRC32 6 select CRC32
7 select QUOTA 7 select QUOTA
8 select QUOTA_TREE 8 select QUOTA_TREE
9 select FS_POSIX_ACL
9 help 10 help
10 OCFS2 is a general purpose extent based shared disk cluster file 11 OCFS2 is a general purpose extent based shared disk cluster file
11 system with many similarities to ext3. It supports 64 bit inode 12 system with many similarities to ext3. It supports 64 bit inode
@@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS
74 This option will enable expensive consistency checks. Enable 75 This option will enable expensive consistency checks. Enable
75 this option for debugging only as it is likely to decrease 76 this option for debugging only as it is likely to decrease
76 performance of the filesystem. 77 performance of the filesystem.
77
78config OCFS2_FS_POSIX_ACL
79 bool "OCFS2 POSIX Access Control Lists"
80 depends on OCFS2_FS
81 select FS_POSIX_ACL
82 default n
83 help
84 Posix Access Control Lists (ACLs) support permissions for users and
85 groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 31f25ce32c97..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -39,16 +39,14 @@ ocfs2-objs := \
39 ver.o \ 39 ver.o \
40 quota_local.o \ 40 quota_local.o \
41 quota_global.o \ 41 quota_global.o \
42 xattr.o 42 xattr.o \
43 43 acl.o
44ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
45ocfs2-objs += acl.o
46endif
47 44
48ocfs2_stackglue-objs := stackglue.o 45ocfs2_stackglue-objs := stackglue.o
49ocfs2_stack_o2cb-objs := stack_o2cb.o 46ocfs2_stack_o2cb-objs := stack_o2cb.o
50ocfs2_stack_user-objs := stack_user.o 47ocfs2_stack_user-objs := stack_user.o
51 48
49obj-$(CONFIG_OCFS2_FS) += dlmfs/
52# cluster/ is always needed when OCFS2_FS for masklog support 50# cluster/ is always needed when OCFS2_FS for masklog support
53obj-$(CONFIG_OCFS2_FS) += cluster/ 51obj-$(CONFIG_OCFS2_FS) += cluster/
54obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ 52obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index fbeaec762103..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h>
24#include <linux/string.h> 25#include <linux/string.h>
25 26
26#define MLOG_MASK_PREFIX ML_INODE 27#define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
30#include "alloc.h" 31#include "alloc.h"
31#include "dlmglue.h" 32#include "dlmglue.h"
32#include "file.h" 33#include "file.h"
34#include "inode.h"
35#include "journal.h"
33#include "ocfs2_fs.h" 36#include "ocfs2_fs.h"
34 37
35#include "xattr.h" 38#include "xattr.h"
@@ -98,15 +101,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
98 int type, 101 int type,
99 struct buffer_head *di_bh) 102 struct buffer_head *di_bh)
100{ 103{
101 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
102 int name_index; 104 int name_index;
103 char *value = NULL; 105 char *value = NULL;
104 struct posix_acl *acl; 106 struct posix_acl *acl;
105 int retval; 107 int retval;
106 108
107 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
108 return NULL;
109
110 switch (type) { 109 switch (type) {
111 case ACL_TYPE_ACCESS: 110 case ACL_TYPE_ACCESS:
112 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; 111 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -170,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
170} 169}
171 170
172/* 171/*
172 * Helper function to set i_mode in memory and disk. Some call paths
173 * will not have di_bh or a journal handle to pass, in which case it
174 * will create it's own.
175 */
176static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
177 handle_t *handle, umode_t new_mode)
178{
179 int ret, commit_handle = 0;
180 struct ocfs2_dinode *di;
181
182 if (di_bh == NULL) {
183 ret = ocfs2_read_inode_block(inode, &di_bh);
184 if (ret) {
185 mlog_errno(ret);
186 goto out;
187 }
188 } else
189 get_bh(di_bh);
190
191 if (handle == NULL) {
192 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
193 OCFS2_INODE_UPDATE_CREDITS);
194 if (IS_ERR(handle)) {
195 ret = PTR_ERR(handle);
196 mlog_errno(ret);
197 goto out_brelse;
198 }
199
200 commit_handle = 1;
201 }
202
203 di = (struct ocfs2_dinode *)di_bh->b_data;
204 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
205 OCFS2_JOURNAL_ACCESS_WRITE);
206 if (ret) {
207 mlog_errno(ret);
208 goto out_commit;
209 }
210
211 inode->i_mode = new_mode;
212 di->i_mode = cpu_to_le16(inode->i_mode);
213
214 ocfs2_journal_dirty(handle, di_bh);
215
216out_commit:
217 if (commit_handle)
218 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
219out_brelse:
220 brelse(di_bh);
221out:
222 return ret;
223}
224
225/*
173 * Set the access or default ACL of an inode. 226 * Set the access or default ACL of an inode.
174 */ 227 */
175static int ocfs2_set_acl(handle_t *handle, 228static int ocfs2_set_acl(handle_t *handle,
@@ -197,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
197 if (ret < 0) 250 if (ret < 0)
198 return ret; 251 return ret;
199 else { 252 else {
200 inode->i_mode = mode;
201 if (ret == 0) 253 if (ret == 0)
202 acl = NULL; 254 acl = NULL;
255
256 ret = ocfs2_acl_set_mode(inode, di_bh,
257 handle, mode);
258 if (ret)
259 return ret;
260
203 } 261 }
204 } 262 }
205 break; 263 break;
@@ -287,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
287 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
288 struct posix_acl *acl = NULL; 346 struct posix_acl *acl = NULL;
289 int ret = 0; 347 int ret = 0;
348 mode_t mode;
290 349
291 if (!S_ISLNK(inode->i_mode)) { 350 if (!S_ISLNK(inode->i_mode)) {
292 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 351 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -295,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
295 if (IS_ERR(acl)) 354 if (IS_ERR(acl))
296 return PTR_ERR(acl); 355 return PTR_ERR(acl);
297 } 356 }
298 if (!acl) 357 if (!acl) {
299 inode->i_mode &= ~current_umask(); 358 mode = inode->i_mode & ~current_umask();
359 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
360 if (ret) {
361 mlog_errno(ret);
362 goto cleanup;
363 }
364 }
300 } 365 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 366 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone; 367 struct posix_acl *clone;
303 mode_t mode;
304 368
305 if (S_ISDIR(inode->i_mode)) { 369 if (S_ISDIR(inode->i_mode)) {
306 ret = ocfs2_set_acl(handle, inode, di_bh, 370 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -317,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
317 mode = inode->i_mode; 381 mode = inode->i_mode;
318 ret = posix_acl_create_masq(clone, &mode); 382 ret = posix_acl_create_masq(clone, &mode);
319 if (ret >= 0) { 383 if (ret >= 0) {
320 inode->i_mode = mode; 384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
321 if (ret > 0) { 385 if (ret > 0) {
322 ret = ocfs2_set_acl(handle, inode, 386 ret = ocfs2_set_acl(handle, inode,
323 di_bh, ACL_TYPE_ACCESS, 387 di_bh, ACL_TYPE_ACCESS,
@@ -331,13 +395,14 @@ cleanup:
331 return ret; 395 return ret;
332} 396}
333 397
334static size_t ocfs2_xattr_list_acl_access(struct inode *inode, 398static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
335 char *list, 399 char *list,
336 size_t list_len, 400 size_t list_len,
337 const char *name, 401 const char *name,
338 size_t name_len) 402 size_t name_len,
403 int type)
339{ 404{
340 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 405 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
341 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); 406 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
342 407
343 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 408 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -348,13 +413,14 @@ static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
348 return size; 413 return size;
349} 414}
350 415
351static size_t ocfs2_xattr_list_acl_default(struct inode *inode, 416static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
352 char *list, 417 char *list,
353 size_t list_len, 418 size_t list_len,
354 const char *name, 419 const char *name,
355 size_t name_len) 420 size_t name_len,
421 int type)
356{ 422{
357 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 423 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); 424 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
359 425
360 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 426 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -365,19 +431,19 @@ static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
365 return size; 431 return size;
366} 432}
367 433
368static int ocfs2_xattr_get_acl(struct inode *inode, 434static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
369 int type, 435 void *buffer, size_t size, int type)
370 void *buffer,
371 size_t size)
372{ 436{
373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 437 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
374 struct posix_acl *acl; 438 struct posix_acl *acl;
375 int ret; 439 int ret;
376 440
441 if (strcmp(name, "") != 0)
442 return -EINVAL;
377 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 443 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
378 return -EOPNOTSUPP; 444 return -EOPNOTSUPP;
379 445
380 acl = ocfs2_get_acl(inode, type); 446 acl = ocfs2_get_acl(dentry->d_inode, type);
381 if (IS_ERR(acl)) 447 if (IS_ERR(acl))
382 return PTR_ERR(acl); 448 return PTR_ERR(acl);
383 if (acl == NULL) 449 if (acl == NULL)
@@ -388,35 +454,16 @@ static int ocfs2_xattr_get_acl(struct inode *inode,
388 return ret; 454 return ret;
389} 455}
390 456
391static int ocfs2_xattr_get_acl_access(struct inode *inode, 457static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
392 const char *name, 458 const void *value, size_t size, int flags, int type)
393 void *buffer,
394 size_t size)
395{
396 if (strcmp(name, "") != 0)
397 return -EINVAL;
398 return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
399}
400
401static int ocfs2_xattr_get_acl_default(struct inode *inode,
402 const char *name,
403 void *buffer,
404 size_t size)
405{
406 if (strcmp(name, "") != 0)
407 return -EINVAL;
408 return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
409}
410
411static int ocfs2_xattr_set_acl(struct inode *inode,
412 int type,
413 const void *value,
414 size_t size)
415{ 459{
460 struct inode *inode = dentry->d_inode;
416 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 461 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
417 struct posix_acl *acl; 462 struct posix_acl *acl;
418 int ret = 0; 463 int ret = 0;
419 464
465 if (strcmp(name, "") != 0)
466 return -EINVAL;
420 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 467 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
421 return -EOPNOTSUPP; 468 return -EOPNOTSUPP;
422 469
@@ -442,38 +489,18 @@ cleanup:
442 return ret; 489 return ret;
443} 490}
444 491
445static int ocfs2_xattr_set_acl_access(struct inode *inode,
446 const char *name,
447 const void *value,
448 size_t size,
449 int flags)
450{
451 if (strcmp(name, "") != 0)
452 return -EINVAL;
453 return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
454}
455
456static int ocfs2_xattr_set_acl_default(struct inode *inode,
457 const char *name,
458 const void *value,
459 size_t size,
460 int flags)
461{
462 if (strcmp(name, "") != 0)
463 return -EINVAL;
464 return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
465}
466
467struct xattr_handler ocfs2_xattr_acl_access_handler = { 492struct xattr_handler ocfs2_xattr_acl_access_handler = {
468 .prefix = POSIX_ACL_XATTR_ACCESS, 493 .prefix = POSIX_ACL_XATTR_ACCESS,
494 .flags = ACL_TYPE_ACCESS,
469 .list = ocfs2_xattr_list_acl_access, 495 .list = ocfs2_xattr_list_acl_access,
470 .get = ocfs2_xattr_get_acl_access, 496 .get = ocfs2_xattr_get_acl,
471 .set = ocfs2_xattr_set_acl_access, 497 .set = ocfs2_xattr_set_acl,
472}; 498};
473 499
474struct xattr_handler ocfs2_xattr_acl_default_handler = { 500struct xattr_handler ocfs2_xattr_acl_default_handler = {
475 .prefix = POSIX_ACL_XATTR_DEFAULT, 501 .prefix = POSIX_ACL_XATTR_DEFAULT,
502 .flags = ACL_TYPE_DEFAULT,
476 .list = ocfs2_xattr_list_acl_default, 503 .list = ocfs2_xattr_list_acl_default,
477 .get = ocfs2_xattr_get_acl_default, 504 .get = ocfs2_xattr_get_acl,
478 .set = ocfs2_xattr_set_acl_default, 505 .set = ocfs2_xattr_set_acl,
479}; 506};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 8f6389ed4da5..5c5d31f05853 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,8 +26,6 @@ struct ocfs2_acl_entry {
26 __le32 e_id; 26 __le32 e_id;
27}; 27};
28 28
29#ifdef CONFIG_OCFS2_FS_POSIX_ACL
30
31extern int ocfs2_check_acl(struct inode *, int); 29extern int ocfs2_check_acl(struct inode *, int);
32extern int ocfs2_acl_chmod(struct inode *); 30extern int ocfs2_acl_chmod(struct inode *);
33extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, 31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
@@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
35 struct ocfs2_alloc_context *, 33 struct ocfs2_alloc_context *,
36 struct ocfs2_alloc_context *); 34 struct ocfs2_alloc_context *);
37 35
38#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
39
40#define ocfs2_check_acl NULL
41static inline int ocfs2_acl_chmod(struct inode *inode)
42{
43 return 0;
44}
45static inline int ocfs2_init_acl(handle_t *handle,
46 struct inode *inode,
47 struct inode *dir,
48 struct buffer_head *di_bh,
49 struct buffer_head *dir_bh,
50 struct ocfs2_alloc_context *meta_ac,
51 struct ocfs2_alloc_context *data_ac)
52{
53 return 0;
54}
55
56#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
57
58#endif /* OCFS2_ACL_H */ 36#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..9f8bd913c51e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1051 eb->h_blkno = cpu_to_le64(first_blkno); 1051 eb->h_blkno = cpu_to_le64(first_blkno);
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1054 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1055 eb->h_list.l_count = 1056 eb->h_list.l_count =
1056 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1765,9 +1766,9 @@ set_and_inc:
1765 * 1766 *
1766 * The array index of the subtree root is passed back. 1767 * The array index of the subtree root is passed back.
1767 */ 1768 */
1768static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 1769int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1769 struct ocfs2_path *left, 1770 struct ocfs2_path *left,
1770 struct ocfs2_path *right) 1771 struct ocfs2_path *right)
1771{ 1772{
1772 int i = 0; 1773 int i = 0;
1773 1774
@@ -2398,7 +2399,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2398 * 2399 *
2399 * The array is assumed to be large enough to hold an entire path (tree depth). 2400 * The array is assumed to be large enough to hold an entire path (tree depth).
2400 * 2401 *
2401 * Upon succesful return from this function: 2402 * Upon successful return from this function:
2402 * 2403 *
2403 * - The 'right_path' array will contain a path to the leaf block 2404 * - The 'right_path' array will contain a path to the leaf block
2404 * whose range contains e_cpos. 2405 * whose range contains e_cpos.
@@ -2872,8 +2873,8 @@ out:
2872 * This looks similar, but is subtly different to 2873 * This looks similar, but is subtly different to
2873 * ocfs2_find_cpos_for_left_leaf(). 2874 * ocfs2_find_cpos_for_left_leaf().
2874 */ 2875 */
2875static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 2876int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2876 struct ocfs2_path *path, u32 *cpos) 2877 struct ocfs2_path *path, u32 *cpos)
2877{ 2878{
2878 int i, j, ret = 0; 2879 int i, j, ret = 0;
2879 u64 blkno; 2880 u64 blkno;
@@ -5712,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5712 goto out; 5713 goto out;
5713 } 5714 }
5714 5715
5715 vfs_dq_free_space_nodirty(inode, 5716 dquot_free_space_nodirty(inode,
5716 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5717 ocfs2_clusters_to_bytes(inode->i_sb, len));
5717 5718
5718 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); 5719 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6037 if (status < 0) 6038 if (status < 0)
6038 mlog_errno(status); 6039 mlog_errno(status);
6039 else 6040 else
6040 ocfs2_init_inode_steal_slot(osb); 6041 ocfs2_init_steal_slots(osb);
6041 6042
6042 mlog_exit(status); 6043 mlog_exit(status);
6043} 6044}
@@ -6935,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6935 goto bail; 6936 goto bail;
6936 } 6937 }
6937 6938
6938 vfs_dq_free_space_nodirty(inode, 6939 dquot_free_space_nodirty(inode,
6939 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); 6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6940 spin_lock(&OCFS2_I(inode)->ip_lock); 6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6941 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7190,8 +7191,8 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
7190 * wait on them - the truncate_inode_pages() call later will 7191 * wait on them - the truncate_inode_pages() call later will
7191 * do that for us. 7192 * do that for us.
7192 */ 7193 */
7193 ret = do_sync_mapping_range(inode->i_mapping, range_start, 7194 ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
7194 range_end - 1, SYNC_FILE_RANGE_WRITE); 7195 range_end - 1);
7195 if (ret) 7196 if (ret)
7196 mlog_errno(ret); 7197 mlog_errno(ret);
7197 7198
@@ -7300,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7300 unsigned int page_end; 7301 unsigned int page_end;
7301 u64 phys; 7302 u64 phys;
7302 7303
7303 if (vfs_dq_alloc_space_nodirty(inode, 7304 ret = dquot_alloc_space_nodirty(inode,
7304 ocfs2_clusters_to_bytes(osb->sb, 1))) { 7305 ocfs2_clusters_to_bytes(osb->sb, 1));
7305 ret = -EDQUOT; 7306 if (ret)
7306 goto out_commit; 7307 goto out_commit;
7307 }
7308 did_quota = 1; 7308 did_quota = 1;
7309 7309
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7380,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7380 7380
7381out_commit: 7381out_commit:
7382 if (ret < 0 && did_quota) 7382 if (ret < 0 && did_quota)
7383 vfs_dq_free_space_nodirty(inode, 7383 dquot_free_space_nodirty(inode,
7384 ocfs2_clusters_to_bytes(osb->sb, 1)); 7384 ocfs2_clusters_to_bytes(osb->sb, 1));
7385 7385
7386 ocfs2_commit_trans(osb, handle); 7386 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 9c122d574464..1db4359ccb90 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle,
317int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, 317int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
318 handle_t *handle, 318 handle_t *handle,
319 struct ocfs2_path *path); 319 struct ocfs2_path *path);
320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
321 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
323 struct ocfs2_path *left,
324 struct ocfs2_path *right);
320#endif /* OCFS2_ALLOC_H */ 325#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..21441ddb5506 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
547 * 547 *
548 * called like this: dio->get_blocks(dio->inode, fs_startblk, 548 * called like this: dio->get_blocks(dio->inode, fs_startblk,
549 * fs_count, map_bh, dio->rw == WRITE); 549 * fs_count, map_bh, dio->rw == WRITE);
550 *
551 * Note that we never bother to allocate blocks here, and thus ignore the
552 * create argument.
550 */ 553 */
551static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 554static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
552 struct buffer_head *bh_result, int create) 555 struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
563 566
564 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 567 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
565 568
566 /*
567 * Any write past EOF is not allowed because we'd be extending.
568 */
569 if (create && (iblock + max_blocks) > inode_blocks) {
570 ret = -EIO;
571 goto bail;
572 }
573
574 /* This figures out the size of the next contiguous block, and 569 /* This figures out the size of the next contiguous block, and
575 * our logical offset */ 570 * our logical offset */
576 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 571 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,17 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
582 goto bail; 577 goto bail;
583 } 578 }
584 579
585 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { 580 /* We should already CoW the refcounted extent in case of create. */
586 ocfs2_error(inode->i_sb, 581 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
587 "Inode %llu has a hole at block %llu\n",
588 (unsigned long long)OCFS2_I(inode)->ip_blkno,
589 (unsigned long long)iblock);
590 ret = -EROFS;
591 goto bail;
592 }
593 582
594 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
596 /* 583 /*
597 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
598 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
@@ -601,20 +588,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
601 */ 588 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 589 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno); 590 map_bh(bh_result, inode->i_sb, p_blkno);
604 else { 591 else
605 /*
606 * ocfs2_prepare_inode_for_write() should have caught
607 * the case where we'd be filling a hole and triggered
608 * a buffered write instead.
609 */
610 if (create) {
611 ret = -EIO;
612 mlog_errno(ret);
613 goto bail;
614 }
615
616 clear_buffer_mapped(bh_result); 592 clear_buffer_mapped(bh_result);
617 }
618 593
619 /* make sure we don't map more than max_blocks blocks here as 594 /* make sure we don't map more than max_blocks blocks here as
620 that's all the kernel will handle at this point. */ 595 that's all the kernel will handle at this point. */
@@ -625,7 +600,7 @@ bail:
625 return ret; 600 return ret;
626} 601}
627 602
628/* 603/*
629 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 604 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
630 * particularly interested in the aio/dio case. Like the core uses 605 * particularly interested in the aio/dio case. Like the core uses
631 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 606 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
@@ -696,7 +671,7 @@ static ssize_t ocfs2_direct_IO(int rw,
696 671
697 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 672 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
698 inode->i_sb->s_bdev, iov, offset, 673 inode->i_sb->s_bdev, iov, offset,
699 nr_segs, 674 nr_segs,
700 ocfs2_direct_IO_get_blocks, 675 ocfs2_direct_IO_get_blocks,
701 ocfs2_dio_end_io); 676 ocfs2_dio_end_io);
702 677
@@ -1789,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1789 1764
1790 wc->w_handle = handle; 1765 wc->w_handle = handle;
1791 1766
1792 if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, 1767 if (clusters_to_alloc) {
1793 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { 1768 ret = dquot_alloc_space_nodirty(inode,
1794 ret = -EDQUOT; 1769 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1795 goto out_commit; 1770 if (ret)
1771 goto out_commit;
1796 } 1772 }
1797 /* 1773 /*
1798 * We don't want this to fail in ocfs2_write_end(), so do it 1774 * We don't want this to fail in ocfs2_write_end(), so do it
@@ -1835,7 +1811,7 @@ success:
1835 return 0; 1811 return 0;
1836out_quota: 1812out_quota:
1837 if (clusters_to_alloc) 1813 if (clusters_to_alloc)
1838 vfs_dq_free_space(inode, 1814 dquot_free_space(inode,
1839 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); 1815 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1840out_commit: 1816out_commit:
1841 ocfs2_commit_trans(osb, handle); 1817 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index a1163b8b417c..b7428c5d0d3b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -47,7 +47,7 @@
47 * Calculate the bit offset in the hamming code buffer based on the bit's 47 * Calculate the bit offset in the hamming code buffer based on the bit's
48 * offset in the data buffer. Since the hamming code reserves all 48 * offset in the data buffer. Since the hamming code reserves all
49 * power-of-two bits for parity, the data bit number and the code bit 49 * power-of-two bits for parity, the data bit number and the code bit
50 * number are offest by all the parity bits beforehand. 50 * number are offset by all the parity bits beforehand.
51 * 51 *
52 * Recall that bit numbers in hamming code are 1-based. This function 52 * Recall that bit numbers in hamming code are 1-based. This function
53 * takes the 0-based data bit from the caller. 53 * takes the 0-based data bit from the caller.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d43d34a1dd31..ecebb2276790 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -368,7 +367,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
368 } 367 }
369 ocfs2_metadata_cache_io_unlock(ci); 368 ocfs2_metadata_cache_io_unlock(ci);
370 369
371 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 370 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
372 (unsigned long long)block, nr, 371 (unsigned long long)block, nr,
373 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", 372 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
374 flags); 373 flags);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index c452d116b892..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h>
37 38
38#include "heartbeat.h" 39#include "heartbeat.h"
39#include "tcp.h" 40#include "tcp.h"
@@ -78,7 +79,7 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
78 79
79unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
80 81
81/* Only sets a new threshold if there are no active regions. 82/* Only sets a new threshold if there are no active regions.
82 * 83 *
83 * No locking or otherwise interesting code is required for reading 84 * No locking or otherwise interesting code is required for reading
84 * o2hb_dead_threshold as it can't change once regions are active and 85 * o2hb_dead_threshold as it can't change once regions are active and
@@ -170,13 +171,14 @@ static void o2hb_write_timeout(struct work_struct *work)
170 171
171 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
172 "milliseconds\n", reg->hr_dev_name, 173 "milliseconds\n", reg->hr_dev_name,
173 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
174 o2quo_disk_timeout(); 175 o2quo_disk_timeout();
175} 176}
176 177
177static void o2hb_arm_write_timeout(struct o2hb_region *reg) 178static void o2hb_arm_write_timeout(struct o2hb_region *reg)
178{ 179{
179 mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); 180 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
181 O2HB_MAX_WRITE_TIMEOUT_MS);
180 182
181 cancel_delayed_work(&reg->hr_write_timeout_work); 183 cancel_delayed_work(&reg->hr_write_timeout_work);
182 reg->hr_last_timeout_start = jiffies; 184 reg->hr_last_timeout_start = jiffies;
@@ -623,7 +625,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
623 "seq %llu last %llu changed %u equal %u\n", 625 "seq %llu last %llu changed %u equal %u\n",
624 slot->ds_node_num, (long long)slot->ds_last_generation, 626 slot->ds_node_num, (long long)slot->ds_last_generation,
625 le32_to_cpu(hb_block->hb_cksum), 627 le32_to_cpu(hb_block->hb_cksum),
626 (unsigned long long)le64_to_cpu(hb_block->hb_seq), 628 (unsigned long long)le64_to_cpu(hb_block->hb_seq),
627 (unsigned long long)slot->ds_last_time, slot->ds_changed_samples, 629 (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
628 slot->ds_equal_samples); 630 slot->ds_equal_samples);
629 631
@@ -874,7 +876,8 @@ static int o2hb_thread(void *data)
874 do_gettimeofday(&after_hb); 876 do_gettimeofday(&after_hb);
875 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 877 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
876 878
877 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", 879 mlog(ML_HEARTBEAT,
880 "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
878 before_hb.tv_sec, (unsigned long) before_hb.tv_usec, 881 before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
879 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 882 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
880 elapsed_msec); 883 elapsed_msec);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..3bb928a2bf7d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS),
115 define_mask(ERROR), 116 define_mask(ERROR),
116 define_mask(NOTICE), 117 define_mask(NOTICE),
117 define_mask(KTHREAD), 118 define_mask(KTHREAD),
@@ -135,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
135 return mlog_mask_store(mlog_attr->mask, buf, count); 136 return mlog_mask_store(mlog_attr->mask, buf, count);
136} 137}
137 138
138static struct sysfs_ops mlog_attr_ops = { 139static const struct sysfs_ops mlog_attr_ops = {
139 .show = mlog_show, 140 .show = mlog_show,
140 .store = mlog_store, 141 .store = mlog_store,
141}; 142};
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
117/* bits that are infrequently given and frequently matched in the high word */ 118/* bits that are infrequently given and frequently matched in the high word */
118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
@@ -194,9 +195,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
194 * previous token if args expands to nothing. 195 * previous token if args expands to nothing.
195 */ 196 */
196#define __mlog_printk(level, fmt, args...) \ 197#define __mlog_printk(level, fmt, args...) \
197 printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ 198 printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
198 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ 199 task_pid_nr(current), __mlog_cpu_guess, \
199 ##args) 200 __PRETTY_FUNCTION__, __LINE__ , ##args)
200 201
201#define mlog(mask, fmt, args...) do { \ 202#define mlog(mask, fmt, args...) do { \
202 u64 __m = MLOG_MASK_PREFIX | (mask); \ 203 u64 __m = MLOG_MASK_PREFIX | (mask); \
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index da794bc07a6c..a3f150e52b02 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -294,10 +294,10 @@ static int sc_seq_show(struct seq_file *seq, void *v)
294 if (sc->sc_sock) { 294 if (sc->sc_sock) {
295 inet = inet_sk(sc->sc_sock->sk); 295 inet = inet_sk(sc->sc_sock->sk);
296 /* the stack's structs aren't sparse endian clean */ 296 /* the stack's structs aren't sparse endian clean */
297 saddr = (__force __be32)inet->saddr; 297 saddr = (__force __be32)inet->inet_saddr;
298 daddr = (__force __be32)inet->daddr; 298 daddr = (__force __be32)inet->inet_daddr;
299 sport = (__force __be16)inet->sport; 299 sport = (__force __be16)inet->inet_sport;
300 dport = (__force __be16)inet->dport; 300 dport = (__force __be16)inet->inet_dport;
301 } 301 }
302 302
303 /* XXX sigh, inet-> doesn't have sparse annotation so any 303 /* XXX sigh, inet-> doesn't have sparse annotation so any
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7ee6188bc79a..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22#include <linux/slab.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/configfs.h> 25#include <linux/configfs.h>
@@ -35,6 +36,10 @@
35 * cluster references throughout where nodes are looked up */ 36 * cluster references throughout where nodes are looked up */
36struct o2nm_cluster *o2nm_single_cluster = NULL; 37struct o2nm_cluster *o2nm_single_cluster = NULL;
37 38
39char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
40 "reset", /* O2NM_FENCE_RESET */
41 "panic", /* O2NM_FENCE_PANIC */
42};
38 43
39struct o2nm_node *o2nm_get_node_by_num(u8 node_num) 44struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
40{ 45{
@@ -579,6 +584,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
579 return o2nm_cluster_attr_write(page, count, 584 return o2nm_cluster_attr_write(page, count,
580 &cluster->cl_reconnect_delay_ms); 585 &cluster->cl_reconnect_delay_ms);
581} 586}
587
588static ssize_t o2nm_cluster_attr_fence_method_read(
589 struct o2nm_cluster *cluster, char *page)
590{
591 ssize_t ret = 0;
592
593 if (cluster)
594 ret = sprintf(page, "%s\n",
595 o2nm_fence_method_desc[cluster->cl_fence_method]);
596 return ret;
597}
598
599static ssize_t o2nm_cluster_attr_fence_method_write(
600 struct o2nm_cluster *cluster, const char *page, size_t count)
601{
602 unsigned int i;
603
604 if (page[count - 1] != '\n')
605 goto bail;
606
607 for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
608 if (count != strlen(o2nm_fence_method_desc[i]) + 1)
609 continue;
610 if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
611 continue;
612 if (cluster->cl_fence_method != i) {
613 printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
614 o2nm_fence_method_desc[i]);
615 cluster->cl_fence_method = i;
616 }
617 return count;
618 }
619
620bail:
621 return -EINVAL;
622}
623
582static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { 624static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
583 .attr = { .ca_owner = THIS_MODULE, 625 .attr = { .ca_owner = THIS_MODULE,
584 .ca_name = "idle_timeout_ms", 626 .ca_name = "idle_timeout_ms",
@@ -603,10 +645,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
603 .store = o2nm_cluster_attr_reconnect_delay_ms_write, 645 .store = o2nm_cluster_attr_reconnect_delay_ms_write,
604}; 646};
605 647
648static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
649 .attr = { .ca_owner = THIS_MODULE,
650 .ca_name = "fence_method",
651 .ca_mode = S_IRUGO | S_IWUSR },
652 .show = o2nm_cluster_attr_fence_method_read,
653 .store = o2nm_cluster_attr_fence_method_write,
654};
655
606static struct configfs_attribute *o2nm_cluster_attrs[] = { 656static struct configfs_attribute *o2nm_cluster_attrs[] = {
607 &o2nm_cluster_attr_idle_timeout_ms.attr, 657 &o2nm_cluster_attr_idle_timeout_ms.attr,
608 &o2nm_cluster_attr_keepalive_delay_ms.attr, 658 &o2nm_cluster_attr_keepalive_delay_ms.attr,
609 &o2nm_cluster_attr_reconnect_delay_ms.attr, 659 &o2nm_cluster_attr_reconnect_delay_ms.attr,
660 &o2nm_cluster_attr_fence_method.attr,
610 NULL, 661 NULL,
611}; 662};
612static ssize_t o2nm_cluster_show(struct config_item *item, 663static ssize_t o2nm_cluster_show(struct config_item *item,
@@ -778,6 +829,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
778 cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; 829 cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
779 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; 830 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
780 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; 831 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
832 cluster->cl_fence_method = O2NM_FENCE_RESET;
781 833
782 ret = &cluster->cl_group; 834 ret = &cluster->cl_group;
783 o2nm_single_cluster = cluster; 835 o2nm_single_cluster = cluster;
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index c992ea0da4ad..09ea2d388bbb 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,6 +33,12 @@
33#include <linux/configfs.h> 33#include <linux/configfs.h>
34#include <linux/rbtree.h> 34#include <linux/rbtree.h>
35 35
36enum o2nm_fence_method {
37 O2NM_FENCE_RESET = 0,
38 O2NM_FENCE_PANIC,
39 O2NM_FENCE_METHODS, /* Number of fence methods */
40};
41
36struct o2nm_node { 42struct o2nm_node {
37 spinlock_t nd_lock; 43 spinlock_t nd_lock;
38 struct config_item nd_item; 44 struct config_item nd_item;
@@ -58,6 +64,7 @@ struct o2nm_cluster {
58 unsigned int cl_idle_timeout_ms; 64 unsigned int cl_idle_timeout_ms;
59 unsigned int cl_keepalive_delay_ms; 65 unsigned int cl_keepalive_delay_ms;
60 unsigned int cl_reconnect_delay_ms; 66 unsigned int cl_reconnect_delay_ms;
67 enum o2nm_fence_method cl_fence_method;
61 68
62 /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ 69 /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
63 unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 70 unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index bbacf7da48a4..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
44 * and if they're the last, they fire off the decision. 44 * and if they're the last, they fire off the decision.
45 */ 45 */
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/reboot.h> 48#include <linux/reboot.h>
50 49
@@ -74,8 +73,20 @@ static void o2quo_fence_self(void)
74 * threads can still schedule, etc, etc */ 73 * threads can still schedule, etc, etc */
75 o2hb_stop_all_regions(); 74 o2hb_stop_all_regions();
76 75
77 printk("ocfs2 is very sorry to be fencing this system by restarting\n"); 76 switch (o2nm_single_cluster->cl_fence_method) {
78 emergency_restart(); 77 case O2NM_FENCE_PANIC:
78 panic("*** ocfs2 is very sorry to be fencing this system by "
79 "panicing ***\n");
80 break;
81 default:
82 WARN_ON(o2nm_single_cluster->cl_fence_method >=
83 O2NM_FENCE_METHODS);
84 case O2NM_FENCE_RESET:
85 printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
86 "system by restarting ***\n");
87 emergency_restart();
88 break;
89 };
79} 90}
80 91
81/* Indicate that a timeout occured on a hearbeat region write. The 92/* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 334f231a422c..73e743eea2c8 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
72 72
73#include "tcp_internal.h" 73#include "tcp_internal.h"
74 74
75#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" 75#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ 76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
77 NIPQUAD(sc->sc_node->nd_ipv4_address), \ 77 &sc->sc_node->nd_ipv4_address, \
78 ntohs(sc->sc_node->nd_ipv4_port) 78 ntohs(sc->sc_node->nd_ipv4_port)
79 79
80/* 80/*
@@ -485,7 +485,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
485 } 485 }
486 486
487 if (was_valid && !valid) { 487 if (was_valid && !valid) {
488 printk(KERN_INFO "o2net: no longer connected to " 488 printk(KERN_NOTICE "o2net: no longer connected to "
489 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 489 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
490 o2net_complete_nodes_nsw(nn); 490 o2net_complete_nodes_nsw(nn);
491 } 491 }
@@ -493,7 +493,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
493 if (!was_valid && valid) { 493 if (!was_valid && valid) {
494 o2quo_conn_up(o2net_num_from_nn(nn)); 494 o2quo_conn_up(o2net_num_from_nn(nn));
495 cancel_delayed_work(&nn->nn_connect_expired); 495 cancel_delayed_work(&nn->nn_connect_expired);
496 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", 496 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
497 o2nm_this_node() > sc->sc_node->nd_num ? 497 o2nm_this_node() > sc->sc_node->nd_num ?
498 "connected to" : "accepted connection from", 498 "connected to" : "accepted connection from",
499 SC_NODEF_ARGS(sc)); 499 SC_NODEF_ARGS(sc));
@@ -930,7 +930,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
930 cond_resched(); 930 cond_resched();
931 continue; 931 continue;
932 } 932 }
933 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 933 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
934 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); 934 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
935 o2net_ensure_shutdown(nn, sc, 0); 935 o2net_ensure_shutdown(nn, sc, 0);
936 break; 936 break;
@@ -1476,14 +1476,14 @@ static void o2net_idle_timer(unsigned long data)
1476 1476
1477 do_gettimeofday(&now); 1477 do_gettimeofday(&now);
1478 1478
1479 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1479 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1480 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1480 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1481 o2net_idle_timeout() / 1000, 1481 o2net_idle_timeout() / 1000,
1482 o2net_idle_timeout() % 1000); 1482 o2net_idle_timeout() % 1000);
1483 mlog(ML_NOTICE, "here are some times that might help debug the " 1483 mlog(ML_NOTICE, "here are some times that might help debug the "
1484 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1484 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1485 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1485 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1486 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 1486 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
1487 now.tv_sec, (long) now.tv_usec, 1487 now.tv_sec, (long) now.tv_usec,
1488 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, 1488 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
1489 sc->sc_tv_advance_start.tv_sec, 1489 sc->sc_tv_advance_start.tv_sec,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b1..96fa7ebc530c 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -32,10 +32,10 @@
32 * on their number */ 32 * on their number */
33#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) 33#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
34 34
35/* 35/*
36 * This version number represents quite a lot, unfortunately. It not 36 * This version number represents quite a lot, unfortunately. It not
37 * only represents the raw network message protocol on the wire but also 37 * only represents the raw network message protocol on the wire but also
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * With version 11, we separate out the filesystem locking portion. The 41 * With version 11, we separate out the filesystem locking portion. The
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..efd77d071c80 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2440 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); 2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2964 goto out; 2964 goto out;
2965 } 2965 }
2966 2966
2967 if (vfs_dq_alloc_space_nodirty(dir, 2967 ret = dquot_alloc_space_nodirty(dir,
2968 ocfs2_clusters_to_bytes(osb->sb, 2968 ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
2969 alloc + dx_alloc))) { 2969 if (ret)
2970 ret = -EDQUOT;
2971 goto out_commit; 2970 goto out_commit;
2972 }
2973 did_quota = 1; 2971 did_quota = 1;
2974 2972
2975 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2973 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3178 3176
3179out_commit: 3177out_commit:
3180 if (ret < 0 && did_quota) 3178 if (ret < 0 && did_quota)
3181 vfs_dq_free_space_nodirty(dir, bytes_allocated); 3179 dquot_free_space_nodirty(dir, bytes_allocated);
3182 3180
3183 ocfs2_commit_trans(osb, handle); 3181 ocfs2_commit_trans(osb, handle);
3184 3182
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3221 if (extend) { 3219 if (extend) {
3222 u32 offset = OCFS2_I(dir)->ip_clusters; 3220 u32 offset = OCFS2_I(dir)->ip_clusters;
3223 3221
3224 if (vfs_dq_alloc_space_nodirty(dir, 3222 status = dquot_alloc_space_nodirty(dir,
3225 ocfs2_clusters_to_bytes(sb, 1))) { 3223 ocfs2_clusters_to_bytes(sb, 1));
3226 status = -EDQUOT; 3224 if (status)
3227 goto bail; 3225 goto bail;
3228 }
3229 did_quota = 1; 3226 did_quota = 1;
3230 3227
3231 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 3228 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3254 status = 0; 3251 status = 0;
3255bail: 3252bail:
3256 if (did_quota && status < 0) 3253 if (did_quota && status < 0)
3257 vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); 3254 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3258 mlog_exit(status); 3255 mlog_exit(status);
3259 return status; 3256 return status;
3260} 3257}
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3889 goto out; 3886 goto out;
3890 } 3887 }
3891 3888
3892 if (vfs_dq_alloc_space_nodirty(dir, 3889 ret = dquot_alloc_space_nodirty(dir,
3893 ocfs2_clusters_to_bytes(dir->i_sb, 1))) { 3890 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3894 ret = -EDQUOT; 3891 if (ret)
3895 goto out_commit; 3892 goto out_commit;
3896 }
3897 did_quota = 1; 3893 did_quota = 1;
3898 3894
3899 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, 3895 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3983 3979
3984out_commit: 3980out_commit:
3985 if (ret < 0 && did_quota) 3981 if (ret < 0 && did_quota)
3986 vfs_dq_free_space_nodirty(dir, 3982 dquot_free_space_nodirty(dir,
3987 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3983 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3988 3984
3989 ocfs2_commit_trans(osb, handle); 3985 ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4165 goto out; 4161 goto out;
4166 } 4162 }
4167 4163
4168 if (vfs_dq_alloc_space_nodirty(dir, 4164 ret = dquot_alloc_space_nodirty(dir,
4169 ocfs2_clusters_to_bytes(osb->sb, 1))) { 4165 ocfs2_clusters_to_bytes(osb->sb, 1));
4170 ret = -EDQUOT; 4166 if (ret)
4171 goto out_commit; 4167 goto out_commit;
4172 }
4173 did_quota = 1; 4168 did_quota = 1;
4174 4169
4175 /* 4170 /*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4229 4224
4230out_commit: 4225out_commit:
4231 if (ret < 0 && did_quota) 4226 if (ret < 0 && did_quota)
4232 vfs_dq_free_space_nodirty(dir, 4227 dquot_free_space_nodirty(dir,
4233 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 4228 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4234 4229
4235 ocfs2_commit_trans(osb, handle); 4230 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7 7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index b5786a787fab..3cfa114aa391 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err);
95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ 95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
96} while (0) 96} while (0)
97 97
98#define DLM_LKSB_UNUSED1 0x01 98#define DLM_LKSB_UNUSED1 0x01
99#define DLM_LKSB_PUT_LVB 0x02 99#define DLM_LKSB_PUT_LVB 0x02
100#define DLM_LKSB_GET_LVB 0x04 100#define DLM_LKSB_GET_LVB 0x04
101#define DLM_LKSB_UNUSED2 0x08 101#define DLM_LKSB_UNUSED2 0x08
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 01cf8cc3d286..a795eb91f4ea 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -123,7 +122,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
123 dlm_lock_put(lock); 122 dlm_lock_put(lock);
124 /* free up the reserved bast that we are cancelling. 123 /* free up the reserved bast that we are cancelling.
125 * guaranteed that this will not be the last reserved 124 * guaranteed that this will not be the last reserved
126 * ast because *both* an ast and a bast were reserved 125 * ast because *both* an ast and a bast were reserved
127 * to get to this point. the res->spinlock will not be 126 * to get to this point. the res->spinlock will not be
128 * taken here */ 127 * taken here */
129 dlm_lockres_release_ast(dlm, res); 128 dlm_lockres_release_ast(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index ca96bce50e18..90803b47cd8c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -396,7 +395,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
396 /* instead of logging the same network error over 395 /* instead of logging the same network error over
397 * and over, sleep here and wait for the heartbeat 396 * and over, sleep here and wait for the heartbeat
398 * to notice the node is dead. times out after 5s. */ 397 * to notice the node is dead. times out after 5s. */
399 dlm_wait_for_node_death(dlm, res->owner, 398 dlm_wait_for_node_death(dlm, res->owner,
400 DLM_NODE_DEATH_WAIT_MAX); 399 DLM_NODE_DEATH_WAIT_MAX);
401 ret = DLM_RECOVERING; 400 ret = DLM_RECOVERING;
402 mlog(0, "node %u died so returning DLM_RECOVERING " 401 mlog(0, "node %u died so returning DLM_RECOVERING "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 42b0bad7a612..0cd24cf54396 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -102,7 +102,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
102 assert_spin_locked(&res->spinlock); 102 assert_spin_locked(&res->spinlock);
103 103
104 stringify_lockname(res->lockname.name, res->lockname.len, 104 stringify_lockname(res->lockname.name, res->lockname.len,
105 buf, sizeof(buf) - 1); 105 buf, sizeof(buf));
106 printk("lockres: %s, owner=%u, state=%u\n", 106 printk("lockres: %s, owner=%u, state=%u\n",
107 buf, res->owner, res->state); 107 buf, res->owner, res->state);
108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n", 108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0334000676d3..988c9055fd4e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -816,7 +816,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
816 } 816 }
817 817
818 /* Once the dlm ctxt is marked as leaving then we don't want 818 /* Once the dlm ctxt is marked as leaving then we don't want
819 * to be put in someone's domain map. 819 * to be put in someone's domain map.
820 * Also, explicitly disallow joining at certain troublesome 820 * Also, explicitly disallow joining at certain troublesome
821 * times (ie. during recovery). */ 821 * times (ie. during recovery). */
822 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { 822 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 437698e9465f..733337772671 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -269,7 +269,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
269 } 269 }
270 dlm_revert_pending_lock(res, lock); 270 dlm_revert_pending_lock(res, lock);
271 dlm_lock_put(lock); 271 dlm_lock_put(lock);
272 } else if (dlm_is_recovery_lock(res->lockname.name, 272 } else if (dlm_is_recovery_lock(res->lockname.name,
273 res->lockname.len)) { 273 res->lockname.len)) {
274 /* special case for the $RECOVERY lock. 274 /* special case for the $RECOVERY lock.
275 * there will never be an AST delivered to put 275 * there will never be an AST delivered to put
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -366,7 +366,7 @@ void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
366 struct dlm_master_list_entry *mle; 366 struct dlm_master_list_entry *mle;
367 367
368 assert_spin_locked(&dlm->spinlock); 368 assert_spin_locked(&dlm->spinlock);
369 369
370 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { 370 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
371 if (node_up) 371 if (node_up)
372 dlm_mle_node_up(dlm, mle, NULL, idx); 372 dlm_mle_node_up(dlm, mle, NULL, idx);
@@ -833,7 +833,7 @@ lookup:
833 __dlm_insert_mle(dlm, mle); 833 __dlm_insert_mle(dlm, mle);
834 834
835 /* still holding the dlm spinlock, check the recovery map 835 /* still holding the dlm spinlock, check the recovery map
836 * to see if there are any nodes that still need to be 836 * to see if there are any nodes that still need to be
837 * considered. these will not appear in the mle nodemap 837 * considered. these will not appear in the mle nodemap
838 * but they might own this lockres. wait on them. */ 838 * but they might own this lockres. wait on them. */
839 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 839 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
@@ -883,7 +883,7 @@ redo_request:
883 msleep(500); 883 msleep(500);
884 } 884 }
885 continue; 885 continue;
886 } 886 }
887 887
888 dlm_kick_recovery_thread(dlm); 888 dlm_kick_recovery_thread(dlm);
889 msleep(1000); 889 msleep(1000);
@@ -939,8 +939,8 @@ wait:
939 res->lockname.name, blocked); 939 res->lockname.name, blocked);
940 if (++tries > 20) { 940 if (++tries > 20) {
941 mlog(ML_ERROR, "%s:%.*s: spinning on " 941 mlog(ML_ERROR, "%s:%.*s: spinning on "
942 "dlm_wait_for_lock_mastery, blocked=%d\n", 942 "dlm_wait_for_lock_mastery, blocked=%d\n",
943 dlm->name, res->lockname.len, 943 dlm->name, res->lockname.len,
944 res->lockname.name, blocked); 944 res->lockname.name, blocked);
945 dlm_print_one_lock_resource(res); 945 dlm_print_one_lock_resource(res);
946 dlm_print_one_mle(mle); 946 dlm_print_one_mle(mle);
@@ -1029,7 +1029,7 @@ recheck:
1029 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); 1029 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1030 b = (mle->type == DLM_MLE_BLOCK); 1030 b = (mle->type == DLM_MLE_BLOCK);
1031 if ((*blocked && !b) || (!*blocked && b)) { 1031 if ((*blocked && !b) || (!*blocked && b)) {
1032 mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 1032 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1033 dlm->name, res->lockname.len, res->lockname.name, 1033 dlm->name, res->lockname.len, res->lockname.name,
1034 *blocked, b); 1034 *blocked, b);
1035 *blocked = b; 1035 *blocked = b;
@@ -1602,7 +1602,7 @@ send_response:
1602 } 1602 }
1603 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", 1603 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1604 dlm->node_num, res->lockname.len, res->lockname.name); 1604 dlm->node_num, res->lockname.len, res->lockname.name);
1605 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 1605 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1606 DLM_ASSERT_MASTER_MLE_CLEANUP); 1606 DLM_ASSERT_MASTER_MLE_CLEANUP);
1607 if (ret < 0) { 1607 if (ret < 0) {
1608 mlog(ML_ERROR, "failed to dispatch assert master work\n"); 1608 mlog(ML_ERROR, "failed to dispatch assert master work\n");
@@ -1701,7 +1701,7 @@ again:
1701 1701
1702 if (r & DLM_ASSERT_RESPONSE_REASSERT) { 1702 if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1703 mlog(0, "%.*s: node %u create mles on other " 1703 mlog(0, "%.*s: node %u create mles on other "
1704 "nodes and requests a re-assert\n", 1704 "nodes and requests a re-assert\n",
1705 namelen, lockname, to); 1705 namelen, lockname, to);
1706 reassert = 1; 1706 reassert = 1;
1707 } 1707 }
@@ -1812,7 +1812,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1812 spin_unlock(&dlm->master_lock); 1812 spin_unlock(&dlm->master_lock);
1813 spin_unlock(&dlm->spinlock); 1813 spin_unlock(&dlm->spinlock);
1814 goto done; 1814 goto done;
1815 } 1815 }
1816 } 1816 }
1817 } 1817 }
1818 spin_unlock(&dlm->master_lock); 1818 spin_unlock(&dlm->master_lock);
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1875ok:
1876 spin_unlock(&res->spinlock); 1876 spin_unlock(&res->spinlock);
1877 } 1877 }
1878 spin_unlock(&dlm->spinlock);
1879 1878
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1879 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1880 // assert->node_idx);
@@ -1883,7 +1882,7 @@ ok:
1883 int extra_ref = 0; 1882 int extra_ref = 0;
1884 int nn = -1; 1883 int nn = -1;
1885 int rr, err = 0; 1884 int rr, err = 0;
1886 1885
1887 spin_lock(&mle->spinlock); 1886 spin_lock(&mle->spinlock);
1888 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) 1887 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1889 extra_ref = 1; 1888 extra_ref = 1;
@@ -1891,7 +1890,7 @@ ok:
1891 /* MASTER mle: if any bits set in the response map 1890 /* MASTER mle: if any bits set in the response map
1892 * then the calling node needs to re-assert to clear 1891 * then the calling node needs to re-assert to clear
1893 * up nodes that this node contacted */ 1892 * up nodes that this node contacted */
1894 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 1893 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1895 nn+1)) < O2NM_MAX_NODES) { 1894 nn+1)) < O2NM_MAX_NODES) {
1896 if (nn != dlm->node_num && nn != assert->node_idx) 1895 if (nn != dlm->node_num && nn != assert->node_idx)
1897 master_request = 1; 1896 master_request = 1;
@@ -1926,7 +1925,6 @@ ok:
1926 /* master is known, detach if not already detached. 1925 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1926 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1927 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1928 spin_lock(&dlm->master_lock);
1931 1929
1932 rr = atomic_read(&mle->mle_refs.refcount); 1930 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
1959 __dlm_put_mle(mle); 1957 __dlm_put_mle(mle);
1960 } 1958 }
1961 spin_unlock(&dlm->master_lock); 1959 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1960 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1961 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1962 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
1967 res->owner, namelen, name); 1964 res->owner, namelen, name);
1968 } 1965 }
1969 } 1966 }
1967 spin_unlock(&dlm->spinlock);
1970 1968
1971done: 1969done:
1972 ret = 0; 1970 ret = 0;
@@ -2002,7 +2000,7 @@ kill:
2002 __dlm_print_one_lock_resource(res); 2000 __dlm_print_one_lock_resource(res);
2003 spin_unlock(&res->spinlock); 2001 spin_unlock(&res->spinlock);
2004 spin_unlock(&dlm->spinlock); 2002 spin_unlock(&dlm->spinlock);
2005 *ret_data = (void *)res; 2003 *ret_data = (void *)res;
2006 dlm_put(dlm); 2004 dlm_put(dlm);
2007 return -EINVAL; 2005 return -EINVAL;
2008} 2006}
@@ -2040,10 +2038,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2040 item->u.am.request_from = request_from; 2038 item->u.am.request_from = request_from;
2041 item->u.am.flags = flags; 2039 item->u.am.flags = flags;
2042 2040
2043 if (ignore_higher) 2041 if (ignore_higher)
2044 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 2042 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
2045 res->lockname.name); 2043 res->lockname.name);
2046 2044
2047 spin_lock(&dlm->work_lock); 2045 spin_lock(&dlm->work_lock);
2048 list_add_tail(&item->list, &dlm->work_list); 2046 list_add_tail(&item->list, &dlm->work_list);
2049 spin_unlock(&dlm->work_lock); 2047 spin_unlock(&dlm->work_lock);
@@ -2133,7 +2131,7 @@ put:
2133 * think that $RECOVERY is currently mastered by a dead node. If so, 2131 * think that $RECOVERY is currently mastered by a dead node. If so,
2134 * we wait a short time to allow that node to get notified by its own 2132 * we wait a short time to allow that node to get notified by its own
2135 * heartbeat stack, then check again. All $RECOVERY lock resources 2133 * heartbeat stack, then check again. All $RECOVERY lock resources
2136 * mastered by dead nodes are purged when the hearbeat callback is 2134 * mastered by dead nodes are purged when the hearbeat callback is
2137 * fired, so we can know for sure that it is safe to continue once 2135 * fired, so we can know for sure that it is safe to continue once
2138 * the node returns a live node or no node. */ 2136 * the node returns a live node or no node. */
2139static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 2137static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2174,7 +2172,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2174 ret = -EAGAIN; 2172 ret = -EAGAIN;
2175 } 2173 }
2176 spin_unlock(&dlm->spinlock); 2174 spin_unlock(&dlm->spinlock);
2177 mlog(0, "%s: reco lock master is %u\n", dlm->name, 2175 mlog(0, "%s: reco lock master is %u\n", dlm->name,
2178 master); 2176 master);
2179 break; 2177 break;
2180 } 2178 }
@@ -2586,7 +2584,7 @@ fail:
2586 * is complete everywhere. if the target dies while this is 2584 * is complete everywhere. if the target dies while this is
2587 * going on, some nodes could potentially see the target as the 2585 * going on, some nodes could potentially see the target as the
2588 * master, so it is important that my recovery finds the migration 2586 * master, so it is important that my recovery finds the migration
2589 * mle and sets the master to UNKNONWN. */ 2587 * mle and sets the master to UNKNOWN. */
2590 2588
2591 2589
2592 /* wait for new node to assert master */ 2590 /* wait for new node to assert master */
@@ -2602,7 +2600,7 @@ fail:
2602 2600
2603 mlog(0, "%s:%.*s: timed out during migration\n", 2601 mlog(0, "%s:%.*s: timed out during migration\n",
2604 dlm->name, res->lockname.len, res->lockname.name); 2602 dlm->name, res->lockname.len, res->lockname.name);
2605 /* avoid hang during shutdown when migrating lockres 2603 /* avoid hang during shutdown when migrating lockres
2606 * to a node which also goes down */ 2604 * to a node which also goes down */
2607 if (dlm_is_node_dead(dlm, target)) { 2605 if (dlm_is_node_dead(dlm, target)) {
2608 mlog(0, "%s:%.*s: expected migration " 2606 mlog(0, "%s:%.*s: expected migration "
@@ -2738,7 +2736,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2738 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); 2736 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2739 spin_unlock(&res->spinlock); 2737 spin_unlock(&res->spinlock);
2740 2738
2741 /* target has died, so make the caller break out of the 2739 /* target has died, so make the caller break out of the
2742 * wait_event, but caller must recheck the domain_map */ 2740 * wait_event, but caller must recheck the domain_map */
2743 spin_lock(&dlm->spinlock); 2741 spin_lock(&dlm->spinlock);
2744 if (!test_bit(mig_target, dlm->domain_map)) 2742 if (!test_bit(mig_target, dlm->domain_map))
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index d9fa3d22e17c..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
310 mlog(0, "dlm thread running for %s...\n", dlm->name); 310 mlog(0, "dlm thread running for %s...\n", dlm->name);
311 311
312 while (!kthread_should_stop()) { 312 while (!kthread_should_stop()) {
313 if (dlm_joined(dlm)) { 313 if (dlm_domain_fully_joined(dlm)) {
314 status = dlm_do_recovery(dlm); 314 status = dlm_do_recovery(dlm);
315 if (status == -EAGAIN) { 315 if (status == -EAGAIN) {
316 /* do not sleep, recheck immediately. */ 316 /* do not sleep, recheck immediately. */
@@ -1050,7 +1050,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1050 if (lock->ml.node == dead_node) { 1050 if (lock->ml.node == dead_node) {
1051 mlog(0, "AHA! there was " 1051 mlog(0, "AHA! there was "
1052 "a $RECOVERY lock for dead " 1052 "a $RECOVERY lock for dead "
1053 "node %u (%s)!\n", 1053 "node %u (%s)!\n",
1054 dead_node, dlm->name); 1054 dead_node, dlm->name);
1055 list_del_init(&lock->list); 1055 list_del_init(&lock->list);
1056 dlm_lock_put(lock); 1056 dlm_lock_put(lock);
@@ -1164,6 +1164,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1164 mres->master = master; 1164 mres->master = master;
1165} 1165}
1166 1166
1167static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
1168 struct dlm_migratable_lockres *mres,
1169 int queue)
1170{
1171 if (!lock->lksb)
1172 return;
1173
1174 /* Ignore lvb in all locks in the blocked list */
1175 if (queue == DLM_BLOCKED_LIST)
1176 return;
1177
1178 /* Only consider lvbs in locks with granted EX or PR lock levels */
1179 if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
1180 return;
1181
1182 if (dlm_lvb_is_empty(mres->lvb)) {
1183 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
1184 return;
1185 }
1186
1187 /* Ensure the lvb copied for migration matches in other valid locks */
1188 if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
1189 return;
1190
1191 mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
1192 "node=%u\n",
1193 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
1194 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
1195 lock->lockres->lockname.len, lock->lockres->lockname.name,
1196 lock->ml.node);
1197 dlm_print_one_lock_resource(lock->lockres);
1198 BUG();
1199}
1167 1200
1168/* returns 1 if this lock fills the network structure, 1201/* returns 1 if this lock fills the network structure,
1169 * 0 otherwise */ 1202 * 0 otherwise */
@@ -1181,20 +1214,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
1181 ml->list = queue; 1214 ml->list = queue;
1182 if (lock->lksb) { 1215 if (lock->lksb) {
1183 ml->flags = lock->lksb->flags; 1216 ml->flags = lock->lksb->flags;
1184 /* send our current lvb */ 1217 dlm_prepare_lvb_for_migration(lock, mres, queue);
1185 if (ml->type == LKM_EXMODE ||
1186 ml->type == LKM_PRMODE) {
1187 /* if it is already set, this had better be a PR
1188 * and it has to match */
1189 if (!dlm_lvb_is_empty(mres->lvb) &&
1190 (ml->type == LKM_EXMODE ||
1191 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
1192 mlog(ML_ERROR, "mismatched lvbs!\n");
1193 dlm_print_one_lock_resource(lock->lockres);
1194 BUG();
1195 }
1196 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
1197 }
1198 } 1218 }
1199 ml->node = lock->ml.node; 1219 ml->node = lock->ml.node;
1200 mres->num_locks++; 1220 mres->num_locks++;
@@ -1730,6 +1750,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1730 struct dlm_lock *lock = NULL; 1750 struct dlm_lock *lock = NULL;
1731 u8 from = O2NM_MAX_NODES; 1751 u8 from = O2NM_MAX_NODES;
1732 unsigned int added = 0; 1752 unsigned int added = 0;
1753 __be64 c;
1733 1754
1734 mlog(0, "running %d locks for this lockres\n", mres->num_locks); 1755 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1735 for (i=0; i<mres->num_locks; i++) { 1756 for (i=0; i<mres->num_locks; i++) {
@@ -1777,19 +1798,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1777 /* lock is always created locally first, and 1798 /* lock is always created locally first, and
1778 * destroyed locally last. it must be on the list */ 1799 * destroyed locally last. it must be on the list */
1779 if (!lock) { 1800 if (!lock) {
1780 __be64 c = ml->cookie; 1801 c = ml->cookie;
1781 mlog(ML_ERROR, "could not find local lock " 1802 mlog(ML_ERROR, "Could not find local lock "
1782 "with cookie %u:%llu!\n", 1803 "with cookie %u:%llu, node %u, "
1804 "list %u, flags 0x%x, type %d, "
1805 "conv %d, highest blocked %d\n",
1783 dlm_get_lock_cookie_node(be64_to_cpu(c)), 1806 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1784 dlm_get_lock_cookie_seq(be64_to_cpu(c))); 1807 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1808 ml->node, ml->list, ml->flags, ml->type,
1809 ml->convert_type, ml->highest_blocked);
1810 __dlm_print_one_lock_resource(res);
1811 BUG();
1812 }
1813
1814 if (lock->ml.node != ml->node) {
1815 c = lock->ml.cookie;
1816 mlog(ML_ERROR, "Mismatched node# in lock "
1817 "cookie %u:%llu, name %.*s, node %u\n",
1818 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1819 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1820 res->lockname.len, res->lockname.name,
1821 lock->ml.node);
1822 c = ml->cookie;
1823 mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
1824 "node %u, list %u, flags 0x%x, type %d, "
1825 "conv %d, highest blocked %d\n",
1826 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1827 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1828 ml->node, ml->list, ml->flags, ml->type,
1829 ml->convert_type, ml->highest_blocked);
1785 __dlm_print_one_lock_resource(res); 1830 __dlm_print_one_lock_resource(res);
1786 BUG(); 1831 BUG();
1787 } 1832 }
1788 BUG_ON(lock->ml.node != ml->node);
1789 1833
1790 if (tmpq != queue) { 1834 if (tmpq != queue) {
1791 mlog(0, "lock was on %u instead of %u for %.*s\n", 1835 c = ml->cookie;
1792 j, ml->list, res->lockname.len, res->lockname.name); 1836 mlog(0, "Lock cookie %u:%llu was on list %u "
1837 "instead of list %u for %.*s\n",
1838 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1839 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1840 j, ml->list, res->lockname.len,
1841 res->lockname.name);
1842 __dlm_print_one_lock_resource(res);
1793 spin_unlock(&res->spinlock); 1843 spin_unlock(&res->spinlock);
1794 continue; 1844 continue;
1795 } 1845 }
@@ -1839,7 +1889,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1839 * the lvb. */ 1889 * the lvb. */
1840 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); 1890 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1841 } else { 1891 } else {
1842 /* otherwise, the node is sending its 1892 /* otherwise, the node is sending its
1843 * most recent valid lvb info */ 1893 * most recent valid lvb info */
1844 BUG_ON(ml->type != LKM_EXMODE && 1894 BUG_ON(ml->type != LKM_EXMODE &&
1845 ml->type != LKM_PRMODE); 1895 ml->type != LKM_PRMODE);
@@ -1886,7 +1936,7 @@ skip_lvb:
1886 spin_lock(&res->spinlock); 1936 spin_lock(&res->spinlock);
1887 list_for_each_entry(lock, queue, list) { 1937 list_for_each_entry(lock, queue, list) {
1888 if (lock->ml.cookie == ml->cookie) { 1938 if (lock->ml.cookie == ml->cookie) {
1889 __be64 c = lock->ml.cookie; 1939 c = lock->ml.cookie;
1890 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " 1940 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
1891 "exists on this lockres!\n", dlm->name, 1941 "exists on this lockres!\n", dlm->name,
1892 res->lockname.len, res->lockname.name, 1942 res->lockname.len, res->lockname.name,
@@ -2114,7 +2164,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2114 assert_spin_locked(&res->spinlock); 2164 assert_spin_locked(&res->spinlock);
2115 2165
2116 if (res->owner == dlm->node_num) 2166 if (res->owner == dlm->node_num)
2117 /* if this node owned the lockres, and if the dead node 2167 /* if this node owned the lockres, and if the dead node
2118 * had an EX when he died, blank out the lvb */ 2168 * had an EX when he died, blank out the lvb */
2119 search_node = dead_node; 2169 search_node = dead_node;
2120 else { 2170 else {
@@ -2152,7 +2202,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2152 2202
2153 /* this node is the lockres master: 2203 /* this node is the lockres master:
2154 * 1) remove any stale locks for the dead node 2204 * 1) remove any stale locks for the dead node
2155 * 2) if the dead node had an EX when he died, blank out the lvb 2205 * 2) if the dead node had an EX when he died, blank out the lvb
2156 */ 2206 */
2157 assert_spin_locked(&dlm->spinlock); 2207 assert_spin_locked(&dlm->spinlock);
2158 assert_spin_locked(&res->spinlock); 2208 assert_spin_locked(&res->spinlock);
@@ -2193,7 +2243,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2193 mlog(0, "%s:%.*s: freed %u locks for dead node %u, " 2243 mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
2194 "dropping ref from lockres\n", dlm->name, 2244 "dropping ref from lockres\n", dlm->name,
2195 res->lockname.len, res->lockname.name, freed, dead_node); 2245 res->lockname.len, res->lockname.name, freed, dead_node);
2196 BUG_ON(!test_bit(dead_node, res->refmap)); 2246 if(!test_bit(dead_node, res->refmap)) {
2247 mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
2248 "but ref was not set\n", dlm->name,
2249 res->lockname.len, res->lockname.name, freed, dead_node);
2250 __dlm_print_one_lock_resource(res);
2251 }
2197 dlm_lockres_clear_refmap_bit(dead_node, res); 2252 dlm_lockres_clear_refmap_bit(dead_node, res);
2198 } else if (test_bit(dead_node, res->refmap)) { 2253 } else if (test_bit(dead_node, res->refmap)) {
2199 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2254 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2260,7 +2315,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2260 } 2315 }
2261 spin_unlock(&res->spinlock); 2316 spin_unlock(&res->spinlock);
2262 continue; 2317 continue;
2263 } 2318 }
2264 spin_lock(&res->spinlock); 2319 spin_lock(&res->spinlock);
2265 /* zero the lvb if necessary */ 2320 /* zero the lvb if necessary */
2266 dlm_revalidate_lvb(dlm, res, dead_node); 2321 dlm_revalidate_lvb(dlm, res, dead_node);
@@ -2411,7 +2466,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
2411 * this function on each node racing to become the recovery 2466 * this function on each node racing to become the recovery
2412 * master will not stop attempting this until either: 2467 * master will not stop attempting this until either:
2413 * a) this node gets the EX (and becomes the recovery master), 2468 * a) this node gets the EX (and becomes the recovery master),
2414 * or b) dlm->reco.new_master gets set to some nodenum 2469 * or b) dlm->reco.new_master gets set to some nodenum
2415 * != O2NM_INVALID_NODE_NUM (another node will do the reco). 2470 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2416 * so each time a recovery master is needed, the entire cluster 2471 * so each time a recovery master is needed, the entire cluster
2417 * will sync at this point. if the new master dies, that will 2472 * will sync at this point. if the new master dies, that will
@@ -2424,7 +2479,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
2424 2479
2425 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", 2480 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
2426 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); 2481 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
2427again: 2482again:
2428 memset(&lksb, 0, sizeof(lksb)); 2483 memset(&lksb, 0, sizeof(lksb));
2429 2484
2430 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 2485 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
@@ -2437,8 +2492,8 @@ again:
2437 if (ret == DLM_NORMAL) { 2492 if (ret == DLM_NORMAL) {
2438 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", 2493 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
2439 dlm->name, dlm->node_num); 2494 dlm->name, dlm->node_num);
2440 2495
2441 /* got the EX lock. check to see if another node 2496 /* got the EX lock. check to see if another node
2442 * just became the reco master */ 2497 * just became the reco master */
2443 if (dlm_reco_master_ready(dlm)) { 2498 if (dlm_reco_master_ready(dlm)) {
2444 mlog(0, "%s: got reco EX lock, but %u will " 2499 mlog(0, "%s: got reco EX lock, but %u will "
@@ -2451,12 +2506,12 @@ again:
2451 /* see if recovery was already finished elsewhere */ 2506 /* see if recovery was already finished elsewhere */
2452 spin_lock(&dlm->spinlock); 2507 spin_lock(&dlm->spinlock);
2453 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 2508 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
2454 status = -EINVAL; 2509 status = -EINVAL;
2455 mlog(0, "%s: got reco EX lock, but " 2510 mlog(0, "%s: got reco EX lock, but "
2456 "node got recovered already\n", dlm->name); 2511 "node got recovered already\n", dlm->name);
2457 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2512 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2458 mlog(ML_ERROR, "%s: new master is %u " 2513 mlog(ML_ERROR, "%s: new master is %u "
2459 "but no dead node!\n", 2514 "but no dead node!\n",
2460 dlm->name, dlm->reco.new_master); 2515 dlm->name, dlm->reco.new_master);
2461 BUG(); 2516 BUG();
2462 } 2517 }
@@ -2468,7 +2523,7 @@ again:
2468 * set the master and send the messages to begin recovery */ 2523 * set the master and send the messages to begin recovery */
2469 if (!status) { 2524 if (!status) {
2470 mlog(0, "%s: dead=%u, this=%u, sending " 2525 mlog(0, "%s: dead=%u, this=%u, sending "
2471 "begin_reco now\n", dlm->name, 2526 "begin_reco now\n", dlm->name,
2472 dlm->reco.dead_node, dlm->node_num); 2527 dlm->reco.dead_node, dlm->node_num);
2473 status = dlm_send_begin_reco_message(dlm, 2528 status = dlm_send_begin_reco_message(dlm,
2474 dlm->reco.dead_node); 2529 dlm->reco.dead_node);
@@ -2501,7 +2556,7 @@ again:
2501 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", 2556 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
2502 dlm->name, dlm->node_num); 2557 dlm->name, dlm->node_num);
2503 /* another node is master. wait on 2558 /* another node is master. wait on
2504 * reco.new_master != O2NM_INVALID_NODE_NUM 2559 * reco.new_master != O2NM_INVALID_NODE_NUM
2505 * for at most one second */ 2560 * for at most one second */
2506 wait_event_timeout(dlm->dlm_reco_thread_wq, 2561 wait_event_timeout(dlm->dlm_reco_thread_wq,
2507 dlm_reco_master_ready(dlm), 2562 dlm_reco_master_ready(dlm),
@@ -2589,9 +2644,23 @@ retry:
2589 "begin reco msg (%d)\n", dlm->name, nodenum, ret); 2644 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2590 ret = 0; 2645 ret = 0;
2591 } 2646 }
2647
2648 /*
2649 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
2650 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
2651 * We are handling both for compatibility reasons.
2652 */
2653 if (ret == -EAGAIN || ret == EAGAIN) {
2654 mlog(0, "%s: trying to start recovery of node "
2655 "%u, but node %u is waiting for last recovery "
2656 "to complete, backoff for a bit\n", dlm->name,
2657 dead_node, nodenum);
2658 msleep(100);
2659 goto retry;
2660 }
2592 if (ret < 0) { 2661 if (ret < 0) {
2593 struct dlm_lock_resource *res; 2662 struct dlm_lock_resource *res;
2594 /* this is now a serious problem, possibly ENOMEM 2663 /* this is now a serious problem, possibly ENOMEM
2595 * in the network stack. must retry */ 2664 * in the network stack. must retry */
2596 mlog_errno(ret); 2665 mlog_errno(ret);
2597 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2666 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
@@ -2604,18 +2673,10 @@ retry:
2604 } else { 2673 } else {
2605 mlog(ML_ERROR, "recovery lock not found\n"); 2674 mlog(ML_ERROR, "recovery lock not found\n");
2606 } 2675 }
2607 /* sleep for a bit in hopes that we can avoid 2676 /* sleep for a bit in hopes that we can avoid
2608 * another ENOMEM */ 2677 * another ENOMEM */
2609 msleep(100); 2678 msleep(100);
2610 goto retry; 2679 goto retry;
2611 } else if (ret == EAGAIN) {
2612 mlog(0, "%s: trying to start recovery of node "
2613 "%u, but node %u is waiting for last recovery "
2614 "to complete, backoff for a bit\n", dlm->name,
2615 dead_node, nodenum);
2616 /* TODO Look into replacing msleep with cond_resched() */
2617 msleep(100);
2618 goto retry;
2619 } 2680 }
2620 } 2681 }
2621 2682
@@ -2639,7 +2700,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2639 dlm->name, br->node_idx, br->dead_node, 2700 dlm->name, br->node_idx, br->dead_node,
2640 dlm->reco.dead_node, dlm->reco.new_master); 2701 dlm->reco.dead_node, dlm->reco.new_master);
2641 spin_unlock(&dlm->spinlock); 2702 spin_unlock(&dlm->spinlock);
2642 return EAGAIN; 2703 return -EAGAIN;
2643 } 2704 }
2644 spin_unlock(&dlm->spinlock); 2705 spin_unlock(&dlm->spinlock);
2645 2706
@@ -2664,7 +2725,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2664 } 2725 }
2665 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { 2726 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2666 mlog(ML_NOTICE, "%s: dead_node previously set to %u, " 2727 mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2667 "node %u changing it to %u\n", dlm->name, 2728 "node %u changing it to %u\n", dlm->name,
2668 dlm->reco.dead_node, br->node_idx, br->dead_node); 2729 dlm->reco.dead_node, br->node_idx, br->dead_node);
2669 } 2730 }
2670 dlm_set_reco_master(dlm, br->node_idx); 2731 dlm_set_reco_master(dlm, br->node_idx);
@@ -2730,8 +2791,8 @@ stage2:
2730 if (ret < 0) { 2791 if (ret < 0) {
2731 mlog_errno(ret); 2792 mlog_errno(ret);
2732 if (dlm_is_host_down(ret)) { 2793 if (dlm_is_host_down(ret)) {
2733 /* this has no effect on this recovery 2794 /* this has no effect on this recovery
2734 * session, so set the status to zero to 2795 * session, so set the status to zero to
2735 * finish out the last recovery */ 2796 * finish out the last recovery */
2736 mlog(ML_ERROR, "node %u went down after this " 2797 mlog(ML_ERROR, "node %u went down after this "
2737 "node finished recovery.\n", nodenum); 2798 "node finished recovery.\n", nodenum);
@@ -2768,7 +2829,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2768 mlog(0, "%s: node %u finalizing recovery stage%d of " 2829 mlog(0, "%s: node %u finalizing recovery stage%d of "
2769 "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, 2830 "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
2770 fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); 2831 fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
2771 2832
2772 spin_lock(&dlm->spinlock); 2833 spin_lock(&dlm->spinlock);
2773 2834
2774 if (dlm->reco.new_master != fr->node_idx) { 2835 if (dlm->reco.new_master != fr->node_idx) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..11a6d1fd1d35 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 00f53b2aea76..b47c1b92b82b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -190,8 +189,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
190 actions &= ~(DLM_UNLOCK_REMOVE_LOCK| 189 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
191 DLM_UNLOCK_REGRANT_LOCK| 190 DLM_UNLOCK_REGRANT_LOCK|
192 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 191 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
193 } else if (status == DLM_RECOVERING || 192 } else if (status == DLM_RECOVERING ||
194 status == DLM_MIGRATING || 193 status == DLM_MIGRATING ||
195 status == DLM_FORWARD) { 194 status == DLM_FORWARD) {
196 /* must clear the actions because this unlock 195 /* must clear the actions because this unlock
197 * is about to be retried. cannot free or do 196 * is about to be retried. cannot free or do
@@ -661,14 +660,14 @@ retry:
661 if (call_ast) { 660 if (call_ast) {
662 mlog(0, "calling unlockast(%p, %d)\n", data, status); 661 mlog(0, "calling unlockast(%p, %d)\n", data, status);
663 if (is_master) { 662 if (is_master) {
664 /* it is possible that there is one last bast 663 /* it is possible that there is one last bast
665 * pending. make sure it is flushed, then 664 * pending. make sure it is flushed, then
666 * call the unlockast. 665 * call the unlockast.
667 * not an issue if this is a mastered remotely, 666 * not an issue if this is a mastered remotely,
668 * since this lock has been removed from the 667 * since this lock has been removed from the
669 * lockres queues and cannot be found. */ 668 * lockres queues and cannot be found. */
670 dlm_kick_thread(dlm, NULL); 669 dlm_kick_thread(dlm, NULL);
671 wait_event(dlm->ast_wq, 670 wait_event(dlm->ast_wq,
672 dlm_lock_basts_flushed(dlm, lock)); 671 dlm_lock_basts_flushed(dlm, lock));
673 } 672 }
674 (*unlockast)(data, status); 673 (*unlockast)(data, status);
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..1b0de157a08c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/backing-dev.h> 45#include <linux/backing-dev.h>
46#include <linux/poll.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49 50#include "stackglue.h"
50#include "cluster/nodemanager.h"
51#include "cluster/heartbeat.h"
52#include "cluster/tcp.h"
53
54#include "dlmapi.h"
55
56#include "userdlm.h" 51#include "userdlm.h"
57
58#include "dlmfsver.h" 52#include "dlmfsver.h"
59 53
60#define MLOG_MASK_PREFIX ML_DLMFS 54#define MLOG_MASK_PREFIX ML_DLMFS
61#include "cluster/masklog.h" 55#include "cluster/masklog.h"
62 56
63#include "ocfs2_lockingver.h"
64 57
65static const struct super_operations dlmfs_ops; 58static const struct super_operations dlmfs_ops;
66static const struct file_operations dlmfs_file_operations; 59static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
71 64
72struct workqueue_struct *user_dlm_worker; 65struct workqueue_struct *user_dlm_worker;
73 66
67
68
74/* 69/*
75 * This is the userdlmfs locking protocol version. 70 * These are the ABI capabilities of dlmfs.
71 *
72 * Over time, dlmfs has added some features that were not part of the
73 * initial ABI. Unfortunately, some of these features are not detectable
74 * via standard usage. For example, Linux's default poll always returns
75 * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
76 * added poll support. Instead, we provide this list of new capabilities.
77 *
78 * Capabilities is a read-only attribute. We do it as a module parameter
79 * so we can discover it whether dlmfs is built in, loaded, or even not
80 * loaded.
76 * 81 *
77 * See fs/ocfs2/dlmglue.c for more details on locking versions. 82 * The ABI features are local to this machine's dlmfs mount. This is
83 * distinct from the locking protocol, which is concerned with inter-node
84 * interaction.
85 *
86 * Capabilities:
87 * - bast : POLLIN against the file descriptor of a held lock
88 * signifies a bast fired on the lock.
78 */ 89 */
79static const struct dlm_protocol_version user_locking_protocol = { 90#define DLMFS_CAPABILITIES "bast stackglue"
80 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, 91extern int param_set_dlmfs_capabilities(const char *val,
81 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, 92 struct kernel_param *kp)
82}; 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
95 return -EINVAL;
96}
97static int param_get_dlmfs_capabilities(char *buffer,
98 struct kernel_param *kp)
99{
100 return strlcpy(buffer, DLMFS_CAPABILITIES,
101 strlen(DLMFS_CAPABILITIES) + 1);
102}
103module_param_call(capabilities, param_set_dlmfs_capabilities,
104 param_get_dlmfs_capabilities, NULL, 0444);
105MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
106
83 107
84/* 108/*
85 * decodes a set of open flags into a valid lock level and a set of flags. 109 * decodes a set of open flags into a valid lock level and a set of flags.
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
179 return 0; 203 return 0;
180} 204}
181 205
206/*
207 * We do ->setattr() just to override size changes. Our size is the size
208 * of the LVB and nothing else.
209 */
210static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
211{
212 int error;
213 struct inode *inode = dentry->d_inode;
214
215 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr);
217 if (!error)
218 error = inode_setattr(inode, attr);
219
220 return error;
221}
222
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
224{
225 int event = 0;
226 struct inode *inode = file->f_path.dentry->d_inode;
227 struct dlmfs_inode_private *ip = DLMFS_I(inode);
228
229 poll_wait(file, &ip->ip_lockres.l_event, wait);
230
231 spin_lock(&ip->ip_lockres.l_lock);
232 if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
233 event = POLLIN | POLLRDNORM;
234 spin_unlock(&ip->ip_lockres.l_lock);
235
236 return event;
237}
238
182static ssize_t dlmfs_file_read(struct file *filp, 239static ssize_t dlmfs_file_read(struct file *filp,
183 char __user *buf, 240 char __user *buf,
184 size_t count, 241 size_t count,
185 loff_t *ppos) 242 loff_t *ppos)
186{ 243{
187 int bytes_left; 244 int bytes_left;
188 ssize_t readlen; 245 ssize_t readlen, got;
189 char *lvb_buf; 246 char *lvb_buf;
190 struct inode *inode = filp->f_path.dentry->d_inode; 247 struct inode *inode = filp->f_path.dentry->d_inode;
191 248
@@ -211,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp,
211 if (!lvb_buf) 268 if (!lvb_buf)
212 return -ENOMEM; 269 return -ENOMEM;
213 270
214 user_dlm_read_lvb(inode, lvb_buf, readlen); 271 got = user_dlm_read_lvb(inode, lvb_buf, readlen);
215 bytes_left = __copy_to_user(buf, lvb_buf, readlen); 272 if (got) {
216 readlen -= bytes_left; 273 BUG_ON(got != readlen);
274 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
275 readlen -= bytes_left;
276 } else
277 readlen = 0;
217 278
218 kfree(lvb_buf); 279 kfree(lvb_buf);
219 280
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
272 struct dlmfs_inode_private *ip = 333 struct dlmfs_inode_private *ip =
273 (struct dlmfs_inode_private *) foo; 334 (struct dlmfs_inode_private *) foo;
274 335
275 ip->ip_dlm = NULL; 336 ip->ip_conn = NULL;
276 ip->ip_parent = NULL; 337 ip->ip_parent = NULL;
277 338
278 inode_init_once(&ip->ip_vfs_inode); 339 inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
314 goto clear_fields; 375 goto clear_fields;
315 } 376 }
316 377
317 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); 378 mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
318 /* we must be a directory. If required, lets unregister the 379 /* we must be a directory. If required, lets unregister the
319 * dlm context now. */ 380 * dlm context now. */
320 if (ip->ip_dlm) 381 if (ip->ip_conn)
321 user_dlm_unregister_context(ip->ip_dlm); 382 user_dlm_unregister(ip->ip_conn);
322clear_fields: 383clear_fields:
323 ip->ip_parent = NULL; 384 ip->ip_parent = NULL;
324 ip->ip_dlm = NULL; 385 ip->ip_conn = NULL;
325} 386}
326 387
327static struct backing_dev_info dlmfs_backing_dev_info = { 388static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
371 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
372 433
373 ip = DLMFS_I(inode); 434 ip = DLMFS_I(inode);
374 ip->ip_dlm = DLMFS_I(parent)->ip_dlm; 435 ip->ip_conn = DLMFS_I(parent)->ip_conn;
375 436
376 switch (mode & S_IFMT) { 437 switch (mode & S_IFMT) {
377 default: 438 default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
425 struct inode *inode = NULL; 486 struct inode *inode = NULL;
426 struct qstr *domain = &dentry->d_name; 487 struct qstr *domain = &dentry->d_name;
427 struct dlmfs_inode_private *ip; 488 struct dlmfs_inode_private *ip;
428 struct dlm_ctxt *dlm; 489 struct ocfs2_cluster_connection *conn;
429 struct dlm_protocol_version proto = user_locking_protocol;
430 490
431 mlog(0, "mkdir %.*s\n", domain->len, domain->name); 491 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
432 492
433 /* verify that we have a proper domain */ 493 /* verify that we have a proper domain */
434 if (domain->len >= O2NM_MAX_NAME_LEN) { 494 if (domain->len >= GROUP_NAME_MAX) {
435 status = -EINVAL; 495 status = -EINVAL;
436 mlog(ML_ERROR, "invalid domain name for directory.\n"); 496 mlog(ML_ERROR, "invalid domain name for directory.\n");
437 goto bail; 497 goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
446 506
447 ip = DLMFS_I(inode); 507 ip = DLMFS_I(inode);
448 508
449 dlm = user_dlm_register_context(domain, &proto); 509 conn = user_dlm_register(domain);
450 if (IS_ERR(dlm)) { 510 if (IS_ERR(conn)) {
451 status = PTR_ERR(dlm); 511 status = PTR_ERR(conn);
452 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", 512 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
453 status, domain->len, domain->name); 513 status, domain->len, domain->name);
454 goto bail; 514 goto bail;
455 } 515 }
456 ip->ip_dlm = dlm; 516 ip->ip_conn = conn;
457 517
458 inc_nlink(dir); 518 inc_nlink(dir);
459 d_instantiate(dentry, inode); 519 d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
549static const struct file_operations dlmfs_file_operations = { 609static const struct file_operations dlmfs_file_operations = {
550 .open = dlmfs_file_open, 610 .open = dlmfs_file_open,
551 .release = dlmfs_file_release, 611 .release = dlmfs_file_release,
612 .poll = dlmfs_file_poll,
552 .read = dlmfs_file_read, 613 .read = dlmfs_file_read,
553 .write = dlmfs_file_write, 614 .write = dlmfs_file_write,
554}; 615};
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
576 637
577static const struct inode_operations dlmfs_file_inode_operations = { 638static const struct inode_operations dlmfs_file_inode_operations = {
578 .getattr = simple_getattr, 639 .getattr = simple_getattr,
640 .setattr = dlmfs_file_setattr,
579}; 641};
580 642
581static int dlmfs_get_sb(struct file_system_type *fs_type, 643static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
620 } 682 }
621 cleanup_worker = 1; 683 cleanup_worker = 1;
622 684
685 user_dlm_set_locking_protocol();
623 status = register_filesystem(&dlmfs_fs_type); 686 status = register_filesystem(&dlmfs_fs_type);
624bail: 687bail:
625 if (status) { 688 if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
34#include <linux/types.h> 34#include <linux/types.h>
35#include <linux/crc32.h> 35#include <linux/crc32.h>
36 36
37 37#include "ocfs2_lockingver.h"
38#include "cluster/nodemanager.h" 38#include "stackglue.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h" 39#include "userdlm.h"
45 40
46#define MLOG_MASK_PREFIX ML_DLMFS 41#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h" 42#include "cluster/masklog.h"
48 43
44
45static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
46{
47 return container_of(lksb, struct user_lock_res, l_lksb);
48}
49
49static inline int user_check_wait_flag(struct user_lock_res *lockres, 50static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag) 51 int flag)
51{ 52{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
73} 74}
74 75
75/* I heart container_of... */ 76/* I heart container_of... */
76static inline struct dlm_ctxt * 77static inline struct ocfs2_cluster_connection *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) 78cluster_connection_from_user_lockres(struct user_lock_res *lockres)
78{ 79{
79 struct dlmfs_inode_private *ip; 80 struct dlmfs_inode_private *ip;
80 81
81 ip = container_of(lockres, 82 ip = container_of(lockres,
82 struct dlmfs_inode_private, 83 struct dlmfs_inode_private,
83 ip_lockres); 84 ip_lockres);
84 return ip->ip_dlm; 85 return ip->ip_conn;
85} 86}
86 87
87static struct inode * 88static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
103} 104}
104 105
105#define user_log_dlm_error(_func, _stat, _lockres) do { \ 106#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 107 mlog(ML_ERROR, "Dlm error %d while calling %s on " \
107 "resource %.*s: %s\n", dlm_errname(_stat), _func, \ 108 "resource %.*s\n", _stat, _func, \
108 _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ 109 _lockres->l_namelen, _lockres->l_name); \
109} while (0) 110} while (0)
110 111
111/* WARNING: This function lives in a world where the only three lock 112/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
113 * lock types are added. */ 114 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level) 115static inline int user_highest_compat_lock_level(int level)
115{ 116{
116 int new_level = LKM_EXMODE; 117 int new_level = DLM_LOCK_EX;
117 118
118 if (level == LKM_EXMODE) 119 if (level == DLM_LOCK_EX)
119 new_level = LKM_NLMODE; 120 new_level = DLM_LOCK_NL;
120 else if (level == LKM_PRMODE) 121 else if (level == DLM_LOCK_PR)
121 new_level = LKM_PRMODE; 122 new_level = DLM_LOCK_PR;
122 return new_level; 123 return new_level;
123} 124}
124 125
125static void user_ast(void *opaque) 126static void user_ast(struct ocfs2_dlm_lksb *lksb)
126{ 127{
127 struct user_lock_res *lockres = opaque; 128 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
128 struct dlm_lockstatus *lksb; 129 int status;
129 130
130 mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, 131 mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
131 lockres->l_name); 132 lockres->l_namelen, lockres->l_name, lockres->l_level,
133 lockres->l_requested);
132 134
133 spin_lock(&lockres->l_lock); 135 spin_lock(&lockres->l_lock);
134 136
135 lksb = &(lockres->l_lksb); 137 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
136 if (lksb->status != DLM_NORMAL) { 138 if (status) {
137 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", 139 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
138 lksb->status, lockres->l_namelen, lockres->l_name); 140 status, lockres->l_namelen, lockres->l_name);
139 spin_unlock(&lockres->l_lock); 141 spin_unlock(&lockres->l_lock);
140 return; 142 return;
141 } 143 }
142 144
143 mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, 145 mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
144 "Lockres %.*s, requested ivmode. flags 0x%x\n", 146 "Lockres %.*s, requested ivmode. flags 0x%x\n",
145 lockres->l_namelen, lockres->l_name, lockres->l_flags); 147 lockres->l_namelen, lockres->l_name, lockres->l_flags);
146 148
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
148 if (lockres->l_requested < lockres->l_level) { 150 if (lockres->l_requested < lockres->l_level) {
149 if (lockres->l_requested <= 151 if (lockres->l_requested <=
150 user_highest_compat_lock_level(lockres->l_blocking)) { 152 user_highest_compat_lock_level(lockres->l_blocking)) {
151 lockres->l_blocking = LKM_NLMODE; 153 lockres->l_blocking = DLM_LOCK_NL;
152 lockres->l_flags &= ~USER_LOCK_BLOCKED; 154 lockres->l_flags &= ~USER_LOCK_BLOCKED;
153 } 155 }
154 } 156 }
155 157
156 lockres->l_level = lockres->l_requested; 158 lockres->l_level = lockres->l_requested;
157 lockres->l_requested = LKM_IVMODE; 159 lockres->l_requested = DLM_LOCK_IV;
158 lockres->l_flags |= USER_LOCK_ATTACHED; 160 lockres->l_flags |= USER_LOCK_ATTACHED;
159 lockres->l_flags &= ~USER_LOCK_BUSY; 161 lockres->l_flags &= ~USER_LOCK_BUSY;
160 162
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
193 return; 195 return;
194 196
195 switch (lockres->l_blocking) { 197 switch (lockres->l_blocking) {
196 case LKM_EXMODE: 198 case DLM_LOCK_EX:
197 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 199 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
198 queue = 1; 200 queue = 1;
199 break; 201 break;
200 case LKM_PRMODE: 202 case DLM_LOCK_PR:
201 if (!lockres->l_ex_holders) 203 if (!lockres->l_ex_holders)
202 queue = 1; 204 queue = 1;
203 break; 205 break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
209 __user_dlm_queue_lockres(lockres); 211 __user_dlm_queue_lockres(lockres);
210} 212}
211 213
212static void user_bast(void *opaque, int level) 214static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
213{ 215{
214 struct user_lock_res *lockres = opaque; 216 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
215 217
216 mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", 218 mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
217 lockres->l_namelen, lockres->l_name, level); 219 lockres->l_namelen, lockres->l_name, level, lockres->l_level);
218 220
219 spin_lock(&lockres->l_lock); 221 spin_lock(&lockres->l_lock);
220 lockres->l_flags |= USER_LOCK_BLOCKED; 222 lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
227 wake_up(&lockres->l_event); 229 wake_up(&lockres->l_event);
228} 230}
229 231
230static void user_unlock_ast(void *opaque, enum dlm_status status) 232static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
231{ 233{
232 struct user_lock_res *lockres = opaque; 234 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
233 235
234 mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, 236 mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
235 lockres->l_name); 237 lockres->l_namelen, lockres->l_name, lockres->l_flags);
236 238
237 if (status != DLM_NORMAL && status != DLM_CANCELGRANT) 239 if (status)
238 mlog(ML_ERROR, "Dlm returns status %d\n", status); 240 mlog(ML_ERROR, "dlm returns status %d\n", status);
239 241
240 spin_lock(&lockres->l_lock); 242 spin_lock(&lockres->l_lock);
241 /* The teardown flag gets set early during the unlock process, 243 /* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
243 * for a concurrent cancel. */ 245 * for a concurrent cancel. */
244 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN 246 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
245 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { 247 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
246 lockres->l_level = LKM_IVMODE; 248 lockres->l_level = DLM_LOCK_IV;
247 } else if (status == DLM_CANCELGRANT) { 249 } else if (status == DLM_CANCELGRANT) {
248 /* We tried to cancel a convert request, but it was 250 /* We tried to cancel a convert request, but it was
249 * already granted. Don't clear the busy flag - the 251 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
254 } else { 256 } else {
255 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 257 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
256 /* Cancel succeeded, we want to re-queue */ 258 /* Cancel succeeded, we want to re-queue */
257 lockres->l_requested = LKM_IVMODE; /* cancel an 259 lockres->l_requested = DLM_LOCK_IV; /* cancel an
258 * upconvert 260 * upconvert
259 * request. */ 261 * request. */
260 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 262 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
271 wake_up(&lockres->l_event); 273 wake_up(&lockres->l_event);
272} 274}
273 275
276/*
277 * This is the userdlmfs locking protocol version.
278 *
279 * See fs/ocfs2/dlmglue.c for more details on locking versions.
280 */
281static struct ocfs2_locking_protocol user_dlm_lproto = {
282 .lp_max_version = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285 },
286 .lp_lock_ast = user_ast,
287 .lp_blocking_ast = user_bast,
288 .lp_unlock_ast = user_unlock_ast,
289};
290
274static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) 291static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
275{ 292{
276 struct inode *inode; 293 struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
283 int new_level, status; 300 int new_level, status;
284 struct user_lock_res *lockres = 301 struct user_lock_res *lockres =
285 container_of(work, struct user_lock_res, l_work); 302 container_of(work, struct user_lock_res, l_work);
286 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 303 struct ocfs2_cluster_connection *conn =
304 cluster_connection_from_user_lockres(lockres);
287 305
288 mlog(0, "processing lockres %.*s\n", lockres->l_namelen, 306 mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
289 lockres->l_name);
290 307
291 spin_lock(&lockres->l_lock); 308 spin_lock(&lockres->l_lock);
292 309
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
304 * flag, and finally we might get another bast which re-queues 321 * flag, and finally we might get another bast which re-queues
305 * us before our ast for the downconvert is called. */ 322 * us before our ast for the downconvert is called. */
306 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { 323 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
324 mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
325 lockres->l_namelen, lockres->l_name);
307 spin_unlock(&lockres->l_lock); 326 spin_unlock(&lockres->l_lock);
308 goto drop_ref; 327 goto drop_ref;
309 } 328 }
310 329
311 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 330 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
331 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
332 lockres->l_namelen, lockres->l_name);
312 spin_unlock(&lockres->l_lock); 333 spin_unlock(&lockres->l_lock);
313 goto drop_ref; 334 goto drop_ref;
314 } 335 }
315 336
316 if (lockres->l_flags & USER_LOCK_BUSY) { 337 if (lockres->l_flags & USER_LOCK_BUSY) {
317 if (lockres->l_flags & USER_LOCK_IN_CANCEL) { 338 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
339 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
340 lockres->l_namelen, lockres->l_name);
318 spin_unlock(&lockres->l_lock); 341 spin_unlock(&lockres->l_lock);
319 goto drop_ref; 342 goto drop_ref;
320 } 343 }
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
322 lockres->l_flags |= USER_LOCK_IN_CANCEL; 345 lockres->l_flags |= USER_LOCK_IN_CANCEL;
323 spin_unlock(&lockres->l_lock); 346 spin_unlock(&lockres->l_lock);
324 347
325 status = dlmunlock(dlm, 348 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
326 &lockres->l_lksb, 349 DLM_LKF_CANCEL);
327 LKM_CANCEL, 350 if (status)
328 user_unlock_ast, 351 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
329 lockres);
330 if (status != DLM_NORMAL)
331 user_log_dlm_error("dlmunlock", status, lockres);
332 goto drop_ref; 352 goto drop_ref;
333 } 353 }
334 354
335 /* If there are still incompat holders, we can exit safely 355 /* If there are still incompat holders, we can exit safely
336 * without worrying about re-queueing this lock as that will 356 * without worrying about re-queueing this lock as that will
337 * happen on the last call to user_cluster_unlock. */ 357 * happen on the last call to user_cluster_unlock. */
338 if ((lockres->l_blocking == LKM_EXMODE) 358 if ((lockres->l_blocking == DLM_LOCK_EX)
339 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 359 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
340 spin_unlock(&lockres->l_lock); 360 spin_unlock(&lockres->l_lock);
341 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", 361 mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
342 lockres->l_ro_holders, lockres->l_ex_holders); 362 lockres->l_namelen, lockres->l_name,
363 lockres->l_ex_holders, lockres->l_ro_holders);
343 goto drop_ref; 364 goto drop_ref;
344 } 365 }
345 366
346 if ((lockres->l_blocking == LKM_PRMODE) 367 if ((lockres->l_blocking == DLM_LOCK_PR)
347 && lockres->l_ex_holders) { 368 && lockres->l_ex_holders) {
348 spin_unlock(&lockres->l_lock); 369 spin_unlock(&lockres->l_lock);
349 mlog(0, "can't downconvert for pr: ex = %u\n", 370 mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
350 lockres->l_ex_holders); 371 lockres->l_namelen, lockres->l_name,
372 lockres->l_ex_holders);
351 goto drop_ref; 373 goto drop_ref;
352 } 374 }
353 375
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
355 new_level = user_highest_compat_lock_level(lockres->l_blocking); 377 new_level = user_highest_compat_lock_level(lockres->l_blocking);
356 lockres->l_requested = new_level; 378 lockres->l_requested = new_level;
357 lockres->l_flags |= USER_LOCK_BUSY; 379 lockres->l_flags |= USER_LOCK_BUSY;
358 mlog(0, "Downconvert lock from %d to %d\n", 380 mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
359 lockres->l_level, new_level); 381 lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
360 spin_unlock(&lockres->l_lock); 382 spin_unlock(&lockres->l_lock);
361 383
362 /* need lock downconvert request now... */ 384 /* need lock downconvert request now... */
363 status = dlmlock(dlm, 385 status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
364 new_level, 386 DLM_LKF_CONVERT|DLM_LKF_VALBLK,
365 &lockres->l_lksb, 387 lockres->l_name,
366 LKM_CONVERT|LKM_VALBLK, 388 lockres->l_namelen);
367 lockres->l_name, 389 if (status) {
368 lockres->l_namelen, 390 user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
369 user_ast,
370 lockres,
371 user_bast);
372 if (status != DLM_NORMAL) {
373 user_log_dlm_error("dlmlock", status, lockres);
374 user_recover_from_dlm_error(lockres); 391 user_recover_from_dlm_error(lockres);
375 } 392 }
376 393
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
382 int level) 399 int level)
383{ 400{
384 switch(level) { 401 switch(level) {
385 case LKM_EXMODE: 402 case DLM_LOCK_EX:
386 lockres->l_ex_holders++; 403 lockres->l_ex_holders++;
387 break; 404 break;
388 case LKM_PRMODE: 405 case DLM_LOCK_PR:
389 lockres->l_ro_holders++; 406 lockres->l_ro_holders++;
390 break; 407 break;
391 default: 408 default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
410 int lkm_flags) 427 int lkm_flags)
411{ 428{
412 int status, local_flags; 429 int status, local_flags;
413 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 430 struct ocfs2_cluster_connection *conn =
431 cluster_connection_from_user_lockres(lockres);
414 432
415 if (level != LKM_EXMODE && 433 if (level != DLM_LOCK_EX &&
416 level != LKM_PRMODE) { 434 level != DLM_LOCK_PR) {
417 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 435 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
418 lockres->l_namelen, lockres->l_name); 436 lockres->l_namelen, lockres->l_name);
419 status = -EINVAL; 437 status = -EINVAL;
420 goto bail; 438 goto bail;
421 } 439 }
422 440
423 mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", 441 mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
424 lockres->l_namelen, lockres->l_name, 442 lockres->l_namelen, lockres->l_name, level, lkm_flags);
425 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
426 lkm_flags);
427 443
428again: 444again:
429 if (signal_pending(current)) { 445 if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
457 } 473 }
458 474
459 if (level > lockres->l_level) { 475 if (level > lockres->l_level) {
460 local_flags = lkm_flags | LKM_VALBLK; 476 local_flags = lkm_flags | DLM_LKF_VALBLK;
461 if (lockres->l_level != LKM_IVMODE) 477 if (lockres->l_level != DLM_LOCK_IV)
462 local_flags |= LKM_CONVERT; 478 local_flags |= DLM_LKF_CONVERT;
463 479
464 lockres->l_requested = level; 480 lockres->l_requested = level;
465 lockres->l_flags |= USER_LOCK_BUSY; 481 lockres->l_flags |= USER_LOCK_BUSY;
466 spin_unlock(&lockres->l_lock); 482 spin_unlock(&lockres->l_lock);
467 483
468 BUG_ON(level == LKM_IVMODE); 484 BUG_ON(level == DLM_LOCK_IV);
469 BUG_ON(level == LKM_NLMODE); 485 BUG_ON(level == DLM_LOCK_NL);
470 486
471 /* call dlm_lock to upgrade lock now */ 487 /* call dlm_lock to upgrade lock now */
472 status = dlmlock(dlm, 488 status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
473 level, 489 local_flags, lockres->l_name,
474 &lockres->l_lksb, 490 lockres->l_namelen);
475 local_flags, 491 if (status) {
476 lockres->l_name, 492 if ((lkm_flags & DLM_LKF_NOQUEUE) &&
477 lockres->l_namelen, 493 (status != -EAGAIN))
478 user_ast, 494 user_log_dlm_error("ocfs2_dlm_lock",
479 lockres, 495 status, lockres);
480 user_bast);
481 if (status != DLM_NORMAL) {
482 if ((lkm_flags & LKM_NOQUEUE) &&
483 (status == DLM_NOTQUEUED))
484 status = -EAGAIN;
485 else {
486 user_log_dlm_error("dlmlock", status, lockres);
487 status = -EINVAL;
488 }
489 user_recover_from_dlm_error(lockres); 496 user_recover_from_dlm_error(lockres);
490 goto bail; 497 goto bail;
491 } 498 }
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
506 int level) 513 int level)
507{ 514{
508 switch(level) { 515 switch(level) {
509 case LKM_EXMODE: 516 case DLM_LOCK_EX:
510 BUG_ON(!lockres->l_ex_holders); 517 BUG_ON(!lockres->l_ex_holders);
511 lockres->l_ex_holders--; 518 lockres->l_ex_holders--;
512 break; 519 break;
513 case LKM_PRMODE: 520 case DLM_LOCK_PR:
514 BUG_ON(!lockres->l_ro_holders); 521 BUG_ON(!lockres->l_ro_holders);
515 lockres->l_ro_holders--; 522 lockres->l_ro_holders--;
516 break; 523 break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
522void user_dlm_cluster_unlock(struct user_lock_res *lockres, 529void user_dlm_cluster_unlock(struct user_lock_res *lockres,
523 int level) 530 int level)
524{ 531{
525 if (level != LKM_EXMODE && 532 if (level != DLM_LOCK_EX &&
526 level != LKM_PRMODE) { 533 level != DLM_LOCK_PR) {
527 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 534 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
528 lockres->l_namelen, lockres->l_name); 535 lockres->l_namelen, lockres->l_name);
529 return; 536 return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
540 unsigned int len) 547 unsigned int len)
541{ 548{
542 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 549 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
543 char *lvb = lockres->l_lksb.lvb; 550 char *lvb;
544 551
545 BUG_ON(len > DLM_LVB_LEN); 552 BUG_ON(len > DLM_LVB_LEN);
546 553
547 spin_lock(&lockres->l_lock); 554 spin_lock(&lockres->l_lock);
548 555
549 BUG_ON(lockres->l_level < LKM_EXMODE); 556 BUG_ON(lockres->l_level < DLM_LOCK_EX);
557 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
550 memcpy(lvb, val, len); 558 memcpy(lvb, val, len);
551 559
552 spin_unlock(&lockres->l_lock); 560 spin_unlock(&lockres->l_lock);
553} 561}
554 562
555void user_dlm_read_lvb(struct inode *inode, 563ssize_t user_dlm_read_lvb(struct inode *inode,
556 char *val, 564 char *val,
557 unsigned int len) 565 unsigned int len)
558{ 566{
559 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 567 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
560 char *lvb = lockres->l_lksb.lvb; 568 char *lvb;
569 ssize_t ret = len;
561 570
562 BUG_ON(len > DLM_LVB_LEN); 571 BUG_ON(len > DLM_LVB_LEN);
563 572
564 spin_lock(&lockres->l_lock); 573 spin_lock(&lockres->l_lock);
565 574
566 BUG_ON(lockres->l_level < LKM_PRMODE); 575 BUG_ON(lockres->l_level < DLM_LOCK_PR);
567 memcpy(val, lvb, len); 576 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
577 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
578 memcpy(val, lvb, len);
579 } else
580 ret = 0;
568 581
569 spin_unlock(&lockres->l_lock); 582 spin_unlock(&lockres->l_lock);
583 return ret;
570} 584}
571 585
572void user_dlm_lock_res_init(struct user_lock_res *lockres, 586void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
576 590
577 spin_lock_init(&lockres->l_lock); 591 spin_lock_init(&lockres->l_lock);
578 init_waitqueue_head(&lockres->l_event); 592 init_waitqueue_head(&lockres->l_event);
579 lockres->l_level = LKM_IVMODE; 593 lockres->l_level = DLM_LOCK_IV;
580 lockres->l_requested = LKM_IVMODE; 594 lockres->l_requested = DLM_LOCK_IV;
581 lockres->l_blocking = LKM_IVMODE; 595 lockres->l_blocking = DLM_LOCK_IV;
582 596
583 /* should have been checked before getting here. */ 597 /* should have been checked before getting here. */
584 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); 598 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
592int user_dlm_destroy_lock(struct user_lock_res *lockres) 606int user_dlm_destroy_lock(struct user_lock_res *lockres)
593{ 607{
594 int status = -EBUSY; 608 int status = -EBUSY;
595 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 609 struct ocfs2_cluster_connection *conn =
610 cluster_connection_from_user_lockres(lockres);
596 611
597 mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); 612 mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
598 613
599 spin_lock(&lockres->l_lock); 614 spin_lock(&lockres->l_lock);
600 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 615 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
627 lockres->l_flags |= USER_LOCK_BUSY; 642 lockres->l_flags |= USER_LOCK_BUSY;
628 spin_unlock(&lockres->l_lock); 643 spin_unlock(&lockres->l_lock);
629 644
630 status = dlmunlock(dlm, 645 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
631 &lockres->l_lksb, 646 if (status) {
632 LKM_VALBLK, 647 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
633 user_unlock_ast,
634 lockres);
635 if (status != DLM_NORMAL) {
636 user_log_dlm_error("dlmunlock", status, lockres);
637 status = -EINVAL;
638 goto bail; 648 goto bail;
639 } 649 }
640 650
@@ -645,32 +655,34 @@ bail:
645 return status; 655 return status;
646} 656}
647 657
648struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 658static void user_dlm_recovery_handler_noop(int node_num,
649 struct dlm_protocol_version *proto) 659 void *recovery_data)
650{ 660{
651 struct dlm_ctxt *dlm; 661 /* We ignore recovery events */
652 u32 dlm_key; 662 return;
653 char *domain; 663}
654
655 domain = kmalloc(name->len + 1, GFP_NOFS);
656 if (!domain) {
657 mlog_errno(-ENOMEM);
658 return ERR_PTR(-ENOMEM);
659 }
660 664
661 dlm_key = crc32_le(0, name->name, name->len); 665void user_dlm_set_locking_protocol(void)
666{
667 ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
668}
662 669
663 snprintf(domain, name->len + 1, "%.*s", name->len, name->name); 670struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
671{
672 int rc;
673 struct ocfs2_cluster_connection *conn;
664 674
665 dlm = dlm_register_domain(domain, dlm_key, proto); 675 rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
666 if (IS_ERR(dlm)) 676 &user_dlm_lproto,
667 mlog_errno(PTR_ERR(dlm)); 677 user_dlm_recovery_handler_noop,
678 NULL, &conn);
679 if (rc)
680 mlog_errno(rc);
668 681
669 kfree(domain); 682 return rc ? ERR_PTR(rc) : conn;
670 return dlm;
671} 683}
672 684
673void user_dlm_unregister_context(struct dlm_ctxt *dlm) 685void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
674{ 686{
675 dlm_unregister_domain(dlm); 687 ocfs2_cluster_disconnect(conn, 0);
676} 688}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
57 int l_level; 57 int l_level;
58 unsigned int l_ro_holders; 58 unsigned int l_ro_holders;
59 unsigned int l_ex_holders; 59 unsigned int l_ex_holders;
60 struct dlm_lockstatus l_lksb; 60 struct ocfs2_dlm_lksb l_lksb;
61 61
62 int l_requested; 62 int l_requested;
63 int l_blocking; 63 int l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
80void user_dlm_write_lvb(struct inode *inode, 80void user_dlm_write_lvb(struct inode *inode,
81 const char *val, 81 const char *val,
82 unsigned int len); 82 unsigned int len);
83void user_dlm_read_lvb(struct inode *inode, 83ssize_t user_dlm_read_lvb(struct inode *inode,
84 char *val, 84 char *val,
85 unsigned int len); 85 unsigned int len);
86struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 86struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
87 struct dlm_protocol_version *proto); 87void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
88void user_dlm_unregister_context(struct dlm_ctxt *dlm); 88void user_dlm_set_locking_protocol(void);
89 89
90struct dlmfs_inode_private { 90struct dlmfs_inode_private {
91 struct dlm_ctxt *ip_dlm; 91 struct ocfs2_cluster_connection *ip_conn;
92 92
93 struct user_lock_res ip_lockres; /* unused for directories. */ 93 struct user_lock_res ip_lockres; /* unused for directories. */
94 struct inode *ip_parent; 94 struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194cb..50c4ee805da4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
298} 298}
299 299
300static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
301{
302 return container_of(lksb, struct ocfs2_lock_res, l_lksb);
303}
304
300static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 305static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
301{ 306{
302 BUG_ON(!ocfs2_is_inode_lock(lockres)); 307 BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -875,6 +880,14 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
875 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 880 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
876 881
877 lockres->l_level = lockres->l_requested; 882 lockres->l_level = lockres->l_requested;
883
884 /*
885 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
886 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
887 * downconverting the lock before the upconvert has fully completed.
888 */
889 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
890
878 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 891 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
879 892
880 mlog_exit_void(); 893 mlog_exit_void();
@@ -907,8 +920,6 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
907 920
908 assert_spin_locked(&lockres->l_lock); 921 assert_spin_locked(&lockres->l_lock);
909 922
910 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
911
912 if (level > lockres->l_blocking) { 923 if (level > lockres->l_blocking) {
913 /* only schedule a downconvert if we haven't already scheduled 924 /* only schedule a downconvert if we haven't already scheduled
914 * one that goes low enough to satisfy the level we're 925 * one that goes low enough to satisfy the level we're
@@ -921,6 +932,13 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
921 lockres->l_blocking = level; 932 lockres->l_blocking = level;
922 } 933 }
923 934
935 mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
936 lockres->l_name, level, lockres->l_level, lockres->l_blocking,
937 needs_downconvert);
938
939 if (needs_downconvert)
940 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
941
924 mlog_exit(needs_downconvert); 942 mlog_exit(needs_downconvert);
925 return needs_downconvert; 943 return needs_downconvert;
926} 944}
@@ -1031,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
1031 return lockres->l_pending_gen; 1049 return lockres->l_pending_gen;
1032} 1050}
1033 1051
1034 1052static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
1035static void ocfs2_blocking_ast(void *opaque, int level)
1036{ 1053{
1037 struct ocfs2_lock_res *lockres = opaque; 1054 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1038 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1055 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1039 int needs_downconvert; 1056 int needs_downconvert;
1040 unsigned long flags; 1057 unsigned long flags;
1041 1058
1042 BUG_ON(level <= DLM_LOCK_NL); 1059 BUG_ON(level <= DLM_LOCK_NL);
1043 1060
1044 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 1061 mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
1045 lockres->l_name, level, lockres->l_level, 1062 "type %s\n", lockres->l_name, level, lockres->l_level,
1046 ocfs2_lock_type_string(lockres->l_type)); 1063 ocfs2_lock_type_string(lockres->l_type));
1047 1064
1048 /* 1065 /*
@@ -1063,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
1063 ocfs2_wake_downconvert_thread(osb); 1080 ocfs2_wake_downconvert_thread(osb);
1064} 1081}
1065 1082
1066static void ocfs2_locking_ast(void *opaque) 1083static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
1067{ 1084{
1068 struct ocfs2_lock_res *lockres = opaque; 1085 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1069 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1086 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1070 unsigned long flags; 1087 unsigned long flags;
1071 int status; 1088 int status;
@@ -1086,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
1086 return; 1103 return;
1087 } 1104 }
1088 1105
1106 mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
1107 "level %d => %d\n", lockres->l_name, lockres->l_action,
1108 lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
1109
1089 switch(lockres->l_action) { 1110 switch(lockres->l_action) {
1090 case OCFS2_AST_ATTACH: 1111 case OCFS2_AST_ATTACH:
1091 ocfs2_generic_handle_attach_action(lockres); 1112 ocfs2_generic_handle_attach_action(lockres);
@@ -1098,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
1098 ocfs2_generic_handle_downconvert_action(lockres); 1119 ocfs2_generic_handle_downconvert_action(lockres);
1099 break; 1120 break;
1100 default: 1121 default:
1101 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 1122 mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
1102 "lockres flags = 0x%lx, unlock action: %u\n", 1123 "flags 0x%lx, unlock: %u\n",
1103 lockres->l_name, lockres->l_action, lockres->l_flags, 1124 lockres->l_name, lockres->l_action, lockres->l_flags,
1104 lockres->l_unlock_action); 1125 lockres->l_unlock_action);
1105 BUG(); 1126 BUG();
@@ -1125,6 +1146,88 @@ out:
1125 spin_unlock_irqrestore(&lockres->l_lock, flags); 1146 spin_unlock_irqrestore(&lockres->l_lock, flags);
1126} 1147}
1127 1148
1149static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1150{
1151 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1152 unsigned long flags;
1153
1154 mlog_entry_void();
1155
1156 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
1157 lockres->l_name, lockres->l_unlock_action);
1158
1159 spin_lock_irqsave(&lockres->l_lock, flags);
1160 if (error) {
1161 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
1162 "unlock_action %d\n", error, lockres->l_name,
1163 lockres->l_unlock_action);
1164 spin_unlock_irqrestore(&lockres->l_lock, flags);
1165 mlog_exit_void();
1166 return;
1167 }
1168
1169 switch(lockres->l_unlock_action) {
1170 case OCFS2_UNLOCK_CANCEL_CONVERT:
1171 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
1172 lockres->l_action = OCFS2_AST_INVALID;
1173 /* Downconvert thread may have requeued this lock, we
1174 * need to wake it. */
1175 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1176 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
1177 break;
1178 case OCFS2_UNLOCK_DROP_LOCK:
1179 lockres->l_level = DLM_LOCK_IV;
1180 break;
1181 default:
1182 BUG();
1183 }
1184
1185 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1187 wake_up(&lockres->l_event);
1188 spin_unlock_irqrestore(&lockres->l_lock, flags);
1189
1190 mlog_exit_void();
1191}
1192
1193/*
1194 * This is the filesystem locking protocol. It provides the lock handling
1195 * hooks for the underlying DLM. It has a maximum version number.
1196 * The version number allows interoperability with systems running at
1197 * the same major number and an equal or smaller minor number.
1198 *
1199 * Whenever the filesystem does new things with locks (adds or removes a
1200 * lock, orders them differently, does different things underneath a lock),
1201 * the version must be changed. The protocol is negotiated when joining
1202 * the dlm domain. A node may join the domain if its major version is
1203 * identical to all other nodes and its minor version is greater than
1204 * or equal to all other nodes. When its minor version is greater than
1205 * the other nodes, it will run at the minor version specified by the
1206 * other nodes.
1207 *
1208 * If a locking change is made that will not be compatible with older
1209 * versions, the major number must be increased and the minor version set
1210 * to zero. If a change merely adds a behavior that can be disabled when
1211 * speaking to older versions, the minor version must be increased. If a
1212 * change adds a fully backwards compatible change (eg, LVB changes that
1213 * are just ignored by older versions), the version does not need to be
1214 * updated.
1215 */
1216static struct ocfs2_locking_protocol lproto = {
1217 .lp_max_version = {
1218 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
1219 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
1220 },
1221 .lp_lock_ast = ocfs2_locking_ast,
1222 .lp_blocking_ast = ocfs2_blocking_ast,
1223 .lp_unlock_ast = ocfs2_unlock_ast,
1224};
1225
1226void ocfs2_set_locking_protocol(void)
1227{
1228 ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
1229}
1230
1128static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 1231static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1129 int convert) 1232 int convert)
1130{ 1233{
@@ -1133,6 +1236,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1133 mlog_entry_void(); 1236 mlog_entry_void();
1134 spin_lock_irqsave(&lockres->l_lock, flags); 1237 spin_lock_irqsave(&lockres->l_lock, flags);
1135 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 1238 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1239 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1136 if (convert) 1240 if (convert)
1137 lockres->l_action = OCFS2_AST_INVALID; 1241 lockres->l_action = OCFS2_AST_INVALID;
1138 else 1242 else
@@ -1179,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1179 &lockres->l_lksb, 1283 &lockres->l_lksb,
1180 dlm_flags, 1284 dlm_flags,
1181 lockres->l_name, 1285 lockres->l_name,
1182 OCFS2_LOCK_ID_MAX_LEN - 1, 1286 OCFS2_LOCK_ID_MAX_LEN - 1);
1183 lockres);
1184 lockres_clear_pending(lockres, gen, osb); 1287 lockres_clear_pending(lockres, gen, osb);
1185 if (ret) { 1288 if (ret) {
1186 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1289 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1323,13 +1426,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1323again: 1426again:
1324 wait = 0; 1427 wait = 0;
1325 1428
1429 spin_lock_irqsave(&lockres->l_lock, flags);
1430
1326 if (catch_signals && signal_pending(current)) { 1431 if (catch_signals && signal_pending(current)) {
1327 ret = -ERESTARTSYS; 1432 ret = -ERESTARTSYS;
1328 goto out; 1433 goto unlock;
1329 } 1434 }
1330 1435
1331 spin_lock_irqsave(&lockres->l_lock, flags);
1332
1333 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, 1436 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
1334 "Cluster lock called on freeing lockres %s! flags " 1437 "Cluster lock called on freeing lockres %s! flags "
1335 "0x%lx\n", lockres->l_name, lockres->l_flags); 1438 "0x%lx\n", lockres->l_name, lockres->l_flags);
@@ -1346,6 +1449,25 @@ again:
1346 goto unlock; 1449 goto unlock;
1347 } 1450 }
1348 1451
1452 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
1453 /*
1454 * We've upconverted. If the lock now has a level we can
1455 * work with, we take it. If, however, the lock is not at the
1456 * required level, we go thru the full cycle. One way this could
1457 * happen is if a process requesting an upconvert to PR is
1458 * closely followed by another requesting upconvert to an EX.
1459 * If the process requesting EX lands here, we want it to
1460 * continue attempting to upconvert and let the process
1461 * requesting PR take the lock.
1462 * If multiple processes request upconvert to PR, the first one
1463 * here will take the lock. The others will have to go thru the
1464 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
1465 * downconvert request.
1466 */
1467 if (level <= lockres->l_level)
1468 goto update_holders;
1469 }
1470
1349 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 1471 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
1350 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { 1472 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
1351 /* is the lock is currently blocked on behalf of 1473 /* is the lock is currently blocked on behalf of
@@ -1383,7 +1505,7 @@ again:
1383 BUG_ON(level == DLM_LOCK_IV); 1505 BUG_ON(level == DLM_LOCK_IV);
1384 BUG_ON(level == DLM_LOCK_NL); 1506 BUG_ON(level == DLM_LOCK_NL);
1385 1507
1386 mlog(0, "lock %s, convert from %d to level = %d\n", 1508 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
1387 lockres->l_name, lockres->l_level, level); 1509 lockres->l_name, lockres->l_level, level);
1388 1510
1389 /* call dlm_lock to upgrade lock now */ 1511 /* call dlm_lock to upgrade lock now */
@@ -1392,8 +1514,7 @@ again:
1392 &lockres->l_lksb, 1514 &lockres->l_lksb,
1393 lkm_flags, 1515 lkm_flags,
1394 lockres->l_name, 1516 lockres->l_name,
1395 OCFS2_LOCK_ID_MAX_LEN - 1, 1517 OCFS2_LOCK_ID_MAX_LEN - 1);
1396 lockres);
1397 lockres_clear_pending(lockres, gen, osb); 1518 lockres_clear_pending(lockres, gen, osb);
1398 if (ret) { 1519 if (ret) {
1399 if (!(lkm_flags & DLM_LKF_NOQUEUE) || 1520 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1416,11 +1537,14 @@ again:
1416 goto again; 1537 goto again;
1417 } 1538 }
1418 1539
1540update_holders:
1419 /* Ok, if we get here then we're good to go. */ 1541 /* Ok, if we get here then we're good to go. */
1420 ocfs2_inc_holders(lockres, level); 1542 ocfs2_inc_holders(lockres, level);
1421 1543
1422 ret = 0; 1544 ret = 0;
1423unlock: 1545unlock:
1546 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1547
1424 spin_unlock_irqrestore(&lockres->l_lock, flags); 1548 spin_unlock_irqrestore(&lockres->l_lock, flags);
1425out: 1549out:
1426 /* 1550 /*
@@ -1757,7 +1881,7 @@ out:
1757 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of 1881 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1758 * flock() calls. The locking approach this requires is sufficiently 1882 * flock() calls. The locking approach this requires is sufficiently
1759 * different from all other cluster lock types that we implement a 1883 * different from all other cluster lock types that we implement a
1760 * seperate path to the "low-level" dlm calls. In particular: 1884 * separate path to the "low-level" dlm calls. In particular:
1761 * 1885 *
1762 * - No optimization of lock levels is done - we take at exactly 1886 * - No optimization of lock levels is done - we take at exactly
1763 * what's been requested. 1887 * what's been requested.
@@ -1827,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1827 spin_unlock_irqrestore(&lockres->l_lock, flags); 1951 spin_unlock_irqrestore(&lockres->l_lock, flags);
1828 1952
1829 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, 1953 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1830 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1954 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
1831 lockres);
1832 if (ret) { 1955 if (ret) {
1833 if (!trylock || (ret != -EAGAIN)) { 1956 if (!trylock || (ret != -EAGAIN)) {
1834 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1957 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1855,7 +1978,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1855 * outstanding lock request, so a cancel convert is 1978 * outstanding lock request, so a cancel convert is
1856 * required. We intentionally overwrite 'ret' - if the 1979 * required. We intentionally overwrite 'ret' - if the
1857 * cancel fails and the lock was granted, it's easier 1980 * cancel fails and the lock was granted, it's easier
1858 * to just bubble sucess back up to the user. 1981 * to just bubble success back up to the user.
1859 */ 1982 */
1860 ret = ocfs2_flock_handle_signal(lockres, level); 1983 ret = ocfs2_flock_handle_signal(lockres, level);
1861 } else if (!ret && (level > lockres->l_level)) { 1984 } else if (!ret && (level > lockres->l_level)) {
@@ -2957,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2957 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 3080 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2958 osb->uuid_str, 3081 osb->uuid_str,
2959 strlen(osb->uuid_str), 3082 strlen(osb->uuid_str),
2960 ocfs2_do_node_down, osb, 3083 &lproto, ocfs2_do_node_down, osb,
2961 &conn); 3084 &conn);
2962 if (status) { 3085 if (status) {
2963 mlog_errno(status); 3086 mlog_errno(status);
@@ -3024,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3024 mlog_exit_void(); 3147 mlog_exit_void();
3025} 3148}
3026 3149
3027static void ocfs2_unlock_ast(void *opaque, int error)
3028{
3029 struct ocfs2_lock_res *lockres = opaque;
3030 unsigned long flags;
3031
3032 mlog_entry_void();
3033
3034 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
3035 lockres->l_unlock_action);
3036
3037 spin_lock_irqsave(&lockres->l_lock, flags);
3038 if (error) {
3039 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
3040 "unlock_action %d\n", error, lockres->l_name,
3041 lockres->l_unlock_action);
3042 spin_unlock_irqrestore(&lockres->l_lock, flags);
3043 mlog_exit_void();
3044 return;
3045 }
3046
3047 switch(lockres->l_unlock_action) {
3048 case OCFS2_UNLOCK_CANCEL_CONVERT:
3049 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
3050 lockres->l_action = OCFS2_AST_INVALID;
3051 /* Downconvert thread may have requeued this lock, we
3052 * need to wake it. */
3053 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
3054 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
3055 break;
3056 case OCFS2_UNLOCK_DROP_LOCK:
3057 lockres->l_level = DLM_LOCK_IV;
3058 break;
3059 default:
3060 BUG();
3061 }
3062
3063 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
3064 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
3065 wake_up(&lockres->l_event);
3066 spin_unlock_irqrestore(&lockres->l_lock, flags);
3067
3068 mlog_exit_void();
3069}
3070
3071static int ocfs2_drop_lock(struct ocfs2_super *osb, 3150static int ocfs2_drop_lock(struct ocfs2_super *osb,
3072 struct ocfs2_lock_res *lockres) 3151 struct ocfs2_lock_res *lockres)
3073{ 3152{
@@ -3135,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
3135 3214
3136 mlog(0, "lock %s\n", lockres->l_name); 3215 mlog(0, "lock %s\n", lockres->l_name);
3137 3216
3138 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, 3217 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
3139 lockres);
3140 if (ret) { 3218 if (ret) {
3141 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3219 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3142 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 3220 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3155,7 +3233,7 @@ out:
3155/* Mark the lockres as being dropped. It will no longer be 3233/* Mark the lockres as being dropped. It will no longer be
3156 * queued if blocking, but we still may have to wait on it 3234 * queued if blocking, but we still may have to wait on it
3157 * being dequeued from the downconvert thread before we can consider 3235 * being dequeued from the downconvert thread before we can consider
3158 * it safe to drop. 3236 * it safe to drop.
3159 * 3237 *
3160 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 3238 * You can *not* attempt to call cluster_lock on this lockres anymore. */
3161void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 3239void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
@@ -3244,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
3244 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); 3322 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
3245 3323
3246 if (lockres->l_level <= new_level) { 3324 if (lockres->l_level <= new_level) {
3247 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", 3325 mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
3248 lockres->l_level, new_level); 3326 "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
3327 "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
3328 new_level, list_empty(&lockres->l_blocked_list),
3329 list_empty(&lockres->l_mask_waiters), lockres->l_type,
3330 lockres->l_flags, lockres->l_ro_holders,
3331 lockres->l_ex_holders, lockres->l_action,
3332 lockres->l_unlock_action, lockres->l_requested,
3333 lockres->l_blocking, lockres->l_pending_gen);
3249 BUG(); 3334 BUG();
3250 } 3335 }
3251 3336
3252 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 3337 mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
3253 lockres->l_name, new_level, lockres->l_blocking); 3338 lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
3254 3339
3255 lockres->l_action = OCFS2_AST_DOWNCONVERT; 3340 lockres->l_action = OCFS2_AST_DOWNCONVERT;
3256 lockres->l_requested = new_level; 3341 lockres->l_requested = new_level;
@@ -3269,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3269 3354
3270 mlog_entry_void(); 3355 mlog_entry_void();
3271 3356
3357 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
3358 lockres->l_level, new_level);
3359
3272 if (lvb) 3360 if (lvb)
3273 dlm_flags |= DLM_LKF_VALBLK; 3361 dlm_flags |= DLM_LKF_VALBLK;
3274 3362
@@ -3277,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3277 &lockres->l_lksb, 3365 &lockres->l_lksb,
3278 dlm_flags, 3366 dlm_flags,
3279 lockres->l_name, 3367 lockres->l_name,
3280 OCFS2_LOCK_ID_MAX_LEN - 1, 3368 OCFS2_LOCK_ID_MAX_LEN - 1);
3281 lockres);
3282 lockres_clear_pending(lockres, generation, osb); 3369 lockres_clear_pending(lockres, generation, osb);
3283 if (ret) { 3370 if (ret) {
3284 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 3371 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3299,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3299 assert_spin_locked(&lockres->l_lock); 3386 assert_spin_locked(&lockres->l_lock);
3300 3387
3301 mlog_entry_void(); 3388 mlog_entry_void();
3302 mlog(0, "lock %s\n", lockres->l_name);
3303 3389
3304 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 3390 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3305 /* If we're already trying to cancel a lock conversion 3391 /* If we're already trying to cancel a lock conversion
3306 * then just drop the spinlock and allow the caller to 3392 * then just drop the spinlock and allow the caller to
3307 * requeue this lock. */ 3393 * requeue this lock. */
3308 3394 mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
3309 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
3310 return 0; 3395 return 0;
3311 } 3396 }
3312 3397
@@ -3321,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3321 "lock %s, invalid flags: 0x%lx\n", 3406 "lock %s, invalid flags: 0x%lx\n",
3322 lockres->l_name, lockres->l_flags); 3407 lockres->l_name, lockres->l_flags);
3323 3408
3409 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3410
3324 return 1; 3411 return 1;
3325} 3412}
3326 3413
@@ -3330,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3330 int ret; 3417 int ret;
3331 3418
3332 mlog_entry_void(); 3419 mlog_entry_void();
3333 mlog(0, "lock %s\n", lockres->l_name);
3334 3420
3335 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, 3421 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
3336 DLM_LKF_CANCEL, lockres); 3422 DLM_LKF_CANCEL);
3337 if (ret) { 3423 if (ret) {
3338 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3424 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3339 ocfs2_recover_from_dlm_error(lockres, 0); 3425 ocfs2_recover_from_dlm_error(lockres, 0);
3340 } 3426 }
3341 3427
3342 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); 3428 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3343 3429
3344 mlog_exit(ret); 3430 mlog_exit(ret);
3345 return ret; 3431 return ret;
@@ -3352,6 +3438,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3352 unsigned long flags; 3438 unsigned long flags;
3353 int blocking; 3439 int blocking;
3354 int new_level; 3440 int new_level;
3441 int level;
3355 int ret = 0; 3442 int ret = 0;
3356 int set_lvb = 0; 3443 int set_lvb = 0;
3357 unsigned int gen; 3444 unsigned int gen;
@@ -3360,9 +3447,17 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3360 3447
3361 spin_lock_irqsave(&lockres->l_lock, flags); 3448 spin_lock_irqsave(&lockres->l_lock, flags);
3362 3449
3363 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
3364
3365recheck: 3450recheck:
3451 /*
3452 * Is it still blocking? If not, we have no more work to do.
3453 */
3454 if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
3455 BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
3456 spin_unlock_irqrestore(&lockres->l_lock, flags);
3457 ret = 0;
3458 goto leave;
3459 }
3460
3366 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 3461 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
3367 /* XXX 3462 /* XXX
3368 * This is a *big* race. The OCFS2_LOCK_PENDING flag 3463 * This is a *big* race. The OCFS2_LOCK_PENDING flag
@@ -3387,8 +3482,11 @@ recheck:
3387 * at the same time they set OCFS2_DLM_BUSY. They must 3482 * at the same time they set OCFS2_DLM_BUSY. They must
3388 * clear OCFS2_DLM_PENDING after dlm_lock() returns. 3483 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3389 */ 3484 */
3390 if (lockres->l_flags & OCFS2_LOCK_PENDING) 3485 if (lockres->l_flags & OCFS2_LOCK_PENDING) {
3486 mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
3487 lockres->l_name);
3391 goto leave_requeue; 3488 goto leave_requeue;
3489 }
3392 3490
3393 ctl->requeue = 1; 3491 ctl->requeue = 1;
3394 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3492 ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3401,31 +3499,70 @@ recheck:
3401 goto leave; 3499 goto leave;
3402 } 3500 }
3403 3501
3502 /*
3503 * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
3504 * set when the ast is received for an upconvert just before the
3505 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
3506 * on the heels of the ast, we want to delay the downconvert just
3507 * enough to allow the up requestor to do its task. Because this
3508 * lock is in the blocked queue, the lock will be downconverted
3509 * as soon as the requestor is done with the lock.
3510 */
3511 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
3512 goto leave_requeue;
3513
3514 /*
3515 * How can we block and yet be at NL? We were trying to upconvert
3516 * from NL and got canceled. The code comes back here, and now
3517 * we notice and clear BLOCKING.
3518 */
3519 if (lockres->l_level == DLM_LOCK_NL) {
3520 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
3521 mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
3522 lockres->l_blocking = DLM_LOCK_NL;
3523 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
3524 spin_unlock_irqrestore(&lockres->l_lock, flags);
3525 goto leave;
3526 }
3527
3404 /* if we're blocking an exclusive and we have *any* holders, 3528 /* if we're blocking an exclusive and we have *any* holders,
3405 * then requeue. */ 3529 * then requeue. */
3406 if ((lockres->l_blocking == DLM_LOCK_EX) 3530 if ((lockres->l_blocking == DLM_LOCK_EX)
3407 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3531 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
3532 mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
3533 lockres->l_name, lockres->l_ex_holders,
3534 lockres->l_ro_holders);
3408 goto leave_requeue; 3535 goto leave_requeue;
3536 }
3409 3537
3410 /* If it's a PR we're blocking, then only 3538 /* If it's a PR we're blocking, then only
3411 * requeue if we've got any EX holders */ 3539 * requeue if we've got any EX holders */
3412 if (lockres->l_blocking == DLM_LOCK_PR && 3540 if (lockres->l_blocking == DLM_LOCK_PR &&
3413 lockres->l_ex_holders) 3541 lockres->l_ex_holders) {
3542 mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
3543 lockres->l_name, lockres->l_ex_holders);
3414 goto leave_requeue; 3544 goto leave_requeue;
3545 }
3415 3546
3416 /* 3547 /*
3417 * Can we get a lock in this state if the holder counts are 3548 * Can we get a lock in this state if the holder counts are
3418 * zero? The meta data unblock code used to check this. 3549 * zero? The meta data unblock code used to check this.
3419 */ 3550 */
3420 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 3551 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
3421 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) 3552 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
3553 mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
3554 lockres->l_name);
3422 goto leave_requeue; 3555 goto leave_requeue;
3556 }
3423 3557
3424 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 3558 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
3425 3559
3426 if (lockres->l_ops->check_downconvert 3560 if (lockres->l_ops->check_downconvert
3427 && !lockres->l_ops->check_downconvert(lockres, new_level)) 3561 && !lockres->l_ops->check_downconvert(lockres, new_level)) {
3562 mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
3563 lockres->l_name);
3428 goto leave_requeue; 3564 goto leave_requeue;
3565 }
3429 3566
3430 /* If we get here, then we know that there are no more 3567 /* If we get here, then we know that there are no more
3431 * incompatible holders (and anyone asking for an incompatible 3568 * incompatible holders (and anyone asking for an incompatible
@@ -3438,17 +3575,24 @@ recheck:
3438 * may sleep, so we save off a copy of what we're blocking as 3575 * may sleep, so we save off a copy of what we're blocking as
3439 * it may change while we're not holding the spin lock. */ 3576 * it may change while we're not holding the spin lock. */
3440 blocking = lockres->l_blocking; 3577 blocking = lockres->l_blocking;
3578 level = lockres->l_level;
3441 spin_unlock_irqrestore(&lockres->l_lock, flags); 3579 spin_unlock_irqrestore(&lockres->l_lock, flags);
3442 3580
3443 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); 3581 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
3444 3582
3445 if (ctl->unblock_action == UNBLOCK_STOP_POST) 3583 if (ctl->unblock_action == UNBLOCK_STOP_POST) {
3584 mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
3585 lockres->l_name);
3446 goto leave; 3586 goto leave;
3587 }
3447 3588
3448 spin_lock_irqsave(&lockres->l_lock, flags); 3589 spin_lock_irqsave(&lockres->l_lock, flags);
3449 if (blocking != lockres->l_blocking) { 3590 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
3450 /* If this changed underneath us, then we can't drop 3591 /* If this changed underneath us, then we can't drop
3451 * it just yet. */ 3592 * it just yet. */
3593 mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
3594 "Recheck\n", lockres->l_name, blocking,
3595 lockres->l_blocking, level, lockres->l_level);
3452 goto recheck; 3596 goto recheck;
3453 } 3597 }
3454 3598
@@ -3843,45 +3987,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3843 ocfs2_cluster_unlock(osb, lockres, level); 3987 ocfs2_cluster_unlock(osb, lockres, level);
3844} 3988}
3845 3989
3846/*
3847 * This is the filesystem locking protocol. It provides the lock handling
3848 * hooks for the underlying DLM. It has a maximum version number.
3849 * The version number allows interoperability with systems running at
3850 * the same major number and an equal or smaller minor number.
3851 *
3852 * Whenever the filesystem does new things with locks (adds or removes a
3853 * lock, orders them differently, does different things underneath a lock),
3854 * the version must be changed. The protocol is negotiated when joining
3855 * the dlm domain. A node may join the domain if its major version is
3856 * identical to all other nodes and its minor version is greater than
3857 * or equal to all other nodes. When its minor version is greater than
3858 * the other nodes, it will run at the minor version specified by the
3859 * other nodes.
3860 *
3861 * If a locking change is made that will not be compatible with older
3862 * versions, the major number must be increased and the minor version set
3863 * to zero. If a change merely adds a behavior that can be disabled when
3864 * speaking to older versions, the minor version must be increased. If a
3865 * change adds a fully backwards compatible change (eg, LVB changes that
3866 * are just ignored by older versions), the version does not need to be
3867 * updated.
3868 */
3869static struct ocfs2_locking_protocol lproto = {
3870 .lp_max_version = {
3871 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3872 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3873 },
3874 .lp_lock_ast = ocfs2_locking_ast,
3875 .lp_blocking_ast = ocfs2_blocking_ast,
3876 .lp_unlock_ast = ocfs2_unlock_ast,
3877};
3878
3879void ocfs2_set_locking_protocol(void)
3880{
3881 ocfs2_stack_glue_set_locking_protocol(&lproto);
3882}
3883
3884
3885static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3990static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3886 struct ocfs2_lock_res *lockres) 3991 struct ocfs2_lock_res *lockres)
3887{ 3992{
@@ -3898,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3898 BUG_ON(!lockres); 4003 BUG_ON(!lockres);
3899 BUG_ON(!lockres->l_ops); 4004 BUG_ON(!lockres->l_ops);
3900 4005
3901 mlog(0, "lockres %s blocked.\n", lockres->l_name); 4006 mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
3902 4007
3903 /* Detect whether a lock has been marked as going away while 4008 /* Detect whether a lock has been marked as going away while
3904 * the downconvert thread was processing other things. A lock can 4009 * the downconvert thread was processing other things. A lock can
@@ -3921,7 +4026,7 @@ unqueue:
3921 } else 4026 } else
3922 ocfs2_schedule_blocked_lock(osb, lockres); 4027 ocfs2_schedule_blocked_lock(osb, lockres);
3923 4028
3924 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 4029 mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
3925 ctl.requeue ? "yes" : "no"); 4030 ctl.requeue ? "yes" : "no");
3926 spin_unlock_irqrestore(&lockres->l_lock, flags); 4031 spin_unlock_irqrestore(&lockres->l_lock, flags);
3927 4032
@@ -3943,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3943 /* Do not schedule a lock for downconvert when it's on 4048 /* Do not schedule a lock for downconvert when it's on
3944 * the way to destruction - any nodes wanting access 4049 * the way to destruction - any nodes wanting access
3945 * to the resource will get it soon. */ 4050 * to the resource will get it soon. */
3946 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 4051 mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
3947 lockres->l_name, lockres->l_flags); 4052 lockres->l_name, lockres->l_flags);
3948 return; 4053 return;
3949 } 4054 }
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 15713cbb865c..19ad145d2af3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -239,7 +239,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
239 mlog(0, "Encoding parent: blkno: %llu, generation: %u\n", 239 mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
240 (unsigned long long)blkno, generation); 240 (unsigned long long)blkno, generation);
241 } 241 }
242 242
243 *max_len = len; 243 *max_len = len;
244 244
245bail: 245bail:
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 843db64e9d4a..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/slab.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/fiemap.h> 29#include <linux/fiemap.h>
29 30
@@ -37,6 +38,7 @@
37#include "extent_map.h" 38#include "extent_map.h"
38#include "inode.h" 39#include "inode.h"
39#include "super.h" 40#include "super.h"
41#include "symlink.h"
40 42
41#include "buffer_head_io.h" 43#include "buffer_head_io.h"
42 44
@@ -191,7 +193,7 @@ static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
191 emi->ei_clusters += ins->ei_clusters; 193 emi->ei_clusters += ins->ei_clusters;
192 return 1; 194 return 1;
193 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && 195 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
194 (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && 196 (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
195 ins->ei_flags == emi->ei_flags) { 197 ins->ei_flags == emi->ei_flags) {
196 emi->ei_phys = ins->ei_phys; 198 emi->ei_phys = ins->ei_phys;
197 emi->ei_cpos = ins->ei_cpos; 199 emi->ei_cpos = ins->ei_cpos;
@@ -452,7 +454,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
452 if (i == -1) { 454 if (i == -1) {
453 /* 455 /*
454 * Holes can be larger than the maximum size of an 456 * Holes can be larger than the maximum size of an
455 * extent, so we return their lengths in a seperate 457 * extent, so we return their lengths in a separate
456 * field. 458 * field.
457 */ 459 */
458 if (hole_len) { 460 if (hole_len) {
@@ -703,6 +705,12 @@ out:
703 return ret; 705 return ret;
704} 706}
705 707
708/*
709 * The ocfs2_fiemap_inline() may be a little bit misleading, since
710 * it not only handles the fiemap for inlined files, but also deals
711 * with the fast symlink, cause they have no difference for extent
712 * mapping per se.
713 */
706static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, 714static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
707 struct fiemap_extent_info *fieinfo, 715 struct fiemap_extent_info *fieinfo,
708 u64 map_start) 716 u64 map_start)
@@ -715,11 +723,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
715 struct ocfs2_inode_info *oi = OCFS2_I(inode); 723 struct ocfs2_inode_info *oi = OCFS2_I(inode);
716 724
717 di = (struct ocfs2_dinode *)di_bh->b_data; 725 di = (struct ocfs2_dinode *)di_bh->b_data;
718 id_count = le16_to_cpu(di->id2.i_data.id_count); 726 if (ocfs2_inode_is_fast_symlink(inode))
727 id_count = ocfs2_fast_symlink_chars(inode->i_sb);
728 else
729 id_count = le16_to_cpu(di->id2.i_data.id_count);
719 730
720 if (map_start < id_count) { 731 if (map_start < id_count) {
721 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; 732 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
722 phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); 733 if (ocfs2_inode_is_fast_symlink(inode))
734 phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
735 else
736 phys += offsetof(struct ocfs2_dinode,
737 id2.i_data.id_data);
723 738
724 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, 739 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
725 flags); 740 flags);
@@ -756,9 +771,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
756 down_read(&OCFS2_I(inode)->ip_alloc_sem); 771 down_read(&OCFS2_I(inode)->ip_alloc_sem);
757 772
758 /* 773 /*
759 * Handle inline-data separately. 774 * Handle inline-data and fast symlink separately.
760 */ 775 */
761 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 776 if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
777 ocfs2_inode_is_fast_symlink(inode)) {
762 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); 778 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
763 goto out_unlock; 779 goto out_unlock;
764 } 780 }
@@ -786,6 +802,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
786 fe_flags = 0; 802 fe_flags = 0;
787 if (rec.e_flags & OCFS2_EXT_UNWRITTEN) 803 if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
788 fe_flags |= FIEMAP_EXTENT_UNWRITTEN; 804 fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
805 if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
806 fe_flags |= FIEMAP_EXTENT_SHARED;
789 if (is_last) 807 if (is_last)
790 fe_flags |= FIEMAP_EXTENT_LAST; 808 fe_flags |= FIEMAP_EXTENT_LAST;
791 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; 809 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de059f490586..17947dc8341e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
109 109
110 if (file->f_mode & FMODE_WRITE)
111 dquot_initialize(inode);
112
110 spin_lock(&oi->ip_lock); 113 spin_lock(&oi->ip_lock);
111 114
112 /* Check that the inode hasn't been wiped from disk by another 115 /* Check that the inode hasn't been wiped from disk by another
@@ -629,11 +632,10 @@ restart_all:
629 } 632 }
630 633
631restarted_transaction: 634restarted_transaction:
632 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 635 status = dquot_alloc_space_nodirty(inode,
633 clusters_to_add))) { 636 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
634 status = -EDQUOT; 637 if (status)
635 goto leave; 638 goto leave;
636 }
637 did_quota = 1; 639 did_quota = 1;
638 640
639 /* reserve a write to the file entry early on - that we if we 641 /* reserve a write to the file entry early on - that we if we
@@ -674,7 +676,7 @@ restarted_transaction:
674 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
675 spin_unlock(&OCFS2_I(inode)->ip_lock); 677 spin_unlock(&OCFS2_I(inode)->ip_lock);
676 /* Release unused quota reservation */ 678 /* Release unused quota reservation */
677 vfs_dq_free_space(inode, 679 dquot_free_space(inode,
678 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 680 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
679 did_quota = 0; 681 did_quota = 0;
680 682
@@ -710,7 +712,7 @@ restarted_transaction:
710 712
711leave: 713leave:
712 if (status < 0 && did_quota) 714 if (status < 0 && did_quota)
713 vfs_dq_free_space(inode, 715 dquot_free_space(inode,
714 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 716 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
715 if (handle) { 717 if (handle) {
716 ocfs2_commit_trans(osb, handle); 718 ocfs2_commit_trans(osb, handle);
@@ -749,7 +751,7 @@ static int ocfs2_write_zero_page(struct inode *inode,
749 int ret; 751 int ret;
750 752
751 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 753 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
752 /* ugh. in prepare/commit_write, if from==to==start of block, we 754 /* ugh. in prepare/commit_write, if from==to==start of block, we
753 ** skip the prepare. make sure we never send an offset for the start 755 ** skip the prepare. make sure we never send an offset for the start
754 ** of a block 756 ** of a block
755 */ 757 */
@@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
978 980
979 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 981 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
980 if (size_change) { 982 if (size_change) {
983 dquot_initialize(inode);
984
981 status = ocfs2_rw_lock(inode, 1); 985 status = ocfs2_rw_lock(inode, 1);
982 if (status < 0) { 986 if (status < 0) {
983 mlog_errno(status); 987 mlog_errno(status);
@@ -993,10 +997,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
993 } 997 }
994 998
995 if (size_change && attr->ia_size != i_size_read(inode)) { 999 if (size_change && attr->ia_size != i_size_read(inode)) {
996 if (attr->ia_size > sb->s_maxbytes) { 1000 status = inode_newsize_ok(inode, attr->ia_size);
997 status = -EFBIG; 1001 if (status)
998 goto bail_unlock; 1002 goto bail_unlock;
999 }
1000 1003
1001 if (i_size_read(inode) > attr->ia_size) { 1004 if (i_size_read(inode) > attr->ia_size) {
1002 if (ocfs2_should_order_data(inode)) { 1005 if (ocfs2_should_order_data(inode)) {
@@ -1021,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1021 /* 1024 /*
1022 * Gather pointers to quota structures so that allocation / 1025 * Gather pointers to quota structures so that allocation /
1023 * freeing of quota structures happens here and not inside 1026 * freeing of quota structures happens here and not inside
1024 * vfs_dq_transfer() where we have problems with lock ordering 1027 * dquot_transfer() where we have problems with lock ordering
1025 */ 1028 */
1026 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1029 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1027 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1030 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1054,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1054 mlog_errno(status); 1057 mlog_errno(status);
1055 goto bail_unlock; 1058 goto bail_unlock;
1056 } 1059 }
1057 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 1060 status = dquot_transfer(inode, attr);
1058 if (status < 0) 1061 if (status < 0)
1059 goto bail_commit; 1062 goto bail_commit;
1060 } else { 1063 } else {
@@ -1772,13 +1775,14 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1772 loff_t *ppos, 1775 loff_t *ppos,
1773 size_t count, 1776 size_t count,
1774 int appending, 1777 int appending,
1775 int *direct_io) 1778 int *direct_io,
1779 int *has_refcount)
1776{ 1780{
1777 int ret = 0, meta_level = 0; 1781 int ret = 0, meta_level = 0;
1778 struct inode *inode = dentry->d_inode; 1782 struct inode *inode = dentry->d_inode;
1779 loff_t saved_pos, end; 1783 loff_t saved_pos, end;
1780 1784
1781 /* 1785 /*
1782 * We start with a read level meta lock and only jump to an ex 1786 * We start with a read level meta lock and only jump to an ex
1783 * if we need to make modifications here. 1787 * if we need to make modifications here.
1784 */ 1788 */
@@ -1833,6 +1837,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1833 saved_pos, 1837 saved_pos,
1834 count, 1838 count,
1835 &meta_level); 1839 &meta_level);
1840 if (has_refcount)
1841 *has_refcount = 1;
1842 if (direct_io)
1843 *direct_io = 0;
1836 } 1844 }
1837 1845
1838 if (ret < 0) { 1846 if (ret < 0) {
@@ -1899,7 +1907,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1899 loff_t pos) 1907 loff_t pos)
1900{ 1908{
1901 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1909 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1902 int can_do_direct; 1910 int can_do_direct, has_refcount = 0;
1903 ssize_t written = 0; 1911 ssize_t written = 0;
1904 size_t ocount; /* original count */ 1912 size_t ocount; /* original count */
1905 size_t count; /* after file limit checks */ 1913 size_t count; /* after file limit checks */
@@ -1942,7 +1950,7 @@ relock:
1942 can_do_direct = direct_io; 1950 can_do_direct = direct_io;
1943 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 1951 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1944 iocb->ki_left, appending, 1952 iocb->ki_left, appending,
1945 &can_do_direct); 1953 &can_do_direct, &has_refcount);
1946 if (ret < 0) { 1954 if (ret < 0) {
1947 mlog_errno(ret); 1955 mlog_errno(ret);
1948 goto out; 1956 goto out;
@@ -2006,14 +2014,16 @@ out_dio:
2006 /* buffered aio wouldn't have proper lock coverage today */ 2014 /* buffered aio wouldn't have proper lock coverage today */
2007 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2015 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2008 2016
2009 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { 2017 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2018 ((file->f_flags & O_DIRECT) && has_refcount)) {
2010 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2019 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2011 pos + count - 1); 2020 pos + count - 1);
2012 if (ret < 0) 2021 if (ret < 0)
2013 written = ret; 2022 written = ret;
2014 2023
2015 if (!ret && (old_size != i_size_read(inode) || 2024 if (!ret && (old_size != i_size_read(inode) ||
2016 old_clusters != OCFS2_I(inode)->ip_clusters)) { 2025 old_clusters != OCFS2_I(inode)->ip_clusters ||
2026 has_refcount)) {
2017 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2027 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2018 if (ret < 0) 2028 if (ret < 0)
2019 written = ret; 2029 written = ret;
@@ -2024,7 +2034,7 @@ out_dio:
2024 pos + count - 1); 2034 pos + count - 1);
2025 } 2035 }
2026 2036
2027 /* 2037 /*
2028 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2038 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2029 * function pointer which is called when o_direct io completes so that 2039 * function pointer which is called when o_direct io completes so that
2030 * it can unlock our rw lock. (it's the clustered equivalent of 2040 * it can unlock our rw lock. (it's the clustered equivalent of
@@ -2034,7 +2044,7 @@ out_dio:
2034 * async dio is going to do it in the future or an end_io after an 2044 * async dio is going to do it in the future or an end_io after an
2035 * error has already done it. 2045 * error has already done it.
2036 */ 2046 */
2037 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2047 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2038 rw_level = -1; 2048 rw_level = -1;
2039 have_alloc_sem = 0; 2049 have_alloc_sem = 0;
2040 } 2050 }
@@ -2062,7 +2072,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2062 int ret; 2072 int ret;
2063 2073
2064 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2074 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
2065 sd->total_len, 0, NULL); 2075 sd->total_len, 0, NULL, NULL);
2066 if (ret < 0) { 2076 if (ret < 0) {
2067 mlog_errno(ret); 2077 mlog_errno(ret);
2068 return ret; 2078 return ret;
@@ -2189,7 +2199,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2189 goto bail; 2199 goto bail;
2190 } 2200 }
2191 2201
2192 /* 2202 /*
2193 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2203 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2194 * need locks to protect pending reads from racing with truncate. 2204 * need locks to protect pending reads from racing with truncate.
2195 */ 2205 */
@@ -2211,10 +2221,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2211 * We're fine letting folks race truncates and extending 2221 * We're fine letting folks race truncates and extending
2212 * writes with read across the cluster, just like they can 2222 * writes with read across the cluster, just like they can
2213 * locally. Hence no rw_lock during read. 2223 * locally. Hence no rw_lock during read.
2214 * 2224 *
2215 * Take and drop the meta data lock to update inode fields 2225 * Take and drop the meta data lock to update inode fields
2216 * like i_size. This allows the checks down below 2226 * like i_size. This allows the checks down below
2217 * generic_file_aio_read() a chance of actually working. 2227 * generic_file_aio_read() a chance of actually working.
2218 */ 2228 */
2219 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2229 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2220 if (ret < 0) { 2230 if (ret < 0) {
@@ -2239,7 +2249,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2239bail: 2249bail:
2240 if (have_alloc_sem) 2250 if (have_alloc_sem)
2241 up_read(&inode->i_alloc_sem); 2251 up_read(&inode->i_alloc_sem);
2242 if (rw_level != -1) 2252 if (rw_level != -1)
2243 ocfs2_rw_unlock(inode, rw_level); 2253 ocfs2_rw_unlock(inode, rw_level);
2244 mlog_exit(ret); 2254 mlog_exit(ret);
2245 2255
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h> 29#include <linux/highmem.h>
31 30
32#define MLOG_MASK_PREFIX ML_SUPER 31#define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0297fb8982b8..07cc8bb68b6d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/quotaops.h> 30#include <linux/quotaops.h>
@@ -475,7 +474,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
475 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { 474 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
476 status = ocfs2_try_open_lock(inode, 0); 475 status = ocfs2_try_open_lock(inode, 0);
477 if (status) { 476 if (status) {
478 make_bad_inode(inode); 477 make_bad_inode(inode);
479 return status; 478 return status;
480 } 479 }
481 } 480 }
@@ -665,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode,
665 } 664 }
666 665
667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 666 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
668 vfs_dq_free_inode(inode); 667 dquot_free_inode(inode);
669 668
670 status = ocfs2_free_dinode(handle, inode_alloc_inode, 669 status = ocfs2_free_dinode(handle, inode_alloc_inode,
671 inode_alloc_bh, di); 670 inode_alloc_bh, di);
@@ -684,7 +683,7 @@ bail:
684 return status; 683 return status;
685} 684}
686 685
687/* 686/*
688 * Serialize with orphan dir recovery. If the process doing 687 * Serialize with orphan dir recovery. If the process doing
689 * recovery on this orphan dir does an iget() with the dir 688 * recovery on this orphan dir does an iget() with the dir
690 * i_mutex held, we'll deadlock here. Instead we detect this 689 * i_mutex held, we'll deadlock here. Instead we detect this
@@ -891,6 +890,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
891 /* Do some basic inode verification... */ 890 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 891 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 892 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
893 /*
894 * Inodes in the orphan dir must have ORPHANED_FL. The only
895 * inodes that come back out of the orphan dir are reflink
896 * targets. A reflink target may be moved out of the orphan
897 * dir between the time we scan the directory and the time we
898 * process it. This would lead to HAS_REFCOUNT_FL being set but
899 * ORPHANED_FL not.
900 */
901 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
902 mlog(0, "Reflinked inode %llu is no longer orphaned. "
903 "it shouldn't be deleted\n",
904 (unsigned long long)oi->ip_blkno);
905 goto bail;
906 }
907
894 /* for lack of a better error? */ 908 /* for lack of a better error? */
895 status = -EEXIST; 909 status = -EEXIST;
896 mlog(ML_ERROR, 910 mlog(ML_ERROR,
@@ -971,6 +985,8 @@ void ocfs2_delete_inode(struct inode *inode)
971 goto bail; 985 goto bail;
972 } 986 }
973 987
988 dquot_initialize(inode);
989
974 if (!ocfs2_inode_is_valid_to_delete(inode)) { 990 if (!ocfs2_inode_is_valid_to_delete(inode)) {
975 /* It's probably not necessary to truncate_inode_pages 991 /* It's probably not necessary to truncate_inode_pages
976 * here but we do it for safety anyway (it will most 992 * here but we do it for safety anyway (it will most
@@ -1087,6 +1103,8 @@ void ocfs2_clear_inode(struct inode *inode)
1087 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1103 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1088 "Inode=%lu\n", inode->i_ino); 1104 "Inode=%lu\n", inode->i_ino);
1089 1105
1106 dquot_drop(inode);
1107
1090 /* To preven remote deletes we hold open lock before, now it 1108 /* To preven remote deletes we hold open lock before, now it
1091 * is time to unlock PR and EX open locks. */ 1109 * is time to unlock PR and EX open locks. */
1092 ocfs2_open_unlock(inode); 1110 ocfs2_open_unlock(inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 31fbb0619510..7d9d9c132cef 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/compat.h>
10 11
11#define MLOG_MASK_PREFIX ML_INODE 12#define MLOG_MASK_PREFIX ML_INODE
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -181,6 +182,10 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
181#ifdef CONFIG_COMPAT 182#ifdef CONFIG_COMPAT
182long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) 183long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
183{ 184{
185 bool preserve;
186 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode;
188
184 switch (cmd) { 189 switch (cmd) {
185 case OCFS2_IOC32_GETFLAGS: 190 case OCFS2_IOC32_GETFLAGS:
186 cmd = OCFS2_IOC_GETFLAGS; 191 cmd = OCFS2_IOC_GETFLAGS;
@@ -195,8 +200,15 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
195 case OCFS2_IOC_GROUP_EXTEND: 200 case OCFS2_IOC_GROUP_EXTEND:
196 case OCFS2_IOC_GROUP_ADD: 201 case OCFS2_IOC_GROUP_ADD:
197 case OCFS2_IOC_GROUP_ADD64: 202 case OCFS2_IOC_GROUP_ADD64:
198 case OCFS2_IOC_REFLINK:
199 break; 203 break;
204 case OCFS2_IOC_REFLINK:
205 if (copy_from_user(&args, (struct reflink_arguments *)arg,
206 sizeof(args)))
207 return -EFAULT;
208 preserve = (args.preserve != 0);
209
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve);
200 default: 212 default:
201 return -ENOIOCTLCMD; 213 return -ENOIOCTLCMD;
202 } 214 }
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
7 * 7 *
8 */ 8 */
9 9
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_PROTO_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_PROTO_H
12 12
13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
15 15
16#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327e..9336c60e3a36 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
659 659
660 default: 660 default:
661 status = -EINVAL; 661 status = -EINVAL;
662 mlog(ML_ERROR, "Uknown access type!\n"); 662 mlog(ML_ERROR, "Unknown access type!\n");
663 } 663 }
664 if (!status && ocfs2_meta_ecc(osb) && triggers) 664 if (!status && ocfs2_meta_ecc(osb) && triggers)
665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers); 665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
@@ -2034,7 +2034,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
2034 status = -ENOENT; 2034 status = -ENOENT;
2035 mlog_errno(status); 2035 mlog_errno(status);
2036 return status; 2036 return status;
2037 } 2037 }
2038 2038
2039 mutex_lock(&orphan_dir_inode->i_mutex); 2039 mutex_lock(&orphan_dir_inode->i_mutex);
2040 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); 2040 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
476 476
477out: 477out:
478 if (!status) 478 if (!status)
479 ocfs2_init_inode_steal_slot(osb); 479 ocfs2_init_steal_slots(osb);
480 mlog_exit(status); 480 mlog_exit(status);
481 return status; 481 return status;
482} 482}
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 872 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 873 (unsigned long long)blkno);
874 874
875 status = ocfs2_free_clusters(handle, main_bm_inode, 875 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 876 main_bm_inode,
877 main_bm_bh, blkno,
878 count);
877 if (status < 0) { 879 if (status < 0) {
878 mlog_errno(status); 880 mlog_errno(status);
879 goto bail; 881 goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 986 }
985 987
986retry_enospc: 988retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 989 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 990 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 991 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 992 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1062 OCFS2_LA_DISABLED)
1062 goto bail; 1063 goto bail;
1063 1064
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1066 status = ocfs2_claim_clusters(osb, handle, ac,
1065 osb->local_alloc_bits, 1067 osb->local_alloc_bits,
1066 &cluster_off, 1068 &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..7898bd3a99f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/uio.h> 30#include <linux/uio.h>
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f010b22b1c44..b1eb50ae4097 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
212 } else 212 } else
213 inode->i_gid = current_fsgid(); 213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode; 214 inode->i_mode = mode;
215 vfs_dq_init(inode); 215 dquot_initialize(inode);
216 return inode; 216 return inode;
217} 217}
218 218
@@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir,
244 (unsigned long)dev, dentry->d_name.len, 244 (unsigned long)dev, dentry->d_name.len,
245 dentry->d_name.name); 245 dentry->d_name.name);
246 246
247 dquot_initialize(dir);
248
247 /* get our super block */ 249 /* get our super block */
248 osb = OCFS2_SB(dir->i_sb); 250 osb = OCFS2_SB(dir->i_sb);
249 251
@@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir,
348 goto leave; 350 goto leave;
349 } 351 }
350 352
351 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 353 status = dquot_alloc_inode(inode);
352 * to be called. */ 354 if (status)
353 if (sb_any_quota_active(osb->sb) &&
354 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
355 status = -EDQUOT;
356 goto leave; 355 goto leave;
357 }
358 did_quota_inode = 1; 356 did_quota_inode = 1;
359 357
360 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 358 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
431 status = 0; 429 status = 0;
432leave: 430leave:
433 if (status < 0 && did_quota_inode) 431 if (status < 0 && did_quota_inode)
434 vfs_dq_free_inode(inode); 432 dquot_free_inode(inode);
435 if (handle) 433 if (handle)
436 ocfs2_commit_trans(osb, handle); 434 ocfs2_commit_trans(osb, handle);
437 435
@@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry,
636 if (S_ISDIR(inode->i_mode)) 634 if (S_ISDIR(inode->i_mode))
637 return -EPERM; 635 return -EPERM;
638 636
637 dquot_initialize(dir);
638
639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); 639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
640 if (err < 0) { 640 if (err < 0) {
641 if (err != -ENOENT) 641 if (err != -ENOENT)
@@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir,
791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
792 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
793 793
794 dquot_initialize(dir);
795
794 BUG_ON(dentry->d_parent->d_inode != dir); 796 BUG_ON(dentry->d_parent->d_inode != dir);
795 797
796 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 798 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -877,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
877 fe = (struct ocfs2_dinode *) fe_bh->b_data; 879 fe = (struct ocfs2_dinode *) fe_bh->b_data;
878 880
879 if (inode_is_unlinkable(inode)) { 881 if (inode_is_unlinkable(inode)) {
880 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 882 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
881 &orphan_insert, orphan_dir); 883 &orphan_insert, orphan_dir);
882 if (status < 0) { 884 if (status < 0) {
883 mlog_errno(status); 885 mlog_errno(status);
@@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir,
1051 old_dentry->d_name.len, old_dentry->d_name.name, 1053 old_dentry->d_name.len, old_dentry->d_name.name,
1052 new_dentry->d_name.len, new_dentry->d_name.name); 1054 new_dentry->d_name.len, new_dentry->d_name.name);
1053 1055
1056 dquot_initialize(old_dir);
1057 dquot_initialize(new_dir);
1058
1054 osb = OCFS2_SB(old_dir->i_sb); 1059 osb = OCFS2_SB(old_dir->i_sb);
1055 1060
1056 if (new_inode) { 1061 if (new_inode) {
@@ -1295,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
1295 if (S_ISDIR(new_inode->i_mode) || 1300 if (S_ISDIR(new_inode->i_mode) ||
1296 (ocfs2_read_links_count(newfe) == 1)) { 1301 (ocfs2_read_links_count(newfe) == 1)) {
1297 status = ocfs2_orphan_add(osb, handle, new_inode, 1302 status = ocfs2_orphan_add(osb, handle, new_inode,
1298 newfe, orphan_name, 1303 newfe_bh, orphan_name,
1299 &orphan_insert, orphan_dir); 1304 &orphan_insert, orphan_dir);
1300 if (status < 0) { 1305 if (status < 0) {
1301 mlog_errno(status); 1306 mlog_errno(status);
@@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir,
1599 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1604 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1600 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1605 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1601 1606
1607 dquot_initialize(dir);
1608
1602 sb = dir->i_sb; 1609 sb = dir->i_sb;
1603 osb = OCFS2_SB(sb); 1610 osb = OCFS2_SB(sb);
1604 1611
@@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir,
1688 goto bail; 1695 goto bail;
1689 } 1696 }
1690 1697
1691 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 1698 status = dquot_alloc_inode(inode);
1692 * to be called. */ 1699 if (status)
1693 if (sb_any_quota_active(osb->sb) &&
1694 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
1695 status = -EDQUOT;
1696 goto bail; 1700 goto bail;
1697 }
1698 did_quota_inode = 1; 1701 did_quota_inode = 1;
1699 1702
1700 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, 1703 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir,
1716 u32 offset = 0; 1719 u32 offset = 0;
1717 1720
1718 inode->i_op = &ocfs2_symlink_inode_operations; 1721 inode->i_op = &ocfs2_symlink_inode_operations;
1719 if (vfs_dq_alloc_space_nodirty(inode, 1722 status = dquot_alloc_space_nodirty(inode,
1720 ocfs2_clusters_to_bytes(osb->sb, 1))) { 1723 ocfs2_clusters_to_bytes(osb->sb, 1));
1721 status = -EDQUOT; 1724 if (status)
1722 goto bail; 1725 goto bail;
1723 }
1724 did_quota = 1; 1726 did_quota = 1;
1725 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1727 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1726 new_fe_bh, 1728 new_fe_bh,
@@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir,
1788 d_instantiate(dentry, inode); 1790 d_instantiate(dentry, inode);
1789bail: 1791bail:
1790 if (status < 0 && did_quota) 1792 if (status < 0 && did_quota)
1791 vfs_dq_free_space_nodirty(inode, 1793 dquot_free_space_nodirty(inode,
1792 ocfs2_clusters_to_bytes(osb->sb, 1)); 1794 ocfs2_clusters_to_bytes(osb->sb, 1));
1793 if (status < 0 && did_quota_inode) 1795 if (status < 0 && did_quota_inode)
1794 vfs_dq_free_inode(inode); 1796 dquot_free_inode(inode);
1795 if (handle) 1797 if (handle)
1796 ocfs2_commit_trans(osb, handle); 1798 ocfs2_commit_trans(osb, handle);
1797 1799
@@ -1909,7 +1911,7 @@ leave:
1909static int ocfs2_orphan_add(struct ocfs2_super *osb, 1911static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 handle_t *handle, 1912 handle_t *handle,
1911 struct inode *inode, 1913 struct inode *inode,
1912 struct ocfs2_dinode *fe, 1914 struct buffer_head *fe_bh,
1913 char *name, 1915 char *name,
1914 struct ocfs2_dir_lookup_result *lookup, 1916 struct ocfs2_dir_lookup_result *lookup,
1915 struct inode *orphan_dir_inode) 1917 struct inode *orphan_dir_inode)
@@ -1917,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1917 struct buffer_head *orphan_dir_bh = NULL; 1919 struct buffer_head *orphan_dir_bh = NULL;
1918 int status = 0; 1920 int status = 0;
1919 struct ocfs2_dinode *orphan_fe; 1921 struct ocfs2_dinode *orphan_fe;
1922 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1920 1923
1921 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1924 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1922 1925
@@ -1957,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1957 goto leave; 1960 goto leave;
1958 } 1961 }
1959 1962
1963 /*
1964 * We're going to journal the change of i_flags and i_orphaned_slot.
1965 * It's safe anyway, though some callers may duplicate the journaling.
1966 * Journaling within the func just make the logic look more
1967 * straightforward.
1968 */
1969 status = ocfs2_journal_access_di(handle,
1970 INODE_CACHE(inode),
1971 fe_bh,
1972 OCFS2_JOURNAL_ACCESS_WRITE);
1973 if (status < 0) {
1974 mlog_errno(status);
1975 goto leave;
1976 }
1977
1960 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1978 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1961 1979
1962 /* Record which orphan dir our inode now resides 1980 /* Record which orphan dir our inode now resides
@@ -1964,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1964 * dir to lock. */ 1982 * dir to lock. */
1965 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 1983 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1966 1984
1985 ocfs2_journal_dirty(handle, fe_bh);
1986
1967 mlog(0, "Inode %llu orphaned in slot %d\n", 1987 mlog(0, "Inode %llu orphaned in slot %d\n",
1968 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1988 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1969 1989
@@ -2099,15 +2119,12 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2099 goto leave; 2119 goto leave;
2100 } 2120 }
2101 2121
2102 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 2122 status = dquot_alloc_inode(inode);
2103 * to be called. */ 2123 if (status)
2104 if (sb_any_quota_active(osb->sb) &&
2105 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
2106 status = -EDQUOT;
2107 goto leave; 2124 goto leave;
2108 }
2109 did_quota_inode = 1; 2125 did_quota_inode = 1;
2110 2126
2127 inode->i_nlink = 0;
2111 /* do the real work now. */ 2128 /* do the real work now. */
2112 status = ocfs2_mknod_locked(osb, dir, inode, 2129 status = ocfs2_mknod_locked(osb, dir, inode,
2113 0, &new_di_bh, parent_di_bh, handle, 2130 0, &new_di_bh, parent_di_bh, handle,
@@ -2124,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2124 } 2141 }
2125 2142
2126 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2143 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2127 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2144 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2128 &orphan_insert, orphan_dir); 2145 &orphan_insert, orphan_dir);
2129 if (status < 0) { 2146 if (status < 0) {
2130 mlog_errno(status); 2147 mlog_errno(status);
@@ -2136,9 +2153,10 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2136 if (status < 0) 2153 if (status < 0)
2137 mlog_errno(status); 2154 mlog_errno(status);
2138 2155
2156 insert_inode_hash(inode);
2139leave: 2157leave:
2140 if (status < 0 && did_quota_inode) 2158 if (status < 0 && did_quota_inode)
2141 vfs_dq_free_inode(inode); 2159 dquot_free_inode(inode);
2142 if (handle) 2160 if (handle)
2143 ocfs2_commit_trans(osb, handle); 2161 ocfs2_commit_trans(osb, handle);
2144 2162
@@ -2267,6 +2285,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2267 di = (struct ocfs2_dinode *)di_bh->b_data; 2285 di = (struct ocfs2_dinode *)di_bh->b_data;
2268 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); 2286 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
2269 di->i_orphaned_slot = 0; 2287 di->i_orphaned_slot = 0;
2288 inode->i_nlink = 1;
2289 ocfs2_set_links_count(di, inode->i_nlink);
2270 ocfs2_journal_dirty(handle, di_bh); 2290 ocfs2_journal_dirty(handle, di_bh);
2271 2291
2272 status = ocfs2_add_entry(handle, dentry, inode, 2292 status = ocfs2_add_entry(handle, dentry, inode,
@@ -2284,7 +2304,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2284 goto out_commit; 2304 goto out_commit;
2285 } 2305 }
2286 2306
2287 insert_inode_hash(inode);
2288 dentry->d_op = &ocfs2_dentry_ops; 2307 dentry->d_op = &ocfs2_dentry_ops;
2289 d_instantiate(dentry, inode); 2308 d_instantiate(dentry, inode);
2290 status = 0; 2309 status = 0;
@@ -2326,4 +2345,5 @@ const struct inode_operations ocfs2_dir_iops = {
2326 .getxattr = generic_getxattr, 2345 .getxattr = generic_getxattr,
2327 .listxattr = ocfs2_listxattr, 2346 .listxattr = ocfs2_listxattr,
2328 .removexattr = generic_removexattr, 2347 .removexattr = generic_removexattr,
2348 .fiemap = ocfs2_fiemap,
2329}; 2349};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d963d8638709..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
42 42
43#include "ocfs2_fs.h" 43#include "ocfs2_fs.h"
44#include "ocfs2_lockid.h" 44#include "ocfs2_lockid.h"
45#include "ocfs2_ioctl.h"
45 46
46/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
47#include "blockcheck.h" 48#include "blockcheck.h"
@@ -136,6 +137,10 @@ enum ocfs2_unlock_action {
136#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a 137#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
137 call to dlm_lock. Only 138 call to dlm_lock. Only
138 exists with BUSY set. */ 139 exists with BUSY set. */
140#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
141 * from downconverting
142 * before the upconvert
143 * has completed */
139 144
140struct ocfs2_lock_res_ops; 145struct ocfs2_lock_res_ops;
141 146
@@ -155,7 +160,7 @@ struct ocfs2_lock_res {
155 int l_level; 160 int l_level;
156 unsigned int l_ro_holders; 161 unsigned int l_ro_holders;
157 unsigned int l_ex_holders; 162 unsigned int l_ex_holders;
158 union ocfs2_dlm_lksb l_lksb; 163 struct ocfs2_dlm_lksb l_lksb;
159 164
160 /* used from AST/BAST funcs. */ 165 /* used from AST/BAST funcs. */
161 enum ocfs2_ast_action l_action; 166 enum ocfs2_ast_action l_action;
@@ -245,9 +250,11 @@ enum ocfs2_mount_options
245 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ 250 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
246 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ 251 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
247 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ 252 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
248 OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ 253 OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
249 OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ 254 OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access
250 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ 255 control lists */
256 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
257 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
251}; 258};
252 259
253#define OCFS2_OSB_SOFT_RO 0x0001 260#define OCFS2_OSB_SOFT_RO 0x0001
@@ -299,7 +306,9 @@ struct ocfs2_super
299 u32 s_next_generation; 306 u32 s_next_generation;
300 unsigned long osb_flags; 307 unsigned long osb_flags;
301 s16 s_inode_steal_slot; 308 s16 s_inode_steal_slot;
309 s16 s_meta_steal_slot;
302 atomic_t s_num_inodes_stolen; 310 atomic_t s_num_inodes_stolen;
311 atomic_t s_num_meta_stolen;
303 312
304 unsigned long s_mount_opt; 313 unsigned long s_mount_opt;
305 unsigned int s_atime_quantum; 314 unsigned int s_atime_quantum;
@@ -754,35 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
754 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
755} 764}
756 765
757static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
758{ 767{
759 spin_lock(&osb->osb_lock); 768 ext2_set_bit(bit, bitmap);
760 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
761 spin_unlock(&osb->osb_lock);
762 atomic_set(&osb->s_num_inodes_stolen, 0);
763} 769}
770#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
764 771
765static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb, 772static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
766 s16 slot)
767{ 773{
768 spin_lock(&osb->osb_lock); 774 ext2_clear_bit(bit, bitmap);
769 osb->s_inode_steal_slot = slot;
770 spin_unlock(&osb->osb_lock);
771}
772
773static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
774{
775 s16 slot;
776
777 spin_lock(&osb->osb_lock);
778 slot = osb->s_inode_steal_slot;
779 spin_unlock(&osb->osb_lock);
780
781 return slot;
782} 775}
776#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
783 777
784#define ocfs2_set_bit ext2_set_bit
785#define ocfs2_clear_bit ext2_clear_bit
786#define ocfs2_test_bit ext2_test_bit 778#define ocfs2_test_bit ext2_test_bit
787#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 779#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
788#define ocfs2_find_next_bit ext2_find_next_bit 780#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e9431e4a5e7c..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -254,63 +254,6 @@
254 * refcount tree */ 254 * refcount tree */
255 255
256/* 256/*
257 * ioctl commands
258 */
259#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
260#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
261#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
262#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
263
264/*
265 * Space reservation / allocation / free ioctls and argument structure
266 * are designed to be compatible with XFS.
267 *
268 * ALLOCSP* and FREESP* are not and will never be supported, but are
269 * included here for completeness.
270 */
271struct ocfs2_space_resv {
272 __s16 l_type;
273 __s16 l_whence;
274 __s64 l_start;
275 __s64 l_len; /* len == 0 means until end of file */
276 __s32 l_sysid;
277 __u32 l_pid;
278 __s32 l_pad[4]; /* reserve area */
279};
280
281#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
282#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
283#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
284#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
285#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
286#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
287#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
288#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
289
290/* Used to pass group descriptor data when online resize is done */
291struct ocfs2_new_group_input {
292 __u64 group; /* Group descriptor's blkno. */
293 __u32 clusters; /* Total number of clusters in this group */
294 __u32 frees; /* Total free clusters in this group */
295 __u16 chain; /* Chain for this group */
296 __u16 reserved1;
297 __u32 reserved2;
298};
299
300#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
313/*
314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 257 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
315 */ 258 */
316#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 259#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
@@ -1202,7 +1145,7 @@ struct ocfs2_local_disk_dqinfo {
1202/* Header of one chunk of a quota file */ 1145/* Header of one chunk of a quota file */
1203struct ocfs2_local_disk_chunk { 1146struct ocfs2_local_disk_chunk {
1204 __le32 dqc_free; /* Number of free entries in the bitmap */ 1147 __le32 dqc_free; /* Number of free entries in the bitmap */
1205 u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding 1148 __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
1206 * chunk of quota file */ 1149 * chunk of quota file */
1207}; 1150};
1208 1151
@@ -1417,9 +1360,16 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
1417 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); 1360 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
1418} 1361}
1419 1362
1420static inline int ocfs2_max_inline_data(int blocksize) 1363static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
1364 struct ocfs2_dinode *di)
1421{ 1365{
1422 return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); 1366 if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
1367 return blocksize -
1368 offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
1369 di->i_xattr_inline_size;
1370 else
1371 return blocksize -
1372 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
1423} 1373}
1424 1374
1425static inline int ocfs2_extent_recs_per_inode(int blocksize) 1375static inline int ocfs2_extent_recs_per_inode(int blocksize)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_ioctl.h
5 *
6 * Defines OCFS2 ioctls.
7 *
8 * Copyright (C) 2010 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_IOCTL_H
21#define OCFS2_IOCTL_H
22
23/*
24 * ioctl commands
25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
30
31/*
32 * Space reservation / allocation / free ioctls and argument structure
33 * are designed to be compatible with XFS.
34 *
35 * ALLOCSP* and FREESP* are not and will never be supported, but are
36 * included here for completeness.
37 */
38struct ocfs2_space_resv {
39 __s16 l_type;
40 __s16 l_whence;
41 __s64 l_start;
42 __s64 l_len; /* len == 0 means until end of file */
43 __s32 l_sysid;
44 __u32 l_pid;
45 __s32 l_pad[4]; /* reserve area */
46};
47
48#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
49#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
50#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
51#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
52#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
53#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
54#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
55#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
56
57/* Used to pass group descriptor data when online resize is done */
58struct ocfs2_new_group_input {
59 __u64 group; /* Group descriptor's blkno. */
60 __u32 clusters; /* Total number of clusters in this group */
61 __u32 frees; /* Total free clusters in this group */
62 __u16 chain; /* Chain for this group */
63 __u16 reserved1;
64 __u32 reserved2;
65};
66
67#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
68#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
69#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
70
71/* Used to pass 2 file names to reflink. */
72struct reflink_arguments {
73 __u64 old_path;
74 __u64 new_path;
75 __u64 preserve;
76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78
79#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
23/* 23/*
24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for 24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for
25 * more details. 25 * more details.
26 *
27 * 1.0 - Initial locking version from ocfs2 1.4.
26 */ 28 */
27#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 29#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
28#define OCFS2_LOCKING_PROTOCOL_MINOR 0 30#define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index e5df9d170b0c..123bc520a2c0 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,10 +17,6 @@
17 17
18#include "ocfs2.h" 18#include "ocfs2.h"
19 19
20/* Common stuff */
21/* id number of quota format */
22#define QFMT_OCFS2 3
23
24/* 20/*
25 * In-memory structures 21 * In-memory structures
26 */ 22 */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4cad..ab42a74c7539 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h> 9#include <linux/dqblk_qtree.h>
@@ -851,13 +852,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
851} 852}
852 853
853const struct dquot_operations ocfs2_quota_operations = { 854const struct dquot_operations ocfs2_quota_operations = {
854 .initialize = dquot_initialize,
855 .drop = dquot_drop,
856 .alloc_space = dquot_alloc_space,
857 .alloc_inode = dquot_alloc_inode,
858 .free_space = dquot_free_space,
859 .free_inode = dquot_free_inode,
860 .transfer = dquot_transfer,
861 .write_dquot = ocfs2_write_dquot, 855 .write_dquot = ocfs2_write_dquot,
862 .acquire_dquot = ocfs2_acquire_dquot, 856 .acquire_dquot = ocfs2_acquire_dquot,
863 .release_dquot = ocfs2_release_dquot, 857 .release_dquot = ocfs2_release_dquot,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 1a2c50a759fa..9ad49305f450 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
3 */ 3 */
4 4
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/module.h> 9#include <linux/module.h>
@@ -457,7 +458,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
457 break; 458 break;
458 } 459 }
459 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; 460 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
460 for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { 461 for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
461 qbh = NULL; 462 qbh = NULL;
462 status = ocfs2_read_quota_block(lqinode, 463 status = ocfs2_read_quota_block(lqinode,
463 ol_dqblk_block(sb, chunk, bit), 464 ol_dqblk_block(sb, chunk, bit),
@@ -1325,7 +1326,7 @@ out:
1325 return status; 1326 return status;
1326} 1327}
1327 1328
1328static struct quota_format_ops ocfs2_format_ops = { 1329static const struct quota_format_ops ocfs2_format_ops = {
1329 .check_quota_file = ocfs2_local_check_quota_file, 1330 .check_quota_file = ocfs2_local_check_quota_file,
1330 .read_file_info = ocfs2_local_read_info, 1331 .read_file_info = ocfs2_local_read_info,
1331 .write_file_info = ocfs2_global_write_info, 1332 .write_file_info = ocfs2_global_write_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3a0df7a1b810..bd96f6c7877e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h> 40#include <linux/slab.h>
42#include <linux/writeback.h> 41#include <linux/writeback.h>
43#include <linux/pagevec.h> 42#include <linux/pagevec.h>
@@ -276,7 +275,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
276 spin_unlock(&osb->osb_lock); 275 spin_unlock(&osb->osb_lock);
277} 276}
278 277
279void ocfs2_kref_remove_refcount_tree(struct kref *kref) 278static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
280{ 279{
281 struct ocfs2_refcount_tree *tree = 280 struct ocfs2_refcount_tree *tree =
282 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); 281 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
@@ -524,23 +523,6 @@ out:
524 return ret; 523 return ret;
525} 524}
526 525
527int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
528 struct ocfs2_refcount_tree **ret_tree,
529 struct buffer_head **ref_bh)
530{
531 int ret;
532 u64 ref_blkno;
533
534 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
535 if (ret) {
536 mlog_errno(ret);
537 return ret;
538 }
539
540 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
541 rw, ret_tree, ref_bh);
542}
543
544void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, 526void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
545 struct ocfs2_refcount_tree *tree, int rw) 527 struct ocfs2_refcount_tree *tree, int rw)
546{ 528{
@@ -643,7 +625,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
643 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 625 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
644 memset(rb, 0, inode->i_sb->s_blocksize); 626 memset(rb, 0, inode->i_sb->s_blocksize);
645 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
646 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
647 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 629 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
648 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 630 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
649 rb->rf_blkno = cpu_to_le64(first_blkno); 631 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -969,6 +951,103 @@ out:
969} 951}
970 952
971/* 953/*
954 * Find the end range for a leaf refcount block indicated by
955 * el->l_recs[index].e_blkno.
956 */
957static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
958 struct buffer_head *ref_root_bh,
959 struct ocfs2_extent_block *eb,
960 struct ocfs2_extent_list *el,
961 int index, u32 *cpos_end)
962{
963 int ret, i, subtree_root;
964 u32 cpos;
965 u64 blkno;
966 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
967 struct ocfs2_path *left_path = NULL, *right_path = NULL;
968 struct ocfs2_extent_tree et;
969 struct ocfs2_extent_list *tmp_el;
970
971 if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
972 /*
973 * We have a extent rec after index, so just use the e_cpos
974 * of the next extent rec.
975 */
976 *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
977 return 0;
978 }
979
980 if (!eb || (eb && !eb->h_next_leaf_blk)) {
981 /*
982 * We are the last extent rec, so any high cpos should
983 * be stored in this leaf refcount block.
984 */
985 *cpos_end = UINT_MAX;
986 return 0;
987 }
988
989 /*
990 * If the extent block isn't the last one, we have to find
991 * the subtree root between this extent block and the next
992 * leaf extent block and get the corresponding e_cpos from
993 * the subroot. Otherwise we may corrupt the b-tree.
994 */
995 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
996
997 left_path = ocfs2_new_path_from_et(&et);
998 if (!left_path) {
999 ret = -ENOMEM;
1000 mlog_errno(ret);
1001 goto out;
1002 }
1003
1004 cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1005 ret = ocfs2_find_path(ci, left_path, cpos);
1006 if (ret) {
1007 mlog_errno(ret);
1008 goto out;
1009 }
1010
1011 right_path = ocfs2_new_path_from_path(left_path);
1012 if (!right_path) {
1013 ret = -ENOMEM;
1014 mlog_errno(ret);
1015 goto out;
1016 }
1017
1018 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1019 if (ret) {
1020 mlog_errno(ret);
1021 goto out;
1022 }
1023
1024 ret = ocfs2_find_path(ci, right_path, cpos);
1025 if (ret) {
1026 mlog_errno(ret);
1027 goto out;
1028 }
1029
1030 subtree_root = ocfs2_find_subtree_root(&et, left_path,
1031 right_path);
1032
1033 tmp_el = left_path->p_node[subtree_root].el;
1034 blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1035 for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
1036 if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1037 *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1038 break;
1039 }
1040 }
1041
1042 BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
1043
1044out:
1045 ocfs2_free_path(left_path);
1046 ocfs2_free_path(right_path);
1047 return ret;
1048}
1049
1050/*
972 * Given a cpos and len, try to find the refcount record which contains cpos. 1051 * Given a cpos and len, try to find the refcount record which contains cpos.
973 * 1. If cpos can be found in one refcount record, return the record. 1052 * 1. If cpos can be found in one refcount record, return the record.
974 * 2. If cpos can't be found, return a fake record which start from cpos 1053 * 2. If cpos can't be found, return a fake record which start from cpos
@@ -983,10 +1062,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
983 struct buffer_head **ret_bh) 1062 struct buffer_head **ret_bh)
984{ 1063{
985 int ret = 0, i, found; 1064 int ret = 0, i, found;
986 u32 low_cpos; 1065 u32 low_cpos, uninitialized_var(cpos_end);
987 struct ocfs2_extent_list *el; 1066 struct ocfs2_extent_list *el;
988 struct ocfs2_extent_rec *tmp, *rec = NULL; 1067 struct ocfs2_extent_rec *rec = NULL;
989 struct ocfs2_extent_block *eb; 1068 struct ocfs2_extent_block *eb = NULL;
990 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; 1069 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
991 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1070 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
992 struct ocfs2_refcount_block *rb = 1071 struct ocfs2_refcount_block *rb =
@@ -1034,12 +1113,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1034 } 1113 }
1035 } 1114 }
1036 1115
1037 /* adjust len when we have ocfs2_extent_rec after it. */ 1116 if (found) {
1038 if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { 1117 ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1039 tmp = &el->l_recs[i+1]; 1118 eb, el, i, &cpos_end);
1119 if (ret) {
1120 mlog_errno(ret);
1121 goto out;
1122 }
1040 1123
1041 if (le32_to_cpu(tmp->e_cpos) < cpos + len) 1124 if (cpos_end < low_cpos + len)
1042 len = le32_to_cpu(tmp->e_cpos) - cpos; 1125 len = cpos_end - low_cpos;
1043 } 1126 }
1044 1127
1045 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), 1128 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
@@ -1246,7 +1329,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1246 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1329 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1247 1330
1248 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1331 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1249 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1332 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1250 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1333 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1251 new_rb->rf_blkno = cpu_to_le64(blkno); 1334 new_rb->rf_blkno = cpu_to_le64(blkno);
1252 new_rb->rf_cpos = cpu_to_le32(0); 1335 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1418,7 +1501,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1418 1501
1419 /* change old and new rl_used accordingly. */ 1502 /* change old and new rl_used accordingly. */
1420 le16_add_cpu(&rl->rl_used, -num_moved); 1503 le16_add_cpu(&rl->rl_used, -num_moved);
1421 new_rl->rl_used = cpu_to_le32(num_moved); 1504 new_rl->rl_used = cpu_to_le16(num_moved);
1422 1505
1423 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1506 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1424 sizeof(struct ocfs2_refcount_rec), 1507 sizeof(struct ocfs2_refcount_rec),
@@ -1492,7 +1575,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1492 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1575 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1493 memset(new_rb, 0, sb->s_blocksize); 1576 memset(new_rb, 0, sb->s_blocksize);
1494 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1577 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1495 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1578 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1496 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1579 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1497 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1580 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1498 new_rb->rf_blkno = cpu_to_le64(blkno); 1581 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1797,7 +1880,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1797 recs_need++; 1880 recs_need++;
1798 1881
1799 /* If the leaf block don't have enough record, expand it. */ 1882 /* If the leaf block don't have enough record, expand it. */
1800 if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { 1883 if (le16_to_cpu(rf_list->rl_used) + recs_need >
1884 le16_to_cpu(rf_list->rl_count)) {
1801 struct ocfs2_refcount_rec tmp_rec; 1885 struct ocfs2_refcount_rec tmp_rec;
1802 u64 cpos = le64_to_cpu(orig_rec->r_cpos); 1886 u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1803 len = le32_to_cpu(orig_rec->r_clusters); 1887 len = le32_to_cpu(orig_rec->r_clusters);
@@ -1859,7 +1943,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1859 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); 1943 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1860 le64_add_cpu(&tail_rec->r_cpos, 1944 le64_add_cpu(&tail_rec->r_cpos,
1861 le32_to_cpu(tail_rec->r_clusters) - len); 1945 le32_to_cpu(tail_rec->r_clusters) - len);
1862 tail_rec->r_clusters = le32_to_cpu(len); 1946 tail_rec->r_clusters = cpu_to_le32(len);
1863 } 1947 }
1864 1948
1865 /* 1949 /*
@@ -2431,7 +2515,7 @@ out:
2431 * we gonna touch and whether we need to create new blocks. 2515 * we gonna touch and whether we need to create new blocks.
2432 * 2516 *
2433 * Normally the refcount blocks store these refcount should be 2517 * Normally the refcount blocks store these refcount should be
2434 * continguous also, so that we can get the number easily. 2518 * contiguous also, so that we can get the number easily.
2435 * As for meta_ac, we will at most add split 2 refcount record and 2519 * As for meta_ac, we will at most add split 2 refcount record and
2436 * 2 more refcount block, so just check it in a rough way. 2520 * 2 more refcount block, so just check it in a rough way.
2437 * 2521 *
@@ -2860,7 +2944,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2860 2944
2861 while (offset < end) { 2945 while (offset < end) {
2862 page_index = offset >> PAGE_CACHE_SHIFT; 2946 page_index = offset >> PAGE_CACHE_SHIFT;
2863 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 2947 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2864 if (map_end > end) 2948 if (map_end > end)
2865 map_end = end; 2949 map_end = end;
2866 2950
@@ -2872,8 +2956,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2872 2956
2873 page = grab_cache_page(mapping, page_index); 2957 page = grab_cache_page(mapping, page_index);
2874 2958
2875 /* This page can't be dirtied before we CoW it out. */ 2959 /*
2876 BUG_ON(PageDirty(page)); 2960 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2961 * can't be dirtied before we CoW it out.
2962 */
2963 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2964 BUG_ON(PageDirty(page));
2877 2965
2878 if (!PageUptodate(page)) { 2966 if (!PageUptodate(page)) {
2879 ret = block_read_full_page(page, ocfs2_get_block); 2967 ret = block_read_full_page(page, ocfs2_get_block);
@@ -3085,7 +3173,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3085 3173
3086 while (offset < end) { 3174 while (offset < end) {
3087 page_index = offset >> PAGE_CACHE_SHIFT; 3175 page_index = offset >> PAGE_CACHE_SHIFT;
3088 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 3176 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3089 if (map_end > end) 3177 if (map_end > end)
3090 map_end = end; 3178 map_end = end;
3091 3179
@@ -3840,8 +3928,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
3840 } 3928 }
3841 3929
3842 ret = ocfs2_insert_extent(handle, et, cpos, 3930 ret = ocfs2_insert_extent(handle, et, cpos,
3843 cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 3931 ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3844 p_cluster)),
3845 num_clusters, ext_flags, meta_ac); 3932 num_clusters, ext_flags, meta_ac);
3846 if (ret) { 3933 if (ret) {
3847 mlog_errno(ret); 3934 mlog_errno(ret);
@@ -3987,6 +4074,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
3987 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4074 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
3988 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4075 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3989 i_size_write(t_inode, size); 4076 i_size_write(t_inode, size);
4077 t_inode->i_blocks = s_inode->i_blocks;
3990 4078
3991 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
3992 di->i_clusters = s_di->i_clusters; 4080 di->i_clusters = s_di->i_clusters;
@@ -4253,8 +4341,8 @@ static int ocfs2_user_path_parent(const char __user *path,
4253 * @new_dentry: target dentry 4341 * @new_dentry: target dentry
4254 * @preserve: if true, preserve all file attributes 4342 * @preserve: if true, preserve all file attributes
4255 */ 4343 */
4256int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, 4344static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4257 struct dentry *new_dentry, bool preserve) 4345 struct dentry *new_dentry, bool preserve)
4258{ 4346{
4259 struct inode *inode = old_dentry->d_inode; 4347 struct inode *inode = old_dentry->d_inode;
4260 int error; 4348 int error;
@@ -4302,7 +4390,7 @@ int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4302 } 4390 }
4303 4391
4304 mutex_lock(&inode->i_mutex); 4392 mutex_lock(&inode->i_mutex);
4305 vfs_dq_init(dir); 4393 dquot_initialize(dir);
4306 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); 4394 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4307 mutex_unlock(&inode->i_mutex); 4395 mutex_unlock(&inode->i_mutex);
4308 if (!error) 4396 if (!error)
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index e49c41050264..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/slab.h>
22#include <linux/module.h> 23#include <linux/module.h>
23 24
24/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ 25/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
@@ -161,24 +162,23 @@ static int dlm_status_to_errno(enum dlm_status status)
161 162
162static void o2dlm_lock_ast_wrapper(void *astarg) 163static void o2dlm_lock_ast_wrapper(void *astarg)
163{ 164{
164 BUG_ON(o2cb_stack.sp_proto == NULL); 165 struct ocfs2_dlm_lksb *lksb = astarg;
165 166
166 o2cb_stack.sp_proto->lp_lock_ast(astarg); 167 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
167} 168}
168 169
169static void o2dlm_blocking_ast_wrapper(void *astarg, int level) 170static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
170{ 171{
171 BUG_ON(o2cb_stack.sp_proto == NULL); 172 struct ocfs2_dlm_lksb *lksb = astarg;
172 173
173 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); 174 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
174} 175}
175 176
176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) 177static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
177{ 178{
179 struct ocfs2_dlm_lksb *lksb = astarg;
178 int error = dlm_status_to_errno(status); 180 int error = dlm_status_to_errno(status);
179 181
180 BUG_ON(o2cb_stack.sp_proto == NULL);
181
182 /* 182 /*
183 * In o2dlm, you can get both the lock_ast() for the lock being 183 * In o2dlm, you can get both the lock_ast() for the lock being
184 * granted and the unlock_ast() for the CANCEL failing. A 184 * granted and the unlock_ast() for the CANCEL failing. A
@@ -193,16 +193,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
193 if (status == DLM_CANCELGRANT) 193 if (status == DLM_CANCELGRANT)
194 return; 194 return;
195 195
196 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); 196 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
197} 197}
198 198
199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, 199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
200 int mode, 200 int mode,
201 union ocfs2_dlm_lksb *lksb, 201 struct ocfs2_dlm_lksb *lksb,
202 u32 flags, 202 u32 flags,
203 void *name, 203 void *name,
204 unsigned int namelen, 204 unsigned int namelen)
205 void *astarg)
206{ 205{
207 enum dlm_status status; 206 enum dlm_status status;
208 int o2dlm_mode = mode_to_o2dlm(mode); 207 int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +210,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
211 210
212 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, 211 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
213 o2dlm_flags, name, namelen, 212 o2dlm_flags, name, namelen,
214 o2dlm_lock_ast_wrapper, astarg, 213 o2dlm_lock_ast_wrapper, lksb,
215 o2dlm_blocking_ast_wrapper); 214 o2dlm_blocking_ast_wrapper);
216 ret = dlm_status_to_errno(status); 215 ret = dlm_status_to_errno(status);
217 return ret; 216 return ret;
218} 217}
219 218
220static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, 219static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
221 union ocfs2_dlm_lksb *lksb, 220 struct ocfs2_dlm_lksb *lksb,
222 u32 flags, 221 u32 flags)
223 void *astarg)
224{ 222{
225 enum dlm_status status; 223 enum dlm_status status;
226 int o2dlm_flags = flags_to_o2dlm(flags); 224 int o2dlm_flags = flags_to_o2dlm(flags);
227 int ret; 225 int ret;
228 226
229 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, 227 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
230 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); 228 o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
231 ret = dlm_status_to_errno(status); 229 ret = dlm_status_to_errno(status);
232 return ret; 230 return ret;
233} 231}
234 232
235static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 233static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
236{ 234{
237 return dlm_status_to_errno(lksb->lksb_o2dlm.status); 235 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
238} 236}
@@ -242,17 +240,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
242 * contents, it will zero out the LVB. Thus the caller can always trust 240 * contents, it will zero out the LVB. Thus the caller can always trust
243 * the contents. 241 * the contents.
244 */ 242 */
245static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 243static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
246{ 244{
247 return 1; 245 return 1;
248} 246}
249 247
250static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) 248static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
251{ 249{
252 return (void *)(lksb->lksb_o2dlm.lvb); 250 return (void *)(lksb->lksb_o2dlm.lvb);
253} 251}
254 252
255static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) 253static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256{ 254{
257 dlm_print_one_lock(lksb->lksb_o2dlm.lockid); 255 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
258} 256}
@@ -277,10 +275,10 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
277 u32 dlm_key; 275 u32 dlm_key;
278 struct dlm_ctxt *dlm; 276 struct dlm_ctxt *dlm;
279 struct o2dlm_private *priv; 277 struct o2dlm_private *priv;
280 struct dlm_protocol_version dlm_version; 278 struct dlm_protocol_version fs_version;
281 279
282 BUG_ON(conn == NULL); 280 BUG_ON(conn == NULL);
283 BUG_ON(o2cb_stack.sp_proto == NULL); 281 BUG_ON(conn->cc_proto == NULL);
284 282
285 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
286 * in the heartbeat universe */ 284 * in the heartbeat universe */
@@ -304,18 +302,18 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
304 /* used by the dlm code to make message headers unique, each 302 /* used by the dlm code to make message headers unique, each
305 * node in this domain must agree on this. */ 303 * node in this domain must agree on this. */
306 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); 304 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
307 dlm_version.pv_major = conn->cc_version.pv_major; 305 fs_version.pv_major = conn->cc_version.pv_major;
308 dlm_version.pv_minor = conn->cc_version.pv_minor; 306 fs_version.pv_minor = conn->cc_version.pv_minor;
309 307
310 dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version); 308 dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
311 if (IS_ERR(dlm)) { 309 if (IS_ERR(dlm)) {
312 rc = PTR_ERR(dlm); 310 rc = PTR_ERR(dlm);
313 mlog_errno(rc); 311 mlog_errno(rc);
314 goto out_free; 312 goto out_free;
315 } 313 }
316 314
317 conn->cc_version.pv_major = dlm_version.pv_major; 315 conn->cc_version.pv_major = fs_version.pv_major;
318 conn->cc_version.pv_minor = dlm_version.pv_minor; 316 conn->cc_version.pv_minor = fs_version.pv_minor;
319 conn->cc_lockspace = dlm; 317 conn->cc_lockspace = dlm;
320 318
321 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); 319 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ff4c798a5635..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,11 +21,11 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
25#include <linux/reboot.h> 26#include <linux/reboot.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 29#include "stackglue.h"
30 30
31#include <linux/dlm_plock.h> 31#include <linux/dlm_plock.h>
@@ -63,8 +63,8 @@
63 * negotiated by the client. The client negotiates based on the maximum 63 * negotiated by the client. The client negotiates based on the maximum
64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major 64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
65 * number from the "SETV" message must match 65 * number from the "SETV" message must match
66 * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number 66 * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
67 * must be less than or equal to ...->lp_max_version.pv_minor. 67 * must be less than or equal to ...sp_max_version.pv_minor.
68 * 68 *
69 * Once this information has been set, mounts will be allowed. From this 69 * Once this information has been set, mounts will be allowed. From this
70 * point on, the "DOWN" message can be sent for node down notification. 70 * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +401,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
401 char *ptr = NULL; 401 char *ptr = NULL;
402 struct ocfs2_control_private *p = file->private_data; 402 struct ocfs2_control_private *p = file->private_data;
403 struct ocfs2_protocol_version *max = 403 struct ocfs2_protocol_version *max =
404 &ocfs2_user_plugin.sp_proto->lp_max_version; 404 &ocfs2_user_plugin.sp_max_proto;
405 405
406 if (ocfs2_control_get_handshake_state(file) != 406 if (ocfs2_control_get_handshake_state(file) !=
407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL) 407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +664,10 @@ static void ocfs2_control_exit(void)
664 -rc); 664 -rc);
665} 665}
666 666
667static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
668{
669 struct ocfs2_lock_res *res = astarg;
670 return &res->l_lksb.lksb_fsdlm;
671}
672
673static void fsdlm_lock_ast_wrapper(void *astarg) 667static void fsdlm_lock_ast_wrapper(void *astarg)
674{ 668{
675 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); 669 struct ocfs2_dlm_lksb *lksb = astarg;
676 int status = lksb->sb_status; 670 int status = lksb->lksb_fsdlm.sb_status;
677
678 BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
679 671
680 /* 672 /*
681 * For now we're punting on the issue of other non-standard errors 673 * For now we're punting on the issue of other non-standard errors
@@ -688,25 +680,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
688 */ 680 */
689 681
690 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) 682 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
691 ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); 683 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
692 else 684 else
693 ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); 685 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
694} 686}
695 687
696static void fsdlm_blocking_ast_wrapper(void *astarg, int level) 688static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
697{ 689{
698 BUG_ON(ocfs2_user_plugin.sp_proto == NULL); 690 struct ocfs2_dlm_lksb *lksb = astarg;
699 691
700 ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); 692 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
701} 693}
702 694
703static int user_dlm_lock(struct ocfs2_cluster_connection *conn, 695static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
704 int mode, 696 int mode,
705 union ocfs2_dlm_lksb *lksb, 697 struct ocfs2_dlm_lksb *lksb,
706 u32 flags, 698 u32 flags,
707 void *name, 699 void *name,
708 unsigned int namelen, 700 unsigned int namelen)
709 void *astarg)
710{ 701{
711 int ret; 702 int ret;
712 703
@@ -716,36 +707,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
716 707
717 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, 708 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
718 flags|DLM_LKF_NODLCKWT, name, namelen, 0, 709 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
719 fsdlm_lock_ast_wrapper, astarg, 710 fsdlm_lock_ast_wrapper, lksb,
720 fsdlm_blocking_ast_wrapper); 711 fsdlm_blocking_ast_wrapper);
721 return ret; 712 return ret;
722} 713}
723 714
724static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, 715static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
725 union ocfs2_dlm_lksb *lksb, 716 struct ocfs2_dlm_lksb *lksb,
726 u32 flags, 717 u32 flags)
727 void *astarg)
728{ 718{
729 int ret; 719 int ret;
730 720
731 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, 721 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
732 flags, &lksb->lksb_fsdlm, astarg); 722 flags, &lksb->lksb_fsdlm, lksb);
733 return ret; 723 return ret;
734} 724}
735 725
736static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 726static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
737{ 727{
738 return lksb->lksb_fsdlm.sb_status; 728 return lksb->lksb_fsdlm.sb_status;
739} 729}
740 730
741static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 731static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
742{ 732{
743 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; 733 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
744 734
745 return !invalid; 735 return !invalid;
746} 736}
747 737
748static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) 738static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
749{ 739{
750 if (!lksb->lksb_fsdlm.sb_lvbptr) 740 if (!lksb->lksb_fsdlm.sb_lvbptr)
751 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + 741 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +743,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
753 return (void *)(lksb->lksb_fsdlm.sb_lvbptr); 743 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
754} 744}
755 745
756static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 746static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
757{ 747{
758} 748}
759 749
@@ -814,7 +804,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
814static int user_cluster_connect(struct ocfs2_cluster_connection *conn) 804static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
815{ 805{
816 dlm_lockspace_t *fsdlm; 806 dlm_lockspace_t *fsdlm;
817 struct ocfs2_live_connection *control; 807 struct ocfs2_live_connection *uninitialized_var(control);
818 int rc = 0; 808 int rc = 0;
819 809
820 BUG_ON(conn == NULL); 810 BUG_ON(conn == NULL);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 3f2f1c45b7b6..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
36#define OCFS2_STACK_PLUGIN_USER "user" 36#define OCFS2_STACK_PLUGIN_USER "user"
37#define OCFS2_MAX_HB_CTL_PATH 256 37#define OCFS2_MAX_HB_CTL_PATH 256
38 38
39static struct ocfs2_locking_protocol *lproto; 39static struct ocfs2_protocol_version locking_max_version;
40static DEFINE_SPINLOCK(ocfs2_stack_lock); 40static DEFINE_SPINLOCK(ocfs2_stack_lock);
41static LIST_HEAD(ocfs2_stack_list); 41static LIST_HEAD(ocfs2_stack_list);
42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; 42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
176 spin_lock(&ocfs2_stack_lock); 176 spin_lock(&ocfs2_stack_lock);
177 if (!ocfs2_stack_lookup(plugin->sp_name)) { 177 if (!ocfs2_stack_lookup(plugin->sp_name)) {
178 plugin->sp_count = 0; 178 plugin->sp_count = 0;
179 plugin->sp_proto = lproto; 179 plugin->sp_max_proto = locking_max_version;
180 list_add(&plugin->sp_list, &ocfs2_stack_list); 180 list_add(&plugin->sp_list, &ocfs2_stack_list);
181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", 181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
182 plugin->sp_name); 182 plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
213} 213}
214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); 214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
215 215
216void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) 216void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
217{ 217{
218 struct ocfs2_stack_plugin *p; 218 struct ocfs2_stack_plugin *p;
219 219
220 BUG_ON(proto == NULL);
221
222 spin_lock(&ocfs2_stack_lock); 220 spin_lock(&ocfs2_stack_lock);
223 BUG_ON(active_stack != NULL); 221 if (memcmp(max_proto, &locking_max_version,
222 sizeof(struct ocfs2_protocol_version))) {
223 BUG_ON(locking_max_version.pv_major != 0);
224 224
225 lproto = proto; 225 locking_max_version = *max_proto;
226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) { 226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
227 p->sp_proto = lproto; 227 p->sp_max_proto = locking_max_version;
228 }
228 } 229 }
229
230 spin_unlock(&ocfs2_stack_lock); 230 spin_unlock(&ocfs2_stack_lock);
231} 231}
232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); 232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
233 233
234 234
235/* 235/*
236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take 236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
237 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the 237 * for the ast and bast functions. They will pass the lksb to the ast
238 * underlying stack plugins need to pilfer the lksb off of the lock_res. 238 * and bast. The caller can wrap the lksb with their own structure to
239 * If some other structure needs to be passed as an astarg, the plugins 239 * get more information.
240 * will need to be given a different avenue to the lksb.
241 */ 240 */
242int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 241int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
243 int mode, 242 int mode,
244 union ocfs2_dlm_lksb *lksb, 243 struct ocfs2_dlm_lksb *lksb,
245 u32 flags, 244 u32 flags,
246 void *name, 245 void *name,
247 unsigned int namelen, 246 unsigned int namelen)
248 struct ocfs2_lock_res *astarg)
249{ 247{
250 BUG_ON(lproto == NULL); 248 if (!lksb->lksb_conn)
251 249 lksb->lksb_conn = conn;
250 else
251 BUG_ON(lksb->lksb_conn != conn);
252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, 252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
253 name, namelen, astarg); 253 name, namelen);
254} 254}
255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); 255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
256 256
257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
258 union ocfs2_dlm_lksb *lksb, 258 struct ocfs2_dlm_lksb *lksb,
259 u32 flags, 259 u32 flags)
260 struct ocfs2_lock_res *astarg)
261{ 260{
262 BUG_ON(lproto == NULL); 261 BUG_ON(lksb->lksb_conn == NULL);
263 262
264 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); 263 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
265} 264}
266EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); 265EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
267 266
268int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 267int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
269{ 268{
270 return active_stack->sp_ops->lock_status(lksb); 269 return active_stack->sp_ops->lock_status(lksb);
271} 270}
272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); 271EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
273 272
274int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 273int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
275{ 274{
276 return active_stack->sp_ops->lvb_valid(lksb); 275 return active_stack->sp_ops->lvb_valid(lksb);
277} 276}
278EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); 277EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
279 278
280void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) 279void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
281{ 280{
282 return active_stack->sp_ops->lock_lvb(lksb); 281 return active_stack->sp_ops->lock_lvb(lksb);
283} 282}
284EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); 283EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
285 284
286void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 285void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
287{ 286{
288 active_stack->sp_ops->dump_lksb(lksb); 287 active_stack->sp_ops->dump_lksb(lksb);
289} 288}
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
312int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
313 const char *group, 312 const char *group,
314 int grouplen, 313 int grouplen,
314 struct ocfs2_locking_protocol *lproto,
315 void (*recovery_handler)(int node_num, 315 void (*recovery_handler)(int node_num,
316 void *recovery_data), 316 void *recovery_data),
317 void *recovery_data, 317 void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
329 goto out; 329 goto out;
330 } 330 }
331 331
332 if (memcmp(&lproto->lp_max_version, &locking_max_version,
333 sizeof(struct ocfs2_protocol_version))) {
334 rc = -EINVAL;
335 goto out;
336 }
337
332 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), 338 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
333 GFP_KERNEL); 339 GFP_KERNEL);
334 if (!new_conn) { 340 if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
341 new_conn->cc_recovery_handler = recovery_handler; 347 new_conn->cc_recovery_handler = recovery_handler;
342 new_conn->cc_recovery_data = recovery_data; 348 new_conn->cc_recovery_data = recovery_data;
343 349
350 new_conn->cc_proto = lproto;
344 /* Start the new connection at our maximum compatibility level */ 351 /* Start the new connection at our maximum compatibility level */
345 new_conn->cc_version = lproto->lp_max_version; 352 new_conn->cc_version = lproto->lp_max_version;
346 353
@@ -366,6 +373,24 @@ out:
366} 373}
367EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); 374EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
368 375
376/* The caller will ensure all nodes have the same cluster stack */
377int ocfs2_cluster_connect_agnostic(const char *group,
378 int grouplen,
379 struct ocfs2_locking_protocol *lproto,
380 void (*recovery_handler)(int node_num,
381 void *recovery_data),
382 void *recovery_data,
383 struct ocfs2_cluster_connection **conn)
384{
385 char *stack_name = NULL;
386
387 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
390 recovery_handler, recovery_data, conn);
391}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393
369/* If hangup_pending is 0, the stack driver will be dropped */ 394/* If hangup_pending is 0, the stack driver will be dropped */
370int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 395int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
371 int hangup_pending) 396 int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
453 ssize_t ret = 0; 478 ssize_t ret = 0;
454 479
455 spin_lock(&ocfs2_stack_lock); 480 spin_lock(&ocfs2_stack_lock);
456 if (lproto) 481 if (locking_max_version.pv_major)
457 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", 482 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
458 lproto->lp_max_version.pv_major, 483 locking_max_version.pv_major,
459 lproto->lp_max_version.pv_minor); 484 locking_max_version.pv_minor);
460 spin_unlock(&ocfs2_stack_lock); 485 spin_unlock(&ocfs2_stack_lock);
461 486
462 return ret; 487 return ret;
@@ -620,51 +645,46 @@ error:
620 645
621static ctl_table ocfs2_nm_table[] = { 646static ctl_table ocfs2_nm_table[] = {
622 { 647 {
623 .ctl_name = 1,
624 .procname = "hb_ctl_path", 648 .procname = "hb_ctl_path",
625 .data = ocfs2_hb_ctl_path, 649 .data = ocfs2_hb_ctl_path,
626 .maxlen = OCFS2_MAX_HB_CTL_PATH, 650 .maxlen = OCFS2_MAX_HB_CTL_PATH,
627 .mode = 0644, 651 .mode = 0644,
628 .proc_handler = &proc_dostring, 652 .proc_handler = proc_dostring,
629 .strategy = &sysctl_string,
630 }, 653 },
631 { .ctl_name = 0 } 654 { }
632}; 655};
633 656
634static ctl_table ocfs2_mod_table[] = { 657static ctl_table ocfs2_mod_table[] = {
635 { 658 {
636 .ctl_name = FS_OCFS2_NM,
637 .procname = "nm", 659 .procname = "nm",
638 .data = NULL, 660 .data = NULL,
639 .maxlen = 0, 661 .maxlen = 0,
640 .mode = 0555, 662 .mode = 0555,
641 .child = ocfs2_nm_table 663 .child = ocfs2_nm_table
642 }, 664 },
643 { .ctl_name = 0} 665 { }
644}; 666};
645 667
646static ctl_table ocfs2_kern_table[] = { 668static ctl_table ocfs2_kern_table[] = {
647 { 669 {
648 .ctl_name = FS_OCFS2,
649 .procname = "ocfs2", 670 .procname = "ocfs2",
650 .data = NULL, 671 .data = NULL,
651 .maxlen = 0, 672 .maxlen = 0,
652 .mode = 0555, 673 .mode = 0555,
653 .child = ocfs2_mod_table 674 .child = ocfs2_mod_table
654 }, 675 },
655 { .ctl_name = 0} 676 { }
656}; 677};
657 678
658static ctl_table ocfs2_root_table[] = { 679static ctl_table ocfs2_root_table[] = {
659 { 680 {
660 .ctl_name = CTL_FS,
661 .procname = "fs", 681 .procname = "fs",
662 .data = NULL, 682 .data = NULL,
663 .maxlen = 0, 683 .maxlen = 0,
664 .mode = 0555, 684 .mode = 0555,
665 .child = ocfs2_kern_table 685 .child = ocfs2_kern_table
666 }, 686 },
667 { .ctl_name = 0 } 687 { }
668}; 688};
669 689
670static struct ctl_table_header *ocfs2_table_header = NULL; 690static struct ctl_table_header *ocfs2_table_header = NULL;
@@ -690,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
690 710
691static void __exit ocfs2_stack_glue_exit(void) 711static void __exit ocfs2_stack_glue_exit(void)
692{ 712{
693 lproto = NULL; 713 memset(&locking_max_version, 0,
714 sizeof(struct ocfs2_protocol_version));
715 locking_max_version.pv_major = 0;
716 locking_max_version.pv_minor = 0;
694 ocfs2_sysfs_exit(); 717 ocfs2_sysfs_exit();
695 if (ocfs2_table_header) 718 if (ocfs2_table_header)
696 unregister_sysctl_table(ocfs2_table_header); 719 unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
56}; 56};
57 57
58/* 58/*
59 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
60 */
61struct ocfs2_locking_protocol {
62 struct ocfs2_protocol_version lp_max_version;
63 void (*lp_lock_ast)(void *astarg);
64 void (*lp_blocking_ast)(void *astarg, int level);
65 void (*lp_unlock_ast)(void *astarg, int error);
66};
67
68
69/*
70 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only 59 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
71 * has a pointer to separately allocated lvb space. This struct exists only to 60 * has a pointer to separately allocated lvb space. This struct exists only to
72 * include in the lksb union to make space for a combined dlm_lksb and lvb. 61 * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
81 * size of the union is known. Lock status structures are embedded in 70 * size of the union is known. Lock status structures are embedded in
82 * ocfs2 inodes. 71 * ocfs2 inodes.
83 */ 72 */
84union ocfs2_dlm_lksb { 73struct ocfs2_cluster_connection;
85 struct dlm_lockstatus lksb_o2dlm; 74struct ocfs2_dlm_lksb {
86 struct dlm_lksb lksb_fsdlm; 75 union {
87 struct fsdlm_lksb_plus_lvb padding; 76 struct dlm_lockstatus lksb_o2dlm;
77 struct dlm_lksb lksb_fsdlm;
78 struct fsdlm_lksb_plus_lvb padding;
79 };
80 struct ocfs2_cluster_connection *lksb_conn;
81};
82
83/*
84 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
85 */
86struct ocfs2_locking_protocol {
87 struct ocfs2_protocol_version lp_max_version;
88 void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
89 void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
90 void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
88}; 91};
89 92
93
90/* 94/*
91 * A cluster connection. Mostly opaque to ocfs2, the connection holds 95 * A cluster connection. Mostly opaque to ocfs2, the connection holds
92 * state for the underlying stack. ocfs2 does use cc_version to determine 96 * state for the underlying stack. ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
96 char cc_name[GROUP_NAME_MAX]; 100 char cc_name[GROUP_NAME_MAX];
97 int cc_namelen; 101 int cc_namelen;
98 struct ocfs2_protocol_version cc_version; 102 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto;
99 void (*cc_recovery_handler)(int node_num, void *recovery_data); 104 void (*cc_recovery_handler)(int node_num, void *recovery_data);
100 void *cc_recovery_data; 105 void *cc_recovery_data;
101 void *cc_lockspace; 106 void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
155 * 160 *
156 * ast and bast functions are not part of the call because the 161 * ast and bast functions are not part of the call because the
157 * stack will likely want to wrap ast and bast calls before passing 162 * stack will likely want to wrap ast and bast calls before passing
158 * them to stack->sp_proto. 163 * them to stack->sp_proto. There is no astarg. The lksb will
164 * be passed back to the ast and bast functions. The caller can
165 * use this to find their object.
159 */ 166 */
160 int (*dlm_lock)(struct ocfs2_cluster_connection *conn, 167 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
161 int mode, 168 int mode,
162 union ocfs2_dlm_lksb *lksb, 169 struct ocfs2_dlm_lksb *lksb,
163 u32 flags, 170 u32 flags,
164 void *name, 171 void *name,
165 unsigned int namelen, 172 unsigned int namelen);
166 void *astarg);
167 173
168 /* 174 /*
169 * Call the underlying dlm unlock function. The ->dlm_unlock() 175 * Call the underlying dlm unlock function. The ->dlm_unlock()
170 * function should convert the flags as appropriate. 176 * function should convert the flags as appropriate.
171 * 177 *
172 * The unlock ast is not passed, as the stack will want to wrap 178 * The unlock ast is not passed, as the stack will want to wrap
173 * it before calling stack->sp_proto->lp_unlock_ast(). 179 * it before calling stack->sp_proto->lp_unlock_ast(). There is
180 * no astarg. The lksb will be passed back to the unlock ast
181 * function. The caller can use this to find their object.
174 */ 182 */
175 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, 183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
176 union ocfs2_dlm_lksb *lksb, 184 struct ocfs2_dlm_lksb *lksb,
177 u32 flags, 185 u32 flags);
178 void *astarg);
179 186
180 /* 187 /*
181 * Return the status of the current lock status block. The fs 188 * Return the status of the current lock status block. The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
183 * callback pulls out the stack-specific lksb, converts the status 190 * callback pulls out the stack-specific lksb, converts the status
184 * to a proper errno, and returns it. 191 * to a proper errno, and returns it.
185 */ 192 */
186 int (*lock_status)(union ocfs2_dlm_lksb *lksb); 193 int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
187 194
188 /* 195 /*
189 * Return non-zero if the LVB is valid. 196 * Return non-zero if the LVB is valid.
190 */ 197 */
191 int (*lvb_valid)(union ocfs2_dlm_lksb *lksb); 198 int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
192 199
193 /* 200 /*
194 * Pull the lvb pointer off of the stack-specific lksb. 201 * Pull the lvb pointer off of the stack-specific lksb.
195 */ 202 */
196 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 203 void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
197 204
198 /* 205 /*
199 * Cluster-aware posix locks 206 * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
210 * This is an optoinal debugging hook. If provided, the 217 * This is an optoinal debugging hook. If provided, the
211 * stack can dump debugging information about this lock. 218 * stack can dump debugging information about this lock.
212 */ 219 */
213 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); 220 void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
214}; 221};
215 222
216/* 223/*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
226 /* These are managed by the stackglue code. */ 233 /* These are managed by the stackglue code. */
227 struct list_head sp_list; 234 struct list_head sp_list;
228 unsigned int sp_count; 235 unsigned int sp_count;
229 struct ocfs2_locking_protocol *sp_proto; 236 struct ocfs2_protocol_version sp_max_proto;
230}; 237};
231 238
232 239
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
234int ocfs2_cluster_connect(const char *stack_name, 241int ocfs2_cluster_connect(const char *stack_name,
235 const char *group, 242 const char *group,
236 int grouplen, 243 int grouplen,
244 struct ocfs2_locking_protocol *lproto,
237 void (*recovery_handler)(int node_num, 245 void (*recovery_handler)(int node_num,
238 void *recovery_data), 246 void *recovery_data),
239 void *recovery_data, 247 void *recovery_data,
240 struct ocfs2_cluster_connection **conn); 248 struct ocfs2_cluster_connection **conn);
249/*
250 * Used by callers that don't store their stack name. They must ensure
251 * all nodes have the same stack.
252 */
253int ocfs2_cluster_connect_agnostic(const char *group,
254 int grouplen,
255 struct ocfs2_locking_protocol *lproto,
256 void (*recovery_handler)(int node_num,
257 void *recovery_data),
258 void *recovery_data,
259 struct ocfs2_cluster_connection **conn);
241int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
242 int hangup_pending); 261 int hangup_pending);
243void ocfs2_cluster_hangup(const char *group, int grouplen); 262void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
246struct ocfs2_lock_res; 265struct ocfs2_lock_res;
247int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
248 int mode, 267 int mode,
249 union ocfs2_dlm_lksb *lksb, 268 struct ocfs2_dlm_lksb *lksb,
250 u32 flags, 269 u32 flags,
251 void *name, 270 void *name,
252 unsigned int namelen, 271 unsigned int namelen);
253 struct ocfs2_lock_res *astarg);
254int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 272int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
255 union ocfs2_dlm_lksb *lksb, 273 struct ocfs2_dlm_lksb *lksb,
256 u32 flags, 274 u32 flags);
257 struct ocfs2_lock_res *astarg);
258 275
259int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); 276int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
260int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); 277int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
261void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 278void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
262void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 279void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
263 280
264int ocfs2_stack_supports_plocks(void); 281int ocfs2_stack_supports_plocks(void);
265int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, 282int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
266 struct file *file, int cmd, struct file_lock *fl); 283 struct file *file, int cmd, struct file_lock *fl);
267 284
268void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 285void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
269 286
270 287
271/* Used by stack plugins */ 288/* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
51#define ALLOC_NEW_GROUP 0x1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2 52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
53 53
54#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 95 struct buffer_head *group_bh,
96 unsigned int bit_off, 96 unsigned int bit_off,
97 unsigned int num_bits); 97 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 98static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 99 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 100 struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 145
153#define do_error(fmt, ...) \ 146#define do_error(fmt, ...) \
154 do{ \ 147 do{ \
155 if (clean_error) \ 148 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 149 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 150 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 151 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 153
161static int ocfs2_validate_gd_self(struct super_block *sb, 154static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 155 struct buffer_head *bh,
163 int clean_error) 156 int resize)
164{ 157{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 158 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 159
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 204static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 205 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 206 struct buffer_head *bh,
214 int clean_error) 207 int resize)
215{ 208{
216 unsigned int max_bits; 209 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 226 return -EINVAL;
234 } 227 }
235 228
236 if (le16_to_cpu(gd->bg_chain) >= 229 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 230 if ((le16_to_cpu(gd->bg_chain) >
231 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232 ((le16_to_cpu(gd->bg_chain) ==
233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 235 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
@@ -637,12 +633,113 @@ bail:
637 return status; 633 return status;
638} 634}
639 635
636static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
637{
638 spin_lock(&osb->osb_lock);
639 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
640 spin_unlock(&osb->osb_lock);
641 atomic_set(&osb->s_num_inodes_stolen, 0);
642}
643
644static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
645{
646 spin_lock(&osb->osb_lock);
647 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
648 spin_unlock(&osb->osb_lock);
649 atomic_set(&osb->s_num_meta_stolen, 0);
650}
651
652void ocfs2_init_steal_slots(struct ocfs2_super *osb)
653{
654 ocfs2_init_inode_steal_slot(osb);
655 ocfs2_init_meta_steal_slot(osb);
656}
657
658static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
659{
660 spin_lock(&osb->osb_lock);
661 if (type == INODE_ALLOC_SYSTEM_INODE)
662 osb->s_inode_steal_slot = slot;
663 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
664 osb->s_meta_steal_slot = slot;
665 spin_unlock(&osb->osb_lock);
666}
667
668static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
669{
670 int slot = OCFS2_INVALID_SLOT;
671
672 spin_lock(&osb->osb_lock);
673 if (type == INODE_ALLOC_SYSTEM_INODE)
674 slot = osb->s_inode_steal_slot;
675 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
676 slot = osb->s_meta_steal_slot;
677 spin_unlock(&osb->osb_lock);
678
679 return slot;
680}
681
682static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
683{
684 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
685}
686
687static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
688{
689 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
690}
691
692static int ocfs2_steal_resource(struct ocfs2_super *osb,
693 struct ocfs2_alloc_context *ac,
694 int type)
695{
696 int i, status = -ENOSPC;
697 int slot = __ocfs2_get_steal_slot(osb, type);
698
699 /* Start to steal resource from the first slot after ours. */
700 if (slot == OCFS2_INVALID_SLOT)
701 slot = osb->slot_num + 1;
702
703 for (i = 0; i < osb->max_slots; i++, slot++) {
704 if (slot == osb->max_slots)
705 slot = 0;
706
707 if (slot == osb->slot_num)
708 continue;
709
710 status = ocfs2_reserve_suballoc_bits(osb, ac,
711 type,
712 (u32)slot, NULL,
713 NOT_ALLOC_NEW_GROUP);
714 if (status >= 0) {
715 __ocfs2_set_steal_slot(osb, slot, type);
716 break;
717 }
718
719 ocfs2_free_ac_resource(ac);
720 }
721
722 return status;
723}
724
725static int ocfs2_steal_inode(struct ocfs2_super *osb,
726 struct ocfs2_alloc_context *ac)
727{
728 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
729}
730
731static int ocfs2_steal_meta(struct ocfs2_super *osb,
732 struct ocfs2_alloc_context *ac)
733{
734 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
735}
736
640int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 737int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
641 int blocks, 738 int blocks,
642 struct ocfs2_alloc_context **ac) 739 struct ocfs2_alloc_context **ac)
643{ 740{
644 int status; 741 int status;
645 u32 slot; 742 int slot = ocfs2_get_meta_steal_slot(osb);
646 743
647 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 744 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
648 if (!(*ac)) { 745 if (!(*ac)) {
@@ -653,12 +750,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
653 750
654 (*ac)->ac_bits_wanted = blocks; 751 (*ac)->ac_bits_wanted = blocks;
655 (*ac)->ac_which = OCFS2_AC_USE_META; 752 (*ac)->ac_which = OCFS2_AC_USE_META;
656 slot = osb->slot_num;
657 (*ac)->ac_group_search = ocfs2_block_group_search; 753 (*ac)->ac_group_search = ocfs2_block_group_search;
658 754
755 if (slot != OCFS2_INVALID_SLOT &&
756 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
757 goto extent_steal;
758
759 atomic_set(&osb->s_num_meta_stolen, 0);
659 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 760 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
660 EXTENT_ALLOC_SYSTEM_INODE, 761 EXTENT_ALLOC_SYSTEM_INODE,
661 slot, NULL, ALLOC_NEW_GROUP); 762 (u32)osb->slot_num, NULL,
763 ALLOC_NEW_GROUP);
764
765
766 if (status >= 0) {
767 status = 0;
768 if (slot != OCFS2_INVALID_SLOT)
769 ocfs2_init_meta_steal_slot(osb);
770 goto bail;
771 } else if (status < 0 && status != -ENOSPC) {
772 mlog_errno(status);
773 goto bail;
774 }
775
776 ocfs2_free_ac_resource(*ac);
777
778extent_steal:
779 status = ocfs2_steal_meta(osb, *ac);
780 atomic_inc(&osb->s_num_meta_stolen);
662 if (status < 0) { 781 if (status < 0) {
663 if (status != -ENOSPC) 782 if (status != -ENOSPC)
664 mlog_errno(status); 783 mlog_errno(status);
@@ -685,43 +804,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
685 ac); 804 ac);
686} 805}
687 806
688static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
689 struct ocfs2_alloc_context *ac)
690{
691 int i, status = -ENOSPC;
692 s16 slot = ocfs2_get_inode_steal_slot(osb);
693
694 /* Start to steal inodes from the first slot after ours. */
695 if (slot == OCFS2_INVALID_SLOT)
696 slot = osb->slot_num + 1;
697
698 for (i = 0; i < osb->max_slots; i++, slot++) {
699 if (slot == osb->max_slots)
700 slot = 0;
701
702 if (slot == osb->slot_num)
703 continue;
704
705 status = ocfs2_reserve_suballoc_bits(osb, ac,
706 INODE_ALLOC_SYSTEM_INODE,
707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
709 if (status >= 0) {
710 ocfs2_set_inode_steal_slot(osb, slot);
711 break;
712 }
713
714 ocfs2_free_ac_resource(ac);
715 }
716
717 return status;
718}
719
720int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 807int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
721 struct ocfs2_alloc_context **ac) 808 struct ocfs2_alloc_context **ac)
722{ 809{
723 int status; 810 int status;
724 s16 slot = ocfs2_get_inode_steal_slot(osb); 811 int slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group; 812 u64 alloc_group;
726 813
727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 814 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +841,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
754 * need to check our slots to see whether there is some space for us. 841 * need to check our slots to see whether there is some space for us.
755 */ 842 */
756 if (slot != OCFS2_INVALID_SLOT && 843 if (slot != OCFS2_INVALID_SLOT &&
757 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) 844 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
758 goto inode_steal; 845 goto inode_steal;
759 846
760 atomic_set(&osb->s_num_inodes_stolen, 0); 847 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group; 848 alloc_group = osb->osb_inode_alloc_group;
762 status = ocfs2_reserve_suballoc_bits(osb, *ac, 849 status = ocfs2_reserve_suballoc_bits(osb, *ac,
763 INODE_ALLOC_SYSTEM_INODE, 850 INODE_ALLOC_SYSTEM_INODE,
764 osb->slot_num, 851 (u32)osb->slot_num,
765 &alloc_group, 852 &alloc_group,
766 ALLOC_NEW_GROUP | 853 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL); 854 ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +876,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
789 ocfs2_free_ac_resource(*ac); 876 ocfs2_free_ac_resource(*ac);
790 877
791inode_steal: 878inode_steal:
792 status = ocfs2_steal_inode_from_other_nodes(osb, *ac); 879 status = ocfs2_steal_inode(osb, *ac);
793 atomic_inc(&osb->s_num_inodes_stolen); 880 atomic_inc(&osb->s_num_inodes_stolen);
794 if (status < 0) { 881 if (status < 0) {
795 if (status != -ENOSPC) 882 if (status != -ENOSPC)
@@ -1884,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1884 bits_wanted, cluster_start, num_clusters); 1971 bits_wanted, cluster_start, num_clusters);
1885} 1972}
1886 1973
1887static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1974static int ocfs2_block_group_clear_bits(handle_t *handle,
1888 struct inode *alloc_inode, 1975 struct inode *alloc_inode,
1889 struct ocfs2_group_desc *bg, 1976 struct ocfs2_group_desc *bg,
1890 struct buffer_head *group_bh, 1977 struct buffer_head *group_bh,
1891 unsigned int bit_off, 1978 unsigned int bit_off,
1892 unsigned int num_bits) 1979 unsigned int num_bits,
1980 void (*undo_fn)(unsigned int bit,
1981 unsigned long *bmap))
1893{ 1982{
1894 int status; 1983 int status;
1895 unsigned int tmp; 1984 unsigned int tmp;
1896 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1897 struct ocfs2_group_desc *undo_bg = NULL; 1985 struct ocfs2_group_desc *undo_bg = NULL;
1898 int cluster_bitmap = 0;
1899 1986
1900 mlog_entry_void(); 1987 mlog_entry_void();
1901 1988
@@ -1905,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1905 1992
1906 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1993 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1907 1994
1908 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1995 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1909 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1910
1911 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1996 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1912 group_bh, journal_type); 1997 group_bh,
1998 undo_fn ?
1999 OCFS2_JOURNAL_ACCESS_UNDO :
2000 OCFS2_JOURNAL_ACCESS_WRITE);
1913 if (status < 0) { 2001 if (status < 0) {
1914 mlog_errno(status); 2002 mlog_errno(status);
1915 goto bail; 2003 goto bail;
1916 } 2004 }
1917 2005
1918 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2006 if (undo_fn) {
1919 cluster_bitmap = 1;
1920
1921 if (cluster_bitmap) {
1922 jbd_lock_bh_state(group_bh); 2007 jbd_lock_bh_state(group_bh);
1923 undo_bg = (struct ocfs2_group_desc *) 2008 undo_bg = (struct ocfs2_group_desc *)
1924 bh2jh(group_bh)->b_committed_data; 2009 bh2jh(group_bh)->b_committed_data;
@@ -1929,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1929 while(tmp--) { 2014 while(tmp--) {
1930 ocfs2_clear_bit((bit_off + tmp), 2015 ocfs2_clear_bit((bit_off + tmp),
1931 (unsigned long *) bg->bg_bitmap); 2016 (unsigned long *) bg->bg_bitmap);
1932 if (cluster_bitmap) 2017 if (undo_fn)
1933 ocfs2_set_bit(bit_off + tmp, 2018 undo_fn(bit_off + tmp,
1934 (unsigned long *) undo_bg->bg_bitmap); 2019 (unsigned long *) undo_bg->bg_bitmap);
1935 } 2020 }
1936 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2021 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1937 2022
1938 if (cluster_bitmap) 2023 if (undo_fn)
1939 jbd_unlock_bh_state(group_bh); 2024 jbd_unlock_bh_state(group_bh);
1940 2025
1941 status = ocfs2_journal_dirty(handle, group_bh); 2026 status = ocfs2_journal_dirty(handle, group_bh);
@@ -1948,12 +2033,14 @@ bail:
1948/* 2033/*
1949 * expects the suballoc inode to already be locked. 2034 * expects the suballoc inode to already be locked.
1950 */ 2035 */
1951int ocfs2_free_suballoc_bits(handle_t *handle, 2036static int _ocfs2_free_suballoc_bits(handle_t *handle,
1952 struct inode *alloc_inode, 2037 struct inode *alloc_inode,
1953 struct buffer_head *alloc_bh, 2038 struct buffer_head *alloc_bh,
1954 unsigned int start_bit, 2039 unsigned int start_bit,
1955 u64 bg_blkno, 2040 u64 bg_blkno,
1956 unsigned int count) 2041 unsigned int count,
2042 void (*undo_fn)(unsigned int bit,
2043 unsigned long *bitmap))
1957{ 2044{
1958 int status = 0; 2045 int status = 0;
1959 u32 tmp_used; 2046 u32 tmp_used;
@@ -1988,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1988 2075
1989 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2076 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1990 group, group_bh, 2077 group, group_bh,
1991 start_bit, count); 2078 start_bit, count, undo_fn);
1992 if (status < 0) { 2079 if (status < 0) {
1993 mlog_errno(status); 2080 mlog_errno(status);
1994 goto bail; 2081 goto bail;
@@ -2019,6 +2106,17 @@ bail:
2019 return status; 2106 return status;
2020} 2107}
2021 2108
2109int ocfs2_free_suballoc_bits(handle_t *handle,
2110 struct inode *alloc_inode,
2111 struct buffer_head *alloc_bh,
2112 unsigned int start_bit,
2113 u64 bg_blkno,
2114 unsigned int count)
2115{
2116 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117 start_bit, bg_blkno, count, NULL);
2118}
2119
2022int ocfs2_free_dinode(handle_t *handle, 2120int ocfs2_free_dinode(handle_t *handle,
2023 struct inode *inode_alloc_inode, 2121 struct inode *inode_alloc_inode,
2024 struct buffer_head *inode_alloc_bh, 2122 struct buffer_head *inode_alloc_bh,
@@ -2032,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
2032 inode_alloc_bh, bit, bg_blkno, 1); 2130 inode_alloc_bh, bit, bg_blkno, 1);
2033} 2131}
2034 2132
2035int ocfs2_free_clusters(handle_t *handle, 2133static int _ocfs2_free_clusters(handle_t *handle,
2036 struct inode *bitmap_inode, 2134 struct inode *bitmap_inode,
2037 struct buffer_head *bitmap_bh, 2135 struct buffer_head *bitmap_bh,
2038 u64 start_blk, 2136 u64 start_blk,
2039 unsigned int num_clusters) 2137 unsigned int num_clusters,
2138 void (*undo_fn)(unsigned int bit,
2139 unsigned long *bitmap))
2040{ 2140{
2041 int status; 2141 int status;
2042 u16 bg_start_bit; 2142 u16 bg_start_bit;
@@ -2063,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
2063 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2163 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2064 (unsigned long long)bg_blkno, bg_start_bit); 2164 (unsigned long long)bg_blkno, bg_start_bit);
2065 2165
2066 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2166 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2067 bg_start_bit, bg_blkno, 2167 bg_start_bit, bg_blkno,
2068 num_clusters); 2168 num_clusters, undo_fn);
2069 if (status < 0) { 2169 if (status < 0) {
2070 mlog_errno(status); 2170 mlog_errno(status);
2071 goto out; 2171 goto out;
@@ -2079,6 +2179,32 @@ out:
2079 return status; 2179 return status;
2080} 2180}
2081 2181
2182int ocfs2_free_clusters(handle_t *handle,
2183 struct inode *bitmap_inode,
2184 struct buffer_head *bitmap_bh,
2185 u64 start_blk,
2186 unsigned int num_clusters)
2187{
2188 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189 start_blk, num_clusters,
2190 _ocfs2_set_bit);
2191}
2192
2193/*
2194 * Give never-used clusters back to the global bitmap. We don't need
2195 * to protect these bits in the undo buffer.
2196 */
2197int ocfs2_release_clusters(handle_t *handle,
2198 struct inode *bitmap_inode,
2199 struct buffer_head *bitmap_bh,
2200 u64 start_blk,
2201 unsigned int num_clusters)
2202{
2203 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204 start_blk, num_clusters,
2205 _ocfs2_clear_bit);
2206}
2207
2082static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2208static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2083{ 2209{
2084 printk("Block Group:\n"); 2210 printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
56 is the same as ~0 - unlimited */ 56 is the same as ~0 - unlimited */
57}; 57};
58 58
59void ocfs2_init_steal_slots(struct ocfs2_super *osb);
59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 60void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
60static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) 61static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
61{ 62{
@@ -126,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
126 struct buffer_head *bitmap_bh, 127 struct buffer_head *bitmap_bh,
127 u64 start_blk, 128 u64 start_blk,
128 unsigned int num_clusters); 129 unsigned int num_clusters);
130int ocfs2_release_clusters(handle_t *handle,
131 struct inode *bitmap_inode,
132 struct buffer_head *bitmap_bh,
133 u64 start_blk,
134 unsigned int num_clusters);
129 135
130static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 136static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
131{ 137{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 14f47d2bfe02..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
69#include "xattr.h" 69#include "xattr.h"
70#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h" 71#include "refcounttree.h"
72#include "suballoc.h"
72 73
73#include "buffer_head_io.h" 74#include "buffer_head_io.h"
74 75
@@ -100,6 +101,8 @@ struct mount_options
100static int ocfs2_parse_options(struct super_block *sb, char *options, 101static int ocfs2_parse_options(struct super_block *sb, char *options,
101 struct mount_options *mopt, 102 struct mount_options *mopt,
102 int is_remount); 103 int is_remount);
104static int ocfs2_check_set_options(struct super_block *sb,
105 struct mount_options *options);
103static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); 106static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
104static void ocfs2_put_super(struct super_block *sb); 107static void ocfs2_put_super(struct super_block *sb);
105static int ocfs2_mount_volume(struct super_block *sb); 108static int ocfs2_mount_volume(struct super_block *sb);
@@ -299,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
299 302
300 spin_lock(&osb->osb_lock); 303 spin_lock(&osb->osb_lock);
301 out += snprintf(buf + out, len - out, 304 out += snprintf(buf + out, len - out,
302 "%10s => Slot: %d NumStolen: %d\n", "Steal", 305 "%10s => InodeSlot: %d StolenInodes: %d, "
306 "MetaSlot: %d StolenMeta: %d\n", "Steal",
303 osb->s_inode_steal_slot, 307 osb->s_inode_steal_slot,
304 atomic_read(&osb->s_num_inodes_stolen)); 308 atomic_read(&osb->s_num_inodes_stolen),
309 osb->s_meta_steal_slot,
310 atomic_read(&osb->s_num_meta_stolen));
305 spin_unlock(&osb->osb_lock); 311 spin_unlock(&osb->osb_lock);
306 312
307 out += snprintf(buf + out, len - out, "OrphanScan => "); 313 out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -600,7 +606,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
600 606
601 lock_kernel(); 607 lock_kernel();
602 608
603 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 609 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
610 !ocfs2_check_set_options(sb, &parsed_options)) {
604 ret = -EINVAL; 611 ret = -EINVAL;
605 goto out; 612 goto out;
606 } 613 }
@@ -691,8 +698,6 @@ unlock_osb:
691 if (!ret) { 698 if (!ret) {
692 /* Only save off the new mount options in case of a successful 699 /* Only save off the new mount options in case of a successful
693 * remount. */ 700 * remount. */
694 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
695 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
696 osb->s_mount_opt = parsed_options.mount_opt; 701 osb->s_mount_opt = parsed_options.mount_opt;
697 osb->s_atime_quantum = parsed_options.atime_quantum; 702 osb->s_atime_quantum = parsed_options.atime_quantum;
698 osb->preferred_slot = parsed_options.slot; 703 osb->preferred_slot = parsed_options.slot;
@@ -701,6 +706,10 @@ unlock_osb:
701 706
702 if (!ocfs2_is_hard_readonly(osb)) 707 if (!ocfs2_is_hard_readonly(osb))
703 ocfs2_set_journal_params(osb); 708 ocfs2_set_journal_params(osb);
709
710 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
711 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
712 MS_POSIXACL : 0);
704 } 713 }
705out: 714out:
706 unlock_kernel(); 715 unlock_kernel();
@@ -1011,31 +1020,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1011 brelse(bh); 1020 brelse(bh);
1012 bh = NULL; 1021 bh = NULL;
1013 1022
1014 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) 1023 if (!ocfs2_check_set_options(sb, &parsed_options)) {
1015 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1024 status = -EINVAL;
1016 1025 goto read_super_error;
1026 }
1017 osb->s_mount_opt = parsed_options.mount_opt; 1027 osb->s_mount_opt = parsed_options.mount_opt;
1018 osb->s_atime_quantum = parsed_options.atime_quantum; 1028 osb->s_atime_quantum = parsed_options.atime_quantum;
1019 osb->preferred_slot = parsed_options.slot; 1029 osb->preferred_slot = parsed_options.slot;
1020 osb->osb_commit_interval = parsed_options.commit_interval; 1030 osb->osb_commit_interval = parsed_options.commit_interval;
1021 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 1031 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
1022 osb->local_alloc_bits = osb->local_alloc_default_bits; 1032 osb->local_alloc_bits = osb->local_alloc_default_bits;
1023 if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
1024 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1025 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1026 status = -EINVAL;
1027 mlog(ML_ERROR, "User quotas were requested, but this "
1028 "filesystem does not have the feature enabled.\n");
1029 goto read_super_error;
1030 }
1031 if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
1032 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1033 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1034 status = -EINVAL;
1035 mlog(ML_ERROR, "Group quotas were requested, but this "
1036 "filesystem does not have the feature enabled.\n");
1037 goto read_super_error;
1038 }
1039 1033
1040 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 1034 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
1041 if (status) 1035 if (status)
@@ -1072,7 +1066,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1072 "file system, but write access is " 1066 "file system, but write access is "
1073 "unavailable.\n"); 1067 "unavailable.\n");
1074 else 1068 else
1075 mlog_errno(status); 1069 mlog_errno(status);
1076 goto read_super_error; 1070 goto read_super_error;
1077 } 1071 }
1078 1072
@@ -1245,6 +1239,40 @@ static struct file_system_type ocfs2_fs_type = {
1245 .next = NULL 1239 .next = NULL
1246}; 1240};
1247 1241
1242static int ocfs2_check_set_options(struct super_block *sb,
1243 struct mount_options *options)
1244{
1245 if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
1246 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1247 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1248 mlog(ML_ERROR, "User quotas were requested, but this "
1249 "filesystem does not have the feature enabled.\n");
1250 return 0;
1251 }
1252 if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
1253 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1254 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1255 mlog(ML_ERROR, "Group quotas were requested, but this "
1256 "filesystem does not have the feature enabled.\n");
1257 return 0;
1258 }
1259 if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
1260 !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
1261 mlog(ML_ERROR, "ACL support requested but extended attributes "
1262 "feature is not enabled\n");
1263 return 0;
1264 }
1265 /* No ACL setting specified? Use XATTR feature... */
1266 if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
1267 OCFS2_MOUNT_NO_POSIX_ACL))) {
1268 if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
1269 options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1270 else
1271 options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1272 }
1273 return 1;
1274}
1275
1248static int ocfs2_parse_options(struct super_block *sb, 1276static int ocfs2_parse_options(struct super_block *sb,
1249 char *options, 1277 char *options,
1250 struct mount_options *mopt, 1278 struct mount_options *mopt,
@@ -1392,40 +1420,19 @@ static int ocfs2_parse_options(struct super_block *sb,
1392 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1420 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
1393 break; 1421 break;
1394 case Opt_usrquota: 1422 case Opt_usrquota:
1395 /* We check only on remount, otherwise features
1396 * aren't yet initialized. */
1397 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1398 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1399 mlog(ML_ERROR, "User quota requested but "
1400 "filesystem feature is not set\n");
1401 status = 0;
1402 goto bail;
1403 }
1404 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; 1423 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
1405 break; 1424 break;
1406 case Opt_grpquota: 1425 case Opt_grpquota:
1407 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1408 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1409 mlog(ML_ERROR, "Group quota requested but "
1410 "filesystem feature is not set\n");
1411 status = 0;
1412 goto bail;
1413 }
1414 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1426 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1415 break; 1427 break;
1416#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1417 case Opt_acl: 1428 case Opt_acl:
1418 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1429 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1430 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
1419 break; 1431 break;
1420 case Opt_noacl: 1432 case Opt_noacl:
1433 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1421 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1434 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1422 break; 1435 break;
1423#else
1424 case Opt_acl:
1425 case Opt_noacl:
1426 printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
1427 break;
1428#endif
1429 default: 1436 default:
1430 mlog(ML_ERROR, 1437 mlog(ML_ERROR,
1431 "Unrecognized mount option \"%s\" " 1438 "Unrecognized mount option \"%s\" "
@@ -1502,12 +1509,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1502 if (opts & OCFS2_MOUNT_INODE64) 1509 if (opts & OCFS2_MOUNT_INODE64)
1503 seq_printf(s, ",inode64"); 1510 seq_printf(s, ",inode64");
1504 1511
1505#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1506 if (opts & OCFS2_MOUNT_POSIX_ACL) 1512 if (opts & OCFS2_MOUNT_POSIX_ACL)
1507 seq_printf(s, ",acl"); 1513 seq_printf(s, ",acl");
1508 else 1514 else
1509 seq_printf(s, ",noacl"); 1515 seq_printf(s, ",noacl");
1510#endif
1511 1516
1512 return 0; 1517 return 0;
1513} 1518}
@@ -1996,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1996 osb->blocked_lock_count = 0; 2001 osb->blocked_lock_count = 0;
1997 spin_lock_init(&osb->osb_lock); 2002 spin_lock_init(&osb->osb_lock);
1998 spin_lock_init(&osb->osb_xattr_lock); 2003 spin_lock_init(&osb->osb_xattr_lock);
1999 ocfs2_init_inode_steal_slot(osb); 2004 ocfs2_init_steal_slots(osb);
2000 2005
2001 atomic_set(&osb->alloc_stats.moves, 0); 2006 atomic_set(&osb->alloc_stats.moves, 0);
2002 atomic_set(&osb->alloc_stats.local_data, 0); 2007 atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index e3421030a69f..32499d213fc4 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -137,20 +137,20 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
137 } 137 }
138 138
139 memcpy(link, target, len); 139 memcpy(link, target, len);
140 nd_set_link(nd, link);
141 140
142bail: 141bail:
142 nd_set_link(nd, status ? ERR_PTR(status) : link);
143 brelse(bh); 143 brelse(bh);
144 144
145 mlog_exit(status); 145 mlog_exit(status);
146 return status ? ERR_PTR(status) : link; 146 return NULL;
147} 147}
148 148
149static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 149static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
150{ 150{
151 char *link = cookie; 151 char *link = nd_get_link(nd);
152 152 if (!IS_ERR(link))
153 kfree(link); 153 kfree(link);
154} 154}
155 155
156const struct inode_operations ocfs2_symlink_inode_operations = { 156const struct inode_operations ocfs2_symlink_inode_operations = {
@@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
163 .getxattr = generic_getxattr, 163 .getxattr = generic_getxattr,
164 .listxattr = ocfs2_listxattr, 164 .listxattr = ocfs2_listxattr,
165 .removexattr = generic_removexattr, 165 .removexattr = generic_removexattr,
166 .fiemap = ocfs2_fiemap,
166}; 167};
167const struct inode_operations ocfs2_fast_symlink_inode_operations = { 168const struct inode_operations ocfs2_fast_symlink_inode_operations = {
168 .readlink = ocfs2_readlink, 169 .readlink = ocfs2_readlink,
@@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = {
174 .getxattr = generic_getxattr, 175 .getxattr = generic_getxattr,
175 .listxattr = ocfs2_listxattr, 176 .listxattr = ocfs2_listxattr,
176 .removexattr = generic_removexattr, 177 .removexattr = generic_removexattr,
178 .fiemap = ocfs2_fiemap,
177}; 179};
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#define MLOG_MASK_PREFIX ML_INODE 30#define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index c61369342a27..a0a120e82b97 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -267,8 +267,8 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
267} 267}
268 268
269/* Warning: even if it returns true, this does *not* guarantee that 269/* Warning: even if it returns true, this does *not* guarantee that
270 * the block is stored in our inode metadata cache. 270 * the block is stored in our inode metadata cache.
271 * 271 *
272 * This can be called under lock_buffer() 272 * This can be called under lock_buffer()
273 */ 273 */
274int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, 274int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fe3419068df2..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
98 98
99struct xattr_handler *ocfs2_xattr_handlers[] = { 99struct xattr_handler *ocfs2_xattr_handlers[] = {
100 &ocfs2_xattr_user_handler, 100 &ocfs2_xattr_user_handler,
101#ifdef CONFIG_OCFS2_FS_POSIX_ACL
102 &ocfs2_xattr_acl_access_handler, 101 &ocfs2_xattr_acl_access_handler,
103 &ocfs2_xattr_acl_default_handler, 102 &ocfs2_xattr_acl_default_handler,
104#endif
105 &ocfs2_xattr_trusted_handler, 103 &ocfs2_xattr_trusted_handler,
106 &ocfs2_xattr_security_handler, 104 &ocfs2_xattr_security_handler,
107 NULL 105 NULL
@@ -109,21 +107,20 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
109 107
110static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
111 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
112#ifdef CONFIG_OCFS2_FS_POSIX_ACL
113 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 110 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
114 = &ocfs2_xattr_acl_access_handler, 111 = &ocfs2_xattr_acl_access_handler,
115 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] 112 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
116 = &ocfs2_xattr_acl_default_handler, 113 = &ocfs2_xattr_acl_default_handler,
117#endif
118 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, 114 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
119 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, 115 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
120}; 116};
121 117
122struct ocfs2_xattr_info { 118struct ocfs2_xattr_info {
123 int name_index; 119 int xi_name_index;
124 const char *name; 120 const char *xi_name;
125 const void *value; 121 int xi_name_len;
126 size_t value_len; 122 const void *xi_value;
123 size_t xi_value_len;
127}; 124};
128 125
129struct ocfs2_xattr_search { 126struct ocfs2_xattr_search {
@@ -141,6 +138,115 @@ struct ocfs2_xattr_search {
141 int not_found; 138 int not_found;
142}; 139};
143 140
141/* Operations on struct ocfs2_xa_entry */
142struct ocfs2_xa_loc;
143struct ocfs2_xa_loc_operations {
144 /*
145 * Journal functions
146 */
147 int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
148 int type);
149 void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
150
151 /*
152 * Return a pointer to the appropriate buffer in loc->xl_storage
153 * at the given offset from loc->xl_header.
154 */
155 void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
156
157 /* Can we reuse the existing entry for the new value? */
158 int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
159 struct ocfs2_xattr_info *xi);
160
161 /* How much space is needed for the new value? */
162 int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
163 struct ocfs2_xattr_info *xi);
164
165 /*
166 * Return the offset of the first name+value pair. This is
167 * the start of our downward-filling free space.
168 */
169 int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
170
171 /*
172 * Remove the name+value at this location. Do whatever is
173 * appropriate with the remaining name+value pairs.
174 */
175 void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
176
177 /* Fill xl_entry with a new entry */
178 void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
179
180 /* Add name+value storage to an entry */
181 void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
182
183 /*
184 * Initialize the value buf's access and bh fields for this entry.
185 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
186 */
187 void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
188 struct ocfs2_xattr_value_buf *vb);
189};
190
191/*
192 * Describes an xattr entry location. This is a memory structure
193 * tracking the on-disk structure.
194 */
195struct ocfs2_xa_loc {
196 /* This xattr belongs to this inode */
197 struct inode *xl_inode;
198
199 /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
200 struct ocfs2_xattr_header *xl_header;
201
202 /* Bytes from xl_header to the end of the storage */
203 int xl_size;
204
205 /*
206 * The ocfs2_xattr_entry this location describes. If this is
207 * NULL, this location describes the on-disk structure where it
208 * would have been.
209 */
210 struct ocfs2_xattr_entry *xl_entry;
211
212 /*
213 * Internal housekeeping
214 */
215
216 /* Buffer(s) containing this entry */
217 void *xl_storage;
218
219 /* Operations on the storage backing this location */
220 const struct ocfs2_xa_loc_operations *xl_ops;
221};
222
223/*
224 * Convenience functions to calculate how much space is needed for a
225 * given name+value pair
226 */
227static int namevalue_size(int name_len, uint64_t value_len)
228{
229 if (value_len > OCFS2_XATTR_INLINE_SIZE)
230 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
231 else
232 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
233}
234
235static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
236{
237 return namevalue_size(xi->xi_name_len, xi->xi_value_len);
238}
239
240static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
241{
242 u64 value_len = le64_to_cpu(xe->xe_value_size);
243
244 BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
245 ocfs2_xattr_is_local(xe));
246 return namevalue_size(xe->xe_name_len, value_len);
247}
248
249
144static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, 250static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
145 struct ocfs2_xattr_header *xh, 251 struct ocfs2_xattr_header *xh,
146 int index, 252 int index,
@@ -205,8 +311,6 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
205 int offset, 311 int offset,
206 struct ocfs2_xattr_value_root **xv, 312 struct ocfs2_xattr_value_root **xv,
207 struct buffer_head **bh); 313 struct buffer_head **bh);
208static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
209 const void *value, size_t size, int flags);
210 314
211static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 315static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
212{ 316{
@@ -218,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
218 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); 322 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
219} 323}
220 324
221static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
222{
223 u16 len = sb->s_blocksize -
224 offsetof(struct ocfs2_xattr_header, xh_entries);
225
226 return len / sizeof(struct ocfs2_xattr_entry);
227}
228
229#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) 325#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
230#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) 326#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
231#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) 327#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -469,35 +565,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
469 return hash; 565 return hash;
470} 566}
471 567
472/* 568static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
473 * ocfs2_xattr_hash_entry()
474 *
475 * Compute the hash of an extended attribute.
476 */
477static void ocfs2_xattr_hash_entry(struct inode *inode,
478 struct ocfs2_xattr_header *header,
479 struct ocfs2_xattr_entry *entry)
480{ 569{
481 u32 hash = 0; 570 return namevalue_size(name_len, value_len) +
482 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); 571 sizeof(struct ocfs2_xattr_entry);
483
484 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
485 entry->xe_name_hash = cpu_to_le32(hash);
486
487 return;
488} 572}
489 573
490static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) 574static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
491{ 575{
492 int size = 0; 576 return namevalue_size_xi(xi) +
493 577 sizeof(struct ocfs2_xattr_entry);
494 if (value_len <= OCFS2_XATTR_INLINE_SIZE) 578}
495 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
496 else
497 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
498 size += sizeof(struct ocfs2_xattr_entry);
499 579
500 return size; 580static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
581{
582 return namevalue_size_xe(xe) +
583 sizeof(struct ocfs2_xattr_entry);
501} 584}
502 585
503int ocfs2_calc_security_init(struct inode *dir, 586int ocfs2_calc_security_init(struct inode *dir,
@@ -1314,452 +1397,897 @@ out:
1314 return ret; 1397 return ret;
1315} 1398}
1316 1399
1317static int ocfs2_xattr_cleanup(struct inode *inode, 1400static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
1318 handle_t *handle, 1401 int num_entries)
1319 struct ocfs2_xattr_info *xi,
1320 struct ocfs2_xattr_search *xs,
1321 struct ocfs2_xattr_value_buf *vb,
1322 size_t offs)
1323{ 1402{
1324 int ret = 0; 1403 int free_space;
1325 size_t name_len = strlen(xi->name);
1326 void *val = xs->base + offs;
1327 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1328 1404
1329 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1405 if (!needed_space)
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1406 return 0;
1331 if (ret) {
1332 mlog_errno(ret);
1333 goto out;
1334 }
1335 /* Decrease xattr count */
1336 le16_add_cpu(&xs->header->xh_count, -1);
1337 /* Remove the xattr entry and tree root which has already be set*/
1338 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
1339 memset(val, 0, size);
1340 1407
1341 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1408 free_space = free_start -
1342 if (ret < 0) 1409 sizeof(struct ocfs2_xattr_header) -
1343 mlog_errno(ret); 1410 (num_entries * sizeof(struct ocfs2_xattr_entry)) -
1344out: 1411 OCFS2_XATTR_HEADER_GAP;
1345 return ret; 1412 if (free_space < 0)
1413 return -EIO;
1414 if (free_space < needed_space)
1415 return -ENOSPC;
1416
1417 return 0;
1346} 1418}
1347 1419
1348static int ocfs2_xattr_update_entry(struct inode *inode, 1420static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
1349 handle_t *handle, 1421 int type)
1350 struct ocfs2_xattr_info *xi,
1351 struct ocfs2_xattr_search *xs,
1352 struct ocfs2_xattr_value_buf *vb,
1353 size_t offs)
1354{ 1422{
1355 int ret; 1423 return loc->xl_ops->xlo_journal_access(handle, loc, type);
1424}
1356 1425
1357 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1426static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
1358 OCFS2_JOURNAL_ACCESS_WRITE); 1427{
1359 if (ret) { 1428 loc->xl_ops->xlo_journal_dirty(handle, loc);
1360 mlog_errno(ret); 1429}
1361 goto out;
1362 }
1363 1430
1364 xs->here->xe_name_offset = cpu_to_le16(offs); 1431/* Give a pointer into the storage for the given offset */
1365 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1432static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
1366 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) 1433{
1367 ocfs2_xattr_set_local(xs->here, 1); 1434 BUG_ON(offset >= loc->xl_size);
1368 else 1435 return loc->xl_ops->xlo_offset_pointer(loc, offset);
1369 ocfs2_xattr_set_local(xs->here, 0); 1436}
1370 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1371 1437
1372 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1438/*
1373 if (ret < 0) 1439 * Wipe the name+value pair and allow the storage to reclaim it. This
1374 mlog_errno(ret); 1440 * must be followed by either removal of the entry or a call to
1375out: 1441 * ocfs2_xa_add_namevalue().
1376 return ret; 1442 */
1443static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
1444{
1445 loc->xl_ops->xlo_wipe_namevalue(loc);
1377} 1446}
1378 1447
1379/* 1448/*
1380 * ocfs2_xattr_set_value_outside() 1449 * Find lowest offset to a name+value pair. This is the start of our
1381 * 1450 * downward-growing free space.
1382 * Set large size value in B tree.
1383 */ 1451 */
1384static int ocfs2_xattr_set_value_outside(struct inode *inode, 1452static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
1385 struct ocfs2_xattr_info *xi,
1386 struct ocfs2_xattr_search *xs,
1387 struct ocfs2_xattr_set_ctxt *ctxt,
1388 struct ocfs2_xattr_value_buf *vb,
1389 size_t offs)
1390{ 1453{
1391 size_t name_len = strlen(xi->name); 1454 return loc->xl_ops->xlo_get_free_start(loc);
1392 void *val = xs->base + offs; 1455}
1393 struct ocfs2_xattr_value_root *xv = NULL;
1394 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1395 int ret = 0;
1396 1456
1397 memset(val, 0, size); 1457/* Can we reuse loc->xl_entry for xi? */
1398 memcpy(val, xi->name, name_len); 1458static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
1399 xv = (struct ocfs2_xattr_value_root *) 1459 struct ocfs2_xattr_info *xi)
1400 (val + OCFS2_XATTR_SIZE(name_len)); 1460{
1401 xv->xr_clusters = 0; 1461 return loc->xl_ops->xlo_can_reuse(loc, xi);
1402 xv->xr_last_eb_blk = 0; 1462}
1403 xv->xr_list.l_tree_depth = 0; 1463
1404 xv->xr_list.l_count = cpu_to_le16(1); 1464/* How much free space is needed to set the new value */
1405 xv->xr_list.l_next_free_rec = 0; 1465static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
1406 vb->vb_xv = xv; 1466 struct ocfs2_xattr_info *xi)
1407 1467{
1408 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); 1468 return loc->xl_ops->xlo_check_space(loc, xi);
1409 if (ret < 0) { 1469}
1410 mlog_errno(ret); 1470
1411 return ret; 1471static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1472{
1473 loc->xl_ops->xlo_add_entry(loc, name_hash);
1474 loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
1475 /*
1476 * We can't leave the new entry's xe_name_offset at zero or
1477 * add_namevalue() will go nuts. We set it to the size of our
1478 * storage so that it can never be less than any other entry.
1479 */
1480 loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
1481}
1482
1483static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
1484 struct ocfs2_xattr_info *xi)
1485{
1486 int size = namevalue_size_xi(xi);
1487 int nameval_offset;
1488 char *nameval_buf;
1489
1490 loc->xl_ops->xlo_add_namevalue(loc, size);
1491 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1492 loc->xl_entry->xe_name_len = xi->xi_name_len;
1493 ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
1494 ocfs2_xattr_set_local(loc->xl_entry,
1495 xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
1496
1497 nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1498 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
1499 memset(nameval_buf, 0, size);
1500 memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
1501}
1502
1503static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
1504 struct ocfs2_xattr_value_buf *vb)
1505{
1506 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1507 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1508
1509 /* Value bufs are for value trees */
1510 BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
1511 BUG_ON(namevalue_size_xe(loc->xl_entry) !=
1512 (name_size + OCFS2_XATTR_ROOT_SIZE));
1513
1514 loc->xl_ops->xlo_fill_value_buf(loc, vb);
1515 vb->vb_xv =
1516 (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
1517 nameval_offset +
1518 name_size);
1519}
1520
1521static int ocfs2_xa_block_journal_access(handle_t *handle,
1522 struct ocfs2_xa_loc *loc, int type)
1523{
1524 struct buffer_head *bh = loc->xl_storage;
1525 ocfs2_journal_access_func access;
1526
1527 if (loc->xl_size == (bh->b_size -
1528 offsetof(struct ocfs2_xattr_block,
1529 xb_attrs.xb_header)))
1530 access = ocfs2_journal_access_xb;
1531 else
1532 access = ocfs2_journal_access_di;
1533 return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
1534}
1535
1536static void ocfs2_xa_block_journal_dirty(handle_t *handle,
1537 struct ocfs2_xa_loc *loc)
1538{
1539 struct buffer_head *bh = loc->xl_storage;
1540
1541 ocfs2_journal_dirty(handle, bh);
1542}
1543
1544static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
1545 int offset)
1546{
1547 return (char *)loc->xl_header + offset;
1548}
1549
1550static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
1551 struct ocfs2_xattr_info *xi)
1552{
1553 /*
1554 * Block storage is strict. If the sizes aren't exact, we will
1555 * remove the old one and reinsert the new.
1556 */
1557 return namevalue_size_xe(loc->xl_entry) ==
1558 namevalue_size_xi(xi);
1559}
1560
1561static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
1562{
1563 struct ocfs2_xattr_header *xh = loc->xl_header;
1564 int i, count = le16_to_cpu(xh->xh_count);
1565 int offset, free_start = loc->xl_size;
1566
1567 for (i = 0; i < count; i++) {
1568 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1569 if (offset < free_start)
1570 free_start = offset;
1412 } 1571 }
1413 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); 1572
1414 if (ret < 0) { 1573 return free_start;
1415 mlog_errno(ret); 1574}
1416 return ret; 1575
1576static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
1577 struct ocfs2_xattr_info *xi)
1578{
1579 int count = le16_to_cpu(loc->xl_header->xh_count);
1580 int free_start = ocfs2_xa_get_free_start(loc);
1581 int needed_space = ocfs2_xi_entry_usage(xi);
1582
1583 /*
1584 * Block storage will reclaim the original entry before inserting
1585 * the new value, so we only need the difference. If the new
1586 * entry is smaller than the old one, we don't need anything.
1587 */
1588 if (loc->xl_entry) {
1589 /* Don't need space if we're reusing! */
1590 if (ocfs2_xa_can_reuse_entry(loc, xi))
1591 needed_space = 0;
1592 else
1593 needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
1417 } 1594 }
1418 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, 1595 if (needed_space < 0)
1419 xi->value, xi->value_len); 1596 needed_space = 0;
1420 if (ret < 0) 1597 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1421 mlog_errno(ret); 1598}
1422 1599
1423 return ret; 1600/*
1601 * Block storage for xattrs keeps the name+value pairs compacted. When
1602 * we remove one, we have to shift any that preceded it towards the end.
1603 */
1604static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1605{
1606 int i, offset;
1607 int namevalue_offset, first_namevalue_offset, namevalue_size;
1608 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1609 struct ocfs2_xattr_header *xh = loc->xl_header;
1610 int count = le16_to_cpu(xh->xh_count);
1611
1612 namevalue_offset = le16_to_cpu(entry->xe_name_offset);
1613 namevalue_size = namevalue_size_xe(entry);
1614 first_namevalue_offset = ocfs2_xa_get_free_start(loc);
1615
1616 /* Shift the name+value pairs */
1617 memmove((char *)xh + first_namevalue_offset + namevalue_size,
1618 (char *)xh + first_namevalue_offset,
1619 namevalue_offset - first_namevalue_offset);
1620 memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
1621
1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size);
1628 }
1629
1630 /*
1631 * Note that we don't update xh_free_start or xh_name_value_len
1632 * because they're not used in block-stored xattrs.
1633 */
1634}
1635
1636static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1637{
1638 int count = le16_to_cpu(loc->xl_header->xh_count);
1639 loc->xl_entry = &(loc->xl_header->xh_entries[count]);
1640 le16_add_cpu(&loc->xl_header->xh_count, 1);
1641 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1642}
1643
1644static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1645{
1646 int free_start = ocfs2_xa_get_free_start(loc);
1647
1648 loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
1649}
1650
1651static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
1652 struct ocfs2_xattr_value_buf *vb)
1653{
1654 struct buffer_head *bh = loc->xl_storage;
1655
1656 if (loc->xl_size == (bh->b_size -
1657 offsetof(struct ocfs2_xattr_block,
1658 xb_attrs.xb_header)))
1659 vb->vb_access = ocfs2_journal_access_xb;
1660 else
1661 vb->vb_access = ocfs2_journal_access_di;
1662 vb->vb_bh = bh;
1424} 1663}
1425 1664
1426/* 1665/*
1427 * ocfs2_xattr_set_entry_local() 1666 * Operations for xattrs stored in blocks. This includes inline inode
1428 * 1667 * storage and unindexed ocfs2_xattr_blocks.
1429 * Set, replace or remove extended attribute in local.
1430 */ 1668 */
1431static void ocfs2_xattr_set_entry_local(struct inode *inode, 1669static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
1432 struct ocfs2_xattr_info *xi, 1670 .xlo_journal_access = ocfs2_xa_block_journal_access,
1433 struct ocfs2_xattr_search *xs, 1671 .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
1434 struct ocfs2_xattr_entry *last, 1672 .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
1435 size_t min_offs) 1673 .xlo_check_space = ocfs2_xa_block_check_space,
1674 .xlo_can_reuse = ocfs2_xa_block_can_reuse,
1675 .xlo_get_free_start = ocfs2_xa_block_get_free_start,
1676 .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
1677 .xlo_add_entry = ocfs2_xa_block_add_entry,
1678 .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
1679 .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
1680};
1681
1682static int ocfs2_xa_bucket_journal_access(handle_t *handle,
1683 struct ocfs2_xa_loc *loc, int type)
1436{ 1684{
1437 size_t name_len = strlen(xi->name); 1685 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1438 int i;
1439 1686
1440 if (xi->value && xs->not_found) { 1687 return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
1441 /* Insert the new xattr entry. */ 1688}
1442 le16_add_cpu(&xs->header->xh_count, 1); 1689
1443 ocfs2_xattr_set_type(last, xi->name_index); 1690static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
1444 ocfs2_xattr_set_local(last, 1); 1691 struct ocfs2_xa_loc *loc)
1445 last->xe_name_len = name_len; 1692{
1446 } else { 1693 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1447 void *first_val; 1694
1448 void *val; 1695 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
1449 size_t offs, size; 1696}
1450 1697
1451 first_val = xs->base + min_offs; 1698static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
1452 offs = le16_to_cpu(xs->here->xe_name_offset); 1699 int offset)
1453 val = xs->base + offs; 1700{
1454 1701 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1455 if (le64_to_cpu(xs->here->xe_value_size) > 1702 int block, block_offset;
1456 OCFS2_XATTR_INLINE_SIZE) 1703
1457 size = OCFS2_XATTR_SIZE(name_len) + 1704 /* The header is at the front of the bucket */
1458 OCFS2_XATTR_ROOT_SIZE; 1705 block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
1706 block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
1707
1708 return bucket_block(bucket, block) + block_offset;
1709}
1710
1711static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
1712 struct ocfs2_xattr_info *xi)
1713{
1714 return namevalue_size_xe(loc->xl_entry) >=
1715 namevalue_size_xi(xi);
1716}
1717
1718static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
1719{
1720 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1721 return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
1722}
1723
1724static int ocfs2_bucket_align_free_start(struct super_block *sb,
1725 int free_start, int size)
1726{
1727 /*
1728 * We need to make sure that the name+value pair fits within
1729 * one block.
1730 */
1731 if (((free_start - size) >> sb->s_blocksize_bits) !=
1732 ((free_start - 1) >> sb->s_blocksize_bits))
1733 free_start -= free_start % sb->s_blocksize;
1734
1735 return free_start;
1736}
1737
1738static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
1739 struct ocfs2_xattr_info *xi)
1740{
1741 int rc;
1742 int count = le16_to_cpu(loc->xl_header->xh_count);
1743 int free_start = ocfs2_xa_get_free_start(loc);
1744 int needed_space = ocfs2_xi_entry_usage(xi);
1745 int size = namevalue_size_xi(xi);
1746 struct super_block *sb = loc->xl_inode->i_sb;
1747
1748 /*
1749 * Bucket storage does not reclaim name+value pairs it cannot
1750 * reuse. They live as holes until the bucket fills, and then
1751 * the bucket is defragmented. However, the bucket can reclaim
1752 * the ocfs2_xattr_entry.
1753 */
1754 if (loc->xl_entry) {
1755 /* Don't need space if we're reusing! */
1756 if (ocfs2_xa_can_reuse_entry(loc, xi))
1757 needed_space = 0;
1459 else 1758 else
1460 size = OCFS2_XATTR_SIZE(name_len) + 1759 needed_space -= sizeof(struct ocfs2_xattr_entry);
1461 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 1760 }
1462 1761 BUG_ON(needed_space < 0);
1463 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1464 OCFS2_XATTR_SIZE(xi->value_len)) {
1465 /* The old and the new value have the
1466 same size. Just replace the value. */
1467 ocfs2_xattr_set_local(xs->here, 1);
1468 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1469 /* Clear value bytes. */
1470 memset(val + OCFS2_XATTR_SIZE(name_len),
1471 0,
1472 OCFS2_XATTR_SIZE(xi->value_len));
1473 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1474 xi->value,
1475 xi->value_len);
1476 return;
1477 }
1478 /* Remove the old name+value. */
1479 memmove(first_val + size, first_val, val - first_val);
1480 memset(first_val, 0, size);
1481 xs->here->xe_name_hash = 0;
1482 xs->here->xe_name_offset = 0;
1483 ocfs2_xattr_set_local(xs->here, 1);
1484 xs->here->xe_value_size = 0;
1485
1486 min_offs += size;
1487
1488 /* Adjust all value offsets. */
1489 last = xs->header->xh_entries;
1490 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1491 size_t o = le16_to_cpu(last->xe_name_offset);
1492
1493 if (o < offs)
1494 last->xe_name_offset = cpu_to_le16(o + size);
1495 last += 1;
1496 }
1497 1762
1498 if (!xi->value) { 1763 if (free_start < size) {
1499 /* Remove the old entry. */ 1764 if (needed_space)
1500 last -= 1; 1765 return -ENOSPC;
1501 memmove(xs->here, xs->here + 1, 1766 } else {
1502 (void *)last - (void *)xs->here); 1767 /*
1503 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 1768 * First we check if it would fit in the first place.
1504 le16_add_cpu(&xs->header->xh_count, -1); 1769 * Below, we align the free start to a block. This may
1505 } 1770 * slide us below the minimum gap. By checking unaligned
1771 * first, we avoid that error.
1772 */
1773 rc = ocfs2_xa_check_space_helper(needed_space, free_start,
1774 count);
1775 if (rc)
1776 return rc;
1777 free_start = ocfs2_bucket_align_free_start(sb, free_start,
1778 size);
1506 } 1779 }
1507 if (xi->value) { 1780 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1508 /* Insert the new name+value. */ 1781}
1509 size_t size = OCFS2_XATTR_SIZE(name_len) + 1782
1510 OCFS2_XATTR_SIZE(xi->value_len); 1783static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
1511 void *val = xs->base + min_offs - size; 1784{
1785 le16_add_cpu(&loc->xl_header->xh_name_value_len,
1786 -namevalue_size_xe(loc->xl_entry));
1787}
1788
1789static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1790{
1791 struct ocfs2_xattr_header *xh = loc->xl_header;
1792 int count = le16_to_cpu(xh->xh_count);
1793 int low = 0, high = count - 1, tmp;
1794 struct ocfs2_xattr_entry *tmp_xe;
1512 1795
1513 xs->here->xe_name_offset = cpu_to_le16(min_offs - size); 1796 /*
1514 memset(val, 0, size); 1797 * We keep buckets sorted by name_hash, so we need to find
1515 memcpy(val, xi->name, name_len); 1798 * our insert place.
1516 memcpy(val + OCFS2_XATTR_SIZE(name_len), 1799 */
1517 xi->value, 1800 while (low <= high && count) {
1518 xi->value_len); 1801 tmp = (low + high) / 2;
1519 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1802 tmp_xe = &xh->xh_entries[tmp];
1520 ocfs2_xattr_set_local(xs->here, 1); 1803
1521 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1804 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
1805 low = tmp + 1;
1806 else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
1807 high = tmp - 1;
1808 else {
1809 low = tmp;
1810 break;
1811 }
1522 } 1812 }
1523 1813
1524 return; 1814 if (low != count)
1815 memmove(&xh->xh_entries[low + 1],
1816 &xh->xh_entries[low],
1817 ((count - low) * sizeof(struct ocfs2_xattr_entry)));
1818
1819 le16_add_cpu(&xh->xh_count, 1);
1820 loc->xl_entry = &xh->xh_entries[low];
1821 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1822}
1823
1824static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1825{
1826 int free_start = ocfs2_xa_get_free_start(loc);
1827 struct ocfs2_xattr_header *xh = loc->xl_header;
1828 struct super_block *sb = loc->xl_inode->i_sb;
1829 int nameval_offset;
1830
1831 free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
1832 nameval_offset = free_start - size;
1833 loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
1834 xh->xh_free_start = cpu_to_le16(nameval_offset);
1835 le16_add_cpu(&xh->xh_name_value_len, size);
1836
1837}
1838
1839static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
1840 struct ocfs2_xattr_value_buf *vb)
1841{
1842 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1843 struct super_block *sb = loc->xl_inode->i_sb;
1844 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1845 int size = namevalue_size_xe(loc->xl_entry);
1846 int block_offset = nameval_offset >> sb->s_blocksize_bits;
1847
1848 /* Values are not allowed to straddle block boundaries */
1849 BUG_ON(block_offset !=
1850 ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
1851 /* We expect the bucket to be filled in */
1852 BUG_ON(!bucket->bu_bhs[block_offset]);
1853
1854 vb->vb_access = ocfs2_journal_access;
1855 vb->vb_bh = bucket->bu_bhs[block_offset];
1856}
1857
1858/* Operations for xattrs stored in buckets. */
1859static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
1860 .xlo_journal_access = ocfs2_xa_bucket_journal_access,
1861 .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
1862 .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
1863 .xlo_check_space = ocfs2_xa_bucket_check_space,
1864 .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
1865 .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
1866 .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
1867 .xlo_add_entry = ocfs2_xa_bucket_add_entry,
1868 .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
1869 .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
1870};
1871
1872static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
1873{
1874 struct ocfs2_xattr_value_buf vb;
1875
1876 if (ocfs2_xattr_is_local(loc->xl_entry))
1877 return 0;
1878
1879 ocfs2_xa_fill_value_buf(loc, &vb);
1880 return le32_to_cpu(vb.vb_xv->xr_clusters);
1881}
1882
1883static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
1884 struct ocfs2_xattr_set_ctxt *ctxt)
1885{
1886 int trunc_rc, access_rc;
1887 struct ocfs2_xattr_value_buf vb;
1888
1889 ocfs2_xa_fill_value_buf(loc, &vb);
1890 trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
1891 ctxt);
1892
1893 /*
1894 * The caller of ocfs2_xa_value_truncate() has already called
1895 * ocfs2_xa_journal_access on the loc. However, The truncate code
1896 * calls ocfs2_extend_trans(). This may commit the previous
1897 * transaction and open a new one. If this is a bucket, truncate
1898 * could leave only vb->vb_bh set up for journaling. Meanwhile,
1899 * the caller is expecting to dirty the entire bucket. So we must
1900 * reset the journal work. We do this even if truncate has failed,
1901 * as it could have failed after committing the extend.
1902 */
1903 access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
1904 OCFS2_JOURNAL_ACCESS_WRITE);
1905
1906 /* Errors in truncate take precedence */
1907 return trunc_rc ? trunc_rc : access_rc;
1908}
1909
1910static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
1911{
1912 int index, count;
1913 struct ocfs2_xattr_header *xh = loc->xl_header;
1914 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1915
1916 ocfs2_xa_wipe_namevalue(loc);
1917 loc->xl_entry = NULL;
1918
1919 le16_add_cpu(&xh->xh_count, -1);
1920 count = le16_to_cpu(xh->xh_count);
1921
1922 /*
1923 * Only zero out the entry if there are more remaining. This is
1924 * important for an empty bucket, as it keeps track of the
1925 * bucket's hash value. It doesn't hurt empty block storage.
1926 */
1927 if (count) {
1928 index = ((char *)entry - (char *)&xh->xh_entries) /
1929 sizeof(struct ocfs2_xattr_entry);
1930 memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
1931 (count - index) * sizeof(struct ocfs2_xattr_entry));
1932 memset(&xh->xh_entries[count], 0,
1933 sizeof(struct ocfs2_xattr_entry));
1934 }
1525} 1935}
1526 1936
1527/* 1937/*
1528 * ocfs2_xattr_set_entry() 1938 * If we have a problem adjusting the size of an external value during
1939 * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
1940 * in an intermediate state. For example, the value may be partially
1941 * truncated.
1529 * 1942 *
1530 * Set extended attribute entry into inode or block. 1943 * If the value tree hasn't changed, the extend/truncate went nowhere.
1944 * We have nothing to do. The caller can treat it as a straight error.
1531 * 1945 *
1532 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, 1946 * If the value tree got partially truncated, we now have a corrupted
1533 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), 1947 * extended attribute. We're going to wipe its entry and leak the
1534 * then set value in B tree with set_value_outside(). 1948 * clusters. Better to leak some storage than leave a corrupt entry.
1949 *
1950 * If the value tree grew, it obviously didn't grow enough for the
1951 * new entry. We're not going to try and reclaim those clusters either.
1952 * If there was already an external value there (orig_clusters != 0),
1953 * the new clusters are attached safely and we can just leave the old
1954 * value in place. If there was no external value there, we remove
1955 * the entry.
1956 *
1957 * This way, the xattr block we store in the journal will be consistent.
1958 * If the size change broke because of the journal, no changes will hit
1959 * disk anyway.
1535 */ 1960 */
1536static int ocfs2_xattr_set_entry(struct inode *inode, 1961static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
1537 struct ocfs2_xattr_info *xi, 1962 const char *what,
1538 struct ocfs2_xattr_search *xs, 1963 unsigned int orig_clusters)
1539 struct ocfs2_xattr_set_ctxt *ctxt, 1964{
1540 int flag) 1965 unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
1541{ 1966 char *nameval_buf = ocfs2_xa_offset_pointer(loc,
1542 struct ocfs2_xattr_entry *last; 1967 le16_to_cpu(loc->xl_entry->xe_name_offset));
1543 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1968
1544 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1969 if (new_clusters < orig_clusters) {
1545 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1970 mlog(ML_ERROR,
1546 size_t size_l = 0; 1971 "Partial truncate while %s xattr %.*s. Leaking "
1547 handle_t *handle = ctxt->handle; 1972 "%u clusters and removing the entry\n",
1548 int free, i, ret; 1973 what, loc->xl_entry->xe_name_len, nameval_buf,
1549 struct ocfs2_xattr_info xi_l = { 1974 orig_clusters - new_clusters);
1550 .name_index = xi->name_index, 1975 ocfs2_xa_remove_entry(loc);
1551 .name = xi->name, 1976 } else if (!orig_clusters) {
1552 .value = xi->value, 1977 mlog(ML_ERROR,
1553 .value_len = xi->value_len, 1978 "Unable to allocate an external value for xattr "
1554 }; 1979 "%.*s safely. Leaking %u clusters and removing the "
1555 struct ocfs2_xattr_value_buf vb = { 1980 "entry\n",
1556 .vb_bh = xs->xattr_bh, 1981 loc->xl_entry->xe_name_len, nameval_buf,
1557 .vb_access = ocfs2_journal_access_di, 1982 new_clusters - orig_clusters);
1558 }; 1983 ocfs2_xa_remove_entry(loc);
1984 } else if (new_clusters > orig_clusters)
1985 mlog(ML_ERROR,
1986 "Unable to grow xattr %.*s safely. %u new clusters "
1987 "have been added, but the value will not be "
1988 "modified\n",
1989 loc->xl_entry->xe_name_len, nameval_buf,
1990 new_clusters - orig_clusters);
1991}
1992
1993static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
1994 struct ocfs2_xattr_set_ctxt *ctxt)
1995{
1996 int rc = 0;
1997 unsigned int orig_clusters;
1998
1999 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2000 orig_clusters = ocfs2_xa_value_clusters(loc);
2001 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2002 if (rc) {
2003 mlog_errno(rc);
2004 /*
2005 * Since this is remove, we can return 0 if
2006 * ocfs2_xa_cleanup_value_truncate() is going to
2007 * wipe the entry anyway. So we check the
2008 * cluster count as well.
2009 */
2010 if (orig_clusters != ocfs2_xa_value_clusters(loc))
2011 rc = 0;
2012 ocfs2_xa_cleanup_value_truncate(loc, "removing",
2013 orig_clusters);
2014 if (rc)
2015 goto out;
2016 }
2017 }
1559 2018
1560 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2019 ocfs2_xa_remove_entry(loc);
1561 BUG_ON(xs->xattr_bh == xs->inode_bh);
1562 vb.vb_access = ocfs2_journal_access_xb;
1563 } else
1564 BUG_ON(xs->xattr_bh != xs->inode_bh);
1565 2020
1566 /* Compute min_offs, last and free space. */ 2021out:
1567 last = xs->header->xh_entries; 2022 return rc;
2023}
1568 2024
1569 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { 2025static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
1570 size_t offs = le16_to_cpu(last->xe_name_offset); 2026{
1571 if (offs < min_offs) 2027 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1572 min_offs = offs; 2028 char *nameval_buf;
1573 last += 1;
1574 }
1575 2029
1576 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; 2030 nameval_buf = ocfs2_xa_offset_pointer(loc,
1577 if (free < 0) 2031 le16_to_cpu(loc->xl_entry->xe_name_offset));
1578 return -EIO; 2032 memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
2033}
1579 2034
1580 if (!xs->not_found) { 2035/*
1581 size_t size = 0; 2036 * Take an existing entry and make it ready for the new value. This
1582 if (ocfs2_xattr_is_local(xs->here)) 2037 * won't allocate space, but it may free space. It should be ready for
1583 size = OCFS2_XATTR_SIZE(name_len) + 2038 * ocfs2_xa_prepare_entry() to finish the work.
1584 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 2039 */
1585 else 2040static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
1586 size = OCFS2_XATTR_SIZE(name_len) + 2041 struct ocfs2_xattr_info *xi,
1587 OCFS2_XATTR_ROOT_SIZE; 2042 struct ocfs2_xattr_set_ctxt *ctxt)
1588 free += (size + sizeof(struct ocfs2_xattr_entry)); 2043{
1589 } 2044 int rc = 0;
1590 /* Check free space in inode or block */ 2045 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
1591 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2046 unsigned int orig_clusters;
1592 if (free < sizeof(struct ocfs2_xattr_entry) + 2047 char *nameval_buf;
1593 OCFS2_XATTR_SIZE(name_len) + 2048 int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
1594 OCFS2_XATTR_ROOT_SIZE) { 2049 int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
1595 ret = -ENOSPC; 2050
1596 goto out; 2051 BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
2052 name_size);
2053
2054 nameval_buf = ocfs2_xa_offset_pointer(loc,
2055 le16_to_cpu(loc->xl_entry->xe_name_offset));
2056 if (xe_local) {
2057 memset(nameval_buf + name_size, 0,
2058 namevalue_size_xe(loc->xl_entry) - name_size);
2059 if (!xi_local)
2060 ocfs2_xa_install_value_root(loc);
2061 } else {
2062 orig_clusters = ocfs2_xa_value_clusters(loc);
2063 if (xi_local) {
2064 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2065 if (rc < 0)
2066 mlog_errno(rc);
2067 else
2068 memset(nameval_buf + name_size, 0,
2069 namevalue_size_xe(loc->xl_entry) -
2070 name_size);
2071 } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
2072 xi->xi_value_len) {
2073 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
2074 ctxt);
2075 if (rc < 0)
2076 mlog_errno(rc);
1597 } 2077 }
1598 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 2078
1599 xi_l.value = (void *)&def_xv; 2079 if (rc) {
1600 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; 2080 ocfs2_xa_cleanup_value_truncate(loc, "reusing",
1601 } else if (xi->value) { 2081 orig_clusters);
1602 if (free < sizeof(struct ocfs2_xattr_entry) +
1603 OCFS2_XATTR_SIZE(name_len) +
1604 OCFS2_XATTR_SIZE(xi->value_len)) {
1605 ret = -ENOSPC;
1606 goto out; 2082 goto out;
1607 } 2083 }
1608 } 2084 }
1609 2085
1610 if (!xs->not_found) { 2086 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1611 /* For existing extended attribute */ 2087 ocfs2_xattr_set_local(loc->xl_entry, xi_local);
1612 size_t size = OCFS2_XATTR_SIZE(name_len) +
1613 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1614 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1615 void *val = xs->base + offs;
1616 2088
1617 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 2089out:
1618 /* Replace existing local xattr with tree root */ 2090 return rc;
1619 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 2091}
1620 ctxt, &vb, offs);
1621 if (ret < 0)
1622 mlog_errno(ret);
1623 goto out;
1624 } else if (!ocfs2_xattr_is_local(xs->here)) {
1625 /* For existing xattr which has value outside */
1626 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1627 (val + OCFS2_XATTR_SIZE(name_len));
1628 2092
1629 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2093/*
1630 /* 2094 * Prepares loc->xl_entry to receive the new xattr. This includes
1631 * If new value need set outside also, 2095 * properly setting up the name+value pair region. If loc->xl_entry
1632 * first truncate old value to new value, 2096 * already exists, it will take care of modifying it appropriately.
1633 * then set new value with set_value_outside(). 2097 *
1634 */ 2098 * Note that this modifies the data. You did journal_access already,
1635 ret = ocfs2_xattr_value_truncate(inode, 2099 * right?
1636 &vb, 2100 */
1637 xi->value_len, 2101static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
1638 ctxt); 2102 struct ocfs2_xattr_info *xi,
1639 if (ret < 0) { 2103 u32 name_hash,
1640 mlog_errno(ret); 2104 struct ocfs2_xattr_set_ctxt *ctxt)
1641 goto out; 2105{
1642 } 2106 int rc = 0;
2107 unsigned int orig_clusters;
2108 __le64 orig_value_size = 0;
1643 2109
1644 ret = ocfs2_xattr_update_entry(inode, 2110 rc = ocfs2_xa_check_space(loc, xi);
1645 handle, 2111 if (rc)
1646 xi, 2112 goto out;
1647 xs,
1648 &vb,
1649 offs);
1650 if (ret < 0) {
1651 mlog_errno(ret);
1652 goto out;
1653 }
1654 2113
1655 ret = __ocfs2_xattr_set_value_outside(inode, 2114 if (loc->xl_entry) {
1656 handle, 2115 if (ocfs2_xa_can_reuse_entry(loc, xi)) {
1657 &vb, 2116 orig_value_size = loc->xl_entry->xe_value_size;
1658 xi->value, 2117 rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
1659 xi->value_len); 2118 if (rc)
1660 if (ret < 0) 2119 goto out;
1661 mlog_errno(ret); 2120 goto alloc_value;
2121 }
2122
2123 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2124 orig_clusters = ocfs2_xa_value_clusters(loc);
2125 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2126 if (rc) {
2127 mlog_errno(rc);
2128 ocfs2_xa_cleanup_value_truncate(loc,
2129 "overwriting",
2130 orig_clusters);
1662 goto out; 2131 goto out;
1663 } else {
1664 /*
1665 * If new value need set in local,
1666 * just trucate old value to zero.
1667 */
1668 ret = ocfs2_xattr_value_truncate(inode,
1669 &vb,
1670 0,
1671 ctxt);
1672 if (ret < 0)
1673 mlog_errno(ret);
1674 } 2132 }
1675 } 2133 }
2134 ocfs2_xa_wipe_namevalue(loc);
2135 } else
2136 ocfs2_xa_add_entry(loc, name_hash);
2137
2138 /*
2139 * If we get here, we have a blank entry. Fill it. We grow our
2140 * name+value pair back from the end.
2141 */
2142 ocfs2_xa_add_namevalue(loc, xi);
2143 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2144 ocfs2_xa_install_value_root(loc);
2145
2146alloc_value:
2147 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2148 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) {
2151 /*
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters);
2160 mlog_errno(rc);
2161 }
1676 } 2162 }
1677 2163
1678 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, 2164out:
2165 return rc;
2166}
2167
2168/*
2169 * Store the value portion of the name+value pair. This will skip
2170 * values that are stored externally. Their tree roots were set up
2171 * by ocfs2_xa_prepare_entry().
2172 */
2173static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
2174 struct ocfs2_xattr_info *xi,
2175 struct ocfs2_xattr_set_ctxt *ctxt)
2176{
2177 int rc = 0;
2178 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
2179 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
2180 char *nameval_buf;
2181 struct ocfs2_xattr_value_buf vb;
2182
2183 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
2184 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2185 ocfs2_xa_fill_value_buf(loc, &vb);
2186 rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
2187 ctxt->handle, &vb,
2188 xi->xi_value,
2189 xi->xi_value_len);
2190 } else
2191 memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
2192
2193 return rc;
2194}
2195
2196static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
2197 struct ocfs2_xattr_info *xi,
2198 struct ocfs2_xattr_set_ctxt *ctxt)
2199{
2200 int ret;
2201 u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
2202 xi->xi_name_len);
2203
2204 ret = ocfs2_xa_journal_access(ctxt->handle, loc,
1679 OCFS2_JOURNAL_ACCESS_WRITE); 2205 OCFS2_JOURNAL_ACCESS_WRITE);
1680 if (ret) { 2206 if (ret) {
1681 mlog_errno(ret); 2207 mlog_errno(ret);
1682 goto out; 2208 goto out;
1683 } 2209 }
1684 2210
1685 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1686 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1687 OCFS2_JOURNAL_ACCESS_WRITE);
1688 if (ret) {
1689 mlog_errno(ret);
1690 goto out;
1691 }
1692 }
1693
1694 /* 2211 /*
1695 * Set value in local, include set tree root in local. 2212 * From here on out, everything is going to modify the buffer a
1696 * This is the first step for value size >INLINE_SIZE. 2213 * little. Errors are going to leave the xattr header in a
2214 * sane state. Thus, even with errors we dirty the sucker.
1697 */ 2215 */
1698 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1699 2216
1700 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2217 /* Don't worry, we are never called with !xi_value and !xl_entry */
1701 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 2218 if (!xi->xi_value) {
1702 if (ret < 0) { 2219 ret = ocfs2_xa_remove(loc, ctxt);
1703 mlog_errno(ret); 2220 goto out_dirty;
1704 goto out;
1705 }
1706 } 2221 }
1707 2222
1708 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && 2223 ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
1709 (flag & OCFS2_INLINE_XATTR_FL)) { 2224 if (ret) {
1710 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 if (ret != -ENOSPC)
1711 unsigned int xattrsize = osb->s_xattr_inline_size; 2226 mlog_errno(ret);
1712 2227 goto out_dirty;
1713 /*
1714 * Adjust extent record count or inline data size
1715 * to reserve space for extended attribute.
1716 */
1717 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1718 struct ocfs2_inline_data *idata = &di->id2.i_data;
1719 le16_add_cpu(&idata->id_count, -xattrsize);
1720 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1721 struct ocfs2_extent_list *el = &di->id2.i_list;
1722 le16_add_cpu(&el->l_count, -(xattrsize /
1723 sizeof(struct ocfs2_extent_rec)));
1724 }
1725 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1726 } 2228 }
1727 /* Update xattr flag */
1728 spin_lock(&oi->ip_lock);
1729 oi->ip_dyn_features |= flag;
1730 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1731 spin_unlock(&oi->ip_lock);
1732 2229
1733 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2230 ret = ocfs2_xa_store_value(loc, xi, ctxt);
1734 if (ret < 0) 2231 if (ret)
1735 mlog_errno(ret); 2232 mlog_errno(ret);
1736 2233
1737 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2234out_dirty:
1738 /* 2235 ocfs2_xa_journal_dirty(ctxt->handle, loc);
1739 * Set value outside in B tree.
1740 * This is the second step for value size > INLINE_SIZE.
1741 */
1742 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1743 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1744 &vb, offs);
1745 if (ret < 0) {
1746 int ret2;
1747 2236
1748 mlog_errno(ret);
1749 /*
1750 * If set value outside failed, we have to clean
1751 * the junk tree root we have already set in local.
1752 */
1753 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1754 xi, xs, &vb, offs);
1755 if (ret2 < 0)
1756 mlog_errno(ret2);
1757 }
1758 }
1759out: 2237out:
1760 return ret; 2238 return ret;
1761} 2239}
1762 2240
2241static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
2242 struct inode *inode,
2243 struct buffer_head *bh,
2244 struct ocfs2_xattr_entry *entry)
2245{
2246 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2247
2248 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
2249
2250 loc->xl_inode = inode;
2251 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2252 loc->xl_storage = bh;
2253 loc->xl_entry = entry;
2254 loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
2255 loc->xl_header =
2256 (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
2257 loc->xl_size);
2258}
2259
2260static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
2261 struct inode *inode,
2262 struct buffer_head *bh,
2263 struct ocfs2_xattr_entry *entry)
2264{
2265 struct ocfs2_xattr_block *xb =
2266 (struct ocfs2_xattr_block *)bh->b_data;
2267
2268 BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
2269
2270 loc->xl_inode = inode;
2271 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2272 loc->xl_storage = bh;
2273 loc->xl_header = &(xb->xb_attrs.xb_header);
2274 loc->xl_entry = entry;
2275 loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
2276 xb_attrs.xb_header);
2277}
2278
2279static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
2280 struct ocfs2_xattr_bucket *bucket,
2281 struct ocfs2_xattr_entry *entry)
2282{
2283 loc->xl_inode = bucket->bu_inode;
2284 loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
2285 loc->xl_storage = bucket;
2286 loc->xl_header = bucket_xh(bucket);
2287 loc->xl_entry = entry;
2288 loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
2289}
2290
1763/* 2291/*
1764 * In xattr remove, if it is stored outside and refcounted, we may have 2292 * In xattr remove, if it is stored outside and refcounted, we may have
1765 * the chance to split the refcount tree. So need the allocators. 2293 * the chance to split the refcount tree. So need the allocators.
@@ -2155,6 +2683,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
2155 return 0; 2683 return 0;
2156} 2684}
2157 2685
2686static int ocfs2_xattr_ibody_init(struct inode *inode,
2687 struct buffer_head *di_bh,
2688 struct ocfs2_xattr_set_ctxt *ctxt)
2689{
2690 int ret;
2691 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2692 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2693 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2694 unsigned int xattrsize = osb->s_xattr_inline_size;
2695
2696 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2697 ret = -ENOSPC;
2698 goto out;
2699 }
2700
2701 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
2702 OCFS2_JOURNAL_ACCESS_WRITE);
2703 if (ret) {
2704 mlog_errno(ret);
2705 goto out;
2706 }
2707
2708 /*
2709 * Adjust extent record count or inline data size
2710 * to reserve space for extended attribute.
2711 */
2712 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2713 struct ocfs2_inline_data *idata = &di->id2.i_data;
2714 le16_add_cpu(&idata->id_count, -xattrsize);
2715 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
2716 struct ocfs2_extent_list *el = &di->id2.i_list;
2717 le16_add_cpu(&el->l_count, -(xattrsize /
2718 sizeof(struct ocfs2_extent_rec)));
2719 }
2720 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
2721
2722 spin_lock(&oi->ip_lock);
2723 oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock);
2726
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730
2731out:
2732 return ret;
2733}
2734
2158/* 2735/*
2159 * ocfs2_xattr_ibody_set() 2736 * ocfs2_xattr_ibody_set()
2160 * 2737 *
@@ -2166,9 +2743,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2166 struct ocfs2_xattr_search *xs, 2743 struct ocfs2_xattr_search *xs,
2167 struct ocfs2_xattr_set_ctxt *ctxt) 2744 struct ocfs2_xattr_set_ctxt *ctxt)
2168{ 2745{
2746 int ret;
2169 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2747 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2170 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2748 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2171 int ret; 2749 struct ocfs2_xa_loc loc;
2172 2750
2173 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2751 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
2174 return -ENOSPC; 2752 return -ENOSPC;
@@ -2181,8 +2759,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2181 } 2759 }
2182 } 2760 }
2183 2761
2184 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2762 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2185 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2763 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2764 if (ret) {
2765 if (ret != -ENOSPC)
2766 mlog_errno(ret);
2767 goto out;
2768 }
2769 }
2770
2771 ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
2772 xs->not_found ? NULL : xs->here);
2773 ret = ocfs2_xa_set(&loc, xi, ctxt);
2774 if (ret) {
2775 if (ret != -ENOSPC)
2776 mlog_errno(ret);
2777 goto out;
2778 }
2779 xs->here = loc.xl_entry;
2780
2186out: 2781out:
2187 up_write(&oi->ip_alloc_sem); 2782 up_write(&oi->ip_alloc_sem);
2188 2783
@@ -2242,12 +2837,11 @@ cleanup:
2242 return ret; 2837 return ret;
2243} 2838}
2244 2839
2245static int ocfs2_create_xattr_block(handle_t *handle, 2840static int ocfs2_create_xattr_block(struct inode *inode,
2246 struct inode *inode,
2247 struct buffer_head *inode_bh, 2841 struct buffer_head *inode_bh,
2248 struct ocfs2_alloc_context *meta_ac, 2842 struct ocfs2_xattr_set_ctxt *ctxt,
2249 struct buffer_head **ret_bh, 2843 int indexed,
2250 int indexed) 2844 struct buffer_head **ret_bh)
2251{ 2845{
2252 int ret; 2846 int ret;
2253 u16 suballoc_bit_start; 2847 u16 suballoc_bit_start;
@@ -2258,14 +2852,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2258 struct buffer_head *new_bh = NULL; 2852 struct buffer_head *new_bh = NULL;
2259 struct ocfs2_xattr_block *xblk; 2853 struct ocfs2_xattr_block *xblk;
2260 2854
2261 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, 2855 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2262 OCFS2_JOURNAL_ACCESS_CREATE); 2856 inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
2263 if (ret < 0) { 2857 if (ret < 0) {
2264 mlog_errno(ret); 2858 mlog_errno(ret);
2265 goto end; 2859 goto end;
2266 } 2860 }
2267 2861
2268 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
2269 &suballoc_bit_start, &num_got, 2863 &suballoc_bit_start, &num_got,
2270 &first_blkno); 2864 &first_blkno);
2271 if (ret < 0) { 2865 if (ret < 0) {
@@ -2276,7 +2870,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2276 new_bh = sb_getblk(inode->i_sb, first_blkno); 2870 new_bh = sb_getblk(inode->i_sb, first_blkno);
2277 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); 2871 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2278 2872
2279 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), 2873 ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
2280 new_bh, 2874 new_bh,
2281 OCFS2_JOURNAL_ACCESS_CREATE); 2875 OCFS2_JOURNAL_ACCESS_CREATE);
2282 if (ret < 0) { 2876 if (ret < 0) {
@@ -2288,11 +2882,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2288 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2882 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2289 memset(xblk, 0, inode->i_sb->s_blocksize); 2883 memset(xblk, 0, inode->i_sb->s_blocksize);
2290 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2291 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); 2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2292 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2293 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2294 xblk->xb_blkno = cpu_to_le64(first_blkno); 2888 xblk->xb_blkno = cpu_to_le64(first_blkno);
2295
2296 if (indexed) { 2889 if (indexed) {
2297 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2298 xr->xt_clusters = cpu_to_le32(1); 2891 xr->xt_clusters = cpu_to_le32(1);
@@ -2303,14 +2896,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2303 xr->xt_list.l_next_free_rec = cpu_to_le16(1); 2896 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2304 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); 2897 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2305 } 2898 }
2899 ocfs2_journal_dirty(ctxt->handle, new_bh);
2306 2900
2307 ret = ocfs2_journal_dirty(handle, new_bh); 2901 /* Add it to the inode */
2308 if (ret < 0) {
2309 mlog_errno(ret);
2310 goto end;
2311 }
2312 di->i_xattr_loc = cpu_to_le64(first_blkno); 2902 di->i_xattr_loc = cpu_to_le64(first_blkno);
2313 ocfs2_journal_dirty(handle, inode_bh); 2903
2904 spin_lock(&OCFS2_I(inode)->ip_lock);
2905 OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
2906 di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
2907 spin_unlock(&OCFS2_I(inode)->ip_lock);
2908
2909 ocfs2_journal_dirty(ctxt->handle, inode_bh);
2314 2910
2315 *ret_bh = new_bh; 2911 *ret_bh = new_bh;
2316 new_bh = NULL; 2912 new_bh = NULL;
@@ -2332,13 +2928,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2332 struct ocfs2_xattr_set_ctxt *ctxt) 2928 struct ocfs2_xattr_set_ctxt *ctxt)
2333{ 2929{
2334 struct buffer_head *new_bh = NULL; 2930 struct buffer_head *new_bh = NULL;
2335 handle_t *handle = ctxt->handle;
2336 struct ocfs2_xattr_block *xblk = NULL; 2931 struct ocfs2_xattr_block *xblk = NULL;
2337 int ret; 2932 int ret;
2933 struct ocfs2_xa_loc loc;
2338 2934
2339 if (!xs->xattr_bh) { 2935 if (!xs->xattr_bh) {
2340 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, 2936 ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
2341 ctxt->meta_ac, &new_bh, 0); 2937 0, &new_bh);
2342 if (ret) { 2938 if (ret) {
2343 mlog_errno(ret); 2939 mlog_errno(ret);
2344 goto end; 2940 goto end;
@@ -2354,21 +2950,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2354 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2355 2951
2356 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
2357 /* Set extended attribute into external block */ 2953 ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
2358 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2954 xs->not_found ? NULL : xs->here);
2359 OCFS2_HAS_XATTR_FL);
2360 if (!ret || ret != -ENOSPC)
2361 goto end;
2362 2955
2363 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2956 ret = ocfs2_xa_set(&loc, xi, ctxt);
2364 if (ret) 2957 if (!ret)
2958 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC)
2365 goto end; 2960 goto end;
2961 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
2963 if (ret)
2964 goto end;
2965 }
2366 } 2966 }
2367 2967
2368 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); 2968 if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
2969 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
2369 2970
2370end: 2971end:
2371
2372 return ret; 2972 return ret;
2373} 2973}
2374 2974
@@ -2377,7 +2977,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2377 struct ocfs2_xattr_info *xi, 2977 struct ocfs2_xattr_info *xi,
2378 struct ocfs2_xattr_search *xs) 2978 struct ocfs2_xattr_search *xs)
2379{ 2979{
2380 u64 value_size;
2381 struct ocfs2_xattr_entry *last; 2980 struct ocfs2_xattr_entry *last;
2382 int free, i; 2981 int free, i;
2383 size_t min_offs = xs->end - xs->base; 2982 size_t min_offs = xs->end - xs->base;
@@ -2400,13 +2999,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2400 2999
2401 BUG_ON(!xs->not_found); 3000 BUG_ON(!xs->not_found);
2402 3001
2403 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3002 if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
2404 value_size = OCFS2_XATTR_ROOT_SIZE;
2405 else
2406 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2407
2408 if (free >= sizeof(struct ocfs2_xattr_entry) +
2409 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2410 return 1; 3003 return 1;
2411 3004
2412 return 0; 3005 return 0;
@@ -2430,7 +3023,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2430 char *base = NULL; 3023 char *base = NULL;
2431 int name_offset, name_len = 0; 3024 int name_offset, name_len = 0;
2432 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, 3025 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2433 xi->value_len); 3026 xi->xi_value_len);
2434 u64 value_size; 3027 u64 value_size;
2435 3028
2436 /* 3029 /*
@@ -2438,14 +3031,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2438 * No matter whether we replace an old one or add a new one, 3031 * No matter whether we replace an old one or add a new one,
2439 * we need this for writing. 3032 * we need this for writing.
2440 */ 3033 */
2441 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3034 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2442 credits += new_clusters * 3035 credits += new_clusters *
2443 ocfs2_clusters_to_blocks(inode->i_sb, 1); 3036 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2444 3037
2445 if (xis->not_found && xbs->not_found) { 3038 if (xis->not_found && xbs->not_found) {
2446 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3039 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2447 3040
2448 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3041 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2449 clusters_add += new_clusters; 3042 clusters_add += new_clusters;
2450 credits += ocfs2_calc_extend_credits(inode->i_sb, 3043 credits += ocfs2_calc_extend_credits(inode->i_sb,
2451 &def_xv.xv.xr_list, 3044 &def_xv.xv.xr_list,
@@ -2490,7 +3083,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2490 * The credits for removing the value tree will be extended 3083 * The credits for removing the value tree will be extended
2491 * by ocfs2_remove_extent itself. 3084 * by ocfs2_remove_extent itself.
2492 */ 3085 */
2493 if (!xi->value) { 3086 if (!xi->xi_value) {
2494 if (!ocfs2_xattr_is_local(xe)) 3087 if (!ocfs2_xattr_is_local(xe))
2495 credits += ocfs2_remove_extent_credits(inode->i_sb); 3088 credits += ocfs2_remove_extent_credits(inode->i_sb);
2496 3089
@@ -2520,7 +3113,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2520 } 3113 }
2521 } 3114 }
2522 3115
2523 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3116 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2524 /* the new values will be stored outside. */ 3117 /* the new values will be stored outside. */
2525 u32 old_clusters = 0; 3118 u32 old_clusters = 0;
2526 3119
@@ -2553,9 +3146,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2553 * value, we don't need any allocation, otherwise we have 3146 * value, we don't need any allocation, otherwise we have
2554 * to guess metadata allocation. 3147 * to guess metadata allocation.
2555 */ 3148 */
2556 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || 3149 if ((ocfs2_xattr_is_local(xe) &&
3150 (value_size >= xi->xi_value_len)) ||
2557 (!ocfs2_xattr_is_local(xe) && 3151 (!ocfs2_xattr_is_local(xe) &&
2558 OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) 3152 OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
2559 goto out; 3153 goto out;
2560 } 3154 }
2561 3155
@@ -2645,7 +3239,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2645 3239
2646 meta_add += extra_meta; 3240 meta_add += extra_meta;
2647 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 3241 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2648 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 3242 "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
2649 3243
2650 if (meta_add) { 3244 if (meta_add) {
2651 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, 3245 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2685,7 +3279,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2685{ 3279{
2686 int ret = 0, credits, old_found; 3280 int ret = 0, credits, old_found;
2687 3281
2688 if (!xi->value) { 3282 if (!xi->xi_value) {
2689 /* Remove existing extended attribute */ 3283 /* Remove existing extended attribute */
2690 if (!xis->not_found) 3284 if (!xis->not_found)
2691 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); 3285 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2699,8 +3293,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2699 * If succeed and that extended attribute existing in 3293 * If succeed and that extended attribute existing in
2700 * external block, then we will remove it. 3294 * external block, then we will remove it.
2701 */ 3295 */
2702 xi->value = NULL; 3296 xi->xi_value = NULL;
2703 xi->value_len = 0; 3297 xi->xi_value_len = 0;
2704 3298
2705 old_found = xis->not_found; 3299 old_found = xis->not_found;
2706 xis->not_found = -ENODATA; 3300 xis->not_found = -ENODATA;
@@ -2728,8 +3322,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2728 } else if (ret == -ENOSPC) { 3322 } else if (ret == -ENOSPC) {
2729 if (di->i_xattr_loc && !xbs->xattr_bh) { 3323 if (di->i_xattr_loc && !xbs->xattr_bh) {
2730 ret = ocfs2_xattr_block_find(inode, 3324 ret = ocfs2_xattr_block_find(inode,
2731 xi->name_index, 3325 xi->xi_name_index,
2732 xi->name, xbs); 3326 xi->xi_name, xbs);
2733 if (ret) 3327 if (ret)
2734 goto out; 3328 goto out;
2735 3329
@@ -2768,8 +3362,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2768 * If succeed and that extended attribute 3362 * If succeed and that extended attribute
2769 * existing in inode, we will remove it. 3363 * existing in inode, we will remove it.
2770 */ 3364 */
2771 xi->value = NULL; 3365 xi->xi_value = NULL;
2772 xi->value_len = 0; 3366 xi->xi_value_len = 0;
2773 xbs->not_found = -ENODATA; 3367 xbs->not_found = -ENODATA;
2774 ret = ocfs2_calc_xattr_set_need(inode, 3368 ret = ocfs2_calc_xattr_set_need(inode,
2775 di, 3369 di,
@@ -2835,10 +3429,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
2835 int ret; 3429 int ret;
2836 3430
2837 struct ocfs2_xattr_info xi = { 3431 struct ocfs2_xattr_info xi = {
2838 .name_index = name_index, 3432 .xi_name_index = name_index,
2839 .name = name, 3433 .xi_name = name,
2840 .value = value, 3434 .xi_name_len = strlen(name),
2841 .value_len = value_len, 3435 .xi_value = value,
3436 .xi_value_len = value_len,
2842 }; 3437 };
2843 3438
2844 struct ocfs2_xattr_search xis = { 3439 struct ocfs2_xattr_search xis = {
@@ -2918,10 +3513,11 @@ int ocfs2_xattr_set(struct inode *inode,
2918 struct ocfs2_refcount_tree *ref_tree = NULL; 3513 struct ocfs2_refcount_tree *ref_tree = NULL;
2919 3514
2920 struct ocfs2_xattr_info xi = { 3515 struct ocfs2_xattr_info xi = {
2921 .name_index = name_index, 3516 .xi_name_index = name_index,
2922 .name = name, 3517 .xi_name = name,
2923 .value = value, 3518 .xi_name_len = strlen(name),
2924 .value_len = value_len, 3519 .xi_value = value,
3520 .xi_value_len = value_len,
2925 }; 3521 };
2926 3522
2927 struct ocfs2_xattr_search xis = { 3523 struct ocfs2_xattr_search xis = {
@@ -3765,7 +4361,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3765 struct ocfs2_xattr_bucket *bucket) 4361 struct ocfs2_xattr_bucket *bucket)
3766{ 4362{
3767 int ret, i; 4363 int ret, i;
3768 size_t end, offset, len, value_len; 4364 size_t end, offset, len;
3769 struct ocfs2_xattr_header *xh; 4365 struct ocfs2_xattr_header *xh;
3770 char *entries, *buf, *bucket_buf = NULL; 4366 char *entries, *buf, *bucket_buf = NULL;
3771 u64 blkno = bucket_blkno(bucket); 4367 u64 blkno = bucket_blkno(bucket);
@@ -3819,12 +4415,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3819 end = OCFS2_XATTR_BUCKET_SIZE; 4415 end = OCFS2_XATTR_BUCKET_SIZE;
3820 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { 4416 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
3821 offset = le16_to_cpu(xe->xe_name_offset); 4417 offset = le16_to_cpu(xe->xe_name_offset);
3822 if (ocfs2_xattr_is_local(xe)) 4418 len = namevalue_size_xe(xe);
3823 value_len = OCFS2_XATTR_SIZE(
3824 le64_to_cpu(xe->xe_value_size));
3825 else
3826 value_len = OCFS2_XATTR_ROOT_SIZE;
3827 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
3828 4419
3829 /* 4420 /*
3830 * We must make sure that the name/value pair 4421 * We must make sure that the name/value pair
@@ -4013,7 +4604,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4013 int new_bucket_head) 4604 int new_bucket_head)
4014{ 4605{
4015 int ret, i; 4606 int ret, i;
4016 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 4607 int count, start, len, name_value_len = 0, name_offset = 0;
4017 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; 4608 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
4018 struct ocfs2_xattr_header *xh; 4609 struct ocfs2_xattr_header *xh;
4019 struct ocfs2_xattr_entry *xe; 4610 struct ocfs2_xattr_entry *xe;
@@ -4104,13 +4695,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4104 name_value_len = 0; 4695 name_value_len = 0;
4105 for (i = 0; i < start; i++) { 4696 for (i = 0; i < start; i++) {
4106 xe = &xh->xh_entries[i]; 4697 xe = &xh->xh_entries[i];
4107 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); 4698 name_value_len += namevalue_size_xe(xe);
4108 if (ocfs2_xattr_is_local(xe))
4109 xe_len +=
4110 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4111 else
4112 xe_len += OCFS2_XATTR_ROOT_SIZE;
4113 name_value_len += xe_len;
4114 if (le16_to_cpu(xe->xe_name_offset) < name_offset) 4699 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
4115 name_offset = le16_to_cpu(xe->xe_name_offset); 4700 name_offset = le16_to_cpu(xe->xe_name_offset);
4116 } 4701 }
@@ -4140,12 +4725,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4140 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); 4725 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4141 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 4726 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4142 xe = &xh->xh_entries[i]; 4727 xe = &xh->xh_entries[i];
4143 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
4144 if (ocfs2_xattr_is_local(xe))
4145 xe_len +=
4146 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4147 else
4148 xe_len += OCFS2_XATTR_ROOT_SIZE;
4149 if (le16_to_cpu(xe->xe_name_offset) < 4728 if (le16_to_cpu(xe->xe_name_offset) <
4150 le16_to_cpu(xh->xh_free_start)) 4729 le16_to_cpu(xh->xh_free_start))
4151 xh->xh_free_start = xe->xe_name_offset; 4730 xh->xh_free_start = xe->xe_name_offset;
@@ -4757,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
4757} 5336}
4758 5337
4759/* 5338/*
4760 * Handle the normal xattr set, including replace, delete and new.
4761 *
4762 * Note: "local" indicates the real data's locality. So we can't
4763 * just its bucket locality by its length.
4764 */
4765static void ocfs2_xattr_set_entry_normal(struct inode *inode,
4766 struct ocfs2_xattr_info *xi,
4767 struct ocfs2_xattr_search *xs,
4768 u32 name_hash,
4769 int local)
4770{
4771 struct ocfs2_xattr_entry *last, *xe;
4772 int name_len = strlen(xi->name);
4773 struct ocfs2_xattr_header *xh = xs->header;
4774 u16 count = le16_to_cpu(xh->xh_count), start;
4775 size_t blocksize = inode->i_sb->s_blocksize;
4776 char *val;
4777 size_t offs, size, new_size;
4778
4779 last = &xh->xh_entries[count];
4780 if (!xs->not_found) {
4781 xe = xs->here;
4782 offs = le16_to_cpu(xe->xe_name_offset);
4783 if (ocfs2_xattr_is_local(xe))
4784 size = OCFS2_XATTR_SIZE(name_len) +
4785 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4786 else
4787 size = OCFS2_XATTR_SIZE(name_len) +
4788 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4789
4790 /*
4791 * If the new value will be stored outside, xi->value has been
4792 * initalized as an empty ocfs2_xattr_value_root, and the same
4793 * goes with xi->value_len, so we can set new_size safely here.
4794 * See ocfs2_xattr_set_in_bucket.
4795 */
4796 new_size = OCFS2_XATTR_SIZE(name_len) +
4797 OCFS2_XATTR_SIZE(xi->value_len);
4798
4799 le16_add_cpu(&xh->xh_name_value_len, -size);
4800 if (xi->value) {
4801 if (new_size > size)
4802 goto set_new_name_value;
4803
4804 /* Now replace the old value with new one. */
4805 if (local)
4806 xe->xe_value_size = cpu_to_le64(xi->value_len);
4807 else
4808 xe->xe_value_size = 0;
4809
4810 val = ocfs2_xattr_bucket_get_val(inode,
4811 xs->bucket, offs);
4812 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
4813 size - OCFS2_XATTR_SIZE(name_len));
4814 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
4815 memcpy(val + OCFS2_XATTR_SIZE(name_len),
4816 xi->value, xi->value_len);
4817
4818 le16_add_cpu(&xh->xh_name_value_len, new_size);
4819 ocfs2_xattr_set_local(xe, local);
4820 return;
4821 } else {
4822 /*
4823 * Remove the old entry if there is more than one.
4824 * We don't remove the last entry so that we can
4825 * use it to indicate the hash value of the empty
4826 * bucket.
4827 */
4828 last -= 1;
4829 le16_add_cpu(&xh->xh_count, -1);
4830 if (xh->xh_count) {
4831 memmove(xe, xe + 1,
4832 (void *)last - (void *)xe);
4833 memset(last, 0,
4834 sizeof(struct ocfs2_xattr_entry));
4835 } else
4836 xh->xh_free_start =
4837 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4838
4839 return;
4840 }
4841 } else {
4842 /* find a new entry for insert. */
4843 int low = 0, high = count - 1, tmp;
4844 struct ocfs2_xattr_entry *tmp_xe;
4845
4846 while (low <= high && count) {
4847 tmp = (low + high) / 2;
4848 tmp_xe = &xh->xh_entries[tmp];
4849
4850 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
4851 low = tmp + 1;
4852 else if (name_hash <
4853 le32_to_cpu(tmp_xe->xe_name_hash))
4854 high = tmp - 1;
4855 else {
4856 low = tmp;
4857 break;
4858 }
4859 }
4860
4861 xe = &xh->xh_entries[low];
4862 if (low != count)
4863 memmove(xe + 1, xe, (void *)last - (void *)xe);
4864
4865 le16_add_cpu(&xh->xh_count, 1);
4866 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4867 xe->xe_name_hash = cpu_to_le32(name_hash);
4868 xe->xe_name_len = name_len;
4869 ocfs2_xattr_set_type(xe, xi->name_index);
4870 }
4871
4872set_new_name_value:
4873 /* Insert the new name+value. */
4874 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4875
4876 /*
4877 * We must make sure that the name/value pair
4878 * exists in the same block.
4879 */
4880 offs = le16_to_cpu(xh->xh_free_start);
4881 start = offs - size;
4882
4883 if (start >> inode->i_sb->s_blocksize_bits !=
4884 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4885 offs = offs - offs % blocksize;
4886 xh->xh_free_start = cpu_to_le16(offs);
4887 }
4888
4889 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4890 xe->xe_name_offset = cpu_to_le16(offs - size);
4891
4892 memset(val, 0, size);
4893 memcpy(val, xi->name, name_len);
4894 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4895
4896 xe->xe_value_size = cpu_to_le64(xi->value_len);
4897 ocfs2_xattr_set_local(xe, local);
4898 xs->here = xe;
4899 le16_add_cpu(&xh->xh_free_start, -size);
4900 le16_add_cpu(&xh->xh_name_value_len, size);
4901
4902 return;
4903}
4904
4905/*
4906 * Set the xattr entry in the specified bucket.
4907 * The bucket is indicated by xs->bucket and it should have the enough
4908 * space for the xattr insertion.
4909 */
4910static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4911 handle_t *handle,
4912 struct ocfs2_xattr_info *xi,
4913 struct ocfs2_xattr_search *xs,
4914 u32 name_hash,
4915 int local)
4916{
4917 int ret;
4918 u64 blkno;
4919
4920 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4921 (unsigned long)xi->value_len, xi->name_index,
4922 (unsigned long long)bucket_blkno(xs->bucket));
4923
4924 if (!xs->bucket->bu_bhs[1]) {
4925 blkno = bucket_blkno(xs->bucket);
4926 ocfs2_xattr_bucket_relse(xs->bucket);
4927 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4928 if (ret) {
4929 mlog_errno(ret);
4930 goto out;
4931 }
4932 }
4933
4934 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4935 OCFS2_JOURNAL_ACCESS_WRITE);
4936 if (ret < 0) {
4937 mlog_errno(ret);
4938 goto out;
4939 }
4940
4941 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4942 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4943
4944out:
4945 return ret;
4946}
4947
4948/*
4949 * Truncate the specified xe_off entry in xattr bucket. 5339 * Truncate the specified xe_off entry in xattr bucket.
4950 * bucket is indicated by header_bh and len is the new length. 5340 * bucket is indicated by header_bh and len is the new length.
4951 * Both the ocfs2_xattr_value_root and the entry will be updated here. 5341 * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5015,66 +5405,6 @@ out:
5015 return ret; 5405 return ret;
5016} 5406}
5017 5407
5018static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
5019 struct ocfs2_xattr_search *xs,
5020 int len,
5021 struct ocfs2_xattr_set_ctxt *ctxt)
5022{
5023 int ret, offset;
5024 struct ocfs2_xattr_entry *xe = xs->here;
5025 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
5026
5027 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
5028
5029 offset = xe - xh->xh_entries;
5030 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
5031 offset, len, ctxt);
5032 if (ret)
5033 mlog_errno(ret);
5034
5035 return ret;
5036}
5037
5038static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
5039 handle_t *handle,
5040 struct ocfs2_xattr_search *xs,
5041 char *val,
5042 int value_len)
5043{
5044 int ret, offset, block_off;
5045 struct ocfs2_xattr_value_root *xv;
5046 struct ocfs2_xattr_entry *xe = xs->here;
5047 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5048 void *base;
5049 struct ocfs2_xattr_value_buf vb = {
5050 .vb_access = ocfs2_journal_access,
5051 };
5052
5053 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
5054
5055 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
5056 xe - xh->xh_entries,
5057 &block_off,
5058 &offset);
5059 if (ret) {
5060 mlog_errno(ret);
5061 goto out;
5062 }
5063
5064 base = bucket_block(xs->bucket, block_off);
5065 xv = (struct ocfs2_xattr_value_root *)(base + offset +
5066 OCFS2_XATTR_SIZE(xe->xe_name_len));
5067
5068 vb.vb_xv = xv;
5069 vb.vb_bh = xs->bucket->bu_bhs[block_off];
5070 ret = __ocfs2_xattr_set_value_outside(inode, handle,
5071 &vb, val, value_len);
5072 if (ret)
5073 mlog_errno(ret);
5074out:
5075 return ret;
5076}
5077
5078static int ocfs2_rm_xattr_cluster(struct inode *inode, 5408static int ocfs2_rm_xattr_cluster(struct inode *inode,
5079 struct buffer_head *root_bh, 5409 struct buffer_head *root_bh,
5080 u64 blkno, 5410 u64 blkno,
@@ -5173,128 +5503,6 @@ out:
5173 return ret; 5503 return ret;
5174} 5504}
5175 5505
5176static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
5177 handle_t *handle,
5178 struct ocfs2_xattr_search *xs)
5179{
5180 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5181 struct ocfs2_xattr_entry *last = &xh->xh_entries[
5182 le16_to_cpu(xh->xh_count) - 1];
5183 int ret = 0;
5184
5185 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
5186 OCFS2_JOURNAL_ACCESS_WRITE);
5187 if (ret) {
5188 mlog_errno(ret);
5189 return;
5190 }
5191
5192 /* Remove the old entry. */
5193 memmove(xs->here, xs->here + 1,
5194 (void *)last - (void *)xs->here);
5195 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
5196 le16_add_cpu(&xh->xh_count, -1);
5197
5198 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
5199}
5200
5201/*
5202 * Set the xattr name/value in the bucket specified in xs.
5203 *
5204 * As the new value in xi may be stored in the bucket or in an outside cluster,
5205 * we divide the whole process into 3 steps:
5206 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
5207 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
5208 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
5209 * 4. If the clusters for the new outside value can't be allocated, we need
5210 * to free the xattr we allocated in set.
5211 */
5212static int ocfs2_xattr_set_in_bucket(struct inode *inode,
5213 struct ocfs2_xattr_info *xi,
5214 struct ocfs2_xattr_search *xs,
5215 struct ocfs2_xattr_set_ctxt *ctxt)
5216{
5217 int ret, local = 1;
5218 size_t value_len;
5219 char *val = (char *)xi->value;
5220 struct ocfs2_xattr_entry *xe = xs->here;
5221 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
5222 strlen(xi->name));
5223
5224 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
5225 /*
5226 * We need to truncate the xattr storage first.
5227 *
5228 * If both the old and new value are stored to
5229 * outside block, we only need to truncate
5230 * the storage and then set the value outside.
5231 *
5232 * If the new value should be stored within block,
5233 * we should free all the outside block first and
5234 * the modification to the xattr block will be done
5235 * by following steps.
5236 */
5237 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
5238 value_len = xi->value_len;
5239 else
5240 value_len = 0;
5241
5242 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5243 value_len,
5244 ctxt);
5245 if (ret)
5246 goto out;
5247
5248 if (value_len)
5249 goto set_value_outside;
5250 }
5251
5252 value_len = xi->value_len;
5253 /* So we have to handle the inside block change now. */
5254 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
5255 /*
5256 * If the new value will be stored outside of block,
5257 * initalize a new empty value root and insert it first.
5258 */
5259 local = 0;
5260 xi->value = &def_xv;
5261 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
5262 }
5263
5264 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
5265 name_hash, local);
5266 if (ret) {
5267 mlog_errno(ret);
5268 goto out;
5269 }
5270
5271 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
5272 goto out;
5273
5274 /* allocate the space now for the outside block storage. */
5275 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5276 value_len, ctxt);
5277 if (ret) {
5278 mlog_errno(ret);
5279
5280 if (xs->not_found) {
5281 /*
5282 * We can't allocate enough clusters for outside
5283 * storage and we have allocated xattr already,
5284 * so need to remove it.
5285 */
5286 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
5287 }
5288 goto out;
5289 }
5290
5291set_value_outside:
5292 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5293 xs, val, value_len);
5294out:
5295 return ret;
5296}
5297
5298/* 5506/*
5299 * check whether the xattr bucket is filled up with the same hash value. 5507 * check whether the xattr bucket is filled up with the same hash value.
5300 * If we want to insert the xattr with the same hash, return -ENOSPC. 5508 * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5323,156 +5531,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
5323 return 0; 5531 return 0;
5324} 5532}
5325 5533
5326static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5534/*
5327 struct ocfs2_xattr_info *xi, 5535 * Try to set the entry in the current bucket. If we fail, the caller
5328 struct ocfs2_xattr_search *xs, 5536 * will handle getting us another bucket.
5329 struct ocfs2_xattr_set_ctxt *ctxt) 5537 */
5538static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5539 struct ocfs2_xattr_info *xi,
5540 struct ocfs2_xattr_search *xs,
5541 struct ocfs2_xattr_set_ctxt *ctxt)
5330{ 5542{
5331 struct ocfs2_xattr_header *xh; 5543 int ret;
5332 struct ocfs2_xattr_entry *xe; 5544 struct ocfs2_xa_loc loc;
5333 u16 count, header_size, xh_free_start;
5334 int free, max_free, need, old;
5335 size_t value_size = 0, name_len = strlen(xi->name);
5336 size_t blocksize = inode->i_sb->s_blocksize;
5337 int ret, allocation = 0;
5338
5339 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
5340
5341try_again:
5342 xh = xs->header;
5343 count = le16_to_cpu(xh->xh_count);
5344 xh_free_start = le16_to_cpu(xh->xh_free_start);
5345 header_size = sizeof(struct ocfs2_xattr_header) +
5346 count * sizeof(struct ocfs2_xattr_entry);
5347 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5348 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5349
5350 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5351 "of %u which exceed block size\n",
5352 (unsigned long long)bucket_blkno(xs->bucket),
5353 header_size);
5354 5545
5355 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5546 mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
5356 value_size = OCFS2_XATTR_ROOT_SIZE;
5357 else if (xi->value)
5358 value_size = OCFS2_XATTR_SIZE(xi->value_len);
5359 5547
5360 if (xs->not_found) 5548 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
5361 need = sizeof(struct ocfs2_xattr_entry) + 5549 xs->not_found ? NULL : xs->here);
5362 OCFS2_XATTR_SIZE(name_len) + value_size; 5550 ret = ocfs2_xa_set(&loc, xi, ctxt);
5363 else { 5551 if (!ret) {
5364 need = value_size + OCFS2_XATTR_SIZE(name_len); 5552 xs->here = loc.xl_entry;
5553 goto out;
5554 }
5555 if (ret != -ENOSPC) {
5556 mlog_errno(ret);
5557 goto out;
5558 }
5365 5559
5366 /* 5560 /* Ok, we need space. Let's try defragmenting the bucket. */
5367 * We only replace the old value if the new length is smaller 5561 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5368 * than the old one. Otherwise we will allocate new space in the 5562 xs->bucket);
5369 * bucket to store it. 5563 if (ret) {
5370 */ 5564 mlog_errno(ret);
5371 xe = xs->here; 5565 goto out;
5372 if (ocfs2_xattr_is_local(xe)) 5566 }
5373 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
5374 else
5375 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
5376 5567
5377 if (old >= value_size) 5568 ret = ocfs2_xa_set(&loc, xi, ctxt);
5378 need = 0; 5569 if (!ret) {
5570 xs->here = loc.xl_entry;
5571 goto out;
5379 } 5572 }
5573 if (ret != -ENOSPC)
5574 mlog_errno(ret);
5380 5575
5381 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5382 /*
5383 * We need to make sure the new name/value pair
5384 * can exist in the same block.
5385 */
5386 if (xh_free_start % blocksize < need)
5387 free -= xh_free_start % blocksize;
5388
5389 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
5390 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
5391 " %u\n", xs->not_found,
5392 (unsigned long long)bucket_blkno(xs->bucket),
5393 free, need, max_free, le16_to_cpu(xh->xh_free_start),
5394 le16_to_cpu(xh->xh_name_value_len));
5395
5396 if (free < need ||
5397 (xs->not_found &&
5398 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
5399 if (need <= max_free &&
5400 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
5401 /*
5402 * We can create the space by defragment. Since only the
5403 * name/value will be moved, the xe shouldn't be changed
5404 * in xs.
5405 */
5406 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5407 xs->bucket);
5408 if (ret) {
5409 mlog_errno(ret);
5410 goto out;
5411 }
5412 5576
5413 xh_free_start = le16_to_cpu(xh->xh_free_start); 5577out:
5414 free = xh_free_start - header_size 5578 mlog_exit(ret);
5415 - OCFS2_XATTR_HEADER_GAP; 5579 return ret;
5416 if (xh_free_start % blocksize < need) 5580}
5417 free -= xh_free_start % blocksize;
5418 5581
5419 if (free >= need) 5582static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5420 goto xattr_set; 5583 struct ocfs2_xattr_info *xi,
5584 struct ocfs2_xattr_search *xs,
5585 struct ocfs2_xattr_set_ctxt *ctxt)
5586{
5587 int ret;
5421 5588
5422 mlog(0, "Can't get enough space for xattr insert by " 5589 mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
5423 "defragment. Need %u bytes, but we have %d, so "
5424 "allocate new bucket for it.\n", need, free);
5425 }
5426 5590
5427 /* 5591 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5428 * We have to add new buckets or clusters and one 5592 if (!ret)
5429 * allocation should leave us enough space for insert. 5593 goto out;
5430 */ 5594 if (ret != -ENOSPC) {
5431 BUG_ON(allocation); 5595 mlog_errno(ret);
5596 goto out;
5597 }
5432 5598
5433 /* 5599 /* Ack, need more space. Let's try to get another bucket! */
5434 * We do not allow for overlapping ranges between buckets. And
5435 * the maximum number of collisions we will allow for then is
5436 * one bucket's worth, so check it here whether we need to
5437 * add a new bucket for the insert.
5438 */
5439 ret = ocfs2_check_xattr_bucket_collision(inode,
5440 xs->bucket,
5441 xi->name);
5442 if (ret) {
5443 mlog_errno(ret);
5444 goto out;
5445 }
5446 5600
5447 ret = ocfs2_add_new_xattr_bucket(inode, 5601 /*
5448 xs->xattr_bh, 5602 * We do not allow for overlapping ranges between buckets. And
5603 * the maximum number of collisions we will allow for then is
5604 * one bucket's worth, so check it here whether we need to
5605 * add a new bucket for the insert.
5606 */
5607 ret = ocfs2_check_xattr_bucket_collision(inode,
5449 xs->bucket, 5608 xs->bucket,
5450 ctxt); 5609 xi->xi_name);
5451 if (ret) { 5610 if (ret) {
5452 mlog_errno(ret); 5611 mlog_errno(ret);
5453 goto out; 5612 goto out;
5454 } 5613 }
5455 5614
5456 /* 5615 ret = ocfs2_add_new_xattr_bucket(inode,
5457 * ocfs2_add_new_xattr_bucket() will have updated 5616 xs->xattr_bh,
5458 * xs->bucket if it moved, but it will not have updated 5617 xs->bucket,
5459 * any of the other search fields. Thus, we drop it and 5618 ctxt);
5460 * re-search. Everything should be cached, so it'll be 5619 if (ret) {
5461 * quick. 5620 mlog_errno(ret);
5462 */ 5621 goto out;
5463 ocfs2_xattr_bucket_relse(xs->bucket);
5464 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5465 xi->name_index,
5466 xi->name, xs);
5467 if (ret && ret != -ENODATA)
5468 goto out;
5469 xs->not_found = ret;
5470 allocation = 1;
5471 goto try_again;
5472 } 5622 }
5473 5623
5474xattr_set: 5624 /*
5475 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); 5625 * ocfs2_add_new_xattr_bucket() will have updated
5626 * xs->bucket if it moved, but it will not have updated
5627 * any of the other search fields. Thus, we drop it and
5628 * re-search. Everything should be cached, so it'll be
5629 * quick.
5630 */
5631 ocfs2_xattr_bucket_relse(xs->bucket);
5632 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5633 xi->xi_name_index,
5634 xi->xi_name, xs);
5635 if (ret && ret != -ENODATA)
5636 goto out;
5637 xs->not_found = ret;
5638
5639 /* Ok, we have a new bucket, let's try again */
5640 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5641 if (ret && (ret != -ENOSPC))
5642 mlog_errno(ret);
5643
5476out: 5644out:
5477 mlog_exit(ret); 5645 mlog_exit(ret);
5478 return ret; 5646 return ret;
@@ -5684,7 +5852,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5684 * refcount tree, and make the original extent become 3. So we will need 5852 * refcount tree, and make the original extent become 3. So we will need
5685 * 2 * cluster more extent recs at most. 5853 * 2 * cluster more extent recs at most.
5686 */ 5854 */
5687 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { 5855 if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
5688 5856
5689 ret = ocfs2_refcounted_xattr_delete_need(inode, 5857 ret = ocfs2_refcounted_xattr_delete_need(inode,
5690 &(*ref_tree)->rf_ci, 5858 &(*ref_tree)->rf_ci,
@@ -6066,7 +6234,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
6066 * to the extent block, so just calculate a maximum record num. 6234 * to the extent block, so just calculate a maximum record num.
6067 */ 6235 */
6068 if (!xv->xr_list.l_tree_depth) 6236 if (!xv->xr_list.l_tree_depth)
6069 *num_recs += xv->xr_list.l_next_free_rec; 6237 *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
6070 else 6238 else
6071 *num_recs += ocfs2_clusters_for_bytes(sb, 6239 *num_recs += ocfs2_clusters_for_bytes(sb,
6072 XATTR_SIZE_MAX); 6240 XATTR_SIZE_MAX);
@@ -6360,33 +6528,33 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6360 int indexed) 6528 int indexed)
6361{ 6529{
6362 int ret; 6530 int ret;
6363 handle_t *handle;
6364 struct ocfs2_alloc_context *meta_ac;
6365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6531 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6532 struct ocfs2_xattr_set_ctxt ctxt;
6366 6533
6367 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6534 memset(&ctxt, 0, sizeof(ctxt));
6535 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6368 if (ret < 0) { 6536 if (ret < 0) {
6369 mlog_errno(ret); 6537 mlog_errno(ret);
6370 return ret; 6538 return ret;
6371 } 6539 }
6372 6540
6373 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); 6541 ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6374 if (IS_ERR(handle)) { 6542 if (IS_ERR(ctxt.handle)) {
6375 ret = PTR_ERR(handle); 6543 ret = PTR_ERR(ctxt.handle);
6376 mlog_errno(ret); 6544 mlog_errno(ret);
6377 goto out; 6545 goto out;
6378 } 6546 }
6379 6547
6380 mlog(0, "create new xattr block for inode %llu, index = %d\n", 6548 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6381 (unsigned long long)fe_bh->b_blocknr, indexed); 6549 (unsigned long long)fe_bh->b_blocknr, indexed);
6382 ret = ocfs2_create_xattr_block(handle, inode, fe_bh, 6550 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
6383 meta_ac, ret_bh, indexed); 6551 ret_bh);
6384 if (ret) 6552 if (ret)
6385 mlog_errno(ret); 6553 mlog_errno(ret);
6386 6554
6387 ocfs2_commit_trans(osb, handle); 6555 ocfs2_commit_trans(osb, ctxt.handle);
6388out: 6556out:
6389 ocfs2_free_alloc_context(meta_ac); 6557 ocfs2_free_alloc_context(ctxt.meta_ac);
6390 return ret; 6558 return ret;
6391} 6559}
6392 6560
@@ -6978,9 +7146,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
6978 7146
6979 ret = ocfs2_init_security_get(inode, dir, &si); 7147 ret = ocfs2_init_security_get(inode, dir, &si);
6980 if (!ret) { 7148 if (!ret) {
6981 ret = ocfs2_xattr_security_set(inode, si.name, 7149 ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
6982 si.value, si.value_len, 7150 si.name, si.value, si.value_len,
6983 XATTR_CREATE); 7151 XATTR_CREATE);
6984 if (ret) { 7152 if (ret) {
6985 mlog_errno(ret); 7153 mlog_errno(ret);
6986 goto leave; 7154 goto leave;
@@ -7008,9 +7176,9 @@ leave:
7008/* 7176/*
7009 * 'security' attributes support 7177 * 'security' attributes support
7010 */ 7178 */
7011static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, 7179static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
7012 size_t list_size, const char *name, 7180 size_t list_size, const char *name,
7013 size_t name_len) 7181 size_t name_len, int type)
7014{ 7182{
7015 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; 7183 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
7016 const size_t total_len = prefix_len + name_len + 1; 7184 const size_t total_len = prefix_len + name_len + 1;
@@ -7023,23 +7191,23 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
7023 return total_len; 7191 return total_len;
7024} 7192}
7025 7193
7026static int ocfs2_xattr_security_get(struct inode *inode, const char *name, 7194static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
7027 void *buffer, size_t size) 7195 void *buffer, size_t size, int type)
7028{ 7196{
7029 if (strcmp(name, "") == 0) 7197 if (strcmp(name, "") == 0)
7030 return -EINVAL; 7198 return -EINVAL;
7031 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, 7199 return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
7032 buffer, size); 7200 name, buffer, size);
7033} 7201}
7034 7202
7035static int ocfs2_xattr_security_set(struct inode *inode, const char *name, 7203static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
7036 const void *value, size_t size, int flags) 7204 const void *value, size_t size, int flags, int type)
7037{ 7205{
7038 if (strcmp(name, "") == 0) 7206 if (strcmp(name, "") == 0)
7039 return -EINVAL; 7207 return -EINVAL;
7040 7208
7041 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, 7209 return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
7042 size, flags); 7210 name, value, size, flags);
7043} 7211}
7044 7212
7045int ocfs2_init_security_get(struct inode *inode, 7213int ocfs2_init_security_get(struct inode *inode,
@@ -7076,9 +7244,9 @@ struct xattr_handler ocfs2_xattr_security_handler = {
7076/* 7244/*
7077 * 'trusted' attributes support 7245 * 'trusted' attributes support
7078 */ 7246 */
7079static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, 7247static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
7080 size_t list_size, const char *name, 7248 size_t list_size, const char *name,
7081 size_t name_len) 7249 size_t name_len, int type)
7082{ 7250{
7083 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; 7251 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
7084 const size_t total_len = prefix_len + name_len + 1; 7252 const size_t total_len = prefix_len + name_len + 1;
@@ -7091,23 +7259,23 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
7091 return total_len; 7259 return total_len;
7092} 7260}
7093 7261
7094static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, 7262static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
7095 void *buffer, size_t size) 7263 void *buffer, size_t size, int type)
7096{ 7264{
7097 if (strcmp(name, "") == 0) 7265 if (strcmp(name, "") == 0)
7098 return -EINVAL; 7266 return -EINVAL;
7099 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, 7267 return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
7100 buffer, size); 7268 name, buffer, size);
7101} 7269}
7102 7270
7103static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, 7271static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
7104 const void *value, size_t size, int flags) 7272 const void *value, size_t size, int flags, int type)
7105{ 7273{
7106 if (strcmp(name, "") == 0) 7274 if (strcmp(name, "") == 0)
7107 return -EINVAL; 7275 return -EINVAL;
7108 7276
7109 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, 7277 return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
7110 size, flags); 7278 name, value, size, flags);
7111} 7279}
7112 7280
7113struct xattr_handler ocfs2_xattr_trusted_handler = { 7281struct xattr_handler ocfs2_xattr_trusted_handler = {
@@ -7120,13 +7288,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
7120/* 7288/*
7121 * 'user' attributes support 7289 * 'user' attributes support
7122 */ 7290 */
7123static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, 7291static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
7124 size_t list_size, const char *name, 7292 size_t list_size, const char *name,
7125 size_t name_len) 7293 size_t name_len, int type)
7126{ 7294{
7127 const size_t prefix_len = XATTR_USER_PREFIX_LEN; 7295 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
7128 const size_t total_len = prefix_len + name_len + 1; 7296 const size_t total_len = prefix_len + name_len + 1;
7129 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7297 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
7130 7298
7131 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) 7299 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
7132 return 0; 7300 return 0;
@@ -7139,31 +7307,31 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
7139 return total_len; 7307 return total_len;
7140} 7308}
7141 7309
7142static int ocfs2_xattr_user_get(struct inode *inode, const char *name, 7310static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
7143 void *buffer, size_t size) 7311 void *buffer, size_t size, int type)
7144{ 7312{
7145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7313 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
7146 7314
7147 if (strcmp(name, "") == 0) 7315 if (strcmp(name, "") == 0)
7148 return -EINVAL; 7316 return -EINVAL;
7149 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) 7317 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
7150 return -EOPNOTSUPP; 7318 return -EOPNOTSUPP;
7151 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, 7319 return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
7152 buffer, size); 7320 buffer, size);
7153} 7321}
7154 7322
7155static int ocfs2_xattr_user_set(struct inode *inode, const char *name, 7323static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
7156 const void *value, size_t size, int flags) 7324 const void *value, size_t size, int flags, int type)
7157{ 7325{
7158 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7326 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
7159 7327
7160 if (strcmp(name, "") == 0) 7328 if (strcmp(name, "") == 0)
7161 return -EINVAL; 7329 return -EINVAL;
7162 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) 7330 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
7163 return -EOPNOTSUPP; 7331 return -EOPNOTSUPP;
7164 7332
7165 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, 7333 return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
7166 size, flags); 7334 name, value, size, flags);
7167} 7335}
7168 7336
7169struct xattr_handler ocfs2_xattr_user_handler = { 7337struct xattr_handler ocfs2_xattr_user_handler = {
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 08e36389f56d..abd72a47f520 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info {
40extern struct xattr_handler ocfs2_xattr_user_handler; 40extern struct xattr_handler ocfs2_xattr_user_handler;
41extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler; 42extern struct xattr_handler ocfs2_xattr_security_handler;
43#ifdef CONFIG_OCFS2_FS_POSIX_ACL
44extern struct xattr_handler ocfs2_xattr_acl_access_handler; 43extern struct xattr_handler ocfs2_xattr_acl_access_handler;
45extern struct xattr_handler ocfs2_xattr_acl_default_handler; 44extern struct xattr_handler ocfs2_xattr_acl_default_handler;
46#endif
47extern struct xattr_handler *ocfs2_xattr_handlers[]; 45extern struct xattr_handler *ocfs2_xattr_handlers[];
48 46
49ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index e1c0ec0ae989..082234581d05 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -85,7 +85,7 @@ out:
85} 85}
86 86
87/* 87/*
88 * Tries to allocate exactly one block. Returns true if sucessful. 88 * Tries to allocate exactly one block. Returns true if successful.
89 */ 89 */
90int omfs_allocate_block(struct super_block *sb, u64 block) 90int omfs_allocate_block(struct super_block *sb, u64 block)
91{ 91{
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3a..c82af6acc2e7 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -6,11 +6,13 @@
6#include <linux/version.h> 6#include <linux/version.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include <linux/fs.h> 10#include <linux/fs.h>
10#include <linux/vfs.h> 11#include <linux/vfs.h>
11#include <linux/parser.h> 12#include <linux/parser.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/writeback.h>
14#include <linux/crc-itu-t.h> 16#include <linux/crc-itu-t.h>
15#include "omfs.h" 17#include "omfs.h"
16 18
@@ -89,7 +91,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
89 oi->i_head.h_check_xor = xor; 91 oi->i_head.h_check_xor = xor;
90} 92}
91 93
92static int omfs_write_inode(struct inode *inode, int wait) 94static int __omfs_write_inode(struct inode *inode, int wait)
93{ 95{
94 struct omfs_inode *oi; 96 struct omfs_inode *oi;
95 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); 97 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +164,14 @@ out:
162 return ret; 164 return ret;
163} 165}
164 166
167static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
168{
169 return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
170}
171
165int omfs_sync_inode(struct inode *inode) 172int omfs_sync_inode(struct inode *inode)
166{ 173{
167 return omfs_write_inode(inode, 1); 174 return __omfs_write_inode(inode, 1);
168} 175}
169 176
170/* 177/*
diff --git a/fs/open.c b/fs/open.c
index 4f01e06227c6..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -8,10 +8,8 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/file.h> 9#include <linux/file.h>
10#include <linux/fdtable.h> 10#include <linux/fdtable.h>
11#include <linux/quotaops.h>
12#include <linux/fsnotify.h> 11#include <linux/fsnotify.h>
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/tty.h> 13#include <linux/tty.h>
16#include <linux/namei.h> 14#include <linux/namei.h>
17#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
@@ -21,6 +19,7 @@
21#include <linux/mount.h> 19#include <linux/mount.h>
22#include <linux/vfs.h> 20#include <linux/vfs.h>
23#include <linux/fcntl.h> 21#include <linux/fcntl.h>
22#include <linux/slab.h>
24#include <asm/uaccess.h> 23#include <asm/uaccess.h>
25#include <linux/fs.h> 24#include <linux/fs.h>
26#include <linux/personality.h> 25#include <linux/personality.h>
@@ -30,6 +29,9 @@
30#include <linux/audit.h> 29#include <linux/audit.h>
31#include <linux/falloc.h> 30#include <linux/falloc.h>
32#include <linux/fs_struct.h> 31#include <linux/fs_struct.h>
32#include <linux/ima.h>
33
34#include "internal.h"
33 35
34int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) 36int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
35{ 37{
@@ -268,17 +270,15 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
268 * Make sure that there are no leases. get_write_access() protects 270 * Make sure that there are no leases. get_write_access() protects
269 * against the truncate racing with a lease-granting setlease(). 271 * against the truncate racing with a lease-granting setlease().
270 */ 272 */
271 error = break_lease(inode, FMODE_WRITE); 273 error = break_lease(inode, O_WRONLY);
272 if (error) 274 if (error)
273 goto put_write_and_out; 275 goto put_write_and_out;
274 276
275 error = locks_verify_truncate(inode, NULL, length); 277 error = locks_verify_truncate(inode, NULL, length);
276 if (!error) 278 if (!error)
277 error = security_path_truncate(&path, length, 0); 279 error = security_path_truncate(&path, length, 0);
278 if (!error) { 280 if (!error)
279 vfs_dq_init(inode);
280 error = do_truncate(path.dentry, length, 0, NULL); 281 error = do_truncate(path.dentry, length, 0, NULL);
281 }
282 282
283put_write_and_out: 283put_write_and_out:
284 put_write_access(inode); 284 put_write_access(inode);
@@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
587 error = -EPERM; 587 error = -EPERM;
588 if (!capable(CAP_SYS_CHROOT)) 588 if (!capable(CAP_SYS_CHROOT))
589 goto dput_and_out; 589 goto dput_and_out;
590 error = security_path_chroot(&path);
591 if (error)
592 goto dput_and_out;
590 593
591 set_fs_root(current->fs, &path); 594 set_fs_root(current->fs, &path);
592 error = 0; 595 error = 0;
@@ -617,11 +620,15 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
617 if (err) 620 if (err)
618 goto out_putf; 621 goto out_putf;
619 mutex_lock(&inode->i_mutex); 622 mutex_lock(&inode->i_mutex);
623 err = security_path_chmod(dentry, file->f_vfsmnt, mode);
624 if (err)
625 goto out_unlock;
620 if (mode == (mode_t) -1) 626 if (mode == (mode_t) -1)
621 mode = inode->i_mode; 627 mode = inode->i_mode;
622 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 628 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
623 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 629 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
624 err = notify_change(dentry, &newattrs); 630 err = notify_change(dentry, &newattrs);
631out_unlock:
625 mutex_unlock(&inode->i_mutex); 632 mutex_unlock(&inode->i_mutex);
626 mnt_drop_write(file->f_path.mnt); 633 mnt_drop_write(file->f_path.mnt);
627out_putf: 634out_putf:
@@ -646,11 +653,15 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
646 if (error) 653 if (error)
647 goto dput_and_out; 654 goto dput_and_out;
648 mutex_lock(&inode->i_mutex); 655 mutex_lock(&inode->i_mutex);
656 error = security_path_chmod(path.dentry, path.mnt, mode);
657 if (error)
658 goto out_unlock;
649 if (mode == (mode_t) -1) 659 if (mode == (mode_t) -1)
650 mode = inode->i_mode; 660 mode = inode->i_mode;
651 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 661 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
652 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 662 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
653 error = notify_change(path.dentry, &newattrs); 663 error = notify_change(path.dentry, &newattrs);
664out_unlock:
654 mutex_unlock(&inode->i_mutex); 665 mutex_unlock(&inode->i_mutex);
655 mnt_drop_write(path.mnt); 666 mnt_drop_write(path.mnt);
656dput_and_out: 667dput_and_out:
@@ -664,9 +675,9 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
664 return sys_fchmodat(AT_FDCWD, filename, mode); 675 return sys_fchmodat(AT_FDCWD, filename, mode);
665} 676}
666 677
667static int chown_common(struct dentry * dentry, uid_t user, gid_t group) 678static int chown_common(struct path *path, uid_t user, gid_t group)
668{ 679{
669 struct inode *inode = dentry->d_inode; 680 struct inode *inode = path->dentry->d_inode;
670 int error; 681 int error;
671 struct iattr newattrs; 682 struct iattr newattrs;
672 683
@@ -683,7 +694,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
683 newattrs.ia_valid |= 694 newattrs.ia_valid |=
684 ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; 695 ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
685 mutex_lock(&inode->i_mutex); 696 mutex_lock(&inode->i_mutex);
686 error = notify_change(dentry, &newattrs); 697 error = security_path_chown(path, user, group);
698 if (!error)
699 error = notify_change(path->dentry, &newattrs);
687 mutex_unlock(&inode->i_mutex); 700 mutex_unlock(&inode->i_mutex);
688 701
689 return error; 702 return error;
@@ -700,7 +713,7 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
700 error = mnt_want_write(path.mnt); 713 error = mnt_want_write(path.mnt);
701 if (error) 714 if (error)
702 goto out_release; 715 goto out_release;
703 error = chown_common(path.dentry, user, group); 716 error = chown_common(&path, user, group);
704 mnt_drop_write(path.mnt); 717 mnt_drop_write(path.mnt);
705out_release: 718out_release:
706 path_put(&path); 719 path_put(&path);
@@ -725,7 +738,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
725 error = mnt_want_write(path.mnt); 738 error = mnt_want_write(path.mnt);
726 if (error) 739 if (error)
727 goto out_release; 740 goto out_release;
728 error = chown_common(path.dentry, user, group); 741 error = chown_common(&path, user, group);
729 mnt_drop_write(path.mnt); 742 mnt_drop_write(path.mnt);
730out_release: 743out_release:
731 path_put(&path); 744 path_put(&path);
@@ -744,7 +757,7 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
744 error = mnt_want_write(path.mnt); 757 error = mnt_want_write(path.mnt);
745 if (error) 758 if (error)
746 goto out_release; 759 goto out_release;
747 error = chown_common(path.dentry, user, group); 760 error = chown_common(&path, user, group);
748 mnt_drop_write(path.mnt); 761 mnt_drop_write(path.mnt);
749out_release: 762out_release:
750 path_put(&path); 763 path_put(&path);
@@ -767,7 +780,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
767 goto out_fput; 780 goto out_fput;
768 dentry = file->f_path.dentry; 781 dentry = file->f_path.dentry;
769 audit_inode(NULL, dentry); 782 audit_inode(NULL, dentry);
770 error = chown_common(dentry, user, group); 783 error = chown_common(&file->f_path, user, group);
771 mnt_drop_write(file->f_path.mnt); 784 mnt_drop_write(file->f_path.mnt);
772out_fput: 785out_fput:
773 fput(file); 786 fput(file);
@@ -805,15 +818,14 @@ static inline int __get_file_write_access(struct inode *inode,
805} 818}
806 819
807static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, 820static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
808 int flags, struct file *f, 821 struct file *f,
809 int (*open)(struct inode *, struct file *), 822 int (*open)(struct inode *, struct file *),
810 const struct cred *cred) 823 const struct cred *cred)
811{ 824{
812 struct inode *inode; 825 struct inode *inode;
813 int error; 826 int error;
814 827
815 f->f_flags = flags; 828 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
816 f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
817 FMODE_PREAD | FMODE_PWRITE; 829 FMODE_PREAD | FMODE_PWRITE;
818 inode = dentry->d_inode; 830 inode = dentry->d_inode;
819 if (f->f_mode & FMODE_WRITE) { 831 if (f->f_mode & FMODE_WRITE) {
@@ -842,6 +854,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
842 if (error) 854 if (error)
843 goto cleanup_all; 855 goto cleanup_all;
844 } 856 }
857 ima_counts_get(f);
845 858
846 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 859 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
847 860
@@ -913,7 +926,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry
913 if (IS_ERR(dentry)) 926 if (IS_ERR(dentry))
914 goto out_err; 927 goto out_err;
915 nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), 928 nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
916 nd->intent.open.flags - 1,
917 nd->intent.open.file, 929 nd->intent.open.file,
918 open, cred); 930 open, cred);
919out: 931out:
@@ -932,7 +944,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
932 * 944 *
933 * Note that this function destroys the original nameidata 945 * Note that this function destroys the original nameidata
934 */ 946 */
935struct file *nameidata_to_filp(struct nameidata *nd, int flags) 947struct file *nameidata_to_filp(struct nameidata *nd)
936{ 948{
937 const struct cred *cred = current_cred(); 949 const struct cred *cred = current_cred();
938 struct file *filp; 950 struct file *filp;
@@ -941,7 +953,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
941 filp = nd->intent.open.file; 953 filp = nd->intent.open.file;
942 /* Has the filesystem initialised the file for us? */ 954 /* Has the filesystem initialised the file for us? */
943 if (filp->f_path.dentry == NULL) 955 if (filp->f_path.dentry == NULL)
944 filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp, 956 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
945 NULL, cred); 957 NULL, cred);
946 else 958 else
947 path_put(&nd->path); 959 path_put(&nd->path);
@@ -980,7 +992,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
980 return ERR_PTR(error); 992 return ERR_PTR(error);
981 } 993 }
982 994
983 return __dentry_open(dentry, mnt, flags, f, NULL, cred); 995 f->f_flags = flags;
996 return __dentry_open(dentry, mnt, f, NULL, cred);
984} 997}
985EXPORT_SYMBOL(dentry_open); 998EXPORT_SYMBOL(dentry_open);
986 999
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/slab.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/ctype.h> 21#include <linux/ctype.h>
21#include <linux/genhd.h> 22#include <linux/genhd.h>
@@ -226,6 +227,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
226 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); 227 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
227} 228}
228 229
230ssize_t part_discard_alignment_show(struct device *dev,
231 struct device_attribute *attr, char *buf)
232{
233 struct hd_struct *p = dev_to_part(dev);
234 return sprintf(buf, "%u\n", p->discard_alignment);
235}
236
229ssize_t part_stat_show(struct device *dev, 237ssize_t part_stat_show(struct device *dev,
230 struct device_attribute *attr, char *buf) 238 struct device_attribute *attr, char *buf)
231{ 239{
@@ -288,6 +296,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
288static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 296static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
289static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 297static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
290static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 298static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
299static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
300 NULL);
291static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 301static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
292static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); 302static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
293#ifdef CONFIG_FAIL_MAKE_REQUEST 303#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +310,7 @@ static struct attribute *part_attrs[] = {
300 &dev_attr_start.attr, 310 &dev_attr_start.attr,
301 &dev_attr_size.attr, 311 &dev_attr_size.attr,
302 &dev_attr_alignment_offset.attr, 312 &dev_attr_alignment_offset.attr,
313 &dev_attr_discard_alignment.attr,
303 &dev_attr_stat.attr, 314 &dev_attr_stat.attr,
304 &dev_attr_inflight.attr, 315 &dev_attr_inflight.attr,
305#ifdef CONFIG_FAIL_MAKE_REQUEST 316#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -402,7 +413,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
402 pdev = part_to_dev(p); 413 pdev = part_to_dev(p);
403 414
404 p->start_sect = start; 415 p->start_sect = start;
405 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); 416 p->alignment_offset =
417 queue_limit_alignment_offset(&disk->queue->limits, start);
418 p->discard_alignment =
419 queue_limit_discard_alignment(&disk->queue->limits, start);
406 p->nr_sects = len; 420 p->nr_sects = len;
407 p->partno = partno; 421 p->partno = partno;
408 p->policy = get_disk_ro(disk); 422 p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152f..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
1/************************************************************ 1/************************************************************
2 * EFI GUID Partition Table handling 2 * EFI GUID Partition Table handling
3 * Per Intel EFI Specification v1.02 3 *
4 * http://developer.intel.com/technology/efi/efi.htm 4 * http://www.uefi.org/specs/
5 * http://www.intel.com/technology/efi/
6 *
5 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> 7 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
6 * Copyright 2000,2001,2002,2004 Dell Inc. 8 * Copyright 2000,2001,2002,2004 Dell Inc.
7 * 9 *
@@ -92,6 +94,8 @@
92 * 94 *
93 ************************************************************/ 95 ************************************************************/
94#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/math64.h>
98#include <linux/slab.h>
95#include "check.h" 99#include "check.h"
96#include "efi.h" 100#include "efi.h"
97 101
@@ -141,7 +145,8 @@ last_lba(struct block_device *bdev)
141{ 145{
142 if (!bdev || !bdev->bd_inode) 146 if (!bdev || !bdev->bd_inode)
143 return 0; 147 return 0;
144 return (bdev->bd_inode->i_size >> 9) - 1ULL; 148 return div_u64(bdev->bd_inode->i_size,
149 bdev_logical_block_size(bdev)) - 1ULL;
145} 150}
146 151
147static inline int 152static inline int
@@ -188,6 +193,7 @@ static size_t
188read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 193read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
189{ 194{
190 size_t totalreadcount = 0; 195 size_t totalreadcount = 0;
196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
191 197
192 if (!bdev || !buffer || lba > last_lba(bdev)) 198 if (!bdev || !buffer || lba > last_lba(bdev))
193 return 0; 199 return 0;
@@ -195,7 +201,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
195 while (count) { 201 while (count) {
196 int copied = 512; 202 int copied = 512;
197 Sector sect; 203 Sector sect;
198 unsigned char *data = read_dev_sector(bdev, lba++, &sect); 204 unsigned char *data = read_dev_sector(bdev, n++, &sect);
199 if (!data) 205 if (!data)
200 break; 206 break;
201 if (copied > count) 207 if (copied > count)
@@ -257,15 +263,16 @@ static gpt_header *
257alloc_read_gpt_header(struct block_device *bdev, u64 lba) 263alloc_read_gpt_header(struct block_device *bdev, u64 lba)
258{ 264{
259 gpt_header *gpt; 265 gpt_header *gpt;
266 unsigned ssz = bdev_logical_block_size(bdev);
267
260 if (!bdev) 268 if (!bdev)
261 return NULL; 269 return NULL;
262 270
263 gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL); 271 gpt = kzalloc(ssz, GFP_KERNEL);
264 if (!gpt) 272 if (!gpt)
265 return NULL; 273 return NULL;
266 274
267 if (read_lba(bdev, lba, (u8 *) gpt, 275 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
268 sizeof (gpt_header)) < sizeof (gpt_header)) {
269 kfree(gpt); 276 kfree(gpt);
270 gpt=NULL; 277 gpt=NULL;
271 return NULL; 278 return NULL;
@@ -601,6 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
601 gpt_header *gpt = NULL; 608 gpt_header *gpt = NULL;
602 gpt_entry *ptes = NULL; 609 gpt_entry *ptes = NULL;
603 u32 i; 610 u32 i;
611 unsigned ssz = bdev_logical_block_size(bdev) / 512;
604 612
605 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 613 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
606 kfree(gpt); 614 kfree(gpt);
@@ -611,13 +619,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
611 pr_debug("GUID Partition Table is valid! Yea!\n"); 619 pr_debug("GUID Partition Table is valid! Yea!\n");
612 620
613 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 621 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
622 u64 start = le64_to_cpu(ptes[i].starting_lba);
623 u64 size = le64_to_cpu(ptes[i].ending_lba) -
624 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
625
614 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 626 if (!is_pte_valid(&ptes[i], last_lba(bdev)))
615 continue; 627 continue;
616 628
617 put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba), 629 put_partition(state, i+1, start * ssz, size * ssz);
618 (le64_to_cpu(ptes[i].ending_lba) -
619 le64_to_cpu(ptes[i].starting_lba) +
620 1ULL));
621 630
622 /* If this is a RAID volume, tell md */ 631 /* If this is a RAID volume, tell md */
623 if (!efi_guidcmp(ptes[i].partition_type_guid, 632 if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475bf..6998b589abf9 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
37#define EFI_PMBR_OSTYPE_EFI 0xEF 37#define EFI_PMBR_OSTYPE_EFI 0xEF
38#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE 38#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
39 39
40#define GPT_BLOCK_SIZE 512
41#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL 40#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
42#define GPT_HEADER_REVISION_V1 0x00010000 41#define GPT_HEADER_REVISION_V1 0x00010000
43#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 42#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
79 __le32 num_partition_entries; 78 __le32 num_partition_entries;
80 __le32 sizeof_partition_entry; 79 __le32 sizeof_partition_entry;
81 __le32 partition_entry_array_crc32; 80 __le32 partition_entry_array_crc32;
82 u8 reserved2[GPT_BLOCK_SIZE - 92]; 81
82 /* The rest of the logical block is reserved by UEFI and must be zero.
83 * EFI standard handles this by:
84 *
85 * uint8_t reserved2[ BlockSize - 92 ];
86 */
83} __attribute__ ((packed)) gpt_header; 87} __attribute__ ((packed)) gpt_header;
84 88
85typedef struct _gpt_entry_attributes { 89typedef struct _gpt_entry_attributes {
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
104 107
105static void 108static void
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109parse_extended(struct parsed_partitions *state, struct block_device *bdev,
107 u32 first_sector, u32 first_size) 110 sector_t first_sector, sector_t first_size)
108{ 111{
109 struct partition *p; 112 struct partition *p;
110 Sector sect; 113 Sector sect;
111 unsigned char *data; 114 unsigned char *data;
112 u32 this_sector, this_size; 115 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 116 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 117 int loopct = 0; /* number of links followed
115 without finding a data partition */ 118 without finding a data partition */
116 int i; 119 int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 148 * First process the data partition(s)
146 */ 149 */
147 for (i=0; i<4; i++, p++) { 150 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 151 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 152 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 153 continue;
151 154
152 /* Check the 3rd and 4th entries - 155 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 156 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 157 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 158 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 159 next = this_sector + offs;
157 if (i >= 2) { 160 if (i >= 2) {
158 if (offs + size > this_size) 161 if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 182 */
180 p -= 4; 183 p -= 4;
181 for (i=0; i<4; i++, p++) 184 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 185 if (nr_sects(p) && is_extended_partition(p))
183 break; 186 break;
184 if (i == 4) 187 if (i == 4)
185 goto done; /* nothing left to do */ 188 goto done; /* nothing left to do */
186 189
187 this_sector = first_sector + START_SECT(p) * sector_size; 190 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 191 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 192 put_dev_sector(sect);
190 } 193 }
191done: 194done:
@@ -197,7 +200,7 @@ done:
197 200
198static void 201static void
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
200 u32 offset, u32 size, int origin) 203 sector_t offset, sector_t size, int origin)
201{ 204{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 205#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 206 Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
244 */ 247 */
245static void 248static void
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 249parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
247 u32 offset, u32 size, int origin, char *flavour, 250 sector_t offset, sector_t size, int origin, char *flavour,
248 int max_partitions) 251 int max_partitions)
249{ 252{
250 Sector sect; 253 Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 266 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 267 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 268 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 269 sector_t bsd_start, bsd_size;
267 270
268 if (state->next == state->limit) 271 if (state->next == state->limit)
269 break; 272 break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
290 293
291static void 294static void
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
293 u32 offset, u32 size, int origin) 296 sector_t offset, sector_t size, int origin)
294{ 297{
295#ifdef CONFIG_BSD_DISKLABEL 298#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 299 parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
300 303
301static void 304static void
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
303 u32 offset, u32 size, int origin) 306 sector_t offset, sector_t size, int origin)
304{ 307{
305#ifdef CONFIG_BSD_DISKLABEL 308#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 309 parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
310 313
311static void 314static void
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
313 u32 offset, u32 size, int origin) 316 sector_t offset, sector_t size, int origin)
314{ 317{
315#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
324 */ 327 */
325static void 328static void
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 329parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
327 u32 offset, u32 size, int origin) 330 sector_t offset, sector_t size, int origin)
328{ 331{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 332#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 333 Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 351
349 if (p->s_label != UNIXWARE_FS_UNUSED) 352 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 353 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 354 le32_to_cpu(p->start_sect),
355 le32_to_cpu(p->nr_sects));
352 p++; 356 p++;
353 } 357 }
354 put_dev_sector(sect); 358 put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
363 */ 367 */
364static void 368static void
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 369parse_minix(struct parsed_partitions *state, struct block_device *bdev,
366 u32 offset, u32 size, int origin) 370 sector_t offset, sector_t size, int origin)
367{ 371{
368#ifdef CONFIG_MINIX_SUBPARTITION 372#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 373 Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 394 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 395 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 396 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 397 start_sect(p), nr_sects(p));
394 } 398 }
395 printk(" >\n"); 399 printk(" >\n");
396 } 400 }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
401static struct { 405static struct {
402 unsigned char id; 406 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 407 void (*parse)(struct parsed_partitions *, struct block_device *,
404 u32, u32, int); 408 sector_t, sector_t, int);
405} subtypes[] = { 409} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 410 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 411 {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
415 419
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 421{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 422 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 423 Sector sect;
420 unsigned char *data; 424 unsigned char *data;
421 struct partition *p; 425 struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 487
484 state->next = 5; 488 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 489 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 490 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 491 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 492 if (!size)
489 continue; 493 continue;
490 if (is_extended_partition(p)) { 494 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 495 /*
492 extended partition, but leave room for LILO */ 496 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 497 * extended partition, but leave room for LILO
498 * FIXME: this uses one logical sector for > 512b
499 * sector, although it may not be enough/proper.
500 */
501 sector_t n = 2;
502 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n);
504
494 printk(" <"); 505 printk(" <");
495 parse_extended(state, bdev, start, size); 506 parse_extended(state, bdev, start, size);
496 printk(" >"); 507 printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 524 unsigned char id = SYS_IND(p);
514 int n; 525 int n;
515 526
516 if (!NR_SECTS(p)) 527 if (!nr_sects(p))
517 continue; 528 continue;
518 529
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 530 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 532
522 if (!subtypes[n].parse) 533 if (!subtypes[n].parse)
523 continue; 534 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
525 NR_SECTS(p)*sector_size, slot); 536 nr_sects(p)*sector_size, slot);
526 } 537 }
527 put_dev_sector(sect); 538 put_dev_sector(sect);
528 return 1; 539 return 1;
diff --git a/fs/pipe.c b/fs/pipe.c
index ae17d026aaa3..37ba29ff3158 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -906,17 +906,6 @@ void free_pipe_info(struct inode *inode)
906} 906}
907 907
908static struct vfsmount *pipe_mnt __read_mostly; 908static struct vfsmount *pipe_mnt __read_mostly;
909static int pipefs_delete_dentry(struct dentry *dentry)
910{
911 /*
912 * At creation time, we pretended this dentry was hashed
913 * (by clearing DCACHE_UNHASHED bit in d_flags)
914 * At delete time, we restore the truth : not hashed.
915 * (so that dput() can proceed correctly)
916 */
917 dentry->d_flags |= DCACHE_UNHASHED;
918 return 0;
919}
920 909
921/* 910/*
922 * pipefs_dname() is called from d_path(). 911 * pipefs_dname() is called from d_path().
@@ -928,7 +917,6 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
928} 917}
929 918
930static const struct dentry_operations pipefs_dentry_operations = { 919static const struct dentry_operations pipefs_dentry_operations = {
931 .d_delete = pipefs_delete_dentry,
932 .d_dname = pipefs_dname, 920 .d_dname = pipefs_dname,
933}; 921};
934 922
@@ -974,7 +962,7 @@ struct file *create_write_pipe(int flags)
974 int err; 962 int err;
975 struct inode *inode; 963 struct inode *inode;
976 struct file *f; 964 struct file *f;
977 struct dentry *dentry; 965 struct path path;
978 struct qstr name = { .name = "" }; 966 struct qstr name = { .name = "" };
979 967
980 err = -ENFILE; 968 err = -ENFILE;
@@ -983,21 +971,16 @@ struct file *create_write_pipe(int flags)
983 goto err; 971 goto err;
984 972
985 err = -ENOMEM; 973 err = -ENOMEM;
986 dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); 974 path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
987 if (!dentry) 975 if (!path.dentry)
988 goto err_inode; 976 goto err_inode;
977 path.mnt = mntget(pipe_mnt);
989 978
990 dentry->d_op = &pipefs_dentry_operations; 979 path.dentry->d_op = &pipefs_dentry_operations;
991 /* 980 d_instantiate(path.dentry, inode);
992 * We dont want to publish this dentry into global dentry hash table.
993 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
994 * This permits a working /proc/$pid/fd/XXX on pipes
995 */
996 dentry->d_flags &= ~DCACHE_UNHASHED;
997 d_instantiate(dentry, inode);
998 981
999 err = -ENFILE; 982 err = -ENFILE;
1000 f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops); 983 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
1001 if (!f) 984 if (!f)
1002 goto err_dentry; 985 goto err_dentry;
1003 f->f_mapping = inode->i_mapping; 986 f->f_mapping = inode->i_mapping;
@@ -1009,7 +992,7 @@ struct file *create_write_pipe(int flags)
1009 992
1010 err_dentry: 993 err_dentry:
1011 free_pipe_info(inode); 994 free_pipe_info(inode);
1012 dput(dentry); 995 path_put(&path);
1013 return ERR_PTR(err); 996 return ERR_PTR(err);
1014 997
1015 err_inode: 998 err_inode:
@@ -1028,20 +1011,14 @@ void free_write_pipe(struct file *f)
1028 1011
1029struct file *create_read_pipe(struct file *wrf, int flags) 1012struct file *create_read_pipe(struct file *wrf, int flags)
1030{ 1013{
1031 struct file *f = get_empty_filp(); 1014 /* Grab pipe from the writer */
1015 struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
1016 &read_pipefifo_fops);
1032 if (!f) 1017 if (!f)
1033 return ERR_PTR(-ENFILE); 1018 return ERR_PTR(-ENFILE);
1034 1019
1035 /* Grab pipe from the writer */
1036 f->f_path = wrf->f_path;
1037 path_get(&wrf->f_path); 1020 path_get(&wrf->f_path);
1038 f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
1039
1040 f->f_pos = 0;
1041 f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1021 f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
1042 f->f_op = &read_pipefifo_fops;
1043 f->f_mode = FMODE_READ;
1044 f->f_version = 0;
1045 1022
1046 return f; 1023 return f;
1047} 1024}
diff --git a/fs/pnode.c b/fs/pnode.c
index 8d5f392ec3d3..5cc564a83149 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt)
86 86
87 /* 87 /*
88 * slave 'mnt' to a peer mount that has the 88 * slave 'mnt' to a peer mount that has the
89 * same root dentry. If none is available than 89 * same root dentry. If none is available then
90 * slave it to anything that is available. 90 * slave it to anything that is available.
91 */ 91 */
92 while ((peer_mnt = next_peer(peer_mnt)) != mnt && 92 while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
@@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
147 * get the next mount in the propagation tree. 147 * get the next mount in the propagation tree.
148 * @m: the mount seen last 148 * @m: the mount seen last
149 * @origin: the original mount from where the tree walk initiated 149 * @origin: the original mount from where the tree walk initiated
150 *
151 * Note that peer groups form contiguous segments of slave lists.
152 * We rely on that in get_source() to be able to find out if
153 * vfsmount found while iterating with propagation_next() is
154 * a peer of one we'd found earlier.
150 */ 155 */
151static struct vfsmount *propagation_next(struct vfsmount *m, 156static struct vfsmount *propagation_next(struct vfsmount *m,
152 struct vfsmount *origin) 157 struct vfsmount *origin)
@@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest,
186{ 191{
187 struct vfsmount *p_last_src = NULL; 192 struct vfsmount *p_last_src = NULL;
188 struct vfsmount *p_last_dest = NULL; 193 struct vfsmount *p_last_dest = NULL;
189 *type = CL_PROPAGATION;
190
191 if (IS_MNT_SHARED(dest))
192 *type |= CL_MAKE_SHARED;
193 194
194 while (last_dest != dest->mnt_master) { 195 while (last_dest != dest->mnt_master) {
195 p_last_dest = last_dest; 196 p_last_dest = last_dest;
@@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest,
202 do { 203 do {
203 p_last_dest = next_peer(p_last_dest); 204 p_last_dest = next_peer(p_last_dest);
204 } while (IS_MNT_NEW(p_last_dest)); 205 } while (IS_MNT_NEW(p_last_dest));
206 /* is that a peer of the earlier? */
207 if (dest == p_last_dest) {
208 *type = CL_MAKE_SHARED;
209 return p_last_src;
210 }
205 } 211 }
206 212 /* slave of the earlier, then */
207 if (dest != p_last_dest) { 213 *type = CL_SLAVE;
208 *type |= CL_SLAVE; 214 /* beginning of peer group among the slaves? */
209 return last_src; 215 if (IS_MNT_SHARED(dest))
210 } else 216 *type |= CL_MAKE_SHARED;
211 return p_last_src; 217 return last_src;
212} 218}
213 219
214/* 220/*
diff --git a/fs/pnode.h b/fs/pnode.h
index 958665d662af..1ea4ae1efcd3 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,12 +21,11 @@
21#define CL_SLAVE 0x02 21#define CL_SLAVE 0x02
22#define CL_COPY_ALL 0x04 22#define CL_COPY_ALL 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PROPAGATION 0x10 24#define CL_PRIVATE 0x10
25#define CL_PRIVATE 0x20
26 25
27static inline void set_mnt_shared(struct vfsmount *mnt) 26static inline void set_mnt_shared(struct vfsmount *mnt)
28{ 27{
29 mnt->mnt_flags &= ~MNT_PNODE_MASK; 28 mnt->mnt_flags &= ~MNT_SHARED_MASK;
30 mnt->mnt_flags |= MNT_SHARED; 29 mnt->mnt_flags |= MNT_SHARED;
31} 30}
32 31
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 822c2d506518..e51f2ec2c5e5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
68#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
69#include <linux/pagemap.h> 69#include <linux/pagemap.h>
70#include <linux/swap.h> 70#include <linux/swap.h>
71#include <linux/slab.h>
72#include <linux/smp.h> 71#include <linux/smp.h>
73#include <linux/signal.h> 72#include <linux/signal.h>
74#include <linux/highmem.h> 73#include <linux/highmem.h>
@@ -134,13 +133,16 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
134 * simple bit tests. 133 * simple bit tests.
135 */ 134 */
136static const char *task_state_array[] = { 135static const char *task_state_array[] = {
137 "R (running)", /* 0 */ 136 "R (running)", /* 0 */
138 "S (sleeping)", /* 1 */ 137 "S (sleeping)", /* 1 */
139 "D (disk sleep)", /* 2 */ 138 "D (disk sleep)", /* 2 */
140 "T (stopped)", /* 4 */ 139 "T (stopped)", /* 4 */
141 "T (tracing stop)", /* 8 */ 140 "t (tracing stop)", /* 8 */
142 "Z (zombie)", /* 16 */ 141 "Z (zombie)", /* 16 */
143 "X (dead)" /* 32 */ 142 "X (dead)", /* 32 */
143 "x (dead)", /* 64 */
144 "K (wakekill)", /* 128 */
145 "W (waking)", /* 256 */
144}; 146};
145 147
146static inline const char *get_task_state(struct task_struct *tsk) 148static inline const char *get_task_state(struct task_struct *tsk)
@@ -148,6 +150,8 @@ static inline const char *get_task_state(struct task_struct *tsk)
148 unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; 150 unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
149 const char **p = &task_state_array[0]; 151 const char **p = &task_state_array[0];
150 152
153 BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
154
151 while (state) { 155 while (state) {
152 p++; 156 p++;
153 state >>= 1; 157 state >>= 1;
@@ -265,8 +269,10 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
265 blocked = p->blocked; 269 blocked = p->blocked;
266 collect_sigign_sigcatch(p, &ignored, &caught); 270 collect_sigign_sigcatch(p, &ignored, &caught);
267 num_threads = atomic_read(&p->signal->count); 271 num_threads = atomic_read(&p->signal->count);
272 rcu_read_lock(); /* FIXME: is this correct? */
268 qsize = atomic_read(&__task_cred(p)->user->sigpending); 273 qsize = atomic_read(&__task_cred(p)->user->sigpending);
269 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; 274 rcu_read_unlock();
275 qlim = task_rlimit(p, RLIMIT_SIGPENDING);
270 unlock_task_sighand(p, &flags); 276 unlock_task_sighand(p, &flags);
271 } 277 }
272 278
@@ -322,93 +328,15 @@ static inline void task_context_switch_counts(struct seq_file *m,
322 p->nivcsw); 328 p->nivcsw);
323} 329}
324 330
325#ifdef CONFIG_MMU 331static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
326
327struct stack_stats {
328 struct vm_area_struct *vma;
329 unsigned long startpage;
330 unsigned long usage;
331};
332
333static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
334 unsigned long end, struct mm_walk *walk)
335{
336 struct stack_stats *ss = walk->private;
337 struct vm_area_struct *vma = ss->vma;
338 pte_t *pte, ptent;
339 spinlock_t *ptl;
340 int ret = 0;
341
342 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
343 for (; addr != end; pte++, addr += PAGE_SIZE) {
344 ptent = *pte;
345
346#ifdef CONFIG_STACK_GROWSUP
347 if (pte_present(ptent) || is_swap_pte(ptent))
348 ss->usage = addr - ss->startpage + PAGE_SIZE;
349#else
350 if (pte_present(ptent) || is_swap_pte(ptent)) {
351 ss->usage = ss->startpage - addr + PAGE_SIZE;
352 pte++;
353 ret = 1;
354 break;
355 }
356#endif
357 }
358 pte_unmap_unlock(pte - 1, ptl);
359 cond_resched();
360 return ret;
361}
362
363static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
364 struct task_struct *task)
365{
366 struct stack_stats ss;
367 struct mm_walk stack_walk = {
368 .pmd_entry = stack_usage_pte_range,
369 .mm = vma->vm_mm,
370 .private = &ss,
371 };
372
373 if (!vma->vm_mm || is_vm_hugetlb_page(vma))
374 return 0;
375
376 ss.vma = vma;
377 ss.startpage = task->stack_start & PAGE_MASK;
378 ss.usage = 0;
379
380#ifdef CONFIG_STACK_GROWSUP
381 walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
382 &stack_walk);
383#else
384 walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
385 &stack_walk);
386#endif
387 return ss.usage;
388}
389
390static inline void task_show_stack_usage(struct seq_file *m,
391 struct task_struct *task)
392{
393 struct vm_area_struct *vma;
394 struct mm_struct *mm = get_task_mm(task);
395
396 if (mm) {
397 down_read(&mm->mmap_sem);
398 vma = find_vma(mm, task->stack_start);
399 if (vma)
400 seq_printf(m, "Stack usage:\t%lu kB\n",
401 get_stack_usage_in_bytes(vma, task) >> 10);
402
403 up_read(&mm->mmap_sem);
404 mmput(mm);
405 }
406}
407#else
408static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
409{ 332{
333 seq_printf(m, "Cpus_allowed:\t");
334 seq_cpumask(m, &task->cpus_allowed);
335 seq_printf(m, "\n");
336 seq_printf(m, "Cpus_allowed_list:\t");
337 seq_cpumask_list(m, &task->cpus_allowed);
338 seq_printf(m, "\n");
410} 339}
411#endif /* CONFIG_MMU */
412 340
413int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 341int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
414 struct pid *pid, struct task_struct *task) 342 struct pid *pid, struct task_struct *task)
@@ -424,12 +352,12 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
424 } 352 }
425 task_sig(m, task); 353 task_sig(m, task);
426 task_cap(m, task); 354 task_cap(m, task);
355 task_cpus_allowed(m, task);
427 cpuset_task_status_allowed(m, task); 356 cpuset_task_status_allowed(m, task);
428#if defined(CONFIG_S390) 357#if defined(CONFIG_S390)
429 task_show_regs(m, task); 358 task_show_regs(m, task);
430#endif 359#endif
431 task_context_switch_counts(m, task); 360 task_context_switch_counts(m, task);
432 task_show_stack_usage(m, task);
433 return 0; 361 return 0;
434} 362}
435 363
@@ -491,24 +419,21 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
491 cutime = sig->cutime; 419 cutime = sig->cutime;
492 cstime = sig->cstime; 420 cstime = sig->cstime;
493 cgtime = sig->cgtime; 421 cgtime = sig->cgtime;
494 rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; 422 rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
495 423
496 /* add up live thread stats at the group level */ 424 /* add up live thread stats at the group level */
497 if (whole) { 425 if (whole) {
498 struct task_cputime cputime;
499 struct task_struct *t = task; 426 struct task_struct *t = task;
500 do { 427 do {
501 min_flt += t->min_flt; 428 min_flt += t->min_flt;
502 maj_flt += t->maj_flt; 429 maj_flt += t->maj_flt;
503 gtime = cputime_add(gtime, task_gtime(t)); 430 gtime = cputime_add(gtime, t->gtime);
504 t = next_thread(t); 431 t = next_thread(t);
505 } while (t != task); 432 } while (t != task);
506 433
507 min_flt += sig->min_flt; 434 min_flt += sig->min_flt;
508 maj_flt += sig->maj_flt; 435 maj_flt += sig->maj_flt;
509 thread_group_cputime(task, &cputime); 436 thread_group_times(task, &utime, &stime);
510 utime = cputime.utime;
511 stime = cputime.stime;
512 gtime = cputime_add(gtime, sig->gtime); 437 gtime = cputime_add(gtime, sig->gtime);
513 } 438 }
514 439
@@ -524,9 +449,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
524 if (!whole) { 449 if (!whole) {
525 min_flt = task->min_flt; 450 min_flt = task->min_flt;
526 maj_flt = task->maj_flt; 451 maj_flt = task->maj_flt;
527 utime = task_utime(task); 452 task_times(task, &utime, &stime);
528 stime = task_stime(task); 453 gtime = task->gtime;
529 gtime = task_gtime(task);
530 } 454 }
531 455
532 /* scale priority and nice values from timeslices to -20..20 */ 456 /* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index af643b5aefe8..7621db800a74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h> 83#include <linux/fs_struct.h>
84#include <linux/slab.h>
84#include "internal.h" 85#include "internal.h"
85 86
86/* NOTE: 87/* NOTE:
@@ -442,12 +443,13 @@ static const struct file_operations proc_lstats_operations = {
442unsigned long badness(struct task_struct *p, unsigned long uptime); 443unsigned long badness(struct task_struct *p, unsigned long uptime);
443static int proc_oom_score(struct task_struct *task, char *buffer) 444static int proc_oom_score(struct task_struct *task, char *buffer)
444{ 445{
445 unsigned long points; 446 unsigned long points = 0;
446 struct timespec uptime; 447 struct timespec uptime;
447 448
448 do_posix_clock_monotonic_gettime(&uptime); 449 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 450 read_lock(&tasklist_lock);
450 points = badness(task->group_leader, uptime.tv_sec); 451 if (pid_alive(task))
452 points = badness(task, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 453 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 454 return sprintf(buffer, "%lu\n", points);
453} 455}
@@ -647,17 +649,11 @@ static int mounts_release(struct inode *inode, struct file *file)
647static unsigned mounts_poll(struct file *file, poll_table *wait) 649static unsigned mounts_poll(struct file *file, poll_table *wait)
648{ 650{
649 struct proc_mounts *p = file->private_data; 651 struct proc_mounts *p = file->private_data;
650 struct mnt_namespace *ns = p->ns;
651 unsigned res = POLLIN | POLLRDNORM; 652 unsigned res = POLLIN | POLLRDNORM;
652 653
653 poll_wait(file, &ns->poll, wait); 654 poll_wait(file, &p->ns->poll, wait);
654 655 if (mnt_had_events(p))
655 spin_lock(&vfsmount_lock);
656 if (p->event != ns->event) {
657 p->event = ns->event;
658 res |= POLLERR | POLLPRI; 656 res |= POLLERR | POLLPRI;
659 }
660 spin_unlock(&vfsmount_lock);
661 657
662 return res; 658 return res;
663} 659}
@@ -1095,8 +1091,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1095 if (!capable(CAP_AUDIT_CONTROL)) 1091 if (!capable(CAP_AUDIT_CONTROL))
1096 return -EPERM; 1092 return -EPERM;
1097 1093
1098 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) 1094 rcu_read_lock();
1095 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1096 rcu_read_unlock();
1099 return -EPERM; 1097 return -EPERM;
1098 }
1099 rcu_read_unlock();
1100 1100
1101 if (count >= PAGE_SIZE) 1101 if (count >= PAGE_SIZE)
1102 count = PAGE_SIZE - 1; 1102 count = PAGE_SIZE - 1;
@@ -1265,6 +1265,72 @@ static const struct file_operations proc_pid_sched_operations = {
1265 1265
1266#endif 1266#endif
1267 1267
1268static ssize_t comm_write(struct file *file, const char __user *buf,
1269 size_t count, loff_t *offset)
1270{
1271 struct inode *inode = file->f_path.dentry->d_inode;
1272 struct task_struct *p;
1273 char buffer[TASK_COMM_LEN];
1274
1275 memset(buffer, 0, sizeof(buffer));
1276 if (count > sizeof(buffer) - 1)
1277 count = sizeof(buffer) - 1;
1278 if (copy_from_user(buffer, buf, count))
1279 return -EFAULT;
1280
1281 p = get_proc_task(inode);
1282 if (!p)
1283 return -ESRCH;
1284
1285 if (same_thread_group(current, p))
1286 set_task_comm(p, buffer);
1287 else
1288 count = -EINVAL;
1289
1290 put_task_struct(p);
1291
1292 return count;
1293}
1294
1295static int comm_show(struct seq_file *m, void *v)
1296{
1297 struct inode *inode = m->private;
1298 struct task_struct *p;
1299
1300 p = get_proc_task(inode);
1301 if (!p)
1302 return -ESRCH;
1303
1304 task_lock(p);
1305 seq_printf(m, "%s\n", p->comm);
1306 task_unlock(p);
1307
1308 put_task_struct(p);
1309
1310 return 0;
1311}
1312
1313static int comm_open(struct inode *inode, struct file *filp)
1314{
1315 int ret;
1316
1317 ret = single_open(filp, comm_show, NULL);
1318 if (!ret) {
1319 struct seq_file *m = filp->private_data;
1320
1321 m->private = inode;
1322 }
1323 return ret;
1324}
1325
1326static const struct file_operations proc_pid_set_comm_operations = {
1327 .open = comm_open,
1328 .read = seq_read,
1329 .write = comm_write,
1330 .llseek = seq_lseek,
1331 .release = single_release,
1332};
1333
1268/* 1334/*
1269 * We added or removed a vma mapping the executable. The vmas are only mapped 1335 * We added or removed a vma mapping the executable. The vmas are only mapped
1270 * during exec and are not mapped with the mmap system call. 1336 * during exec and are not mapped with the mmap system call.
@@ -1353,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1353 goto out; 1419 goto out;
1354 1420
1355 error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); 1421 error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
1356 nd->last_type = LAST_BIND;
1357out: 1422out:
1358 return ERR_PTR(error); 1423 return ERR_PTR(error);
1359} 1424}
@@ -2200,7 +2265,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
2200 2265
2201#endif 2266#endif
2202 2267
2203#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 2268#ifdef CONFIG_ELF_CORE
2204static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, 2269static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2205 size_t count, loff_t *ppos) 2270 size_t count, loff_t *ppos)
2206{ 2271{
@@ -2304,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2304{ 2369{
2305 struct pid_namespace *ns = dentry->d_sb->s_fs_info; 2370 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2306 pid_t tgid = task_tgid_nr_ns(current, ns); 2371 pid_t tgid = task_tgid_nr_ns(current, ns);
2307 char tmp[PROC_NUMBUF]; 2372 char *name = ERR_PTR(-ENOENT);
2308 if (!tgid) 2373 if (tgid) {
2309 return ERR_PTR(-ENOENT); 2374 name = __getname();
2310 sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); 2375 if (!name)
2311 return ERR_PTR(vfs_follow_link(nd,tmp)); 2376 name = ERR_PTR(-ENOMEM);
2377 else
2378 sprintf(name, "%d", tgid);
2379 }
2380 nd_set_link(nd, name);
2381 return NULL;
2382}
2383
2384static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2385 void *cookie)
2386{
2387 char *s = nd_get_link(nd);
2388 if (!IS_ERR(s))
2389 __putname(s);
2312} 2390}
2313 2391
2314static const struct inode_operations proc_self_inode_operations = { 2392static const struct inode_operations proc_self_inode_operations = {
2315 .readlink = proc_self_readlink, 2393 .readlink = proc_self_readlink,
2316 .follow_link = proc_self_follow_link, 2394 .follow_link = proc_self_follow_link,
2395 .put_link = proc_self_put_link,
2317}; 2396};
2318 2397
2319/* 2398/*
@@ -2504,6 +2583,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2504#ifdef CONFIG_SCHED_DEBUG 2583#ifdef CONFIG_SCHED_DEBUG
2505 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2584 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2506#endif 2585#endif
2586 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2507#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2587#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2508 INF("syscall", S_IRUSR, proc_pid_syscall), 2588 INF("syscall", S_IRUSR, proc_pid_syscall),
2509#endif 2589#endif
@@ -2556,7 +2636,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2556#ifdef CONFIG_FAULT_INJECTION 2636#ifdef CONFIG_FAULT_INJECTION
2557 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), 2637 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2558#endif 2638#endif
2559#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 2639#ifdef CONFIG_ELF_CORE
2560 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), 2640 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
2561#endif 2641#endif
2562#ifdef CONFIG_TASK_IO_ACCOUNTING 2642#ifdef CONFIG_TASK_IO_ACCOUNTING
@@ -2838,6 +2918,7 @@ static const struct pid_entry tid_base_stuff[] = {
2838#ifdef CONFIG_SCHED_DEBUG 2918#ifdef CONFIG_SCHED_DEBUG
2839 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2919 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2840#endif 2920#endif
2921 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2841#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2922#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2842 INF("syscall", S_IRUSR, proc_pid_syscall), 2923 INF("syscall", S_IRUSR, proc_pid_syscall),
2843#endif 2924#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index fa678abc9db1..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/idr.h> 19#include <linux/idr.h>
@@ -291,19 +292,17 @@ static const struct inode_operations proc_file_inode_operations = {
291 * returns the struct proc_dir_entry for "/proc/tty/driver", and 292 * returns the struct proc_dir_entry for "/proc/tty/driver", and
292 * returns "serial" in residual. 293 * returns "serial" in residual.
293 */ 294 */
294static int xlate_proc_name(const char *name, 295static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
295 struct proc_dir_entry **ret, const char **residual) 296 const char **residual)
296{ 297{
297 const char *cp = name, *next; 298 const char *cp = name, *next;
298 struct proc_dir_entry *de; 299 struct proc_dir_entry *de;
299 int len; 300 int len;
300 int rtn = 0;
301 301
302 de = *ret; 302 de = *ret;
303 if (!de) 303 if (!de)
304 de = &proc_root; 304 de = &proc_root;
305 305
306 spin_lock(&proc_subdir_lock);
307 while (1) { 306 while (1) {
308 next = strchr(cp, '/'); 307 next = strchr(cp, '/');
309 if (!next) 308 if (!next)
@@ -315,16 +314,25 @@ static int xlate_proc_name(const char *name,
315 break; 314 break;
316 } 315 }
317 if (!de) { 316 if (!de) {
318 rtn = -ENOENT; 317 WARN(1, "name '%s'\n", name);
319 goto out; 318 return -ENOENT;
320 } 319 }
321 cp += len + 1; 320 cp += len + 1;
322 } 321 }
323 *residual = cp; 322 *residual = cp;
324 *ret = de; 323 *ret = de;
325out: 324 return 0;
325}
326
327static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
328 const char **residual)
329{
330 int rv;
331
332 spin_lock(&proc_subdir_lock);
333 rv = __xlate_proc_name(name, ret, residual);
326 spin_unlock(&proc_subdir_lock); 334 spin_unlock(&proc_subdir_lock);
327 return rtn; 335 return rv;
328} 336}
329 337
330static DEFINE_IDA(proc_inum_ida); 338static DEFINE_IDA(proc_inum_ida);
@@ -429,7 +437,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
429 unsigned int ino; 437 unsigned int ino;
430 438
431 ino = de->low_ino; 439 ino = de->low_ino;
432 de_get(de); 440 pde_get(de);
433 spin_unlock(&proc_subdir_lock); 441 spin_unlock(&proc_subdir_lock);
434 error = -EINVAL; 442 error = -EINVAL;
435 inode = proc_get_inode(dir->i_sb, ino, de); 443 inode = proc_get_inode(dir->i_sb, ino, de);
@@ -445,7 +453,7 @@ out_unlock:
445 return NULL; 453 return NULL;
446 } 454 }
447 if (de) 455 if (de)
448 de_put(de); 456 pde_put(de);
449 return ERR_PTR(error); 457 return ERR_PTR(error);
450} 458}
451 459
@@ -509,17 +517,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
509 struct proc_dir_entry *next; 517 struct proc_dir_entry *next;
510 518
511 /* filldir passes info to user space */ 519 /* filldir passes info to user space */
512 de_get(de); 520 pde_get(de);
513 spin_unlock(&proc_subdir_lock); 521 spin_unlock(&proc_subdir_lock);
514 if (filldir(dirent, de->name, de->namelen, filp->f_pos, 522 if (filldir(dirent, de->name, de->namelen, filp->f_pos,
515 de->low_ino, de->mode >> 12) < 0) { 523 de->low_ino, de->mode >> 12) < 0) {
516 de_put(de); 524 pde_put(de);
517 goto out; 525 goto out;
518 } 526 }
519 spin_lock(&proc_subdir_lock); 527 spin_lock(&proc_subdir_lock);
520 filp->f_pos++; 528 filp->f_pos++;
521 next = de->next; 529 next = de->next;
522 de_put(de); 530 pde_put(de);
523 de = next; 531 de = next;
524 } while (de); 532 } while (de);
525 spin_unlock(&proc_subdir_lock); 533 spin_unlock(&proc_subdir_lock);
@@ -662,6 +670,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
662 } 670 }
663 return ent; 671 return ent;
664} 672}
673EXPORT_SYMBOL(proc_symlink);
665 674
666struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, 675struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
667 struct proc_dir_entry *parent) 676 struct proc_dir_entry *parent)
@@ -700,6 +709,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
700{ 709{
701 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); 710 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
702} 711}
712EXPORT_SYMBOL(proc_mkdir);
703 713
704struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, 714struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
705 struct proc_dir_entry *parent) 715 struct proc_dir_entry *parent)
@@ -728,6 +738,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
728 } 738 }
729 return ent; 739 return ent;
730} 740}
741EXPORT_SYMBOL(create_proc_entry);
731 742
732struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, 743struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
733 struct proc_dir_entry *parent, 744 struct proc_dir_entry *parent,
@@ -762,8 +773,9 @@ out_free:
762out: 773out:
763 return NULL; 774 return NULL;
764} 775}
776EXPORT_SYMBOL(proc_create_data);
765 777
766void free_proc_entry(struct proc_dir_entry *de) 778static void free_proc_entry(struct proc_dir_entry *de)
767{ 779{
768 unsigned int ino = de->low_ino; 780 unsigned int ino = de->low_ino;
769 781
@@ -777,6 +789,12 @@ void free_proc_entry(struct proc_dir_entry *de)
777 kfree(de); 789 kfree(de);
778} 790}
779 791
792void pde_put(struct proc_dir_entry *pde)
793{
794 if (atomic_dec_and_test(&pde->count))
795 free_proc_entry(pde);
796}
797
780/* 798/*
781 * Remove a /proc entry and free it if it's not currently in use. 799 * Remove a /proc entry and free it if it's not currently in use.
782 */ 800 */
@@ -787,11 +805,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
787 const char *fn = name; 805 const char *fn = name;
788 int len; 806 int len;
789 807
790 if (xlate_proc_name(name, &parent, &fn) != 0) 808 spin_lock(&proc_subdir_lock);
809 if (__xlate_proc_name(name, &parent, &fn) != 0) {
810 spin_unlock(&proc_subdir_lock);
791 return; 811 return;
812 }
792 len = strlen(fn); 813 len = strlen(fn);
793 814
794 spin_lock(&proc_subdir_lock);
795 for (p = &parent->subdir; *p; p=&(*p)->next ) { 815 for (p = &parent->subdir; *p; p=&(*p)->next ) {
796 if (proc_match(len, fn, *p)) { 816 if (proc_match(len, fn, *p)) {
797 de = *p; 817 de = *p;
@@ -801,8 +821,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
801 } 821 }
802 } 822 }
803 spin_unlock(&proc_subdir_lock); 823 spin_unlock(&proc_subdir_lock);
804 if (!de) 824 if (!de) {
825 WARN(1, "name '%s'\n", name);
805 return; 826 return;
827 }
806 828
807 spin_lock(&de->pde_unload_lock); 829 spin_lock(&de->pde_unload_lock);
808 /* 830 /*
@@ -845,6 +867,6 @@ continue_removing:
845 WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " 867 WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
846 "'%s/%s', leaking at least '%s'\n", __func__, 868 "'%s/%s', leaking at least '%s'\n", __func__,
847 de->parent->name, de->name, de->subdir->name); 869 de->parent->name, de->name, de->subdir->name);
848 if (atomic_dec_and_test(&de->count)) 870 pde_put(de);
849 free_proc_entry(de);
850} 871}
872EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d78ade305541..d35b23238fb1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,35 +18,13 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24 25
25#include "internal.h" 26#include "internal.h"
26 27
27struct proc_dir_entry *de_get(struct proc_dir_entry *de)
28{
29 atomic_inc(&de->count);
30 return de;
31}
32
33/*
34 * Decrements the use count and checks for deferred deletion.
35 */
36void de_put(struct proc_dir_entry *de)
37{
38 if (!atomic_read(&de->count)) {
39 printk("de_put: entry %s already free!\n", de->name);
40 return;
41 }
42
43 if (atomic_dec_and_test(&de->count))
44 free_proc_entry(de);
45}
46
47/*
48 * Decrement the use count of the proc_dir_entry.
49 */
50static void proc_delete_inode(struct inode *inode) 28static void proc_delete_inode(struct inode *inode)
51{ 29{
52 struct proc_dir_entry *de; 30 struct proc_dir_entry *de;
@@ -59,7 +37,7 @@ static void proc_delete_inode(struct inode *inode)
59 /* Let go of any associated proc directory entry */ 37 /* Let go of any associated proc directory entry */
60 de = PROC_I(inode)->pde; 38 de = PROC_I(inode)->pde;
61 if (de) 39 if (de)
62 de_put(de); 40 pde_put(de);
63 if (PROC_I(inode)->sysctl) 41 if (PROC_I(inode)->sysctl)
64 sysctl_head_put(PROC_I(inode)->sysctl); 42 sysctl_head_put(PROC_I(inode)->sysctl);
65 clear_inode(inode); 43 clear_inode(inode);
@@ -480,7 +458,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
480 } 458 }
481 unlock_new_inode(inode); 459 unlock_new_inode(inode);
482 } else 460 } else
483 de_put(de); 461 pde_put(de);
484 return inode; 462 return inode;
485} 463}
486 464
@@ -495,7 +473,7 @@ int proc_fill_super(struct super_block *s)
495 s->s_op = &proc_sops; 473 s->s_op = &proc_sops;
496 s->s_time_gran = 1; 474 s->s_time_gran = 1;
497 475
498 de_get(&proc_root); 476 pde_get(&proc_root);
499 root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); 477 root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
500 if (!root_inode) 478 if (!root_inode)
501 goto out_no_root; 479 goto out_no_root;
@@ -509,6 +487,6 @@ int proc_fill_super(struct super_block *s)
509out_no_root: 487out_no_root:
510 printk("proc_read_super: get root inode failed\n"); 488 printk("proc_read_super: get root inode failed\n");
511 iput(root_inode); 489 iput(root_inode);
512 de_put(&proc_root); 490 pde_put(&proc_root);
513 return -ENOMEM; 491 return -ENOMEM;
514} 492}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 753ca37002c8..1f24a3eddd12 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,8 +61,6 @@ extern const struct file_operations proc_pagemap_operations;
61extern const struct file_operations proc_net_operations; 61extern const struct file_operations proc_net_operations;
62extern const struct inode_operations proc_net_inode_operations; 62extern const struct inode_operations proc_net_inode_operations;
63 63
64void free_proc_entry(struct proc_dir_entry *de);
65
66void proc_init_inodecache(void); 64void proc_init_inodecache(void);
67 65
68static inline struct pid *proc_pid(struct inode *inode) 66static inline struct pid *proc_pid(struct inode *inode)
@@ -101,8 +99,12 @@ unsigned long task_vsize(struct mm_struct *);
101int task_statm(struct mm_struct *, int *, int *, int *, int *); 99int task_statm(struct mm_struct *, int *, int *, int *, int *);
102void task_mem(struct seq_file *, struct mm_struct *); 100void task_mem(struct seq_file *, struct mm_struct *);
103 101
104struct proc_dir_entry *de_get(struct proc_dir_entry *de); 102static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
105void de_put(struct proc_dir_entry *de); 103{
104 atomic_inc(&pde->count);
105 return pde;
106}
107void pde_put(struct proc_dir_entry *pde);
106 108
107extern struct vfsmount *proc_mnt; 109extern struct vfsmount *proc_mnt;
108int proc_fill_super(struct super_block *); 110int proc_fill_super(struct super_block *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..19979a2ce272 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/slab.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/io.h> 24#include <asm/io.h>
24#include <linux/list.h> 25#include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 491 }
491 read_unlock(&kclist_lock); 492 read_unlock(&kclist_lock);
492 493
493 if (m == NULL) { 494 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
495 return -EFAULT; 496 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..cfe90a48a6e8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
12#include <linux/poll.h> 12#include <linux/poll.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/syslog.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io.h> 18#include <asm/io.h>
18 19
19extern wait_queue_head_t log_wait; 20extern wait_queue_head_t log_wait;
20 21
21extern int do_syslog(int type, char __user *bug, int count);
22
23static int kmsg_open(struct inode * inode, struct file * file) 22static int kmsg_open(struct inode * inode, struct file * file)
24{ 23{
25 return do_syslog(1,NULL,0); 24 return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
26} 25}
27 26
28static int kmsg_release(struct inode * inode, struct file * file) 27static int kmsg_release(struct inode * inode, struct file * file)
29{ 28{
30 (void) do_syslog(0,NULL,0); 29 (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
31 return 0; 30 return 0;
32} 31}
33 32
34static ssize_t kmsg_read(struct file *file, char __user *buf, 33static ssize_t kmsg_read(struct file *file, char __user *buf,
35 size_t count, loff_t *ppos) 34 size_t count, loff_t *ppos)
36{ 35{
37 if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) 36 if ((file->f_flags & O_NONBLOCK) &&
37 !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
38 return -EAGAIN; 38 return -EAGAIN;
39 return do_syslog(2, buf, count); 39 return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
40} 40}
41 41
42static unsigned int kmsg_poll(struct file *file, poll_table *wait) 42static unsigned int kmsg_poll(struct file *file, poll_table *wait)
43{ 43{
44 poll_wait(file, &log_wait, wait); 44 poll_wait(file, &log_wait, wait);
45 if (do_syslog(9, NULL, 0)) 45 if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
46 return POLLIN | POLLRDNORM; 46 return POLLIN | POLLRDNORM;
47 return 0; 47 return 0;
48} 48}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/seq_file.h> 25#include <linux/seq_file.h>
27#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 5033ce0d254b..180cf5a0bd67 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
8#include <linux/proc_fs.h> 8#include <linux/proc_fs.h>
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/hugetlb.h> 10#include <linux/hugetlb.h>
11#include <linux/kernel-page-flags.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12#include "internal.h" 13#include "internal.h"
13 14
@@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = {
71 * physical page flags. 72 * physical page flags.
72 */ 73 */
73 74
74/* These macros are used to decouple internal flags from exported ones */
75
76#define KPF_LOCKED 0
77#define KPF_ERROR 1
78#define KPF_REFERENCED 2
79#define KPF_UPTODATE 3
80#define KPF_DIRTY 4
81#define KPF_LRU 5
82#define KPF_ACTIVE 6
83#define KPF_SLAB 7
84#define KPF_WRITEBACK 8
85#define KPF_RECLAIM 9
86#define KPF_BUDDY 10
87
88/* 11-20: new additions in 2.6.31 */
89#define KPF_MMAP 11
90#define KPF_ANON 12
91#define KPF_SWAPCACHE 13
92#define KPF_SWAPBACKED 14
93#define KPF_COMPOUND_HEAD 15
94#define KPF_COMPOUND_TAIL 16
95#define KPF_HUGE 17
96#define KPF_UNEVICTABLE 18
97#define KPF_HWPOISON 19
98#define KPF_NOPAGE 20
99
100#define KPF_KSM 21
101
102/* kernel hacking assistances
103 * WARNING: subject to change, never rely on them!
104 */
105#define KPF_RESERVED 32
106#define KPF_MLOCKED 33
107#define KPF_MAPPEDTODISK 34
108#define KPF_PRIVATE 35
109#define KPF_PRIVATE_2 36
110#define KPF_OWNER_PRIVATE 37
111#define KPF_ARCH 38
112#define KPF_UNCACHED 39
113
114static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) 75static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
115{ 76{
116 return ((kflags >> kbit) & 1) << ubit; 77 return ((kflags >> kbit) & 1) << ubit;
117} 78}
118 79
119static u64 get_uflags(struct page *page) 80u64 stable_page_flags(struct page *page)
120{ 81{
121 u64 k; 82 u64 k;
122 u64 u; 83 u64 u;
@@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
219 else 180 else
220 ppage = NULL; 181 ppage = NULL;
221 182
222 if (put_user(get_uflags(ppage), out)) { 183 if (put_user(stable_page_flags(ppage), out)) {
223 ret = -EFAULT; 184 ret = -EFAULT;
224 break; 185 break;
225 } 186 }
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 7ba79a54948c..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -7,44 +7,50 @@
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/time.h> 8#include <linux/time.h>
9#include <linux/proc_fs.h> 9#include <linux/proc_fs.h>
10#include <linux/seq_file.h>
10#include <linux/stat.h> 11#include <linux/stat.h>
11#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h>
14#include <linux/module.h>
15#include <linux/slab.h>
12#include <asm/prom.h> 16#include <asm/prom.h>
13#include <asm/uaccess.h> 17#include <asm/uaccess.h>
14#include "internal.h" 18#include "internal.h"
15 19
16#ifndef HAVE_ARCH_DEVTREE_FIXUPS
17static inline void set_node_proc_entry(struct device_node *np, 20static inline void set_node_proc_entry(struct device_node *np,
18 struct proc_dir_entry *de) 21 struct proc_dir_entry *de)
19{ 22{
20} 23#ifdef HAVE_ARCH_DEVTREE_FIXUPS
24 np->pde = de;
21#endif 25#endif
26}
22 27
23static struct proc_dir_entry *proc_device_tree; 28static struct proc_dir_entry *proc_device_tree;
24 29
25/* 30/*
26 * Supply data on a read from /proc/device-tree/node/property. 31 * Supply data on a read from /proc/device-tree/node/property.
27 */ 32 */
28static int property_read_proc(char *page, char **start, off_t off, 33static int property_proc_show(struct seq_file *m, void *v)
29 int count, int *eof, void *data)
30{ 34{
31 struct property *pp = data; 35 struct property *pp = m->private;
32 int n;
33 36
34 if (off >= pp->length) { 37 seq_write(m, pp->value, pp->length);
35 *eof = 1; 38 return 0;
36 return 0;
37 }
38 n = pp->length - off;
39 if (n > count)
40 n = count;
41 else
42 *eof = 1;
43 memcpy(page, (char *)pp->value + off, n);
44 *start = page;
45 return n;
46} 39}
47 40
41static int property_proc_open(struct inode *inode, struct file *file)
42{
43 return single_open(file, property_proc_show, PDE(inode)->data);
44}
45
46static const struct file_operations property_proc_fops = {
47 .owner = THIS_MODULE,
48 .open = property_proc_open,
49 .read = seq_read,
50 .llseek = seq_lseek,
51 .release = single_release,
52};
53
48/* 54/*
49 * For a node with a name like "gc@10", we make symlinks called "gc" 55 * For a node with a name like "gc@10", we make symlinks called "gc"
50 * and "@10" to it. 56 * and "@10" to it.
@@ -63,10 +69,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
63 * Unfortunately proc_register puts each new entry 69 * Unfortunately proc_register puts each new entry
64 * at the beginning of the list. So we rearrange them. 70 * at the beginning of the list. So we rearrange them.
65 */ 71 */
66 ent = create_proc_read_entry(name, 72 ent = proc_create_data(name,
67 strncmp(name, "security-", 9) 73 strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
68 ? S_IRUGO : S_IRUSR, de, 74 de, &property_proc_fops, pp);
69 property_read_proc, pp);
70 if (ent == NULL) 75 if (ent == NULL)
71 return NULL; 76 return NULL;
72 77
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8aeabdf..6ff9981f0a18 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -48,7 +48,7 @@ out:
48static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) 48static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
49{ 49{
50 int len; 50 int len;
51 for ( ; p->ctl_name || p->procname; p++) { 51 for ( ; p->procname; p++) {
52 52
53 if (!p->procname) 53 if (!p->procname)
54 continue; 54 continue;
@@ -218,7 +218,7 @@ static int scan(struct ctl_table_header *head, ctl_table *table,
218 void *dirent, filldir_t filldir) 218 void *dirent, filldir_t filldir)
219{ 219{
220 220
221 for (; table->ctl_name || table->procname; table++, (*pos)++) { 221 for (; table->procname; table++, (*pos)++) {
222 int res; 222 int res;
223 223
224 /* Can't do anything without a proc name */ 224 /* Can't do anything without a proc name */
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e3..757c069f2a65 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns)
220{ 220{
221 mntput(ns->proc_mnt); 221 mntput(ns->proc_mnt);
222} 222}
223
224EXPORT_SYMBOL(proc_symlink);
225EXPORT_SYMBOL(proc_mkdir);
226EXPORT_SYMBOL(create_proc_entry);
227EXPORT_SYMBOL(proc_create_data);
228EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/gfp.h>
4#include <linux/init.h> 3#include <linux/init.h>
5#include <linux/interrupt.h> 4#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 5#include <linux/kernel_stat.h>
@@ -27,7 +26,7 @@ static int show_stat(struct seq_file *p, void *v)
27 int i, j; 26 int i, j;
28 unsigned long jif; 27 unsigned long jif;
29 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 28 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
30 cputime64_t guest; 29 cputime64_t guest, guest_nice;
31 u64 sum = 0; 30 u64 sum = 0;
32 u64 sum_softirq = 0; 31 u64 sum_softirq = 0;
33 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +35,7 @@ static int show_stat(struct seq_file *p, void *v)
36 35
37 user = nice = system = idle = iowait = 36 user = nice = system = idle = iowait =
38 irq = softirq = steal = cputime64_zero; 37 irq = softirq = steal = cputime64_zero;
39 guest = cputime64_zero; 38 guest = guest_nice = cputime64_zero;
40 getboottime(&boottime); 39 getboottime(&boottime);
41 jif = boottime.tv_sec; 40 jif = boottime.tv_sec;
42 41
@@ -51,6 +50,8 @@ static int show_stat(struct seq_file *p, void *v)
51 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 50 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
52 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 51 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
53 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 52 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
53 guest_nice = cputime64_add(guest_nice,
54 kstat_cpu(i).cpustat.guest_nice);
54 for_each_irq_nr(j) { 55 for_each_irq_nr(j) {
55 sum += kstat_irqs_cpu(j, i); 56 sum += kstat_irqs_cpu(j, i);
56 } 57 }
@@ -65,7 +66,8 @@ static int show_stat(struct seq_file *p, void *v)
65 } 66 }
66 sum += arch_irq_stat(); 67 sum += arch_irq_stat();
67 68
68 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", 69 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
70 "%llu\n",
69 (unsigned long long)cputime64_to_clock_t(user), 71 (unsigned long long)cputime64_to_clock_t(user),
70 (unsigned long long)cputime64_to_clock_t(nice), 72 (unsigned long long)cputime64_to_clock_t(nice),
71 (unsigned long long)cputime64_to_clock_t(system), 73 (unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +76,8 @@ static int show_stat(struct seq_file *p, void *v)
74 (unsigned long long)cputime64_to_clock_t(irq), 76 (unsigned long long)cputime64_to_clock_t(irq),
75 (unsigned long long)cputime64_to_clock_t(softirq), 77 (unsigned long long)cputime64_to_clock_t(softirq),
76 (unsigned long long)cputime64_to_clock_t(steal), 78 (unsigned long long)cputime64_to_clock_t(steal),
77 (unsigned long long)cputime64_to_clock_t(guest)); 79 (unsigned long long)cputime64_to_clock_t(guest),
80 (unsigned long long)cputime64_to_clock_t(guest_nice));
78 for_each_online_cpu(i) { 81 for_each_online_cpu(i) {
79 82
80 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 83 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +91,10 @@ static int show_stat(struct seq_file *p, void *v)
88 softirq = kstat_cpu(i).cpustat.softirq; 91 softirq = kstat_cpu(i).cpustat.softirq;
89 steal = kstat_cpu(i).cpustat.steal; 92 steal = kstat_cpu(i).cpustat.steal;
90 guest = kstat_cpu(i).cpustat.guest; 93 guest = kstat_cpu(i).cpustat.guest;
94 guest_nice = kstat_cpu(i).cpustat.guest_nice;
91 seq_printf(p, 95 seq_printf(p,
92 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", 96 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
97 "%llu\n",
93 i, 98 i,
94 (unsigned long long)cputime64_to_clock_t(user), 99 (unsigned long long)cputime64_to_clock_t(user),
95 (unsigned long long)cputime64_to_clock_t(nice), 100 (unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +104,8 @@ static int show_stat(struct seq_file *p, void *v)
99 (unsigned long long)cputime64_to_clock_t(irq), 104 (unsigned long long)cputime64_to_clock_t(irq),
100 (unsigned long long)cputime64_to_clock_t(softirq), 105 (unsigned long long)cputime64_to_clock_t(softirq),
101 (unsigned long long)cputime64_to_clock_t(steal), 106 (unsigned long long)cputime64_to_clock_t(steal),
102 (unsigned long long)cputime64_to_clock_t(guest)); 107 (unsigned long long)cputime64_to_clock_t(guest),
108 (unsigned long long)cputime64_to_clock_t(guest_nice));
103 } 109 }
104 seq_printf(p, "intr %llu", (unsigned long long)sum); 110 seq_printf(p, "intr %llu", (unsigned long long)sum);
105 111
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2a1bef9203c6..caf0337dff73 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
4#include <linux/seq_file.h> 4#include <linux/seq_file.h>
5#include <linux/highmem.h> 5#include <linux/highmem.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <linux/slab.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/mempolicy.h> 9#include <linux/mempolicy.h>
9#include <linux/swap.h> 10#include <linux/swap.h>
@@ -16,7 +17,7 @@
16 17
17void task_mem(struct seq_file *m, struct mm_struct *mm) 18void task_mem(struct seq_file *m, struct mm_struct *mm)
18{ 19{
19 unsigned long data, text, lib; 20 unsigned long data, text, lib, swap;
20 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 21 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
21 22
22 /* 23 /*
@@ -36,6 +37,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
36 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 37 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
37 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 38 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
38 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 39 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
40 swap = get_mm_counter(mm, MM_SWAPENTS);
39 seq_printf(m, 41 seq_printf(m,
40 "VmPeak:\t%8lu kB\n" 42 "VmPeak:\t%8lu kB\n"
41 "VmSize:\t%8lu kB\n" 43 "VmSize:\t%8lu kB\n"
@@ -46,7 +48,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
46 "VmStk:\t%8lu kB\n" 48 "VmStk:\t%8lu kB\n"
47 "VmExe:\t%8lu kB\n" 49 "VmExe:\t%8lu kB\n"
48 "VmLib:\t%8lu kB\n" 50 "VmLib:\t%8lu kB\n"
49 "VmPTE:\t%8lu kB\n", 51 "VmPTE:\t%8lu kB\n"
52 "VmSwap:\t%8lu kB\n",
50 hiwater_vm << (PAGE_SHIFT-10), 53 hiwater_vm << (PAGE_SHIFT-10),
51 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 54 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
52 mm->locked_vm << (PAGE_SHIFT-10), 55 mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +57,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 total_rss << (PAGE_SHIFT-10), 57 total_rss << (PAGE_SHIFT-10),
55 data << (PAGE_SHIFT-10), 58 data << (PAGE_SHIFT-10),
56 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 59 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
57 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); 60 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
61 swap << (PAGE_SHIFT-10));
58} 62}
59 63
60unsigned long task_vsize(struct mm_struct *mm) 64unsigned long task_vsize(struct mm_struct *mm)
@@ -65,11 +69,11 @@ unsigned long task_vsize(struct mm_struct *mm)
65int task_statm(struct mm_struct *mm, int *shared, int *text, 69int task_statm(struct mm_struct *mm, int *shared, int *text,
66 int *data, int *resident) 70 int *data, int *resident)
67{ 71{
68 *shared = get_mm_counter(mm, file_rss); 72 *shared = get_mm_counter(mm, MM_FILEPAGES);
69 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 73 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
70 >> PAGE_SHIFT; 74 >> PAGE_SHIFT;
71 *data = mm->total_vm - mm->shared_vm; 75 *data = mm->total_vm - mm->shared_vm;
72 *resident = *shared + get_mm_counter(mm, anon_rss); 76 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
73 return mm->total_vm; 77 return mm->total_vm;
74} 78}
75 79
@@ -361,12 +365,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
361 if (!pte_present(ptent)) 365 if (!pte_present(ptent))
362 continue; 366 continue;
363 367
364 mss->resident += PAGE_SIZE;
365
366 page = vm_normal_page(vma, addr, ptent); 368 page = vm_normal_page(vma, addr, ptent);
367 if (!page) 369 if (!page)
368 continue; 370 continue;
369 371
372 mss->resident += PAGE_SIZE;
370 /* Accumulate the size in pages that have been accessed. */ 373 /* Accumulate the size in pages that have been accessed. */
371 if (pte_young(ptent) || PageReferenced(page)) 374 if (pte_young(ptent) || PageReferenced(page))
372 mss->referenced += PAGE_SIZE; 375 mss->referenced += PAGE_SIZE;
@@ -404,6 +407,7 @@ static int show_smap(struct seq_file *m, void *v)
404 407
405 memset(&mss, 0, sizeof mss); 408 memset(&mss, 0, sizeof mss);
406 mss.vma = vma; 409 mss.vma = vma;
410 /* mmap_sem is held in m_start */
407 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 411 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
408 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 412 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
409 413
@@ -550,7 +554,8 @@ const struct file_operations proc_clear_refs_operations = {
550}; 554};
551 555
552struct pagemapread { 556struct pagemapread {
553 u64 __user *out, *end; 557 int pos, len;
558 u64 *buffer;
554}; 559};
555 560
556#define PM_ENTRY_BYTES sizeof(u64) 561#define PM_ENTRY_BYTES sizeof(u64)
@@ -573,10 +578,8 @@ struct pagemapread {
573static int add_to_pagemap(unsigned long addr, u64 pfn, 578static int add_to_pagemap(unsigned long addr, u64 pfn,
574 struct pagemapread *pm) 579 struct pagemapread *pm)
575{ 580{
576 if (put_user(pfn, pm->out)) 581 pm->buffer[pm->pos++] = pfn;
577 return -EFAULT; 582 if (pm->pos >= pm->len)
578 pm->out++;
579 if (pm->out >= pm->end)
580 return PM_END_OF_BUFFER; 583 return PM_END_OF_BUFFER;
581 return 0; 584 return 0;
582} 585}
@@ -650,6 +653,50 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
650 return err; 653 return err;
651} 654}
652 655
656static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
657{
658 u64 pme = 0;
659 if (pte_present(pte))
660 pme = PM_PFRAME(pte_pfn(pte) + offset)
661 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
662 return pme;
663}
664
665static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
666 unsigned long end, struct mm_walk *walk)
667{
668 struct vm_area_struct *vma;
669 struct pagemapread *pm = walk->private;
670 struct hstate *hs = NULL;
671 int err = 0;
672
673 vma = find_vma(walk->mm, addr);
674 if (vma)
675 hs = hstate_vma(vma);
676 for (; addr != end; addr += PAGE_SIZE) {
677 u64 pfn = PM_NOT_PRESENT;
678
679 if (vma && (addr >= vma->vm_end)) {
680 vma = find_vma(walk->mm, addr);
681 if (vma)
682 hs = hstate_vma(vma);
683 }
684
685 if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
686 /* calculate pfn of the "raw" page in the hugepage. */
687 int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
688 pfn = huge_pte_to_pagemap_entry(*pte, offset);
689 }
690 err = add_to_pagemap(addr, pfn, pm);
691 if (err)
692 return err;
693 }
694
695 cond_resched();
696
697 return err;
698}
699
653/* 700/*
654 * /proc/pid/pagemap - an array mapping virtual pages to pfns 701 * /proc/pid/pagemap - an array mapping virtual pages to pfns
655 * 702 *
@@ -674,21 +721,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
674 * determine which areas of memory are actually mapped and llseek to 721 * determine which areas of memory are actually mapped and llseek to
675 * skip over unmapped regions. 722 * skip over unmapped regions.
676 */ 723 */
724#define PAGEMAP_WALK_SIZE (PMD_SIZE)
677static ssize_t pagemap_read(struct file *file, char __user *buf, 725static ssize_t pagemap_read(struct file *file, char __user *buf,
678 size_t count, loff_t *ppos) 726 size_t count, loff_t *ppos)
679{ 727{
680 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 728 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
681 struct page **pages, *page;
682 unsigned long uaddr, uend;
683 struct mm_struct *mm; 729 struct mm_struct *mm;
684 struct pagemapread pm; 730 struct pagemapread pm;
685 int pagecount;
686 int ret = -ESRCH; 731 int ret = -ESRCH;
687 struct mm_walk pagemap_walk = {}; 732 struct mm_walk pagemap_walk = {};
688 unsigned long src; 733 unsigned long src;
689 unsigned long svpfn; 734 unsigned long svpfn;
690 unsigned long start_vaddr; 735 unsigned long start_vaddr;
691 unsigned long end_vaddr; 736 unsigned long end_vaddr;
737 int copied = 0;
692 738
693 if (!task) 739 if (!task)
694 goto out; 740 goto out;
@@ -711,37 +757,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
711 if (!mm) 757 if (!mm)
712 goto out_task; 758 goto out_task;
713 759
714 760 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
715 uaddr = (unsigned long)buf & PAGE_MASK; 761 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
716 uend = (unsigned long)(buf + count);
717 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
718 ret = 0;
719 if (pagecount == 0)
720 goto out_mm;
721 pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
722 ret = -ENOMEM; 762 ret = -ENOMEM;
723 if (!pages) 763 if (!pm.buffer)
724 goto out_mm; 764 goto out_mm;
725 765
726 down_read(&current->mm->mmap_sem);
727 ret = get_user_pages(current, current->mm, uaddr, pagecount,
728 1, 0, pages, NULL);
729 up_read(&current->mm->mmap_sem);
730
731 if (ret < 0)
732 goto out_free;
733
734 if (ret != pagecount) {
735 pagecount = ret;
736 ret = -EFAULT;
737 goto out_pages;
738 }
739
740 pm.out = (u64 __user *)buf;
741 pm.end = (u64 __user *)(buf + count);
742
743 pagemap_walk.pmd_entry = pagemap_pte_range; 766 pagemap_walk.pmd_entry = pagemap_pte_range;
744 pagemap_walk.pte_hole = pagemap_pte_hole; 767 pagemap_walk.pte_hole = pagemap_pte_hole;
768 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
745 pagemap_walk.mm = mm; 769 pagemap_walk.mm = mm;
746 pagemap_walk.private = &pm; 770 pagemap_walk.private = &pm;
747 771
@@ -760,23 +784,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
760 * user buffer is tracked in "pm", and the walk 784 * user buffer is tracked in "pm", and the walk
761 * will stop when we hit the end of the buffer. 785 * will stop when we hit the end of the buffer.
762 */ 786 */
763 ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); 787 ret = 0;
764 if (ret == PM_END_OF_BUFFER) 788 while (count && (start_vaddr < end_vaddr)) {
765 ret = 0; 789 int len;
766 /* don't need mmap_sem for these, but this looks cleaner */ 790 unsigned long end;
767 *ppos += (char __user *)pm.out - buf; 791
768 if (!ret) 792 pm.pos = 0;
769 ret = (char __user *)pm.out - buf; 793 end = start_vaddr + PAGEMAP_WALK_SIZE;
770 794 /* overflow ? */
771out_pages: 795 if (end < start_vaddr || end > end_vaddr)
772 for (; pagecount; pagecount--) { 796 end = end_vaddr;
773 page = pages[pagecount-1]; 797 down_read(&mm->mmap_sem);
774 if (!PageReserved(page)) 798 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
775 SetPageDirty(page); 799 up_read(&mm->mmap_sem);
776 page_cache_release(page); 800 start_vaddr = end;
801
802 len = min(count, PM_ENTRY_BYTES * pm.pos);
803 if (copy_to_user(buf, pm.buffer, len) < 0) {
804 ret = -EFAULT;
805 goto out_free;
806 }
807 copied += len;
808 buf += len;
809 count -= len;
777 } 810 }
811 *ppos += copied;
812 if (!ret || ret == PM_END_OF_BUFFER)
813 ret = copied;
814
778out_free: 815out_free:
779 kfree(pages); 816 kfree(pm.buffer);
780out_mm: 817out_mm:
781 mmput(mm); 818 mmput(mm);
782out_task: 819out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f5c05d3dbd3..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
5#include <linux/fs_struct.h> 5#include <linux/fs_struct.h>
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/ptrace.h> 7#include <linux/ptrace.h>
8#include <linux/slab.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
9#include "internal.h" 10#include "internal.h"
10 11
@@ -110,9 +111,13 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
110 } 111 }
111 } 112 }
112 113
113 size += (*text = mm->end_code - mm->start_code); 114 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
114 size += (*data = mm->start_stack - mm->start_data); 115 >> PAGE_SHIFT;
116 *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
117 >> PAGE_SHIFT;
115 up_read(&mm->mmap_sem); 118 up_read(&mm->mmap_sem);
119 size >>= PAGE_SHIFT;
120 size += *text + *data;
116 *resident = size; 121 *resident = size;
117 return size; 122 return size;
118} 123}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..9fbc99ec799a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/slab.h>
15#include <linux/highmem.h> 16#include <linux/highmem.h>
16#include <linux/bootmem.h> 17#include <linux/bootmem.h>
17#include <linux/init.h> 18#include <linux/init.h>
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 0afba069d567..22e0d60e53ef 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -17,13 +17,6 @@
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include "qnx4.h" 18#include "qnx4.h"
19 19
20#if 0
21int qnx4_new_block(struct super_block *sb)
22{
23 return 0;
24}
25#endif /* 0 */
26
27static void count_bits(register const char *bmPart, register int size, 20static void count_bits(register const char *bmPart, register int size,
28 int *const tf) 21 int *const tf)
29{ 22{
@@ -35,22 +28,7 @@ static void count_bits(register const char *bmPart, register int size,
35 } 28 }
36 do { 29 do {
37 b = *bmPart++; 30 b = *bmPart++;
38 if ((b & 1) == 0) 31 tot += 8 - hweight8(b);
39 tot++;
40 if ((b & 2) == 0)
41 tot++;
42 if ((b & 4) == 0)
43 tot++;
44 if ((b & 8) == 0)
45 tot++;
46 if ((b & 16) == 0)
47 tot++;
48 if ((b & 32) == 0)
49 tot++;
50 if ((b & 64) == 0)
51 tot++;
52 if ((b & 128) == 0)
53 tot++;
54 size--; 32 size--;
55 } while (size != 0); 33 } while (size != 0);
56 *tf = tot; 34 *tf = tot;
@@ -67,7 +45,7 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
67 45
68 while (total < size) { 46 while (total < size) {
69 if ((bh = sb_bread(sb, start + offset)) == NULL) { 47 if ((bh = sb_bread(sb, start + offset)) == NULL) {
70 printk("qnx4: I/O error in counting free blocks\n"); 48 printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
71 break; 49 break;
72 } 50 }
73 count_bits(bh->b_data, size - total, &total_free); 51 count_bits(bh->b_data, size - total, &total_free);
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 86cc39cb1398..6f30c3d5bcbf 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -26,8 +26,8 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
26 int ix, ino; 26 int ix, ino;
27 int size; 27 int size;
28 28
29 QNX4DEBUG(("qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 29 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
30 QNX4DEBUG(("filp->f_pos = %ld\n", (long) filp->f_pos)); 30 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
31 31
32 lock_kernel(); 32 lock_kernel();
33 33
@@ -50,7 +50,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
50 size = QNX4_NAME_MAX; 50 size = QNX4_NAME_MAX;
51 51
52 if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) { 52 if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
53 QNX4DEBUG(("qnx4_readdir:%.*s\n", size, de->di_fname)); 53 QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
54 if ( ( de->di_status & QNX4_FILE_LINK ) == 0 ) 54 if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
55 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; 55 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
56 else { 56 else {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d2cd1798d8c4..277575ddc05c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -64,25 +64,7 @@ static struct buffer_head *qnx4_getblk(struct inode *inode, int nr,
64 result = sb_getblk(inode->i_sb, nr); 64 result = sb_getblk(inode->i_sb, nr);
65 return result; 65 return result;
66 } 66 }
67 if (!create) { 67 return NULL;
68 return NULL;
69 }
70#if 0
71 tmp = qnx4_new_block(inode->i_sb);
72 if (!tmp) {
73 return NULL;
74 }
75 result = sb_getblk(inode->i_sb, tmp);
76 if (tst) {
77 qnx4_free_block(inode->i_sb, tmp);
78 brelse(result);
79 goto repeat;
80 }
81 tst = tmp;
82#endif
83 inode->i_ctime = CURRENT_TIME_SEC;
84 mark_inode_dirty(inode);
85 return result;
86} 68}
87 69
88struct buffer_head *qnx4_bread(struct inode *inode, int block, int create) 70struct buffer_head *qnx4_bread(struct inode *inode, int block, int create)
@@ -107,14 +89,12 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
107{ 89{
108 unsigned long phys; 90 unsigned long phys;
109 91
110 QNX4DEBUG(("qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock)); 92 QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
111 93
112 phys = qnx4_block_map( inode, iblock ); 94 phys = qnx4_block_map( inode, iblock );
113 if ( phys ) { 95 if ( phys ) {
114 // logical block is before EOF 96 // logical block is before EOF
115 map_bh(bh, inode->i_sb, phys); 97 map_bh(bh, inode->i_sb, phys);
116 } else if ( create ) {
117 // to be done.
118 } 98 }
119 return 0; 99 return 0;
120} 100}
@@ -142,12 +122,12 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
142 // read next xtnt block. 122 // read next xtnt block.
143 bh = sb_bread(inode->i_sb, i_xblk - 1); 123 bh = sb_bread(inode->i_sb, i_xblk - 1);
144 if ( !bh ) { 124 if ( !bh ) {
145 QNX4DEBUG(("qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1)); 125 QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
146 return -EIO; 126 return -EIO;
147 } 127 }
148 xblk = (struct qnx4_xblk*)bh->b_data; 128 xblk = (struct qnx4_xblk*)bh->b_data;
149 if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) { 129 if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
150 QNX4DEBUG(("qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk)); 130 QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
151 return -EIO; 131 return -EIO;
152 } 132 }
153 } 133 }
@@ -168,7 +148,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
168 brelse( bh ); 148 brelse( bh );
169 } 149 }
170 150
171 QNX4DEBUG(("qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block)); 151 QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
172 return block; 152 return block;
173} 153}
174 154
@@ -209,7 +189,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
209 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') { 189 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
210 return "no qnx4 filesystem (no root dir)."; 190 return "no qnx4 filesystem (no root dir).";
211 } else { 191 } else {
212 QNX4DEBUG(("QNX4 filesystem found on dev %s.\n", sb->s_id)); 192 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
213 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; 193 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
214 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); 194 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
215 for (j = 0; j < rl; j++) { 195 for (j = 0; j < rl; j++) {
@@ -220,8 +200,9 @@ static const char *qnx4_checkroot(struct super_block *sb)
220 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) { 200 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
221 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); 201 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
222 if (rootdir->di_fname != NULL) { 202 if (rootdir->di_fname != NULL) {
223 QNX4DEBUG(("Rootdir entry found : [%s]\n", rootdir->di_fname)); 203 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
224 if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) { 204 if (!strcmp(rootdir->di_fname,
205 QNX4_BMNAME)) {
225 found = 1; 206 found = 1;
226 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); 207 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
227 if (!qnx4_sb(sb)->BitMap) { 208 if (!qnx4_sb(sb)->BitMap) {
@@ -265,12 +246,12 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
265 if we don't belong here... */ 246 if we don't belong here... */
266 bh = sb_bread(s, 1); 247 bh = sb_bread(s, 1);
267 if (!bh) { 248 if (!bh) {
268 printk("qnx4: unable to read the superblock\n"); 249 printk(KERN_ERR "qnx4: unable to read the superblock\n");
269 goto outnobh; 250 goto outnobh;
270 } 251 }
271 if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) { 252 if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
272 if (!silent) 253 if (!silent)
273 printk("qnx4: wrong fsid in superblock.\n"); 254 printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
274 goto out; 255 goto out;
275 } 256 }
276 s->s_op = &qnx4_sops; 257 s->s_op = &qnx4_sops;
@@ -284,14 +265,14 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
284 errmsg = qnx4_checkroot(s); 265 errmsg = qnx4_checkroot(s);
285 if (errmsg != NULL) { 266 if (errmsg != NULL) {
286 if (!silent) 267 if (!silent)
287 printk("qnx4: %s\n", errmsg); 268 printk(KERN_ERR "qnx4: %s\n", errmsg);
288 goto out; 269 goto out;
289 } 270 }
290 271
291 /* does root not have inode number QNX4_ROOT_INO ?? */ 272 /* does root not have inode number QNX4_ROOT_INO ?? */
292 root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); 273 root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
293 if (IS_ERR(root)) { 274 if (IS_ERR(root)) {
294 printk("qnx4: get inode failed\n"); 275 printk(KERN_ERR "qnx4: get inode failed\n");
295 ret = PTR_ERR(root); 276 ret = PTR_ERR(root);
296 goto out; 277 goto out;
297 } 278 }
@@ -374,7 +355,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
374 qnx4_inode = qnx4_raw_inode(inode); 355 qnx4_inode = qnx4_raw_inode(inode);
375 inode->i_mode = 0; 356 inode->i_mode = 0;
376 357
377 QNX4DEBUG(("Reading inode : [%d]\n", ino)); 358 QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
378 if (!ino) { 359 if (!ino) {
379 printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is " 360 printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
380 "out of range\n", 361 "out of range\n",
@@ -385,7 +366,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
385 block = ino / QNX4_INODES_PER_BLOCK; 366 block = ino / QNX4_INODES_PER_BLOCK;
386 367
387 if (!(bh = sb_bread(sb, block))) { 368 if (!(bh = sb_bread(sb, block))) {
388 printk("qnx4: major problem: unable to read inode from dev " 369 printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
389 "%s\n", sb->s_id); 370 "%s\n", sb->s_id);
390 iget_failed(inode); 371 iget_failed(inode);
391 return ERR_PTR(-EIO); 372 return ERR_PTR(-EIO);
@@ -499,7 +480,7 @@ static int __init init_qnx4_fs(void)
499 return err; 480 return err;
500 } 481 }
501 482
502 printk("QNX4 filesystem 0.2.3 registered.\n"); 483 printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
503 return 0; 484 return 0;
504} 485}
505 486
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index ae1e7edbacd6..58703ebba879 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -30,7 +30,7 @@ static int qnx4_match(int len, const char *name,
30 int namelen, thislen; 30 int namelen, thislen;
31 31
32 if (bh == NULL) { 32 if (bh == NULL) {
33 printk("qnx4: matching unassigned buffer !\n"); 33 printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
34 return 0; 34 return 0;
35 } 35 }
36 de = (struct qnx4_inode_entry *) (bh->b_data + *offset); 36 de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
@@ -66,7 +66,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
66 66
67 *res_dir = NULL; 67 *res_dir = NULL;
68 if (!dir->i_sb) { 68 if (!dir->i_sb) {
69 printk("qnx4: no superblock on dir.\n"); 69 printk(KERN_WARNING "qnx4: no superblock on dir.\n");
70 return NULL; 70 return NULL;
71 } 71 }
72 bh = NULL; 72 bh = NULL;
@@ -124,7 +124,7 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
124 foundinode = qnx4_iget(dir->i_sb, ino); 124 foundinode = qnx4_iget(dir->i_sb, ino);
125 if (IS_ERR(foundinode)) { 125 if (IS_ERR(foundinode)) {
126 unlock_kernel(); 126 unlock_kernel();
127 QNX4DEBUG(("qnx4: lookup->iget -> error %ld\n", 127 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
128 PTR_ERR(foundinode))); 128 PTR_ERR(foundinode)));
129 return ERR_CAST(foundinode); 129 return ERR_CAST(foundinode);
130 } 130 }
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46b..dad7fb247ddc 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
17 17
18config QUOTA_NETLINK_INTERFACE 18config QUOTA_NETLINK_INTERFACE
19 bool "Report quota messages through netlink interface" 19 bool "Report quota messages through netlink interface"
20 depends on QUOTA && NET 20 depends on QUOTACTL && NET
21 help 21 help
22 If you say Y here, quota warnings (about exceeding softlimit, reaching 22 If you say Y here, quota warnings (about exceeding softlimit, reaching
23 hardlimit, etc.) will be reported through netlink interface. If unsure, 23 hardlimit, etc.) will be reported through netlink interface. If unsure,
@@ -46,14 +46,21 @@ config QFMT_V1
46 format say Y here. 46 format say Y here.
47 47
48config QFMT_V2 48config QFMT_V2
49 tristate "Quota format v2 support" 49 tristate "Quota format vfsv0 and vfsv1 support"
50 depends on QUOTA 50 depends on QUOTA
51 select QUOTA_TREE 51 select QUOTA_TREE
52 help 52 help
53 This quota format allows using quotas with 32-bit UIDs/GIDs. If you 53 This config option enables kernel support for vfsv0 and vfsv1 quota
54 need this functionality say Y here. 54 formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format
55 also supports 64-bit inode and block quota limits. If you need this
56 functionality say Y here.
55 57
56config QUOTACTL 58config QUOTACTL
57 bool 59 bool
58 depends on XFS_QUOTA || QUOTA 60 depends on XFS_QUOTA || QUOTA
59 default y 61 default y
62
63config QUOTACTL_COMPAT
64 bool
65 depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
66 default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6dc0578..5f9e9e276af0 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
3obj-$(CONFIG_QFMT_V2) += quota_v2.o 3obj-$(CONFIG_QFMT_V2) += quota_v2.o
4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o 4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
5obj-$(CONFIG_QUOTACTL) += quota.o 5obj-$(CONFIG_QUOTACTL) += quota.o
6obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
7obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 000000000000..fb1892fe3e56
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
1
2#include <linux/syscalls.h>
3#include <linux/compat.h>
4#include <linux/quotaops.h>
5
6/*
7 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
8 * and is necessary due to alignment problems.
9 */
10struct compat_if_dqblk {
11 compat_u64 dqb_bhardlimit;
12 compat_u64 dqb_bsoftlimit;
13 compat_u64 dqb_curspace;
14 compat_u64 dqb_ihardlimit;
15 compat_u64 dqb_isoftlimit;
16 compat_u64 dqb_curinodes;
17 compat_u64 dqb_btime;
18 compat_u64 dqb_itime;
19 compat_uint_t dqb_valid;
20};
21
22/* XFS structures */
23struct compat_fs_qfilestat {
24 compat_u64 dqb_bhardlimit;
25 compat_u64 qfs_nblks;
26 compat_uint_t qfs_nextents;
27};
28
29struct compat_fs_quota_stat {
30 __s8 qs_version;
31 __u16 qs_flags;
32 __s8 qs_pad;
33 struct compat_fs_qfilestat qs_uquota;
34 struct compat_fs_qfilestat qs_gquota;
35 compat_uint_t qs_incoredqs;
36 compat_int_t qs_btimelimit;
37 compat_int_t qs_itimelimit;
38 compat_int_t qs_rtbtimelimit;
39 __u16 qs_bwarnlimit;
40 __u16 qs_iwarnlimit;
41};
42
43asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
44 qid_t id, void __user *addr)
45{
46 unsigned int cmds;
47 struct if_dqblk __user *dqblk;
48 struct compat_if_dqblk __user *compat_dqblk;
49 struct fs_quota_stat __user *fsqstat;
50 struct compat_fs_quota_stat __user *compat_fsqstat;
51 compat_uint_t data;
52 u16 xdata;
53 long ret;
54
55 cmds = cmd >> SUBCMDSHIFT;
56
57 switch (cmds) {
58 case Q_GETQUOTA:
59 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
60 compat_dqblk = addr;
61 ret = sys_quotactl(cmd, special, id, dqblk);
62 if (ret)
63 break;
64 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
65 get_user(data, &dqblk->dqb_valid) ||
66 put_user(data, &compat_dqblk->dqb_valid))
67 ret = -EFAULT;
68 break;
69 case Q_SETQUOTA:
70 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
71 compat_dqblk = addr;
72 ret = -EFAULT;
73 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
74 get_user(data, &compat_dqblk->dqb_valid) ||
75 put_user(data, &dqblk->dqb_valid))
76 break;
77 ret = sys_quotactl(cmd, special, id, dqblk);
78 break;
79 case Q_XGETQSTAT:
80 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
81 compat_fsqstat = addr;
82 ret = sys_quotactl(cmd, special, id, fsqstat);
83 if (ret)
84 break;
85 ret = -EFAULT;
86 /* Copying qs_version, qs_flags, qs_pad */
87 if (copy_in_user(compat_fsqstat, fsqstat,
88 offsetof(struct compat_fs_quota_stat, qs_uquota)))
89 break;
90 /* Copying qs_uquota */
91 if (copy_in_user(&compat_fsqstat->qs_uquota,
92 &fsqstat->qs_uquota,
93 sizeof(compat_fsqstat->qs_uquota)) ||
94 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
95 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
96 break;
97 /* Copying qs_gquota */
98 if (copy_in_user(&compat_fsqstat->qs_gquota,
99 &fsqstat->qs_gquota,
100 sizeof(compat_fsqstat->qs_gquota)) ||
101 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
102 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
103 break;
104 /* Copying the rest */
105 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
106 &fsqstat->qs_incoredqs,
107 sizeof(struct compat_fs_quota_stat) -
108 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
109 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
110 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
111 break;
112 ret = 0;
113 break;
114 default:
115 ret = sys_quotactl(cmd, special, id, addr);
116 }
117 return ret;
118}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7ed..e0b870f4749f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
77#include <linux/capability.h> 77#include <linux/capability.h>
78#include <linux/quotaops.h> 78#include <linux/quotaops.h>
79#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ 79#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
80#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
81#include <net/netlink.h>
82#include <net/genetlink.h>
83#endif
84 80
85#include <asm/uaccess.h> 81#include <asm/uaccess.h>
86 82
@@ -104,9 +100,13 @@
104 * 100 *
105 * Any operation working on dquots via inode pointers must hold dqptr_sem. If 101 * Any operation working on dquots via inode pointers must hold dqptr_sem. If
106 * operation is just reading pointers from inode (or not using them at all) the 102 * operation is just reading pointers from inode (or not using them at all) the
107 * read lock is enough. If pointers are altered function must hold write lock 103 * read lock is enough. If pointers are altered function must hold write lock.
108 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that 104 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
109 * for altering the flag i_mutex is also needed). 105 * inode is a quota file). Functions adding pointers from inode to dquots have
106 * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
107 * have to do all pointer modifications before dropping dqptr_sem. This makes
108 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
109 * then drops all pointers to dquots from an inode.
110 * 110 *
111 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced 111 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
112 * from inodes (dquot_alloc_space() and such don't check the dq_lock). 112 * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -229,6 +229,9 @@ static struct hlist_head *dquot_hash;
229struct dqstats dqstats; 229struct dqstats dqstats;
230EXPORT_SYMBOL(dqstats); 230EXPORT_SYMBOL(dqstats);
231 231
232static qsize_t inode_get_rsv_space(struct inode *inode);
233static void __dquot_initialize(struct inode *inode, int type);
234
232static inline unsigned int 235static inline unsigned int
233hashfn(const struct super_block *sb, unsigned int id, int type) 236hashfn(const struct super_block *sb, unsigned int id, int type)
234{ 237{
@@ -327,6 +330,30 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
327} 330}
328EXPORT_SYMBOL(dquot_mark_dquot_dirty); 331EXPORT_SYMBOL(dquot_mark_dquot_dirty);
329 332
333/* Dirtify all the dquots - this can block when journalling */
334static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
335{
336 int ret, err, cnt;
337
338 ret = err = 0;
339 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
340 if (dquot[cnt])
341 /* Even in case of error we have to continue */
342 ret = mark_dquot_dirty(dquot[cnt]);
343 if (!err)
344 err = ret;
345 }
346 return err;
347}
348
349static inline void dqput_all(struct dquot **dquot)
350{
351 unsigned int cnt;
352
353 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
354 dqput(dquot[cnt]);
355}
356
330/* This function needs dq_list_lock */ 357/* This function needs dq_list_lock */
331static inline int clear_dquot_dirty(struct dquot *dquot) 358static inline int clear_dquot_dirty(struct dquot *dquot)
332{ 359{
@@ -544,7 +571,7 @@ out:
544} 571}
545EXPORT_SYMBOL(dquot_scan_active); 572EXPORT_SYMBOL(dquot_scan_active);
546 573
547int vfs_quota_sync(struct super_block *sb, int type) 574int vfs_quota_sync(struct super_block *sb, int type, int wait)
548{ 575{
549 struct list_head *dirty; 576 struct list_head *dirty;
550 struct dquot *dquot; 577 struct dquot *dquot;
@@ -589,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type)
589 spin_unlock(&dq_list_lock); 616 spin_unlock(&dq_list_lock);
590 mutex_unlock(&dqopt->dqonoff_mutex); 617 mutex_unlock(&dqopt->dqonoff_mutex);
591 618
619 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
620 return 0;
621
622 /* This is not very clever (and fast) but currently I don't know about
623 * any other simple way of getting quota data to disk and we must get
624 * them there for userspace to be visible... */
625 if (sb->s_op->sync_fs)
626 sb->s_op->sync_fs(sb, 1);
627 sync_blockdev(sb->s_bdev);
628
629 /*
630 * Now when everything is written we can discard the pagecache so
631 * that userspace sees the changes.
632 */
633 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
634 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
635 if (type != -1 && cnt != type)
636 continue;
637 if (!sb_has_quota_active(sb, cnt))
638 continue;
639 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
640 I_MUTEX_QUOTA);
641 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
642 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
643 }
644 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
645
592 return 0; 646 return 0;
593} 647}
594EXPORT_SYMBOL(vfs_quota_sync); 648EXPORT_SYMBOL(vfs_quota_sync);
@@ -820,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type)
820static void add_dquot_ref(struct super_block *sb, int type) 874static void add_dquot_ref(struct super_block *sb, int type)
821{ 875{
822 struct inode *inode, *old_inode = NULL; 876 struct inode *inode, *old_inode = NULL;
877 int reserved = 0;
823 878
824 spin_lock(&inode_lock); 879 spin_lock(&inode_lock);
825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
826 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
827 continue; 882 continue;
883 if (unlikely(inode_get_rsv_space(inode) > 0))
884 reserved = 1;
828 if (!atomic_read(&inode->i_writecount)) 885 if (!atomic_read(&inode->i_writecount))
829 continue; 886 continue;
830 if (!dqinit_needed(inode, type)) 887 if (!dqinit_needed(inode, type))
@@ -834,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
834 spin_unlock(&inode_lock); 891 spin_unlock(&inode_lock);
835 892
836 iput(old_inode); 893 iput(old_inode);
837 sb->dq_op->initialize(inode, type); 894 __dquot_initialize(inode, type);
838 /* We hold a reference to 'inode' so it couldn't have been 895 /* We hold a reference to 'inode' so it couldn't have been
839 * removed from s_inodes list while we dropped the inode_lock. 896 * removed from s_inodes list while we dropped the inode_lock.
840 * We cannot iput the inode now as we can be holding the last 897 * We cannot iput the inode now as we can be holding the last
@@ -845,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
845 } 902 }
846 spin_unlock(&inode_lock); 903 spin_unlock(&inode_lock);
847 iput(old_inode); 904 iput(old_inode);
905
906 if (reserved) {
907 printk(KERN_WARNING "VFS (%s): Writes happened before quota"
908 " was turned on thus quota information is probably "
909 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
910 }
848} 911}
849 912
850/* 913/*
@@ -958,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
958/* 1021/*
959 * Claim reserved quota space 1022 * Claim reserved quota space
960 */ 1023 */
961static void dquot_claim_reserved_space(struct dquot *dquot, 1024static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
962 qsize_t number)
963{ 1025{
964 WARN_ON(dquot->dq_dqb.dqb_rsvspace < number); 1026 if (dquot->dq_dqb.dqb_rsvspace < number) {
1027 WARN_ON_ONCE(1);
1028 number = dquot->dq_dqb.dqb_rsvspace;
1029 }
965 dquot->dq_dqb.dqb_curspace += number; 1030 dquot->dq_dqb.dqb_curspace += number;
966 dquot->dq_dqb.dqb_rsvspace -= number; 1031 dquot->dq_dqb.dqb_rsvspace -= number;
967} 1032}
@@ -969,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
969static inline 1034static inline
970void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) 1035void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
971{ 1036{
972 dquot->dq_dqb.dqb_rsvspace -= number; 1037 if (dquot->dq_dqb.dqb_rsvspace >= number)
1038 dquot->dq_dqb.dqb_rsvspace -= number;
1039 else {
1040 WARN_ON_ONCE(1);
1041 dquot->dq_dqb.dqb_rsvspace = 0;
1042 }
973} 1043}
974 1044
975static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) 1045static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1071,73 +1141,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
1071} 1141}
1072#endif 1142#endif
1073 1143
1074#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
1075
1076/* Netlink family structure for quota */
1077static struct genl_family quota_genl_family = {
1078 .id = GENL_ID_GENERATE,
1079 .hdrsize = 0,
1080 .name = "VFS_DQUOT",
1081 .version = 1,
1082 .maxattr = QUOTA_NL_A_MAX,
1083};
1084
1085/* Send warning to userspace about user which exceeded quota */
1086static void send_warning(const struct dquot *dquot, const char warntype)
1087{
1088 static atomic_t seq;
1089 struct sk_buff *skb;
1090 void *msg_head;
1091 int ret;
1092 int msg_size = 4 * nla_total_size(sizeof(u32)) +
1093 2 * nla_total_size(sizeof(u64));
1094
1095 /* We have to allocate using GFP_NOFS as we are called from a
1096 * filesystem performing write and thus further recursion into
1097 * the fs to free some data could cause deadlocks. */
1098 skb = genlmsg_new(msg_size, GFP_NOFS);
1099 if (!skb) {
1100 printk(KERN_ERR
1101 "VFS: Not enough memory to send quota warning.\n");
1102 return;
1103 }
1104 msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
1105 &quota_genl_family, 0, QUOTA_NL_C_WARNING);
1106 if (!msg_head) {
1107 printk(KERN_ERR
1108 "VFS: Cannot store netlink header in quota warning.\n");
1109 goto err_out;
1110 }
1111 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
1112 if (ret)
1113 goto attr_err_out;
1114 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
1115 if (ret)
1116 goto attr_err_out;
1117 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
1118 if (ret)
1119 goto attr_err_out;
1120 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
1121 MAJOR(dquot->dq_sb->s_dev));
1122 if (ret)
1123 goto attr_err_out;
1124 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
1125 MINOR(dquot->dq_sb->s_dev));
1126 if (ret)
1127 goto attr_err_out;
1128 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
1129 if (ret)
1130 goto attr_err_out;
1131 genlmsg_end(skb, msg_head);
1132
1133 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
1134 return;
1135attr_err_out:
1136 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
1137err_out:
1138 kfree_skb(skb);
1139}
1140#endif
1141/* 1144/*
1142 * Write warnings to the console and send warning messages over netlink. 1145 * Write warnings to the console and send warning messages over netlink.
1143 * 1146 *
@@ -1145,18 +1148,20 @@ err_out:
1145 */ 1148 */
1146static void flush_warnings(struct dquot *const *dquots, char *warntype) 1149static void flush_warnings(struct dquot *const *dquots, char *warntype)
1147{ 1150{
1151 struct dquot *dq;
1148 int i; 1152 int i;
1149 1153
1150 for (i = 0; i < MAXQUOTAS; i++) 1154 for (i = 0; i < MAXQUOTAS; i++) {
1151 if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN && 1155 dq = dquots[i];
1152 !warning_issued(dquots[i], warntype[i])) { 1156 if (dq && warntype[i] != QUOTA_NL_NOWARN &&
1157 !warning_issued(dq, warntype[i])) {
1153#ifdef CONFIG_PRINT_QUOTA_WARNING 1158#ifdef CONFIG_PRINT_QUOTA_WARNING
1154 print_warning(dquots[i], warntype[i]); 1159 print_warning(dq, warntype[i]);
1155#endif
1156#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
1157 send_warning(dquots[i], warntype[i]);
1158#endif 1160#endif
1161 quota_send_warning(dq->dq_type, dq->dq_id,
1162 dq->dq_sb->s_dev, warntype[i]);
1159 } 1163 }
1164 }
1160} 1165}
1161 1166
1162static int ignore_hardlimit(struct dquot *dquot) 1167static int ignore_hardlimit(struct dquot *dquot)
@@ -1176,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1176 *warntype = QUOTA_NL_NOWARN; 1181 *warntype = QUOTA_NL_NOWARN;
1177 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || 1182 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1178 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1183 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1179 return QUOTA_OK; 1184 return 0;
1180 1185
1181 if (dquot->dq_dqb.dqb_ihardlimit && 1186 if (dquot->dq_dqb.dqb_ihardlimit &&
1182 newinodes > dquot->dq_dqb.dqb_ihardlimit && 1187 newinodes > dquot->dq_dqb.dqb_ihardlimit &&
1183 !ignore_hardlimit(dquot)) { 1188 !ignore_hardlimit(dquot)) {
1184 *warntype = QUOTA_NL_IHARDWARN; 1189 *warntype = QUOTA_NL_IHARDWARN;
1185 return NO_QUOTA; 1190 return -EDQUOT;
1186 } 1191 }
1187 1192
1188 if (dquot->dq_dqb.dqb_isoftlimit && 1193 if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1191,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1191 get_seconds() >= dquot->dq_dqb.dqb_itime && 1196 get_seconds() >= dquot->dq_dqb.dqb_itime &&
1192 !ignore_hardlimit(dquot)) { 1197 !ignore_hardlimit(dquot)) {
1193 *warntype = QUOTA_NL_ISOFTLONGWARN; 1198 *warntype = QUOTA_NL_ISOFTLONGWARN;
1194 return NO_QUOTA; 1199 return -EDQUOT;
1195 } 1200 }
1196 1201
1197 if (dquot->dq_dqb.dqb_isoftlimit && 1202 if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1202,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1202 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; 1207 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
1203 } 1208 }
1204 1209
1205 return QUOTA_OK; 1210 return 0;
1206} 1211}
1207 1212
1208/* needs dq_data_lock */ 1213/* needs dq_data_lock */
@@ -1214,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1214 *warntype = QUOTA_NL_NOWARN; 1219 *warntype = QUOTA_NL_NOWARN;
1215 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || 1220 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
1216 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1221 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1217 return QUOTA_OK; 1222 return 0;
1218 1223
1219 tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace 1224 tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
1220 + space; 1225 + space;
@@ -1224,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1224 !ignore_hardlimit(dquot)) { 1229 !ignore_hardlimit(dquot)) {
1225 if (!prealloc) 1230 if (!prealloc)
1226 *warntype = QUOTA_NL_BHARDWARN; 1231 *warntype = QUOTA_NL_BHARDWARN;
1227 return NO_QUOTA; 1232 return -EDQUOT;
1228 } 1233 }
1229 1234
1230 if (dquot->dq_dqb.dqb_bsoftlimit && 1235 if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1234,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1234 !ignore_hardlimit(dquot)) { 1239 !ignore_hardlimit(dquot)) {
1235 if (!prealloc) 1240 if (!prealloc)
1236 *warntype = QUOTA_NL_BSOFTLONGWARN; 1241 *warntype = QUOTA_NL_BSOFTLONGWARN;
1237 return NO_QUOTA; 1242 return -EDQUOT;
1238 } 1243 }
1239 1244
1240 if (dquot->dq_dqb.dqb_bsoftlimit && 1245 if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1250,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1250 * We don't allow preallocation to exceed softlimit so exceeding will 1255 * We don't allow preallocation to exceed softlimit so exceeding will
1251 * be always printed 1256 * be always printed
1252 */ 1257 */
1253 return NO_QUOTA; 1258 return -EDQUOT;
1254 } 1259 }
1255 1260
1256 return QUOTA_OK; 1261 return 0;
1257} 1262}
1258 1263
1259static int info_idq_free(struct dquot *dquot, qsize_t inodes) 1264static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1287,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
1287 return QUOTA_NL_BHARDBELOW; 1292 return QUOTA_NL_BHARDBELOW;
1288 return QUOTA_NL_NOWARN; 1293 return QUOTA_NL_NOWARN;
1289} 1294}
1295
1290/* 1296/*
1291 * Initialize quota pointers in inode 1297 * Initialize quota pointers in inode
1292 * We do things in a bit complicated way but by that we avoid calling 1298 *
1293 * dqget() and thus filesystem callbacks under dqptr_sem. 1299 * We do things in a bit complicated way but by that we avoid calling
1300 * dqget() and thus filesystem callbacks under dqptr_sem.
1301 *
1302 * It is better to call this function outside of any transaction as it
1303 * might need a lot of space in journal for dquot structure allocation.
1294 */ 1304 */
1295int dquot_initialize(struct inode *inode, int type) 1305static void __dquot_initialize(struct inode *inode, int type)
1296{ 1306{
1297 unsigned int id = 0; 1307 unsigned int id = 0;
1298 int cnt, ret = 0; 1308 int cnt;
1299 struct dquot *got[MAXQUOTAS] = { NULL, NULL }; 1309 struct dquot *got[MAXQUOTAS];
1300 struct super_block *sb = inode->i_sb; 1310 struct super_block *sb = inode->i_sb;
1311 qsize_t rsv;
1301 1312
1302 /* First test before acquiring mutex - solves deadlocks when we 1313 /* First test before acquiring mutex - solves deadlocks when we
1303 * re-enter the quota code and are already holding the mutex */ 1314 * re-enter the quota code and are already holding the mutex */
1304 if (IS_NOQUOTA(inode)) 1315 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1305 return 0; 1316 return;
1306 1317
1307 /* First get references to structures we might need. */ 1318 /* First get references to structures we might need. */
1308 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1319 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1320 got[cnt] = NULL;
1309 if (type != -1 && cnt != type) 1321 if (type != -1 && cnt != type)
1310 continue; 1322 continue;
1311 switch (cnt) { 1323 switch (cnt) {
@@ -1320,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type)
1320 } 1332 }
1321 1333
1322 down_write(&sb_dqopt(sb)->dqptr_sem); 1334 down_write(&sb_dqopt(sb)->dqptr_sem);
1323 /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
1324 if (IS_NOQUOTA(inode)) 1335 if (IS_NOQUOTA(inode))
1325 goto out_err; 1336 goto out_err;
1326 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1337 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1332,21 +1343,31 @@ int dquot_initialize(struct inode *inode, int type)
1332 if (!inode->i_dquot[cnt]) { 1343 if (!inode->i_dquot[cnt]) {
1333 inode->i_dquot[cnt] = got[cnt]; 1344 inode->i_dquot[cnt] = got[cnt];
1334 got[cnt] = NULL; 1345 got[cnt] = NULL;
1346 /*
1347 * Make quota reservation system happy if someone
1348 * did a write before quota was turned on
1349 */
1350 rsv = inode_get_rsv_space(inode);
1351 if (unlikely(rsv))
1352 dquot_resv_space(inode->i_dquot[cnt], rsv);
1335 } 1353 }
1336 } 1354 }
1337out_err: 1355out_err:
1338 up_write(&sb_dqopt(sb)->dqptr_sem); 1356 up_write(&sb_dqopt(sb)->dqptr_sem);
1339 /* Drop unused references */ 1357 /* Drop unused references */
1340 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1358 dqput_all(got);
1341 dqput(got[cnt]); 1359}
1342 return ret; 1360
1361void dquot_initialize(struct inode *inode)
1362{
1363 __dquot_initialize(inode, -1);
1343} 1364}
1344EXPORT_SYMBOL(dquot_initialize); 1365EXPORT_SYMBOL(dquot_initialize);
1345 1366
1346/* 1367/*
1347 * Release all quotas referenced by inode 1368 * Release all quotas referenced by inode
1348 */ 1369 */
1349int dquot_drop(struct inode *inode) 1370static void __dquot_drop(struct inode *inode)
1350{ 1371{
1351 int cnt; 1372 int cnt;
1352 struct dquot *put[MAXQUOTAS]; 1373 struct dquot *put[MAXQUOTAS];
@@ -1357,54 +1378,128 @@ int dquot_drop(struct inode *inode)
1357 inode->i_dquot[cnt] = NULL; 1378 inode->i_dquot[cnt] = NULL;
1358 } 1379 }
1359 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1380 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1381 dqput_all(put);
1382}
1360 1383
1361 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1384void dquot_drop(struct inode *inode)
1362 dqput(put[cnt]); 1385{
1363 return 0; 1386 int cnt;
1387
1388 if (IS_NOQUOTA(inode))
1389 return;
1390
1391 /*
1392 * Test before calling to rule out calls from proc and such
1393 * where we are not allowed to block. Note that this is
1394 * actually reliable test even without the lock - the caller
1395 * must assure that nobody can come after the DQUOT_DROP and
1396 * add quota pointers back anyway.
1397 */
1398 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1399 if (inode->i_dquot[cnt])
1400 break;
1401 }
1402
1403 if (cnt < MAXQUOTAS)
1404 __dquot_drop(inode);
1364} 1405}
1365EXPORT_SYMBOL(dquot_drop); 1406EXPORT_SYMBOL(dquot_drop);
1366 1407
1367/* Wrapper to remove references to quota structures from inode */ 1408/*
1368void vfs_dq_drop(struct inode *inode) 1409 * inode_reserved_space is managed internally by quota, and protected by
1369{ 1410 * i_lock similar to i_blocks+i_bytes.
1370 /* Here we can get arbitrary inode from clear_inode() so we have 1411 */
1371 * to be careful. OTOH we don't need locking as quota operations 1412static qsize_t *inode_reserved_space(struct inode * inode)
1372 * are allowed to change only at mount time */ 1413{
1373 if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op 1414 /* Filesystem must explicitly define it's own method in order to use
1374 && inode->i_sb->dq_op->drop) { 1415 * quota reservation interface */
1375 int cnt; 1416 BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
1376 /* Test before calling to rule out calls from proc and such 1417 return inode->i_sb->dq_op->get_reserved_space(inode);
1377 * where we are not allowed to block. Note that this is 1418}
1378 * actually reliable test even without the lock - the caller 1419
1379 * must assure that nobody can come after the DQUOT_DROP and 1420void inode_add_rsv_space(struct inode *inode, qsize_t number)
1380 * add quota pointers back anyway */ 1421{
1381 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1422 spin_lock(&inode->i_lock);
1382 if (inode->i_dquot[cnt]) 1423 *inode_reserved_space(inode) += number;
1383 break; 1424 spin_unlock(&inode->i_lock);
1384 if (cnt < MAXQUOTAS) 1425}
1385 inode->i_sb->dq_op->drop(inode); 1426EXPORT_SYMBOL(inode_add_rsv_space);
1386 } 1427
1387} 1428void inode_claim_rsv_space(struct inode *inode, qsize_t number)
1388EXPORT_SYMBOL(vfs_dq_drop); 1429{
1430 spin_lock(&inode->i_lock);
1431 *inode_reserved_space(inode) -= number;
1432 __inode_add_bytes(inode, number);
1433 spin_unlock(&inode->i_lock);
1434}
1435EXPORT_SYMBOL(inode_claim_rsv_space);
1436
1437void inode_sub_rsv_space(struct inode *inode, qsize_t number)
1438{
1439 spin_lock(&inode->i_lock);
1440 *inode_reserved_space(inode) -= number;
1441 spin_unlock(&inode->i_lock);
1442}
1443EXPORT_SYMBOL(inode_sub_rsv_space);
1444
1445static qsize_t inode_get_rsv_space(struct inode *inode)
1446{
1447 qsize_t ret;
1448
1449 if (!inode->i_sb->dq_op->get_reserved_space)
1450 return 0;
1451 spin_lock(&inode->i_lock);
1452 ret = *inode_reserved_space(inode);
1453 spin_unlock(&inode->i_lock);
1454 return ret;
1455}
1456
1457static void inode_incr_space(struct inode *inode, qsize_t number,
1458 int reserve)
1459{
1460 if (reserve)
1461 inode_add_rsv_space(inode, number);
1462 else
1463 inode_add_bytes(inode, number);
1464}
1465
1466static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1467{
1468 if (reserve)
1469 inode_sub_rsv_space(inode, number);
1470 else
1471 inode_sub_bytes(inode, number);
1472}
1389 1473
1390/* 1474/*
1391 * Following four functions update i_blocks+i_bytes fields and 1475 * This functions updates i_blocks+i_bytes fields and quota information
1392 * quota information (together with appropriate checks) 1476 * (together with appropriate checks).
1393 * NOTE: We absolutely rely on the fact that caller dirties 1477 *
1394 * the inode (usually macros in quotaops.h care about this) and 1478 * NOTE: We absolutely rely on the fact that caller dirties the inode
1395 * holds a handle for the current transaction so that dquot write and 1479 * (usually helpers in quotaops.h care about this) and holds a handle for
1396 * inode write go into the same transaction. 1480 * the current transaction so that dquot write and inode write go into the
1481 * same transaction.
1397 */ 1482 */
1398 1483
1399/* 1484/*
1400 * This operation can block, but only after everything is updated 1485 * This operation can block, but only after everything is updated
1401 */ 1486 */
1402int __dquot_alloc_space(struct inode *inode, qsize_t number, 1487int __dquot_alloc_space(struct inode *inode, qsize_t number,
1403 int warn, int reserve) 1488 int warn, int reserve)
1404{ 1489{
1405 int cnt, ret = QUOTA_OK; 1490 int cnt, ret = 0;
1406 char warntype[MAXQUOTAS]; 1491 char warntype[MAXQUOTAS];
1407 1492
1493 /*
1494 * First test before acquiring mutex - solves deadlocks when we
1495 * re-enter the quota code and are already holding the mutex
1496 */
1497 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1498 inode_incr_space(inode, number, reserve);
1499 goto out;
1500 }
1501
1502 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1408 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1503 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1409 warntype[cnt] = QUOTA_NL_NOWARN; 1504 warntype[cnt] = QUOTA_NL_NOWARN;
1410 1505
@@ -1412,10 +1507,11 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1412 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1507 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1413 if (!inode->i_dquot[cnt]) 1508 if (!inode->i_dquot[cnt])
1414 continue; 1509 continue;
1415 if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) 1510 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1416 == NO_QUOTA) { 1511 warntype+cnt);
1417 ret = NO_QUOTA; 1512 if (ret) {
1418 goto out_unlock; 1513 spin_unlock(&dq_data_lock);
1514 goto out_flush_warn;
1419 } 1515 }
1420 } 1516 }
1421 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1517 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1426,131 +1522,73 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1426 else 1522 else
1427 dquot_incr_space(inode->i_dquot[cnt], number); 1523 dquot_incr_space(inode->i_dquot[cnt], number);
1428 } 1524 }
1429 if (!reserve) 1525 inode_incr_space(inode, number, reserve);
1430 inode_add_bytes(inode, number);
1431out_unlock:
1432 spin_unlock(&dq_data_lock); 1526 spin_unlock(&dq_data_lock);
1433 flush_warnings(inode->i_dquot, warntype);
1434 return ret;
1435}
1436
1437int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
1438{
1439 int cnt, ret = QUOTA_OK;
1440
1441 /*
1442 * First test before acquiring mutex - solves deadlocks when we
1443 * re-enter the quota code and are already holding the mutex
1444 */
1445 if (IS_NOQUOTA(inode)) {
1446 inode_add_bytes(inode, number);
1447 goto out;
1448 }
1449 1527
1450 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1528 if (reserve)
1451 if (IS_NOQUOTA(inode)) { 1529 goto out_flush_warn;
1452 inode_add_bytes(inode, number); 1530 mark_all_dquot_dirty(inode->i_dquot);
1453 goto out_unlock; 1531out_flush_warn:
1454 } 1532 flush_warnings(inode->i_dquot, warntype);
1455
1456 ret = __dquot_alloc_space(inode, number, warn, 0);
1457 if (ret == NO_QUOTA)
1458 goto out_unlock;
1459
1460 /* Dirtify all the dquots - this can block when journalling */
1461 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1462 if (inode->i_dquot[cnt])
1463 mark_dquot_dirty(inode->i_dquot[cnt]);
1464out_unlock:
1465 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1466out:
1467 return ret;
1468}
1469EXPORT_SYMBOL(dquot_alloc_space);
1470
1471int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
1472{
1473 int ret = QUOTA_OK;
1474
1475 if (IS_NOQUOTA(inode))
1476 goto out;
1477
1478 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1479 if (IS_NOQUOTA(inode))
1480 goto out_unlock;
1481
1482 ret = __dquot_alloc_space(inode, number, warn, 1);
1483out_unlock:
1484 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1533 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1485out: 1534out:
1486 return ret; 1535 return ret;
1487} 1536}
1488EXPORT_SYMBOL(dquot_reserve_space); 1537EXPORT_SYMBOL(__dquot_alloc_space);
1489 1538
1490/* 1539/*
1491 * This operation can block, but only after everything is updated 1540 * This operation can block, but only after everything is updated
1492 */ 1541 */
1493int dquot_alloc_inode(const struct inode *inode, qsize_t number) 1542int dquot_alloc_inode(const struct inode *inode)
1494{ 1543{
1495 int cnt, ret = NO_QUOTA; 1544 int cnt, ret = 0;
1496 char warntype[MAXQUOTAS]; 1545 char warntype[MAXQUOTAS];
1497 1546
1498 /* First test before acquiring mutex - solves deadlocks when we 1547 /* First test before acquiring mutex - solves deadlocks when we
1499 * re-enter the quota code and are already holding the mutex */ 1548 * re-enter the quota code and are already holding the mutex */
1500 if (IS_NOQUOTA(inode)) 1549 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1501 return QUOTA_OK; 1550 return 0;
1502 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1551 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1503 warntype[cnt] = QUOTA_NL_NOWARN; 1552 warntype[cnt] = QUOTA_NL_NOWARN;
1504 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1553 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1505 if (IS_NOQUOTA(inode)) {
1506 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1507 return QUOTA_OK;
1508 }
1509 spin_lock(&dq_data_lock); 1554 spin_lock(&dq_data_lock);
1510 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1555 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1511 if (!inode->i_dquot[cnt]) 1556 if (!inode->i_dquot[cnt])
1512 continue; 1557 continue;
1513 if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) 1558 ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
1514 == NO_QUOTA) 1559 if (ret)
1515 goto warn_put_all; 1560 goto warn_put_all;
1516 } 1561 }
1517 1562
1518 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1563 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1519 if (!inode->i_dquot[cnt]) 1564 if (!inode->i_dquot[cnt])
1520 continue; 1565 continue;
1521 dquot_incr_inodes(inode->i_dquot[cnt], number); 1566 dquot_incr_inodes(inode->i_dquot[cnt], 1);
1522 } 1567 }
1523 ret = QUOTA_OK; 1568
1524warn_put_all: 1569warn_put_all:
1525 spin_unlock(&dq_data_lock); 1570 spin_unlock(&dq_data_lock);
1526 if (ret == QUOTA_OK) 1571 if (ret == 0)
1527 /* Dirtify all the dquots - this can block when journalling */ 1572 mark_all_dquot_dirty(inode->i_dquot);
1528 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1529 if (inode->i_dquot[cnt])
1530 mark_dquot_dirty(inode->i_dquot[cnt]);
1531 flush_warnings(inode->i_dquot, warntype); 1573 flush_warnings(inode->i_dquot, warntype);
1532 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1574 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1533 return ret; 1575 return ret;
1534} 1576}
1535EXPORT_SYMBOL(dquot_alloc_inode); 1577EXPORT_SYMBOL(dquot_alloc_inode);
1536 1578
1537int dquot_claim_space(struct inode *inode, qsize_t number) 1579/*
1580 * Convert in-memory reserved quotas to real consumed quotas
1581 */
1582int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1538{ 1583{
1539 int cnt; 1584 int cnt;
1540 int ret = QUOTA_OK;
1541 1585
1542 if (IS_NOQUOTA(inode)) { 1586 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1543 inode_add_bytes(inode, number); 1587 inode_claim_rsv_space(inode, number);
1544 goto out; 1588 return 0;
1545 } 1589 }
1546 1590
1547 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1591 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1548 if (IS_NOQUOTA(inode)) {
1549 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1550 inode_add_bytes(inode, number);
1551 goto out;
1552 }
1553
1554 spin_lock(&dq_data_lock); 1592 spin_lock(&dq_data_lock);
1555 /* Claim reserved quotas to allocated quotas */ 1593 /* Claim reserved quotas to allocated quotas */
1556 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1594 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1559,191 +1597,129 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
1559 number); 1597 number);
1560 } 1598 }
1561 /* Update inode bytes */ 1599 /* Update inode bytes */
1562 inode_add_bytes(inode, number); 1600 inode_claim_rsv_space(inode, number);
1563 spin_unlock(&dq_data_lock); 1601 spin_unlock(&dq_data_lock);
1564 /* Dirtify all the dquots - this can block when journalling */ 1602 mark_all_dquot_dirty(inode->i_dquot);
1565 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1566 if (inode->i_dquot[cnt])
1567 mark_dquot_dirty(inode->i_dquot[cnt]);
1568 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1603 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1569out: 1604 return 0;
1570 return ret;
1571}
1572EXPORT_SYMBOL(dquot_claim_space);
1573
1574/*
1575 * Release reserved quota space
1576 */
1577void dquot_release_reserved_space(struct inode *inode, qsize_t number)
1578{
1579 int cnt;
1580
1581 if (IS_NOQUOTA(inode))
1582 goto out;
1583
1584 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1585 if (IS_NOQUOTA(inode))
1586 goto out_unlock;
1587
1588 spin_lock(&dq_data_lock);
1589 /* Release reserved dquots */
1590 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1591 if (inode->i_dquot[cnt])
1592 dquot_free_reserved_space(inode->i_dquot[cnt], number);
1593 }
1594 spin_unlock(&dq_data_lock);
1595
1596out_unlock:
1597 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1598out:
1599 return;
1600} 1605}
1601EXPORT_SYMBOL(dquot_release_reserved_space); 1606EXPORT_SYMBOL(dquot_claim_space_nodirty);
1602 1607
1603/* 1608/*
1604 * This operation can block, but only after everything is updated 1609 * This operation can block, but only after everything is updated
1605 */ 1610 */
1606int dquot_free_space(struct inode *inode, qsize_t number) 1611void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
1607{ 1612{
1608 unsigned int cnt; 1613 unsigned int cnt;
1609 char warntype[MAXQUOTAS]; 1614 char warntype[MAXQUOTAS];
1610 1615
1611 /* First test before acquiring mutex - solves deadlocks when we 1616 /* First test before acquiring mutex - solves deadlocks when we
1612 * re-enter the quota code and are already holding the mutex */ 1617 * re-enter the quota code and are already holding the mutex */
1613 if (IS_NOQUOTA(inode)) { 1618 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1614out_sub: 1619 inode_decr_space(inode, number, reserve);
1615 inode_sub_bytes(inode, number); 1620 return;
1616 return QUOTA_OK;
1617 } 1621 }
1618 1622
1619 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1623 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1620 /* Now recheck reliably when holding dqptr_sem */
1621 if (IS_NOQUOTA(inode)) {
1622 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1623 goto out_sub;
1624 }
1625 spin_lock(&dq_data_lock); 1624 spin_lock(&dq_data_lock);
1626 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1625 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1627 if (!inode->i_dquot[cnt]) 1626 if (!inode->i_dquot[cnt])
1628 continue; 1627 continue;
1629 warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); 1628 warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
1630 dquot_decr_space(inode->i_dquot[cnt], number); 1629 if (reserve)
1630 dquot_free_reserved_space(inode->i_dquot[cnt], number);
1631 else
1632 dquot_decr_space(inode->i_dquot[cnt], number);
1631 } 1633 }
1632 inode_sub_bytes(inode, number); 1634 inode_decr_space(inode, number, reserve);
1633 spin_unlock(&dq_data_lock); 1635 spin_unlock(&dq_data_lock);
1634 /* Dirtify all the dquots - this can block when journalling */ 1636
1635 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1637 if (reserve)
1636 if (inode->i_dquot[cnt]) 1638 goto out_unlock;
1637 mark_dquot_dirty(inode->i_dquot[cnt]); 1639 mark_all_dquot_dirty(inode->i_dquot);
1640out_unlock:
1638 flush_warnings(inode->i_dquot, warntype); 1641 flush_warnings(inode->i_dquot, warntype);
1639 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1642 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1640 return QUOTA_OK;
1641} 1643}
1642EXPORT_SYMBOL(dquot_free_space); 1644EXPORT_SYMBOL(__dquot_free_space);
1643 1645
1644/* 1646/*
1645 * This operation can block, but only after everything is updated 1647 * This operation can block, but only after everything is updated
1646 */ 1648 */
1647int dquot_free_inode(const struct inode *inode, qsize_t number) 1649void dquot_free_inode(const struct inode *inode)
1648{ 1650{
1649 unsigned int cnt; 1651 unsigned int cnt;
1650 char warntype[MAXQUOTAS]; 1652 char warntype[MAXQUOTAS];
1651 1653
1652 /* First test before acquiring mutex - solves deadlocks when we 1654 /* First test before acquiring mutex - solves deadlocks when we
1653 * re-enter the quota code and are already holding the mutex */ 1655 * re-enter the quota code and are already holding the mutex */
1654 if (IS_NOQUOTA(inode)) 1656 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1655 return QUOTA_OK; 1657 return;
1656 1658
1657 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1659 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1658 /* Now recheck reliably when holding dqptr_sem */
1659 if (IS_NOQUOTA(inode)) {
1660 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1661 return QUOTA_OK;
1662 }
1663 spin_lock(&dq_data_lock); 1660 spin_lock(&dq_data_lock);
1664 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1661 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1665 if (!inode->i_dquot[cnt]) 1662 if (!inode->i_dquot[cnt])
1666 continue; 1663 continue;
1667 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); 1664 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
1668 dquot_decr_inodes(inode->i_dquot[cnt], number); 1665 dquot_decr_inodes(inode->i_dquot[cnt], 1);
1669 } 1666 }
1670 spin_unlock(&dq_data_lock); 1667 spin_unlock(&dq_data_lock);
1671 /* Dirtify all the dquots - this can block when journalling */ 1668 mark_all_dquot_dirty(inode->i_dquot);
1672 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1673 if (inode->i_dquot[cnt])
1674 mark_dquot_dirty(inode->i_dquot[cnt]);
1675 flush_warnings(inode->i_dquot, warntype); 1669 flush_warnings(inode->i_dquot, warntype);
1676 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1670 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1677 return QUOTA_OK;
1678} 1671}
1679EXPORT_SYMBOL(dquot_free_inode); 1672EXPORT_SYMBOL(dquot_free_inode);
1680 1673
1681/* 1674/*
1682 * call back function, get reserved quota space from underlying fs
1683 */
1684qsize_t dquot_get_reserved_space(struct inode *inode)
1685{
1686 qsize_t reserved_space = 0;
1687
1688 if (sb_any_quota_active(inode->i_sb) &&
1689 inode->i_sb->dq_op->get_reserved_space)
1690 reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
1691 return reserved_space;
1692}
1693
1694/*
1695 * Transfer the number of inode and blocks from one diskquota to an other. 1675 * Transfer the number of inode and blocks from one diskquota to an other.
1696 * 1676 *
1697 * This operation can block, but only after everything is updated 1677 * This operation can block, but only after everything is updated
1698 * A transaction must be started when entering this function. 1678 * A transaction must be started when entering this function.
1699 */ 1679 */
1700int dquot_transfer(struct inode *inode, struct iattr *iattr) 1680static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
1701{ 1681{
1702 qsize_t space, cur_space; 1682 qsize_t space, cur_space;
1703 qsize_t rsv_space = 0; 1683 qsize_t rsv_space = 0;
1704 struct dquot *transfer_from[MAXQUOTAS]; 1684 struct dquot *transfer_from[MAXQUOTAS];
1705 struct dquot *transfer_to[MAXQUOTAS]; 1685 struct dquot *transfer_to[MAXQUOTAS];
1706 int cnt, ret = QUOTA_OK; 1686 int cnt, ret = 0;
1707 int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
1708 chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
1709 char warntype_to[MAXQUOTAS]; 1687 char warntype_to[MAXQUOTAS];
1710 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1688 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1711 1689
1712 /* First test before acquiring mutex - solves deadlocks when we 1690 /* First test before acquiring mutex - solves deadlocks when we
1713 * re-enter the quota code and are already holding the mutex */ 1691 * re-enter the quota code and are already holding the mutex */
1714 if (IS_NOQUOTA(inode)) 1692 if (IS_NOQUOTA(inode))
1715 return QUOTA_OK; 1693 return 0;
1716 /* Initialize the arrays */ 1694 /* Initialize the arrays */
1717 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1695 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1718 transfer_from[cnt] = NULL; 1696 transfer_from[cnt] = NULL;
1719 transfer_to[cnt] = NULL; 1697 transfer_to[cnt] = NULL;
1720 warntype_to[cnt] = QUOTA_NL_NOWARN; 1698 warntype_to[cnt] = QUOTA_NL_NOWARN;
1721 } 1699 }
1722 if (chuid) 1700 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1723 transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid, 1701 if (mask & (1 << cnt))
1724 USRQUOTA); 1702 transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
1725 if (chgid) 1703 }
1726 transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
1727 GRPQUOTA);
1728
1729 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1704 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1730 /* Now recheck reliably when holding dqptr_sem */
1731 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1705 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1732 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1706 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1733 goto put_all; 1707 goto put_all;
1734 } 1708 }
1735 spin_lock(&dq_data_lock); 1709 spin_lock(&dq_data_lock);
1736 cur_space = inode_get_bytes(inode); 1710 cur_space = inode_get_bytes(inode);
1737 rsv_space = dquot_get_reserved_space(inode); 1711 rsv_space = inode_get_rsv_space(inode);
1738 space = cur_space + rsv_space; 1712 space = cur_space + rsv_space;
1739 /* Build the transfer_from list and check the limits */ 1713 /* Build the transfer_from list and check the limits */
1740 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1714 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1741 if (!transfer_to[cnt]) 1715 if (!transfer_to[cnt])
1742 continue; 1716 continue;
1743 transfer_from[cnt] = inode->i_dquot[cnt]; 1717 transfer_from[cnt] = inode->i_dquot[cnt];
1744 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == 1718 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
1745 NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, 1719 if (ret)
1746 warntype_to + cnt) == NO_QUOTA) 1720 goto over_quota;
1721 ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
1722 if (ret)
1747 goto over_quota; 1723 goto over_quota;
1748 } 1724 }
1749 1725
@@ -1778,25 +1754,18 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1778 spin_unlock(&dq_data_lock); 1754 spin_unlock(&dq_data_lock);
1779 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1755 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1780 1756
1781 /* Dirtify all the dquots - this can block when journalling */ 1757 mark_all_dquot_dirty(transfer_from);
1782 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1758 mark_all_dquot_dirty(transfer_to);
1783 if (transfer_from[cnt]) 1759 /* The reference we got is transferred to the inode */
1784 mark_dquot_dirty(transfer_from[cnt]); 1760 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1785 if (transfer_to[cnt]) { 1761 transfer_to[cnt] = NULL;
1786 mark_dquot_dirty(transfer_to[cnt]);
1787 /* The reference we got is transferred to the inode */
1788 transfer_to[cnt] = NULL;
1789 }
1790 }
1791warn_put_all: 1762warn_put_all:
1792 flush_warnings(transfer_to, warntype_to); 1763 flush_warnings(transfer_to, warntype_to);
1793 flush_warnings(transfer_from, warntype_from_inodes); 1764 flush_warnings(transfer_from, warntype_from_inodes);
1794 flush_warnings(transfer_from, warntype_from_space); 1765 flush_warnings(transfer_from, warntype_from_space);
1795put_all: 1766put_all:
1796 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1767 dqput_all(transfer_from);
1797 dqput(transfer_from[cnt]); 1768 dqput_all(transfer_to);
1798 dqput(transfer_to[cnt]);
1799 }
1800 return ret; 1769 return ret;
1801over_quota: 1770over_quota:
1802 spin_unlock(&dq_data_lock); 1771 spin_unlock(&dq_data_lock);
@@ -1804,22 +1773,32 @@ over_quota:
1804 /* Clear dquot pointers we don't want to dqput() */ 1773 /* Clear dquot pointers we don't want to dqput() */
1805 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1774 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1806 transfer_from[cnt] = NULL; 1775 transfer_from[cnt] = NULL;
1807 ret = NO_QUOTA;
1808 goto warn_put_all; 1776 goto warn_put_all;
1809} 1777}
1810EXPORT_SYMBOL(dquot_transfer);
1811 1778
1812/* Wrapper for transferring ownership of an inode */ 1779/* Wrapper for transferring ownership of an inode for uid/gid only
1813int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) 1780 * Called from FSXXX_setattr()
1781 */
1782int dquot_transfer(struct inode *inode, struct iattr *iattr)
1814{ 1783{
1784 qid_t chid[MAXQUOTAS];
1785 unsigned long mask = 0;
1786
1787 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
1788 mask |= 1 << USRQUOTA;
1789 chid[USRQUOTA] = iattr->ia_uid;
1790 }
1791 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
1792 mask |= 1 << GRPQUOTA;
1793 chid[GRPQUOTA] = iattr->ia_gid;
1794 }
1815 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { 1795 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
1816 vfs_dq_init(inode); 1796 dquot_initialize(inode);
1817 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) 1797 return __dquot_transfer(inode, chid, mask);
1818 return 1;
1819 } 1798 }
1820 return 0; 1799 return 0;
1821} 1800}
1822EXPORT_SYMBOL(vfs_dq_transfer); 1801EXPORT_SYMBOL(dquot_transfer);
1823 1802
1824/* 1803/*
1825 * Write info of quota file to disk 1804 * Write info of quota file to disk
@@ -1840,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info);
1840 * Definitions of diskquota operations. 1819 * Definitions of diskquota operations.
1841 */ 1820 */
1842const struct dquot_operations dquot_operations = { 1821const struct dquot_operations dquot_operations = {
1843 .initialize = dquot_initialize,
1844 .drop = dquot_drop,
1845 .alloc_space = dquot_alloc_space,
1846 .alloc_inode = dquot_alloc_inode,
1847 .free_space = dquot_free_space,
1848 .free_inode = dquot_free_inode,
1849 .transfer = dquot_transfer,
1850 .write_dquot = dquot_commit, 1822 .write_dquot = dquot_commit,
1851 .acquire_dquot = dquot_acquire, 1823 .acquire_dquot = dquot_acquire,
1852 .release_dquot = dquot_release, 1824 .release_dquot = dquot_release,
@@ -1857,6 +1829,20 @@ const struct dquot_operations dquot_operations = {
1857}; 1829};
1858 1830
1859/* 1831/*
1832 * Generic helper for ->open on filesystems supporting disk quotas.
1833 */
1834int dquot_file_open(struct inode *inode, struct file *file)
1835{
1836 int error;
1837
1838 error = generic_file_open(inode, file);
1839 if (!error && (file->f_mode & FMODE_WRITE))
1840 dquot_initialize(inode);
1841 return error;
1842}
1843EXPORT_SYMBOL(dquot_file_open);
1844
1845/*
1860 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1846 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1861 */ 1847 */
1862int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1848int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
@@ -2035,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2035 } 2021 }
2036 2022
2037 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { 2023 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
2038 /* As we bypass the pagecache we must now flush the inode so 2024 /* As we bypass the pagecache we must now flush all the
2039 * that we see all the changes from userspace... */ 2025 * dirty data and invalidate caches so that kernel sees
2040 write_inode_now(inode, 1); 2026 * changes from userspace. It is not enough to just flush
2041 /* And now flush the block cache so that kernel sees the 2027 * the quota file since if blocksize < pagesize, invalidation
2042 * changes */ 2028 * of the cache could fail because of other unrelated dirty
2029 * data */
2030 sync_filesystem(sb);
2043 invalidate_bdev(sb->s_bdev); 2031 invalidate_bdev(sb->s_bdev);
2044 } 2032 }
2045 mutex_lock(&dqopt->dqonoff_mutex); 2033 mutex_lock(&dqopt->dqonoff_mutex);
@@ -2052,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2052 /* We don't want quota and atime on quota files (deadlocks 2040 /* We don't want quota and atime on quota files (deadlocks
2053 * possible) Also nobody should write to the file - we use 2041 * possible) Also nobody should write to the file - we use
2054 * special IO operations which ignore the immutable bit. */ 2042 * special IO operations which ignore the immutable bit. */
2055 down_write(&dqopt->dqptr_sem);
2056 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2043 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2057 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | 2044 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
2058 S_NOQUOTA); 2045 S_NOQUOTA);
2059 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 2046 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
2060 mutex_unlock(&inode->i_mutex); 2047 mutex_unlock(&inode->i_mutex);
2061 up_write(&dqopt->dqptr_sem); 2048 /*
2062 sb->dq_op->drop(inode); 2049 * When S_NOQUOTA is set, remove dquot references as no more
2050 * references can be added
2051 */
2052 __dquot_drop(inode);
2063 } 2053 }
2064 2054
2065 error = -EIO; 2055 error = -EIO;
@@ -2095,14 +2085,12 @@ out_file_init:
2095 iput(inode); 2085 iput(inode);
2096out_lock: 2086out_lock:
2097 if (oldflags != -1) { 2087 if (oldflags != -1) {
2098 down_write(&dqopt->dqptr_sem);
2099 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2088 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2100 /* Set the flags back (in the case of accidental quotaon() 2089 /* Set the flags back (in the case of accidental quotaon()
2101 * on a wrong file we don't want to mess up the flags) */ 2090 * on a wrong file we don't want to mess up the flags) */
2102 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); 2091 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
2103 inode->i_flags |= oldflags; 2092 inode->i_flags |= oldflags;
2104 mutex_unlock(&inode->i_mutex); 2093 mutex_unlock(&inode->i_mutex);
2105 up_write(&dqopt->dqptr_sem);
2106 } 2094 }
2107 mutex_unlock(&dqopt->dqonoff_mutex); 2095 mutex_unlock(&dqopt->dqonoff_mutex);
2108out_fmt: 2096out_fmt:
@@ -2233,7 +2221,9 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
2233 struct dentry *dentry; 2221 struct dentry *dentry;
2234 int error; 2222 int error;
2235 2223
2224 mutex_lock(&sb->s_root->d_inode->i_mutex);
2236 dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); 2225 dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
2226 mutex_unlock(&sb->s_root->d_inode->i_mutex);
2237 if (IS_ERR(dentry)) 2227 if (IS_ERR(dentry))
2238 return PTR_ERR(dentry); 2228 return PTR_ERR(dentry);
2239 2229
@@ -2473,100 +2463,89 @@ const struct quotactl_ops vfs_quotactl_ops = {
2473 2463
2474static ctl_table fs_dqstats_table[] = { 2464static ctl_table fs_dqstats_table[] = {
2475 { 2465 {
2476 .ctl_name = FS_DQ_LOOKUPS,
2477 .procname = "lookups", 2466 .procname = "lookups",
2478 .data = &dqstats.lookups, 2467 .data = &dqstats.lookups,
2479 .maxlen = sizeof(int), 2468 .maxlen = sizeof(int),
2480 .mode = 0444, 2469 .mode = 0444,
2481 .proc_handler = &proc_dointvec, 2470 .proc_handler = proc_dointvec,
2482 }, 2471 },
2483 { 2472 {
2484 .ctl_name = FS_DQ_DROPS,
2485 .procname = "drops", 2473 .procname = "drops",
2486 .data = &dqstats.drops, 2474 .data = &dqstats.drops,
2487 .maxlen = sizeof(int), 2475 .maxlen = sizeof(int),
2488 .mode = 0444, 2476 .mode = 0444,
2489 .proc_handler = &proc_dointvec, 2477 .proc_handler = proc_dointvec,
2490 }, 2478 },
2491 { 2479 {
2492 .ctl_name = FS_DQ_READS,
2493 .procname = "reads", 2480 .procname = "reads",
2494 .data = &dqstats.reads, 2481 .data = &dqstats.reads,
2495 .maxlen = sizeof(int), 2482 .maxlen = sizeof(int),
2496 .mode = 0444, 2483 .mode = 0444,
2497 .proc_handler = &proc_dointvec, 2484 .proc_handler = proc_dointvec,
2498 }, 2485 },
2499 { 2486 {
2500 .ctl_name = FS_DQ_WRITES,
2501 .procname = "writes", 2487 .procname = "writes",
2502 .data = &dqstats.writes, 2488 .data = &dqstats.writes,
2503 .maxlen = sizeof(int), 2489 .maxlen = sizeof(int),
2504 .mode = 0444, 2490 .mode = 0444,
2505 .proc_handler = &proc_dointvec, 2491 .proc_handler = proc_dointvec,
2506 }, 2492 },
2507 { 2493 {
2508 .ctl_name = FS_DQ_CACHE_HITS,
2509 .procname = "cache_hits", 2494 .procname = "cache_hits",
2510 .data = &dqstats.cache_hits, 2495 .data = &dqstats.cache_hits,
2511 .maxlen = sizeof(int), 2496 .maxlen = sizeof(int),
2512 .mode = 0444, 2497 .mode = 0444,
2513 .proc_handler = &proc_dointvec, 2498 .proc_handler = proc_dointvec,
2514 }, 2499 },
2515 { 2500 {
2516 .ctl_name = FS_DQ_ALLOCATED,
2517 .procname = "allocated_dquots", 2501 .procname = "allocated_dquots",
2518 .data = &dqstats.allocated_dquots, 2502 .data = &dqstats.allocated_dquots,
2519 .maxlen = sizeof(int), 2503 .maxlen = sizeof(int),
2520 .mode = 0444, 2504 .mode = 0444,
2521 .proc_handler = &proc_dointvec, 2505 .proc_handler = proc_dointvec,
2522 }, 2506 },
2523 { 2507 {
2524 .ctl_name = FS_DQ_FREE,
2525 .procname = "free_dquots", 2508 .procname = "free_dquots",
2526 .data = &dqstats.free_dquots, 2509 .data = &dqstats.free_dquots,
2527 .maxlen = sizeof(int), 2510 .maxlen = sizeof(int),
2528 .mode = 0444, 2511 .mode = 0444,
2529 .proc_handler = &proc_dointvec, 2512 .proc_handler = proc_dointvec,
2530 }, 2513 },
2531 { 2514 {
2532 .ctl_name = FS_DQ_SYNCS,
2533 .procname = "syncs", 2515 .procname = "syncs",
2534 .data = &dqstats.syncs, 2516 .data = &dqstats.syncs,
2535 .maxlen = sizeof(int), 2517 .maxlen = sizeof(int),
2536 .mode = 0444, 2518 .mode = 0444,
2537 .proc_handler = &proc_dointvec, 2519 .proc_handler = proc_dointvec,
2538 }, 2520 },
2539#ifdef CONFIG_PRINT_QUOTA_WARNING 2521#ifdef CONFIG_PRINT_QUOTA_WARNING
2540 { 2522 {
2541 .ctl_name = FS_DQ_WARNINGS,
2542 .procname = "warnings", 2523 .procname = "warnings",
2543 .data = &flag_print_warnings, 2524 .data = &flag_print_warnings,
2544 .maxlen = sizeof(int), 2525 .maxlen = sizeof(int),
2545 .mode = 0644, 2526 .mode = 0644,
2546 .proc_handler = &proc_dointvec, 2527 .proc_handler = proc_dointvec,
2547 }, 2528 },
2548#endif 2529#endif
2549 { .ctl_name = 0 }, 2530 { },
2550}; 2531};
2551 2532
2552static ctl_table fs_table[] = { 2533static ctl_table fs_table[] = {
2553 { 2534 {
2554 .ctl_name = FS_DQSTATS,
2555 .procname = "quota", 2535 .procname = "quota",
2556 .mode = 0555, 2536 .mode = 0555,
2557 .child = fs_dqstats_table, 2537 .child = fs_dqstats_table,
2558 }, 2538 },
2559 { .ctl_name = 0 }, 2539 { },
2560}; 2540};
2561 2541
2562static ctl_table sys_table[] = { 2542static ctl_table sys_table[] = {
2563 { 2543 {
2564 .ctl_name = CTL_FS,
2565 .procname = "fs", 2544 .procname = "fs",
2566 .mode = 0555, 2545 .mode = 0555,
2567 .child = fs_table, 2546 .child = fs_table,
2568 }, 2547 },
2569 { .ctl_name = 0 }, 2548 { },
2570}; 2549};
2571 2550
2572static int __init dquot_init(void) 2551static int __init dquot_init(void)
@@ -2607,12 +2586,6 @@ static int __init dquot_init(void)
2607 2586
2608 register_shrinker(&dqcache_shrinker); 2587 register_shrinker(&dqcache_shrinker);
2609 2588
2610#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
2611 if (genl_register_family(&quota_genl_family) != 0)
2612 printk(KERN_ERR
2613 "VFS: Failed to create quota netlink interface.\n");
2614#endif
2615
2616 return 0; 2589 return 0;
2617} 2590}
2618module_init(dquot_init); 2591module_init(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 000000000000..d67908b407d9
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,96 @@
1
2#include <linux/cred.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/kernel.h>
6#include <linux/quotaops.h>
7#include <linux/sched.h>
8#include <linux/slab.h>
9#include <net/netlink.h>
10#include <net/genetlink.h>
11
12/* Netlink family structure for quota */
13static struct genl_family quota_genl_family = {
14 .id = GENL_ID_GENERATE,
15 .hdrsize = 0,
16 .name = "VFS_DQUOT",
17 .version = 1,
18 .maxattr = QUOTA_NL_A_MAX,
19};
20
21/**
22 * quota_send_warning - Send warning to userspace about exceeded quota
23 * @type: The quota type: USRQQUOTA, GRPQUOTA,...
24 * @id: The user or group id of the quota that was exceeded
25 * @dev: The device on which the fs is mounted (sb->s_dev)
26 * @warntype: The type of the warning: QUOTA_NL_...
27 *
28 * This can be used by filesystems (including those which don't use
29 * dquot) to send a message to userspace relating to quota limits.
30 *
31 */
32
33void quota_send_warning(short type, unsigned int id, dev_t dev,
34 const char warntype)
35{
36 static atomic_t seq;
37 struct sk_buff *skb;
38 void *msg_head;
39 int ret;
40 int msg_size = 4 * nla_total_size(sizeof(u32)) +
41 2 * nla_total_size(sizeof(u64));
42
43 /* We have to allocate using GFP_NOFS as we are called from a
44 * filesystem performing write and thus further recursion into
45 * the fs to free some data could cause deadlocks. */
46 skb = genlmsg_new(msg_size, GFP_NOFS);
47 if (!skb) {
48 printk(KERN_ERR
49 "VFS: Not enough memory to send quota warning.\n");
50 return;
51 }
52 msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
53 &quota_genl_family, 0, QUOTA_NL_C_WARNING);
54 if (!msg_head) {
55 printk(KERN_ERR
56 "VFS: Cannot store netlink header in quota warning.\n");
57 goto err_out;
58 }
59 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
60 if (ret)
61 goto attr_err_out;
62 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
63 if (ret)
64 goto attr_err_out;
65 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
66 if (ret)
67 goto attr_err_out;
68 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
69 if (ret)
70 goto attr_err_out;
71 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
72 if (ret)
73 goto attr_err_out;
74 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
75 if (ret)
76 goto attr_err_out;
77 genlmsg_end(skb, msg_head);
78
79 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
80 return;
81attr_err_out:
82 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
83err_out:
84 kfree_skb(skb);
85}
86EXPORT_SYMBOL(quota_send_warning);
87
88static int __init quota_init(void)
89{
90 if (genl_register_family(&quota_genl_family) != 0)
91 printk(KERN_ERR
92 "VFS: Failed to create quota netlink interface.\n");
93 return 0;
94};
95
96module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b2..95388f9b7356 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,7 +10,6 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <asm/current.h> 11#include <asm/current.h>
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13#include <linux/compat.h>
14#include <linux/kernel.h> 13#include <linux/kernel.h>
15#include <linux/security.h> 14#include <linux/security.h>
16#include <linux/syscalls.h> 15#include <linux/syscalls.h>
@@ -18,218 +17,205 @@
18#include <linux/capability.h> 17#include <linux/capability.h>
19#include <linux/quotaops.h> 18#include <linux/quotaops.h>
20#include <linux/types.h> 19#include <linux/types.h>
20#include <linux/writeback.h>
21 21
22/* Check validity of generic quotactl commands */ 22static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
23static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, 23 qid_t id)
24 qid_t id)
25{ 24{
26 if (type >= MAXQUOTAS)
27 return -EINVAL;
28 if (!sb && cmd != Q_SYNC)
29 return -ENODEV;
30 /* Is operation supported? */
31 if (sb && !sb->s_qcop)
32 return -ENOSYS;
33
34 switch (cmd) { 25 switch (cmd) {
35 case Q_GETFMT: 26 /* these commands do not require any special privilegues */
36 break; 27 case Q_GETFMT:
37 case Q_QUOTAON: 28 case Q_SYNC:
38 if (!sb->s_qcop->quota_on) 29 case Q_GETINFO:
39 return -ENOSYS; 30 case Q_XGETQSTAT:
40 break; 31 case Q_XQUOTASYNC:
41 case Q_QUOTAOFF: 32 break;
42 if (!sb->s_qcop->quota_off) 33 /* allow to query information for dquots we "own" */
43 return -ENOSYS; 34 case Q_GETQUOTA:
44 break; 35 case Q_XGETQUOTA:
45 case Q_SETINFO: 36 if ((type == USRQUOTA && current_euid() == id) ||
46 if (!sb->s_qcop->set_info) 37 (type == GRPQUOTA && in_egroup_p(id)))
47 return -ENOSYS;
48 break;
49 case Q_GETINFO:
50 if (!sb->s_qcop->get_info)
51 return -ENOSYS;
52 break;
53 case Q_SETQUOTA:
54 if (!sb->s_qcop->set_dqblk)
55 return -ENOSYS;
56 break;
57 case Q_GETQUOTA:
58 if (!sb->s_qcop->get_dqblk)
59 return -ENOSYS;
60 break;
61 case Q_SYNC:
62 if (sb && !sb->s_qcop->quota_sync)
63 return -ENOSYS;
64 break; 38 break;
65 default: 39 /*FALLTHROUGH*/
66 return -EINVAL; 40 default:
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
67 } 43 }
68 44
69 /* Is quota turned on for commands which need it? */ 45 return security_quotactl(cmd, type, id, sb);
70 switch (cmd) { 46}
71 case Q_GETFMT:
72 case Q_GETINFO:
73 case Q_SETINFO:
74 case Q_SETQUOTA:
75 case Q_GETQUOTA:
76 /* This is just an informative test so we are satisfied
77 * without the lock */
78 if (!sb_has_quota_active(sb, type))
79 return -ESRCH;
80 }
81 47
82 /* Check privileges */ 48static int quota_sync_all(int type)
83 if (cmd == Q_GETQUOTA) { 49{
84 if (((type == USRQUOTA && current_euid() != id) || 50 struct super_block *sb;
85 (type == GRPQUOTA && !in_egroup_p(id))) && 51 int ret;
86 !capable(CAP_SYS_ADMIN)) 52
87 return -EPERM; 53 if (type >= MAXQUOTAS)
54 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret)
57 return ret;
58
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
88 } 74 }
89 else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) 75 spin_unlock(&sb_lock);
90 if (!capable(CAP_SYS_ADMIN))
91 return -EPERM;
92 76
93 return 0; 77 return 0;
94} 78}
95 79
96/* Check validity of XFS Quota Manager commands */ 80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
97static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, 81 void __user *addr)
98 qid_t id)
99{ 82{
100 if (type >= XQM_MAXQUOTAS) 83 char *pathname;
101 return -EINVAL; 84 int ret = -ENOSYS;
102 if (!sb) 85
103 return -ENODEV; 86 pathname = getname(addr);
104 if (!sb->s_qcop) 87 if (IS_ERR(pathname))
105 return -ENOSYS; 88 return PTR_ERR(pathname);
89 if (sb->s_qcop->quota_on)
90 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
91 putname(pathname);
92 return ret;
93}
106 94
107 switch (cmd) { 95static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
108 case Q_XQUOTAON: 96{
109 case Q_XQUOTAOFF: 97 __u32 fmt;
110 case Q_XQUOTARM:
111 if (!sb->s_qcop->set_xstate)
112 return -ENOSYS;
113 break;
114 case Q_XGETQSTAT:
115 if (!sb->s_qcop->get_xstate)
116 return -ENOSYS;
117 break;
118 case Q_XSETQLIM:
119 if (!sb->s_qcop->set_xquota)
120 return -ENOSYS;
121 break;
122 case Q_XGETQUOTA:
123 if (!sb->s_qcop->get_xquota)
124 return -ENOSYS;
125 break;
126 case Q_XQUOTASYNC:
127 if (!sb->s_qcop->quota_sync)
128 return -ENOSYS;
129 break;
130 default:
131 return -EINVAL;
132 }
133 98
134 /* Check privileges */ 99 down_read(&sb_dqopt(sb)->dqptr_sem);
135 if (cmd == Q_XGETQUOTA) { 100 if (!sb_has_quota_active(sb, type)) {
136 if (((type == XQM_USRQUOTA && current_euid() != id) || 101 up_read(&sb_dqopt(sb)->dqptr_sem);
137 (type == XQM_GRPQUOTA && !in_egroup_p(id))) && 102 return -ESRCH;
138 !capable(CAP_SYS_ADMIN))
139 return -EPERM;
140 } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
141 if (!capable(CAP_SYS_ADMIN))
142 return -EPERM;
143 } 103 }
104 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
105 up_read(&sb_dqopt(sb)->dqptr_sem);
106 if (copy_to_user(addr, &fmt, sizeof(fmt)))
107 return -EFAULT;
108 return 0;
109}
110
111static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
112{
113 struct if_dqinfo info;
114 int ret;
115
116 if (!sb_has_quota_active(sb, type))
117 return -ESRCH;
118 if (!sb->s_qcop->get_info)
119 return -ENOSYS;
120 ret = sb->s_qcop->get_info(sb, type, &info);
121 if (!ret && copy_to_user(addr, &info, sizeof(info)))
122 return -EFAULT;
123 return ret;
124}
125
126static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
127{
128 struct if_dqinfo info;
129
130 if (copy_from_user(&info, addr, sizeof(info)))
131 return -EFAULT;
132 if (!sb_has_quota_active(sb, type))
133 return -ESRCH;
134 if (!sb->s_qcop->set_info)
135 return -ENOSYS;
136 return sb->s_qcop->set_info(sb, type, &info);
137}
138
139static int quota_getquota(struct super_block *sb, int type, qid_t id,
140 void __user *addr)
141{
142 struct if_dqblk idq;
143 int ret;
144 144
145 if (!sb_has_quota_active(sb, type))
146 return -ESRCH;
147 if (!sb->s_qcop->get_dqblk)
148 return -ENOSYS;
149 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
150 if (ret)
151 return ret;
152 if (copy_to_user(addr, &idq, sizeof(idq)))
153 return -EFAULT;
145 return 0; 154 return 0;
146} 155}
147 156
148static int check_quotactl_valid(struct super_block *sb, int type, int cmd, 157static int quota_setquota(struct super_block *sb, int type, qid_t id,
149 qid_t id) 158 void __user *addr)
150{ 159{
151 int error; 160 struct if_dqblk idq;
152 161
153 if (XQM_COMMAND(cmd)) 162 if (copy_from_user(&idq, addr, sizeof(idq)))
154 error = xqm_quotactl_valid(sb, type, cmd, id); 163 return -EFAULT;
155 else 164 if (!sb_has_quota_active(sb, type))
156 error = generic_quotactl_valid(sb, type, cmd, id); 165 return -ESRCH;
157 if (!error) 166 if (!sb->s_qcop->set_dqblk)
158 error = security_quotactl(cmd, type, id, sb); 167 return -ENOSYS;
159 return error; 168 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
160} 169}
161 170
162#ifdef CONFIG_QUOTA 171static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
163void sync_quota_sb(struct super_block *sb, int type)
164{ 172{
165 int cnt; 173 __u32 flags;
166 174
167 if (!sb->s_qcop->quota_sync) 175 if (copy_from_user(&flags, addr, sizeof(flags)))
168 return; 176 return -EFAULT;
177 if (!sb->s_qcop->set_xstate)
178 return -ENOSYS;
179 return sb->s_qcop->set_xstate(sb, flags, cmd);
180}
181
182static int quota_getxstate(struct super_block *sb, void __user *addr)
183{
184 struct fs_quota_stat fqs;
185 int ret;
169 186
170 sb->s_qcop->quota_sync(sb, type); 187 if (!sb->s_qcop->get_xstate)
188 return -ENOSYS;
189 ret = sb->s_qcop->get_xstate(sb, &fqs);
190 if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
191 return -EFAULT;
192 return ret;
193}
171 194
172 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) 195static int quota_setxquota(struct super_block *sb, int type, qid_t id,
173 return; 196 void __user *addr)
174 /* This is not very clever (and fast) but currently I don't know about 197{
175 * any other simple way of getting quota data to disk and we must get 198 struct fs_disk_quota fdq;
176 * them there for userspace to be visible... */
177 if (sb->s_op->sync_fs)
178 sb->s_op->sync_fs(sb, 1);
179 sync_blockdev(sb->s_bdev);
180 199
181 /* 200 if (copy_from_user(&fdq, addr, sizeof(fdq)))
182 * Now when everything is written we can discard the pagecache so 201 return -EFAULT;
183 * that userspace sees the changes. 202 if (!sb->s_qcop->set_xquota)
184 */ 203 return -ENOSYS;
185 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 204 return sb->s_qcop->set_xquota(sb, type, id, &fdq);
186 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
187 if (type != -1 && cnt != type)
188 continue;
189 if (!sb_has_quota_active(sb, cnt))
190 continue;
191 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
192 I_MUTEX_QUOTA);
193 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
194 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
195 }
196 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
197} 205}
198#endif
199 206
200static void sync_dquots(int type) 207static int quota_getxquota(struct super_block *sb, int type, qid_t id,
208 void __user *addr)
201{ 209{
202 struct super_block *sb; 210 struct fs_disk_quota fdq;
203 int cnt; 211 int ret;
204 212
205 spin_lock(&sb_lock); 213 if (!sb->s_qcop->get_xquota)
206restart: 214 return -ENOSYS;
207 list_for_each_entry(sb, &super_blocks, s_list) { 215 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
208 /* This test just improves performance so it needn't be 216 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
209 * reliable... */ 217 return -EFAULT;
210 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 218 return ret;
211 if (type != -1 && type != cnt)
212 continue;
213 if (!sb_has_quota_active(sb, cnt))
214 continue;
215 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
216 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
217 continue;
218 break;
219 }
220 if (cnt == MAXQUOTAS)
221 continue;
222 sb->s_count++;
223 spin_unlock(&sb_lock);
224 down_read(&sb->s_umount);
225 if (sb->s_root)
226 sync_quota_sb(sb, type);
227 up_read(&sb->s_umount);
228 spin_lock(&sb_lock);
229 if (__put_super_and_need_restart(sb))
230 goto restart;
231 }
232 spin_unlock(&sb_lock);
233} 219}
234 220
235/* Copy parameters and call proper function */ 221/* Copy parameters and call proper function */
@@ -238,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
238{ 224{
239 int ret; 225 int ret;
240 226
227 if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
228 return -EINVAL;
229 if (!sb->s_qcop)
230 return -ENOSYS;
231
232 ret = check_quotactl_permission(sb, type, cmd, id);
233 if (ret < 0)
234 return ret;
235
241 switch (cmd) { 236 switch (cmd) {
242 case Q_QUOTAON: { 237 case Q_QUOTAON:
243 char *pathname; 238 return quota_quotaon(sb, type, cmd, id, addr);
244 239 case Q_QUOTAOFF:
245 pathname = getname(addr); 240 if (!sb->s_qcop->quota_off)
246 if (IS_ERR(pathname)) 241 return -ENOSYS;
247 return PTR_ERR(pathname); 242 return sb->s_qcop->quota_off(sb, type, 0);
248 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 243 case Q_GETFMT:
249 putname(pathname); 244 return quota_getfmt(sb, type, addr);
250 return ret; 245 case Q_GETINFO:
251 } 246 return quota_getinfo(sb, type, addr);
252 case Q_QUOTAOFF: 247 case Q_SETINFO:
253 return sb->s_qcop->quota_off(sb, type, 0); 248 return quota_setinfo(sb, type, addr);
254 249 case Q_GETQUOTA:
255 case Q_GETFMT: { 250 return quota_getquota(sb, type, id, addr);
256 __u32 fmt; 251 case Q_SETQUOTA:
257 252 return quota_setquota(sb, type, id, addr);
258 down_read(&sb_dqopt(sb)->dqptr_sem); 253 case Q_SYNC:
259 if (!sb_has_quota_active(sb, type)) { 254 if (!sb->s_qcop->quota_sync)
260 up_read(&sb_dqopt(sb)->dqptr_sem); 255 return -ENOSYS;
261 return -ESRCH; 256 return sb->s_qcop->quota_sync(sb, type, 1);
262 } 257 case Q_XQUOTAON:
263 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; 258 case Q_XQUOTAOFF:
264 up_read(&sb_dqopt(sb)->dqptr_sem); 259 case Q_XQUOTARM:
265 if (copy_to_user(addr, &fmt, sizeof(fmt))) 260 return quota_setxstate(sb, cmd, addr);
266 return -EFAULT; 261 case Q_XGETQSTAT:
267 return 0; 262 return quota_getxstate(sb, addr);
268 } 263 case Q_XSETQLIM:
269 case Q_GETINFO: { 264 return quota_setxquota(sb, type, id, addr);
270 struct if_dqinfo info; 265 case Q_XGETQUOTA:
271 266 return quota_getxquota(sb, type, id, addr);
272 ret = sb->s_qcop->get_info(sb, type, &info); 267 case Q_XQUOTASYNC:
273 if (ret) 268 /* caller already holds s_umount */
274 return ret; 269 if (sb->s_flags & MS_RDONLY)
275 if (copy_to_user(addr, &info, sizeof(info))) 270 return -EROFS;
276 return -EFAULT; 271 writeback_inodes_sb(sb);
277 return 0; 272 return 0;
278 } 273 default:
279 case Q_SETINFO: { 274 return -EINVAL;
280 struct if_dqinfo info;
281
282 if (copy_from_user(&info, addr, sizeof(info)))
283 return -EFAULT;
284 return sb->s_qcop->set_info(sb, type, &info);
285 }
286 case Q_GETQUOTA: {
287 struct if_dqblk idq;
288
289 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
290 if (ret)
291 return ret;
292 if (copy_to_user(addr, &idq, sizeof(idq)))
293 return -EFAULT;
294 return 0;
295 }
296 case Q_SETQUOTA: {
297 struct if_dqblk idq;
298
299 if (copy_from_user(&idq, addr, sizeof(idq)))
300 return -EFAULT;
301 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
302 }
303 case Q_SYNC:
304 if (sb)
305 sync_quota_sb(sb, type);
306 else
307 sync_dquots(type);
308 return 0;
309
310 case Q_XQUOTAON:
311 case Q_XQUOTAOFF:
312 case Q_XQUOTARM: {
313 __u32 flags;
314
315 if (copy_from_user(&flags, addr, sizeof(flags)))
316 return -EFAULT;
317 return sb->s_qcop->set_xstate(sb, flags, cmd);
318 }
319 case Q_XGETQSTAT: {
320 struct fs_quota_stat fqs;
321
322 if ((ret = sb->s_qcop->get_xstate(sb, &fqs)))
323 return ret;
324 if (copy_to_user(addr, &fqs, sizeof(fqs)))
325 return -EFAULT;
326 return 0;
327 }
328 case Q_XSETQLIM: {
329 struct fs_disk_quota fdq;
330
331 if (copy_from_user(&fdq, addr, sizeof(fdq)))
332 return -EFAULT;
333 return sb->s_qcop->set_xquota(sb, type, id, &fdq);
334 }
335 case Q_XGETQUOTA: {
336 struct fs_disk_quota fdq;
337
338 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
339 if (ret)
340 return ret;
341 if (copy_to_user(addr, &fdq, sizeof(fdq)))
342 return -EFAULT;
343 return 0;
344 }
345 case Q_XQUOTASYNC:
346 return sb->s_qcop->quota_sync(sb, type);
347 /* We never reach here unless validity check is broken */
348 default:
349 BUG();
350 } 275 }
351 return 0;
352} 276}
353 277
354/* 278/*
@@ -395,133 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
395 cmds = cmd >> SUBCMDSHIFT; 319 cmds = cmd >> SUBCMDSHIFT;
396 type = cmd & SUBCMDMASK; 320 type = cmd & SUBCMDMASK;
397 321
398 if (cmds != Q_SYNC || special) { 322 /*
399 sb = quotactl_block(special); 323 * As a special case Q_SYNC can be called without a specific device.
400 if (IS_ERR(sb)) 324 * It will iterate all superblocks that have quota enabled and call
401 return PTR_ERR(sb); 325 * the sync action on each of them.
326 */
327 if (!special) {
328 if (cmds == Q_SYNC)
329 return quota_sync_all(type);
330 return -ENODEV;
402 } 331 }
403 332
404 ret = check_quotactl_valid(sb, type, cmds, id); 333 sb = quotactl_block(special);
405 if (ret >= 0) 334 if (IS_ERR(sb))
406 ret = do_quotactl(sb, type, cmds, id, addr); 335 return PTR_ERR(sb);
407 if (sb)
408 drop_super(sb);
409 336
410 return ret; 337 ret = do_quotactl(sb, type, cmds, id, addr);
411}
412 338
413#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT) 339 drop_super(sb);
414/*
415 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
416 * and is necessary due to alignment problems.
417 */
418struct compat_if_dqblk {
419 compat_u64 dqb_bhardlimit;
420 compat_u64 dqb_bsoftlimit;
421 compat_u64 dqb_curspace;
422 compat_u64 dqb_ihardlimit;
423 compat_u64 dqb_isoftlimit;
424 compat_u64 dqb_curinodes;
425 compat_u64 dqb_btime;
426 compat_u64 dqb_itime;
427 compat_uint_t dqb_valid;
428};
429
430/* XFS structures */
431struct compat_fs_qfilestat {
432 compat_u64 dqb_bhardlimit;
433 compat_u64 qfs_nblks;
434 compat_uint_t qfs_nextents;
435};
436
437struct compat_fs_quota_stat {
438 __s8 qs_version;
439 __u16 qs_flags;
440 __s8 qs_pad;
441 struct compat_fs_qfilestat qs_uquota;
442 struct compat_fs_qfilestat qs_gquota;
443 compat_uint_t qs_incoredqs;
444 compat_int_t qs_btimelimit;
445 compat_int_t qs_itimelimit;
446 compat_int_t qs_rtbtimelimit;
447 __u16 qs_bwarnlimit;
448 __u16 qs_iwarnlimit;
449};
450
451asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
452 qid_t id, void __user *addr)
453{
454 unsigned int cmds;
455 struct if_dqblk __user *dqblk;
456 struct compat_if_dqblk __user *compat_dqblk;
457 struct fs_quota_stat __user *fsqstat;
458 struct compat_fs_quota_stat __user *compat_fsqstat;
459 compat_uint_t data;
460 u16 xdata;
461 long ret;
462
463 cmds = cmd >> SUBCMDSHIFT;
464
465 switch (cmds) {
466 case Q_GETQUOTA:
467 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
468 compat_dqblk = addr;
469 ret = sys_quotactl(cmd, special, id, dqblk);
470 if (ret)
471 break;
472 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
473 get_user(data, &dqblk->dqb_valid) ||
474 put_user(data, &compat_dqblk->dqb_valid))
475 ret = -EFAULT;
476 break;
477 case Q_SETQUOTA:
478 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
479 compat_dqblk = addr;
480 ret = -EFAULT;
481 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
482 get_user(data, &compat_dqblk->dqb_valid) ||
483 put_user(data, &dqblk->dqb_valid))
484 break;
485 ret = sys_quotactl(cmd, special, id, dqblk);
486 break;
487 case Q_XGETQSTAT:
488 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
489 compat_fsqstat = addr;
490 ret = sys_quotactl(cmd, special, id, fsqstat);
491 if (ret)
492 break;
493 ret = -EFAULT;
494 /* Copying qs_version, qs_flags, qs_pad */
495 if (copy_in_user(compat_fsqstat, fsqstat,
496 offsetof(struct compat_fs_quota_stat, qs_uquota)))
497 break;
498 /* Copying qs_uquota */
499 if (copy_in_user(&compat_fsqstat->qs_uquota,
500 &fsqstat->qs_uquota,
501 sizeof(compat_fsqstat->qs_uquota)) ||
502 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
503 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
504 break;
505 /* Copying qs_gquota */
506 if (copy_in_user(&compat_fsqstat->qs_gquota,
507 &fsqstat->qs_gquota,
508 sizeof(compat_fsqstat->qs_gquota)) ||
509 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
510 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
511 break;
512 /* Copying the rest */
513 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
514 &fsqstat->qs_incoredqs,
515 sizeof(struct compat_fs_quota_stat) -
516 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
517 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
518 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
519 break;
520 ret = 0;
521 break;
522 default:
523 ret = sys_quotactl(cmd, special, id, addr);
524 }
525 return ret; 340 return ret;
526} 341}
527#endif
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 0edcf42b1778..2ae757e9c008 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -204,7 +204,7 @@ out:
204 return ret; 204 return ret;
205} 205}
206 206
207static struct quota_format_ops v1_format_ops = { 207static const struct quota_format_ops v1_format_ops = {
208 .check_quota_file = v1_check_quota_file, 208 .check_quota_file = v1_check_quota_file,
209 .read_file_info = v1_read_file_info, 209 .read_file_info = v1_read_file_info,
210 .write_file_info = v1_write_file_info, 210 .write_file_info = v1_write_file_info,
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index a5475fb1ae44..e3da02f4986f 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -23,14 +23,23 @@ MODULE_LICENSE("GPL");
23 23
24#define __QUOTA_V2_PARANOIA 24#define __QUOTA_V2_PARANOIA
25 25
26static void v2_mem2diskdqb(void *dp, struct dquot *dquot); 26static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot);
27static void v2_disk2memdqb(struct dquot *dquot, void *dp); 27static void v2r0_disk2memdqb(struct dquot *dquot, void *dp);
28static int v2_is_id(void *dp, struct dquot *dquot); 28static int v2r0_is_id(void *dp, struct dquot *dquot);
29 29static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
30static struct qtree_fmt_operations v2_qtree_ops = { 30static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
31 .mem2disk_dqblk = v2_mem2diskdqb, 31static int v2r1_is_id(void *dp, struct dquot *dquot);
32 .disk2mem_dqblk = v2_disk2memdqb, 32
33 .is_id = v2_is_id, 33static struct qtree_fmt_operations v2r0_qtree_ops = {
34 .mem2disk_dqblk = v2r0_mem2diskdqb,
35 .disk2mem_dqblk = v2r0_disk2memdqb,
36 .is_id = v2r0_is_id,
37};
38
39static struct qtree_fmt_operations v2r1_qtree_ops = {
40 .mem2disk_dqblk = v2r1_mem2diskdqb,
41 .disk2mem_dqblk = v2r1_disk2memdqb,
42 .is_id = v2r1_is_id,
34}; 43};
35 44
36#define QUOTABLOCK_BITS 10 45#define QUOTABLOCK_BITS 10
@@ -46,23 +55,33 @@ static inline qsize_t v2_qbtos(qsize_t blocks)
46 return blocks << QUOTABLOCK_BITS; 55 return blocks << QUOTABLOCK_BITS;
47} 56}
48 57
58static int v2_read_header(struct super_block *sb, int type,
59 struct v2_disk_dqheader *dqhead)
60{
61 ssize_t size;
62
63 size = sb->s_op->quota_read(sb, type, (char *)dqhead,
64 sizeof(struct v2_disk_dqheader), 0);
65 if (size != sizeof(struct v2_disk_dqheader)) {
66 printk(KERN_WARNING "quota_v2: Failed header read:"
67 " expected=%zd got=%zd\n",
68 sizeof(struct v2_disk_dqheader), size);
69 return 0;
70 }
71 return 1;
72}
73
49/* Check whether given file is really vfsv0 quotafile */ 74/* Check whether given file is really vfsv0 quotafile */
50static int v2_check_quota_file(struct super_block *sb, int type) 75static int v2_check_quota_file(struct super_block *sb, int type)
51{ 76{
52 struct v2_disk_dqheader dqhead; 77 struct v2_disk_dqheader dqhead;
53 ssize_t size;
54 static const uint quota_magics[] = V2_INITQMAGICS; 78 static const uint quota_magics[] = V2_INITQMAGICS;
55 static const uint quota_versions[] = V2_INITQVERSIONS; 79 static const uint quota_versions[] = V2_INITQVERSIONS;
56 80
57 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, 81 if (!v2_read_header(sb, type, &dqhead))
58 sizeof(struct v2_disk_dqheader), 0);
59 if (size != sizeof(struct v2_disk_dqheader)) {
60 printk("quota_v2: failed read expected=%zd got=%zd\n",
61 sizeof(struct v2_disk_dqheader), size);
62 return 0; 82 return 0;
63 }
64 if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || 83 if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
65 le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) 84 le32_to_cpu(dqhead.dqh_version) > quota_versions[type])
66 return 0; 85 return 0;
67 return 1; 86 return 1;
68} 87}
@@ -71,14 +90,23 @@ static int v2_check_quota_file(struct super_block *sb, int type)
71static int v2_read_file_info(struct super_block *sb, int type) 90static int v2_read_file_info(struct super_block *sb, int type)
72{ 91{
73 struct v2_disk_dqinfo dinfo; 92 struct v2_disk_dqinfo dinfo;
93 struct v2_disk_dqheader dqhead;
74 struct mem_dqinfo *info = sb_dqinfo(sb, type); 94 struct mem_dqinfo *info = sb_dqinfo(sb, type);
75 struct qtree_mem_dqinfo *qinfo; 95 struct qtree_mem_dqinfo *qinfo;
76 ssize_t size; 96 ssize_t size;
97 unsigned int version;
98
99 if (!v2_read_header(sb, type, &dqhead))
100 return -1;
101 version = le32_to_cpu(dqhead.dqh_version);
102 if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) ||
103 (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1))
104 return -1;
77 105
78 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
79 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
80 if (size != sizeof(struct v2_disk_dqinfo)) { 108 if (size != sizeof(struct v2_disk_dqinfo)) {
81 printk(KERN_WARNING "Can't read info structure on device %s.\n", 109 printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
82 sb->s_id); 110 sb->s_id);
83 return -1; 111 return -1;
84 } 112 }
@@ -89,9 +117,15 @@ static int v2_read_file_info(struct super_block *sb, int type)
89 return -1; 117 return -1;
90 } 118 }
91 qinfo = info->dqi_priv; 119 qinfo = info->dqi_priv;
92 /* limits are stored as unsigned 32-bit data */ 120 if (version == 0) {
93 info->dqi_maxblimit = 0xffffffff; 121 /* limits are stored as unsigned 32-bit data */
94 info->dqi_maxilimit = 0xffffffff; 122 info->dqi_maxblimit = 0xffffffff;
123 info->dqi_maxilimit = 0xffffffff;
124 } else {
125 /* used space is stored as unsigned 64-bit value */
126 info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */
127 info->dqi_maxilimit = 0xffffffffffffffffULL;
128 }
95 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 129 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
96 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 130 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
97 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); 131 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
@@ -103,8 +137,13 @@ static int v2_read_file_info(struct super_block *sb, int type)
103 qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; 137 qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
104 qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; 138 qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
105 qinfo->dqi_qtree_depth = qtree_depth(qinfo); 139 qinfo->dqi_qtree_depth = qtree_depth(qinfo);
106 qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk); 140 if (version == 0) {
107 qinfo->dqi_ops = &v2_qtree_ops; 141 qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk);
142 qinfo->dqi_ops = &v2r0_qtree_ops;
143 } else {
144 qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
145 qinfo->dqi_ops = &v2r1_qtree_ops;
146 }
108 return 0; 147 return 0;
109} 148}
110 149
@@ -135,9 +174,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
135 return 0; 174 return 0;
136} 175}
137 176
138static void v2_disk2memdqb(struct dquot *dquot, void *dp) 177static void v2r0_disk2memdqb(struct dquot *dquot, void *dp)
139{ 178{
140 struct v2_disk_dqblk *d = dp, empty; 179 struct v2r0_disk_dqblk *d = dp, empty;
141 struct mem_dqblk *m = &dquot->dq_dqb; 180 struct mem_dqblk *m = &dquot->dq_dqb;
142 181
143 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); 182 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
@@ -149,15 +188,15 @@ static void v2_disk2memdqb(struct dquot *dquot, void *dp)
149 m->dqb_curspace = le64_to_cpu(d->dqb_curspace); 188 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
150 m->dqb_btime = le64_to_cpu(d->dqb_btime); 189 m->dqb_btime = le64_to_cpu(d->dqb_btime);
151 /* We need to escape back all-zero structure */ 190 /* We need to escape back all-zero structure */
152 memset(&empty, 0, sizeof(struct v2_disk_dqblk)); 191 memset(&empty, 0, sizeof(struct v2r0_disk_dqblk));
153 empty.dqb_itime = cpu_to_le64(1); 192 empty.dqb_itime = cpu_to_le64(1);
154 if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk))) 193 if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk)))
155 m->dqb_itime = 0; 194 m->dqb_itime = 0;
156} 195}
157 196
158static void v2_mem2diskdqb(void *dp, struct dquot *dquot) 197static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
159{ 198{
160 struct v2_disk_dqblk *d = dp; 199 struct v2r0_disk_dqblk *d = dp;
161 struct mem_dqblk *m = &dquot->dq_dqb; 200 struct mem_dqblk *m = &dquot->dq_dqb;
162 struct qtree_mem_dqinfo *info = 201 struct qtree_mem_dqinfo *info =
163 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 202 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
@@ -175,9 +214,60 @@ static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
175 d->dqb_itime = cpu_to_le64(1); 214 d->dqb_itime = cpu_to_le64(1);
176} 215}
177 216
178static int v2_is_id(void *dp, struct dquot *dquot) 217static int v2r0_is_id(void *dp, struct dquot *dquot)
179{ 218{
180 struct v2_disk_dqblk *d = dp; 219 struct v2r0_disk_dqblk *d = dp;
220 struct qtree_mem_dqinfo *info =
221 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
222
223 if (qtree_entry_unused(info, dp))
224 return 0;
225 return le32_to_cpu(d->dqb_id) == dquot->dq_id;
226}
227
228static void v2r1_disk2memdqb(struct dquot *dquot, void *dp)
229{
230 struct v2r1_disk_dqblk *d = dp, empty;
231 struct mem_dqblk *m = &dquot->dq_dqb;
232
233 m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
234 m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
235 m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
236 m->dqb_itime = le64_to_cpu(d->dqb_itime);
237 m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit));
238 m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit));
239 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
240 m->dqb_btime = le64_to_cpu(d->dqb_btime);
241 /* We need to escape back all-zero structure */
242 memset(&empty, 0, sizeof(struct v2r1_disk_dqblk));
243 empty.dqb_itime = cpu_to_le64(1);
244 if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk)))
245 m->dqb_itime = 0;
246}
247
248static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
249{
250 struct v2r1_disk_dqblk *d = dp;
251 struct mem_dqblk *m = &dquot->dq_dqb;
252 struct qtree_mem_dqinfo *info =
253 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
254
255 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
256 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
257 d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
258 d->dqb_itime = cpu_to_le64(m->dqb_itime);
259 d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit));
260 d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit));
261 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
262 d->dqb_btime = cpu_to_le64(m->dqb_btime);
263 d->dqb_id = cpu_to_le32(dquot->dq_id);
264 if (qtree_entry_unused(info, dp))
265 d->dqb_itime = cpu_to_le64(1);
266}
267
268static int v2r1_is_id(void *dp, struct dquot *dquot)
269{
270 struct v2r1_disk_dqblk *d = dp;
181 struct qtree_mem_dqinfo *info = 271 struct qtree_mem_dqinfo *info =
182 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 272 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
183 273
@@ -207,7 +297,7 @@ static int v2_free_file_info(struct super_block *sb, int type)
207 return 0; 297 return 0;
208} 298}
209 299
210static struct quota_format_ops v2_format_ops = { 300static const struct quota_format_ops v2_format_ops = {
211 .check_quota_file = v2_check_quota_file, 301 .check_quota_file = v2_check_quota_file,
212 .read_file_info = v2_read_file_info, 302 .read_file_info = v2_read_file_info,
213 .write_file_info = v2_write_file_info, 303 .write_file_info = v2_write_file_info,
@@ -217,20 +307,32 @@ static struct quota_format_ops v2_format_ops = {
217 .release_dqblk = v2_release_dquot, 307 .release_dqblk = v2_release_dquot,
218}; 308};
219 309
220static struct quota_format_type v2_quota_format = { 310static struct quota_format_type v2r0_quota_format = {
221 .qf_fmt_id = QFMT_VFS_V0, 311 .qf_fmt_id = QFMT_VFS_V0,
222 .qf_ops = &v2_format_ops, 312 .qf_ops = &v2_format_ops,
223 .qf_owner = THIS_MODULE 313 .qf_owner = THIS_MODULE
224}; 314};
225 315
316static struct quota_format_type v2r1_quota_format = {
317 .qf_fmt_id = QFMT_VFS_V1,
318 .qf_ops = &v2_format_ops,
319 .qf_owner = THIS_MODULE
320};
321
226static int __init init_v2_quota_format(void) 322static int __init init_v2_quota_format(void)
227{ 323{
228 return register_quota_format(&v2_quota_format); 324 int ret;
325
326 ret = register_quota_format(&v2r0_quota_format);
327 if (ret)
328 return ret;
329 return register_quota_format(&v2r1_quota_format);
229} 330}
230 331
231static void __exit exit_v2_quota_format(void) 332static void __exit exit_v2_quota_format(void)
232{ 333{
233 unregister_quota_format(&v2_quota_format); 334 unregister_quota_format(&v2r0_quota_format);
335 unregister_quota_format(&v2r1_quota_format);
234} 336}
235 337
236module_init(init_v2_quota_format); 338module_init(init_v2_quota_format);
diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h
index 530fe580685c..f1966b42c2fd 100644
--- a/fs/quota/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
@@ -17,8 +17,8 @@
17} 17}
18 18
19#define V2_INITQVERSIONS {\ 19#define V2_INITQVERSIONS {\
20 0, /* USRQUOTA */\ 20 1, /* USRQUOTA */\
21 0 /* GRPQUOTA */\ 21 1 /* GRPQUOTA */\
22} 22}
23 23
24/* First generic header */ 24/* First generic header */
@@ -32,7 +32,7 @@ struct v2_disk_dqheader {
32 * (as it appears on disk) - the file is a radix tree whose leaves point 32 * (as it appears on disk) - the file is a radix tree whose leaves point
33 * to blocks of these structures. 33 * to blocks of these structures.
34 */ 34 */
35struct v2_disk_dqblk { 35struct v2r0_disk_dqblk {
36 __le32 dqb_id; /* id this quota applies to */ 36 __le32 dqb_id; /* id this quota applies to */
37 __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ 37 __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */
38 __le32 dqb_isoftlimit; /* preferred inode limit */ 38 __le32 dqb_isoftlimit; /* preferred inode limit */
@@ -44,6 +44,19 @@ struct v2_disk_dqblk {
44 __le64 dqb_itime; /* time limit for excessive inode use */ 44 __le64 dqb_itime; /* time limit for excessive inode use */
45}; 45};
46 46
47struct v2r1_disk_dqblk {
48 __le32 dqb_id; /* id this quota applies to */
49 __le32 dqb_pad;
50 __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
51 __le64 dqb_isoftlimit; /* preferred inode limit */
52 __le64 dqb_curinodes; /* current # allocated inodes */
53 __le64 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
54 __le64 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
55 __le64 dqb_curspace; /* current space occupied (in bytes) */
56 __le64 dqb_btime; /* time limit for excessive disk use */
57 __le64 dqb_itime; /* time limit for excessive inode use */
58};
59
47/* Header with type and version specific information */ 60/* Header with type and version specific information */
48struct v2_disk_dqinfo { 61struct v2_disk_dqinfo {
49 __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ 62 __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 32fae4040ebf..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/slab.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include "internal.h" 27#include "internal.h"
@@ -60,7 +61,7 @@ const struct inode_operations ramfs_file_inode_operations = {
60 */ 61 */
61int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) 62int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
62{ 63{
63 unsigned long npages, xpages, loop, limit; 64 unsigned long npages, xpages, loop;
64 struct page *pages; 65 struct page *pages;
65 unsigned order; 66 unsigned order;
66 void *data; 67 void *data;
@@ -123,30 +124,6 @@ add_error:
123 124
124/*****************************************************************************/ 125/*****************************************************************************/
125/* 126/*
126 * check that file shrinkage doesn't leave any VMAs dangling in midair
127 */
128static int ramfs_nommu_check_mappings(struct inode *inode,
129 size_t newsize, size_t size)
130{
131 struct vm_area_struct *vma;
132 struct prio_tree_iter iter;
133
134 /* search for VMAs that fall within the dead zone */
135 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
136 newsize >> PAGE_SHIFT,
137 (size + PAGE_SIZE - 1) >> PAGE_SHIFT
138 ) {
139 /* found one - only interested if it's shared out of the page
140 * cache */
141 if (vma->vm_flags & VM_SHARED)
142 return -ETXTBSY; /* not quite true, but near enough */
143 }
144
145 return 0;
146}
147
148/*****************************************************************************/
149/*
150 * 127 *
151 */ 128 */
152static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) 129static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
@@ -164,7 +141,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
164 141
165 /* check that a decrease in size doesn't cut off any shared mappings */ 142 /* check that a decrease in size doesn't cut off any shared mappings */
166 if (newsize < size) { 143 if (newsize < size) {
167 ret = ramfs_nommu_check_mappings(inode, newsize, size); 144 ret = nommu_shrink_inode_mappings(inode, size, newsize);
168 if (ret < 0) 145 if (ret < 0)
169 return ret; 146 return ret;
170 } 147 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..c94853473ca9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h> 37#include <linux/magic.h>
38#include <linux/slab.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include "internal.h" 40#include "internal.h"
40 41
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac28987f22a..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 258 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 259 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 260 kiocb.ki_left = len;
261 kiocb.ki_nbytes = len;
261 262
262 for (;;) { 263 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 264 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 314 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 315 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 316 kiocb.ki_left = len;
317 kiocb.ki_nbytes = len;
316 318
317 for (;;) { 319 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 320 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -826,8 +828,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
826 if (!(out_file->f_mode & FMODE_WRITE)) 828 if (!(out_file->f_mode & FMODE_WRITE))
827 goto fput_out; 829 goto fput_out;
828 retval = -EINVAL; 830 retval = -EINVAL;
829 if (!out_file->f_op || !out_file->f_op->sendpage)
830 goto fput_out;
831 in_inode = in_file->f_path.dentry->d_inode; 831 in_inode = in_file->f_path.dentry->d_inode;
832 out_inode = out_file->f_path.dentry->d_inode; 832 out_inode = out_file->f_path.dentry->d_inode;
833 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); 833 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 7c5ab6330dd6..792b3cb2cd18 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,11 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ 7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ 8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
9 hashes.o tail_conversion.o journal.o resize.o \ 9 hashes.o tail_conversion.o journal.o resize.o \
10 item_ops.o ioctl.o procfs.o xattr.o 10 item_ops.o ioctl.o xattr.o lock.o
11
12ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
13reiserfs-objs += procfs.o
14endif
11 15
12ifeq ($(CONFIG_REISERFS_FS_XATTR),y) 16ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
13reiserfs-objs += xattr_user.o xattr_trusted.o 17reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index e716161ab325..483442e66ed6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
169 return 0; // No free blocks in this bitmap 169 return 0; // No free blocks in this bitmap
170 } 170 }
171 171
172 /* search for a first zero bit -- beggining of a window */ 172 /* search for a first zero bit -- beginning of a window */
173 *beg = reiserfs_find_next_zero_le_bit 173 *beg = reiserfs_find_next_zero_le_bit
174 ((unsigned long *)(bh->b_data), boundary, *beg); 174 ((unsigned long *)(bh->b_data), boundary, *beg);
175 175
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
425 425
426 journal_mark_dirty(th, s, sbh); 426 journal_mark_dirty(th, s, sbh);
427 if (for_unformatted) 427 if (for_unformatted)
428 vfs_dq_free_block_nodirty(inode, 1); 428 dquot_free_block_nodirty(inode, 1);
429} 429}
430 430
431void reiserfs_free_block(struct reiserfs_transaction_handle *th, 431void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1049 amount_needed, hint->inode->i_uid); 1049 amount_needed, hint->inode->i_uid);
1050#endif 1050#endif
1051 quota_ret = 1051 quota_ret =
1052 vfs_dq_alloc_block_nodirty(hint->inode, amount_needed); 1052 dquot_alloc_block_nodirty(hint->inode, amount_needed);
1053 if (quota_ret) /* Quota exceeded? */ 1053 if (quota_ret) /* Quota exceeded? */
1054 return QUOTA_EXCEEDED; 1054 return QUOTA_EXCEEDED;
1055 if (hint->preallocate && hint->prealloc_size) { 1055 if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1058 "reiserquota: allocating (prealloc) %d blocks id=%u", 1058 "reiserquota: allocating (prealloc) %d blocks id=%u",
1059 hint->prealloc_size, hint->inode->i_uid); 1059 hint->prealloc_size, hint->inode->i_uid);
1060#endif 1060#endif
1061 quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode, 1061 quota_ret = dquot_prealloc_block_nodirty(hint->inode,
1062 hint->prealloc_size); 1062 hint->prealloc_size);
1063 if (quota_ret) 1063 if (quota_ret)
1064 hint->preallocate = hint->prealloc_size = 0; 1064 hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1092 hint->inode->i_uid); 1092 hint->inode->i_uid);
1093#endif 1093#endif
1094 /* Free not allocated blocks */ 1094 /* Free not allocated blocks */
1095 vfs_dq_free_block_nodirty(hint->inode, 1095 dquot_free_block_nodirty(hint->inode,
1096 amount_needed + hint->prealloc_size - 1096 amount_needed + hint->prealloc_size -
1097 nr_allocated); 1097 nr_allocated);
1098 } 1098 }
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1125 REISERFS_I(hint->inode)->i_prealloc_count, 1125 REISERFS_I(hint->inode)->i_prealloc_count,
1126 hint->inode->i_uid); 1126 hint->inode->i_uid);
1127#endif 1127#endif
1128 vfs_dq_free_block_nodirty(hint->inode, amount_needed + 1128 dquot_free_block_nodirty(hint->inode, amount_needed +
1129 hint->prealloc_size - nr_allocated - 1129 hint->prealloc_size - nr_allocated -
1130 REISERFS_I(hint->inode)-> 1130 REISERFS_I(hint->inode)->
1131 i_prealloc_count); 1131 i_prealloc_count);
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
1249 else if (bitmap == 0) 1249 else if (bitmap == 0)
1250 block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; 1250 block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
1251 1251
1252 reiserfs_write_unlock(sb);
1252 bh = sb_bread(sb, block); 1253 bh = sb_bread(sb, block);
1254 reiserfs_write_lock(sb);
1253 if (bh == NULL) 1255 if (bh == NULL)
1254 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " 1256 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
1255 "reading failed", __func__, block); 1257 "reading failed", __func__, block);
1256 else { 1258 else {
1257 if (buffer_locked(bh)) { 1259 if (buffer_locked(bh)) {
1258 PROC_INFO_INC(sb, scan_bitmap.wait); 1260 PROC_INFO_INC(sb, scan_bitmap.wait);
1261 reiserfs_write_unlock(sb);
1259 __wait_on_buffer(bh); 1262 __wait_on_buffer(bh);
1263 reiserfs_write_lock(sb);
1260 } 1264 }
1261 BUG_ON(!buffer_uptodate(bh)); 1265 BUG_ON(!buffer_uptodate(bh));
1262 BUG_ON(atomic_read(&bh->b_count) == 0); 1266 BUG_ON(atomic_read(&bh->b_count) == 0);
@@ -1273,7 +1277,10 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
1273 struct reiserfs_bitmap_info *bitmap; 1277 struct reiserfs_bitmap_info *bitmap;
1274 unsigned int bmap_nr = reiserfs_bmap_count(sb); 1278 unsigned int bmap_nr = reiserfs_bmap_count(sb);
1275 1279
1280 /* Avoid lock recursion in fault case */
1281 reiserfs_write_unlock(sb);
1276 bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); 1282 bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
1283 reiserfs_write_lock(sb);
1277 if (bitmap == NULL) 1284 if (bitmap == NULL)
1278 return -ENOMEM; 1285 return -ENOMEM;
1279 1286
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc384..f8a6075abf50 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
8#include <linux/reiserfs_fs.h> 8#include <linux/reiserfs_fs.h>
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12 13
13extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
@@ -20,7 +21,7 @@ const struct file_operations reiserfs_dir_operations = {
20 .read = generic_read_dir, 21 .read = generic_read_dir,
21 .readdir = reiserfs_readdir, 22 .readdir = reiserfs_readdir,
22 .fsync = reiserfs_dir_fsync, 23 .fsync = reiserfs_dir_fsync,
23 .ioctl = reiserfs_ioctl, 24 .unlocked_ioctl = reiserfs_ioctl,
24#ifdef CONFIG_COMPAT 25#ifdef CONFIG_COMPAT
25 .compat_ioctl = reiserfs_compat_ioctl, 26 .compat_ioctl = reiserfs_compat_ioctl,
26#endif 27#endif
@@ -174,14 +175,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
174 // user space buffer is swapped out. At that time 175 // user space buffer is swapped out. At that time
175 // entry can move to somewhere else 176 // entry can move to somewhere else
176 memcpy(local_buf, d_name, d_reclen); 177 memcpy(local_buf, d_name, d_reclen);
178
179 /*
180 * Since filldir might sleep, we can release
181 * the write lock here for other waiters
182 */
183 reiserfs_write_unlock(inode->i_sb);
177 if (filldir 184 if (filldir
178 (dirent, local_buf, d_reclen, d_off, d_ino, 185 (dirent, local_buf, d_reclen, d_off, d_ino,
179 DT_UNKNOWN) < 0) { 186 DT_UNKNOWN) < 0) {
187 reiserfs_write_lock(inode->i_sb);
180 if (local_buf != small_buf) { 188 if (local_buf != small_buf) {
181 kfree(local_buf); 189 kfree(local_buf);
182 } 190 }
183 goto end; 191 goto end;
184 } 192 }
193 reiserfs_write_lock(inode->i_sb);
185 if (local_buf != small_buf) { 194 if (local_buf != small_buf) {
186 kfree(local_buf); 195 kfree(local_buf);
187 } 196 }
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 128d3f7c8aa5..60c080440661 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -21,14 +21,6 @@
21#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23 23
24#ifdef CONFIG_REISERFS_CHECK
25
26struct tree_balance *cur_tb = NULL; /* detects whether more than one
27 copy of tb exists as a means
28 of checking whether schedule
29 is interrupting do_balance */
30#endif
31
32static inline void buffer_info_init_left(struct tree_balance *tb, 24static inline void buffer_info_init_left(struct tree_balance *tb,
33 struct buffer_info *bi) 25 struct buffer_info *bi)
34{ 26{
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
1840{ 1832{
1841 int retval = 0; 1833 int retval = 0;
1842 1834
1843 if (cur_tb) { 1835 if (REISERFS_SB(tb->tb_sb)->cur_tb) {
1844 reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " 1836 reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
1845 "occurred based on cur_tb not being null at " 1837 "occurred based on cur_tb not being null at "
1846 "this point in code. do_balance cannot properly " 1838 "this point in code. do_balance cannot properly "
1847 "handle schedule occurring while it runs."); 1839 "handle concurrent tree accesses on a same "
1840 "mount point.");
1848 } 1841 }
1849 1842
1850 /* double check that buffers that we will modify are unlocked. (fix_nodes should already have 1843 /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
1986 "check");*/ 1979 "check");*/
1987 RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); 1980 RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
1988#ifdef CONFIG_REISERFS_CHECK 1981#ifdef CONFIG_REISERFS_CHECK
1989 cur_tb = tb; 1982 REISERFS_SB(tb->tb_sb)->cur_tb = tb;
1990#endif 1983#endif
1991} 1984}
1992 1985
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
1996#ifdef CONFIG_REISERFS_CHECK 1989#ifdef CONFIG_REISERFS_CHECK
1997 check_leaf_level(tb); 1990 check_leaf_level(tb);
1998 check_internal_levels(tb); 1991 check_internal_levels(tb);
1999 cur_tb = NULL; 1992 REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
2000#endif 1993#endif
2001 1994
2002 /* reiserfs_free_block is no longer schedule safe. So, we need to 1995 /* reiserfs_free_block is no longer schedule safe. So, we need to
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9f436668b7f8..1d9c12714c5c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -284,12 +284,12 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
284const struct file_operations reiserfs_file_operations = { 284const struct file_operations reiserfs_file_operations = {
285 .read = do_sync_read, 285 .read = do_sync_read,
286 .write = reiserfs_file_write, 286 .write = reiserfs_file_write,
287 .ioctl = reiserfs_ioctl, 287 .unlocked_ioctl = reiserfs_ioctl,
288#ifdef CONFIG_COMPAT 288#ifdef CONFIG_COMPAT
289 .compat_ioctl = reiserfs_compat_ioctl, 289 .compat_ioctl = reiserfs_compat_ioctl,
290#endif 290#endif
291 .mmap = reiserfs_file_mmap, 291 .mmap = reiserfs_file_mmap,
292 .open = generic_file_open, 292 .open = dquot_file_open,
293 .release = reiserfs_file_release, 293 .release = reiserfs_file_release,
294 .fsync = reiserfs_sync_file, 294 .fsync = reiserfs_sync_file,
295 .aio_read = generic_file_aio_read, 295 .aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5e5a4e6fbaf8..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
35 **/ 35 **/
36 36
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/slab.h>
38#include <linux/string.h> 39#include <linux/string.h>
39#include <linux/reiserfs_fs.h> 40#include <linux/reiserfs_fs.h>
40#include <linux/buffer_head.h> 41#include <linux/buffer_head.h>
@@ -563,9 +564,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
563 return needed_nodes; 564 return needed_nodes;
564} 565}
565 566
566#ifdef CONFIG_REISERFS_CHECK
567extern struct tree_balance *cur_tb;
568#endif
569 567
570/* Set parameters for balancing. 568/* Set parameters for balancing.
571 * Performs write of results of analysis of balancing into structure tb, 569 * Performs write of results of analysis of balancing into structure tb,
@@ -834,7 +832,7 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
834 RFALSE(buffer_dirty(new_bh) || 832 RFALSE(buffer_dirty(new_bh) ||
835 buffer_journaled(new_bh) || 833 buffer_journaled(new_bh) ||
836 buffer_journal_dirty(new_bh), 834 buffer_journal_dirty(new_bh),
837 "PAP-8140: journlaled or dirty buffer %b for the new block", 835 "PAP-8140: journaled or dirty buffer %b for the new block",
838 new_bh); 836 new_bh);
839 837
840 /* Put empty buffers into the array. */ 838 /* Put empty buffers into the array. */
@@ -1022,7 +1020,11 @@ static int get_far_parent(struct tree_balance *tb,
1022 /* Check whether the common parent is locked. */ 1020 /* Check whether the common parent is locked. */
1023 1021
1024 if (buffer_locked(*pcom_father)) { 1022 if (buffer_locked(*pcom_father)) {
1023
1024 /* Release the write lock while the buffer is busy */
1025 reiserfs_write_unlock(tb->tb_sb);
1025 __wait_on_buffer(*pcom_father); 1026 __wait_on_buffer(*pcom_father);
1027 reiserfs_write_lock(tb->tb_sb);
1026 if (FILESYSTEM_CHANGED_TB(tb)) { 1028 if (FILESYSTEM_CHANGED_TB(tb)) {
1027 brelse(*pcom_father); 1029 brelse(*pcom_father);
1028 return REPEAT_SEARCH; 1030 return REPEAT_SEARCH;
@@ -1927,7 +1929,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
1927 return REPEAT_SEARCH; 1929 return REPEAT_SEARCH;
1928 1930
1929 if (buffer_locked(bh)) { 1931 if (buffer_locked(bh)) {
1932 reiserfs_write_unlock(tb->tb_sb);
1930 __wait_on_buffer(bh); 1933 __wait_on_buffer(bh);
1934 reiserfs_write_lock(tb->tb_sb);
1931 if (FILESYSTEM_CHANGED_TB(tb)) 1935 if (FILESYSTEM_CHANGED_TB(tb))
1932 return REPEAT_SEARCH; 1936 return REPEAT_SEARCH;
1933 } 1937 }
@@ -1965,7 +1969,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
1965 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> 1969 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
1966 FL[h]); 1970 FL[h]);
1967 son_number = B_N_CHILD_NUM(tb->FL[h], child_position); 1971 son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
1972 reiserfs_write_unlock(sb);
1968 bh = sb_bread(sb, son_number); 1973 bh = sb_bread(sb, son_number);
1974 reiserfs_write_lock(sb);
1969 if (!bh) 1975 if (!bh)
1970 return IO_ERROR; 1976 return IO_ERROR;
1971 if (FILESYSTEM_CHANGED_TB(tb)) { 1977 if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2009,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
2003 child_position = 2009 child_position =
2004 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; 2010 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2005 son_number = B_N_CHILD_NUM(tb->FR[h], child_position); 2011 son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2012 reiserfs_write_unlock(sb);
2006 bh = sb_bread(sb, son_number); 2013 bh = sb_bread(sb, son_number);
2014 reiserfs_write_lock(sb);
2007 if (!bh) 2015 if (!bh)
2008 return IO_ERROR; 2016 return IO_ERROR;
2009 if (FILESYSTEM_CHANGED_TB(tb)) { 2017 if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2286,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2278 REPEAT_SEARCH : CARRY_ON; 2286 REPEAT_SEARCH : CARRY_ON;
2279 } 2287 }
2280#endif 2288#endif
2289 reiserfs_write_unlock(tb->tb_sb);
2281 __wait_on_buffer(locked); 2290 __wait_on_buffer(locked);
2291 reiserfs_write_lock(tb->tb_sb);
2282 if (FILESYSTEM_CHANGED_TB(tb)) 2292 if (FILESYSTEM_CHANGED_TB(tb))
2283 return REPEAT_SEARCH; 2293 return REPEAT_SEARCH;
2284 } 2294 }
@@ -2349,12 +2359,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
2349 2359
2350 /* if it possible in indirect_to_direct conversion */ 2360 /* if it possible in indirect_to_direct conversion */
2351 if (buffer_locked(tbS0)) { 2361 if (buffer_locked(tbS0)) {
2362 reiserfs_write_unlock(tb->tb_sb);
2352 __wait_on_buffer(tbS0); 2363 __wait_on_buffer(tbS0);
2364 reiserfs_write_lock(tb->tb_sb);
2353 if (FILESYSTEM_CHANGED_TB(tb)) 2365 if (FILESYSTEM_CHANGED_TB(tb))
2354 return REPEAT_SEARCH; 2366 return REPEAT_SEARCH;
2355 } 2367 }
2356#ifdef CONFIG_REISERFS_CHECK 2368#ifdef CONFIG_REISERFS_CHECK
2357 if (cur_tb) { 2369 if (REISERFS_SB(tb->tb_sb)->cur_tb) {
2358 print_cur_tb("fix_nodes"); 2370 print_cur_tb("fix_nodes");
2359 reiserfs_panic(tb->tb_sb, "PAP-8305", 2371 reiserfs_panic(tb->tb_sb, "PAP-8305",
2360 "there is pending do_balance"); 2372 "there is pending do_balance");
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/slab.h>
14#include <asm/uaccess.h> 15#include <asm/uaccess.h>
15#include <asm/unaligned.h> 16#include <asm/unaligned.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
@@ -31,11 +32,15 @@ void reiserfs_delete_inode(struct inode *inode)
31 JOURNAL_PER_BALANCE_CNT * 2 + 32 JOURNAL_PER_BALANCE_CNT * 2 +
32 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); 33 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
33 struct reiserfs_transaction_handle th; 34 struct reiserfs_transaction_handle th;
35 int depth;
34 int err; 36 int err;
35 37
38 if (!is_bad_inode(inode))
39 dquot_initialize(inode);
40
36 truncate_inode_pages(&inode->i_data, 0); 41 truncate_inode_pages(&inode->i_data, 0);
37 42
38 reiserfs_write_lock(inode->i_sb); 43 depth = reiserfs_write_lock_once(inode->i_sb);
39 44
40 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ 45 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
41 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ 46 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
@@ -53,7 +58,7 @@ void reiserfs_delete_inode(struct inode *inode)
53 * after delete_object so that quota updates go into the same transaction as 58 * after delete_object so that quota updates go into the same transaction as
54 * stat data deletion */ 59 * stat data deletion */
55 if (!err) 60 if (!err)
56 vfs_dq_free_inode(inode); 61 dquot_free_inode(inode);
57 62
58 if (journal_end(&th, inode->i_sb, jbegin_count)) 63 if (journal_end(&th, inode->i_sb, jbegin_count))
59 goto out; 64 goto out;
@@ -74,7 +79,7 @@ void reiserfs_delete_inode(struct inode *inode)
74 out: 79 out:
75 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ 80 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */
76 inode->i_blocks = 0; 81 inode->i_blocks = 0;
77 reiserfs_write_unlock(inode->i_sb); 82 reiserfs_write_unlock_once(inode->i_sb, depth);
78} 83}
79 84
80static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, 85static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -251,7 +256,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
251 struct cpu_key key; 256 struct cpu_key key;
252 struct buffer_head *bh; 257 struct buffer_head *bh;
253 struct item_head *ih, tmp_ih; 258 struct item_head *ih, tmp_ih;
254 int fs_gen;
255 b_blocknr_t blocknr; 259 b_blocknr_t blocknr;
256 char *p = NULL; 260 char *p = NULL;
257 int chars; 261 int chars;
@@ -265,7 +269,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
265 (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 269 (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
266 3); 270 3);
267 271
268 research:
269 result = search_for_position_by_key(inode->i_sb, &key, &path); 272 result = search_for_position_by_key(inode->i_sb, &key, &path);
270 if (result != POSITION_FOUND) { 273 if (result != POSITION_FOUND) {
271 pathrelse(&path); 274 pathrelse(&path);
@@ -340,7 +343,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
340 } 343 }
341 // read file tail into part of page 344 // read file tail into part of page
342 offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); 345 offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
343 fs_gen = get_generation(inode->i_sb);
344 copy_item_head(&tmp_ih, ih); 346 copy_item_head(&tmp_ih, ih);
345 347
346 /* we only want to kmap if we are reading the tail into the page. 348 /* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +350,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
348 ** sure we need to. But, this means the item might move if 350 ** sure we need to. But, this means the item might move if
349 ** kmap schedules 351 ** kmap schedules
350 */ 352 */
351 if (!p) { 353 if (!p)
352 p = (char *)kmap(bh_result->b_page); 354 p = (char *)kmap(bh_result->b_page);
353 if (fs_changed(fs_gen, inode->i_sb) 355
354 && item_moved(&tmp_ih, &path)) {
355 goto research;
356 }
357 }
358 p += offset; 356 p += offset;
359 memset(p, 0, inode->i_sb->s_blocksize); 357 memset(p, 0, inode->i_sb->s_blocksize);
360 do { 358 do {
@@ -489,10 +487,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
489 disappeared */ 487 disappeared */
490 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { 488 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
491 int err; 489 int err;
492 lock_kernel(); 490
491 reiserfs_write_lock(inode->i_sb);
492
493 err = reiserfs_commit_for_inode(inode); 493 err = reiserfs_commit_for_inode(inode);
494 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 494 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
495 unlock_kernel(); 495
496 reiserfs_write_unlock(inode->i_sb);
497
496 if (err < 0) 498 if (err < 0)
497 ret = err; 499 ret = err;
498 } 500 }
@@ -601,6 +603,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
601 __le32 *item; 603 __le32 *item;
602 int done; 604 int done;
603 int fs_gen; 605 int fs_gen;
606 int lock_depth;
604 struct reiserfs_transaction_handle *th = NULL; 607 struct reiserfs_transaction_handle *th = NULL;
605 /* space reserved in transaction batch: 608 /* space reserved in transaction batch:
606 . 3 balancings in direct->indirect conversion 609 . 3 balancings in direct->indirect conversion
@@ -616,12 +619,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
616 loff_t new_offset = 619 loff_t new_offset =
617 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; 620 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
618 621
619 /* bad.... */ 622 lock_depth = reiserfs_write_lock_once(inode->i_sb);
620 reiserfs_write_lock(inode->i_sb);
621 version = get_inode_item_key_version(inode); 623 version = get_inode_item_key_version(inode);
622 624
623 if (!file_capable(inode, block)) { 625 if (!file_capable(inode, block)) {
624 reiserfs_write_unlock(inode->i_sb); 626 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
625 return -EFBIG; 627 return -EFBIG;
626 } 628 }
627 629
@@ -633,7 +635,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
633 /* find number of block-th logical block of the file */ 635 /* find number of block-th logical block of the file */
634 ret = _get_block_create_0(inode, block, bh_result, 636 ret = _get_block_create_0(inode, block, bh_result,
635 create | GET_BLOCK_READ_DIRECT); 637 create | GET_BLOCK_READ_DIRECT);
636 reiserfs_write_unlock(inode->i_sb); 638 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
637 return ret; 639 return ret;
638 } 640 }
639 /* 641 /*
@@ -751,7 +753,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
751 if (!dangle && th) 753 if (!dangle && th)
752 retval = reiserfs_end_persistent_transaction(th); 754 retval = reiserfs_end_persistent_transaction(th);
753 755
754 reiserfs_write_unlock(inode->i_sb); 756 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
755 757
756 /* the item was found, so new blocks were not added to the file 758 /* the item was found, so new blocks were not added to the file
757 ** there is no need to make sure the inode is updated with this 759 ** there is no need to make sure the inode is updated with this
@@ -935,7 +937,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
935 if (blocks_needed == 1) { 937 if (blocks_needed == 1) {
936 un = &unf_single; 938 un = &unf_single;
937 } else { 939 } else {
938 un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC); // We need to avoid scheduling. 940 un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
939 if (!un) { 941 if (!un) {
940 un = &unf_single; 942 un = &unf_single;
941 blocks_needed = 1; 943 blocks_needed = 1;
@@ -997,10 +999,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
997 if (retval) 999 if (retval)
998 goto failure; 1000 goto failure;
999 } 1001 }
1000 /* inserting indirect pointers for a hole can take a 1002 /*
1001 ** long time. reschedule if needed 1003 * inserting indirect pointers for a hole can take a
1004 * long time. reschedule if needed and also release the write
1005 * lock for others.
1002 */ 1006 */
1003 cond_resched(); 1007 if (need_resched()) {
1008 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1009 schedule();
1010 lock_depth = reiserfs_write_lock_once(inode->i_sb);
1011 }
1004 1012
1005 retval = search_for_position_by_key(inode->i_sb, &key, &path); 1013 retval = search_for_position_by_key(inode->i_sb, &key, &path);
1006 if (retval == IO_ERROR) { 1014 if (retval == IO_ERROR) {
@@ -1035,7 +1043,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
1035 retval = err; 1043 retval = err;
1036 } 1044 }
1037 1045
1038 reiserfs_write_unlock(inode->i_sb); 1046 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1039 reiserfs_check_path(&path); 1047 reiserfs_check_path(&path);
1040 return retval; 1048 return retval;
1041} 1049}
@@ -1493,9 +1501,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1493 1501
1494 args.objectid = key->on_disk_key.k_objectid; 1502 args.objectid = key->on_disk_key.k_objectid;
1495 args.dirid = key->on_disk_key.k_dir_id; 1503 args.dirid = key->on_disk_key.k_dir_id;
1504 reiserfs_write_unlock(s);
1496 inode = iget5_locked(s, key->on_disk_key.k_objectid, 1505 inode = iget5_locked(s, key->on_disk_key.k_objectid,
1497 reiserfs_find_actor, reiserfs_init_locked_inode, 1506 reiserfs_find_actor, reiserfs_init_locked_inode,
1498 (void *)(&args)); 1507 (void *)(&args));
1508 reiserfs_write_lock(s);
1499 if (!inode) 1509 if (!inode)
1500 return ERR_PTR(-ENOMEM); 1510 return ERR_PTR(-ENOMEM);
1501 1511
@@ -1609,7 +1619,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1609** to properly mark inodes for datasync and such, but only actually 1619** to properly mark inodes for datasync and such, but only actually
1610** does something when called for a synchronous update. 1620** does something when called for a synchronous update.
1611*/ 1621*/
1612int reiserfs_write_inode(struct inode *inode, int do_sync) 1622int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1613{ 1623{
1614 struct reiserfs_transaction_handle th; 1624 struct reiserfs_transaction_handle th;
1615 int jbegin_count = 1; 1625 int jbegin_count = 1;
@@ -1621,7 +1631,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
1621 ** inode needs to reach disk for safety, and they can safely be 1631 ** inode needs to reach disk for safety, and they can safely be
1622 ** ignored because the altered inode has already been logged. 1632 ** ignored because the altered inode has already been logged.
1623 */ 1633 */
1624 if (do_sync && !(current->flags & PF_MEMALLOC)) { 1634 if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
1625 reiserfs_write_lock(inode->i_sb); 1635 reiserfs_write_lock(inode->i_sb);
1626 if (!journal_begin(&th, inode->i_sb, jbegin_count)) { 1636 if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1627 reiserfs_update_sd(&th, inode); 1637 reiserfs_update_sd(&th, inode);
@@ -1759,10 +1769,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1759 1769
1760 BUG_ON(!th->t_trans_id); 1770 BUG_ON(!th->t_trans_id);
1761 1771
1762 if (vfs_dq_alloc_inode(inode)) { 1772 dquot_initialize(inode);
1763 err = -EDQUOT; 1773 err = dquot_alloc_inode(inode);
1774 if (err)
1764 goto out_end_trans; 1775 goto out_end_trans;
1765 }
1766 if (!dir->i_nlink) { 1776 if (!dir->i_nlink) {
1767 err = -EPERM; 1777 err = -EPERM;
1768 goto out_bad_inode; 1778 goto out_bad_inode;
@@ -1953,12 +1963,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1953 INODE_PKEY(inode)->k_objectid = 0; 1963 INODE_PKEY(inode)->k_objectid = 0;
1954 1964
1955 /* Quota change must be inside a transaction for journaling */ 1965 /* Quota change must be inside a transaction for journaling */
1956 vfs_dq_free_inode(inode); 1966 dquot_free_inode(inode);
1957 1967
1958 out_end_trans: 1968 out_end_trans:
1959 journal_end(th, th->t_super, th->t_blocks_allocated); 1969 journal_end(th, th->t_super, th->t_blocks_allocated);
1960 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1970 /* Drop can be outside and it needs more credits so it's better to have it outside */
1961 vfs_dq_drop(inode); 1971 dquot_drop(inode);
1962 inode->i_flags |= S_NOQUOTA; 1972 inode->i_flags |= S_NOQUOTA;
1963 make_bad_inode(inode); 1973 make_bad_inode(inode);
1964 1974
@@ -2072,8 +2082,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2072 int error; 2082 int error;
2073 struct buffer_head *bh = NULL; 2083 struct buffer_head *bh = NULL;
2074 int err2; 2084 int err2;
2085 int lock_depth;
2075 2086
2076 reiserfs_write_lock(inode->i_sb); 2087 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2077 2088
2078 if (inode->i_size > 0) { 2089 if (inode->i_size > 0) {
2079 error = grab_tail_page(inode, &page, &bh); 2090 error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2153,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2142 page_cache_release(page); 2153 page_cache_release(page);
2143 } 2154 }
2144 2155
2145 reiserfs_write_unlock(inode->i_sb); 2156 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2157
2146 return 0; 2158 return 0;
2147 out: 2159 out:
2148 if (page) { 2160 if (page) {
2149 unlock_page(page); 2161 unlock_page(page);
2150 page_cache_release(page); 2162 page_cache_release(page);
2151 } 2163 }
2152 reiserfs_write_unlock(inode->i_sb); 2164
2165 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2166
2153 return error; 2167 return error;
2154} 2168}
2155 2169
@@ -2531,6 +2545,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
2531 return reiserfs_write_full_page(page, wbc); 2545 return reiserfs_write_full_page(page, wbc);
2532} 2546}
2533 2547
2548static void reiserfs_truncate_failed_write(struct inode *inode)
2549{
2550 truncate_inode_pages(inode->i_mapping, inode->i_size);
2551 reiserfs_truncate_file(inode, 0);
2552}
2553
2534static int reiserfs_write_begin(struct file *file, 2554static int reiserfs_write_begin(struct file *file,
2535 struct address_space *mapping, 2555 struct address_space *mapping,
2536 loff_t pos, unsigned len, unsigned flags, 2556 loff_t pos, unsigned len, unsigned flags,
@@ -2597,6 +2617,8 @@ static int reiserfs_write_begin(struct file *file,
2597 if (ret) { 2617 if (ret) {
2598 unlock_page(page); 2618 unlock_page(page);
2599 page_cache_release(page); 2619 page_cache_release(page);
2620 /* Truncate allocated blocks */
2621 reiserfs_truncate_failed_write(inode);
2600 } 2622 }
2601 return ret; 2623 return ret;
2602} 2624}
@@ -2608,7 +2630,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2608 int ret; 2630 int ret;
2609 int old_ref = 0; 2631 int old_ref = 0;
2610 2632
2633 reiserfs_write_unlock(inode->i_sb);
2611 reiserfs_wait_on_write_block(inode->i_sb); 2634 reiserfs_wait_on_write_block(inode->i_sb);
2635 reiserfs_write_lock(inode->i_sb);
2636
2612 fix_tail_page_for_writing(page); 2637 fix_tail_page_for_writing(page);
2613 if (reiserfs_transaction_running(inode->i_sb)) { 2638 if (reiserfs_transaction_running(inode->i_sb)) {
2614 struct reiserfs_transaction_handle *th; 2639 struct reiserfs_transaction_handle *th;
@@ -2664,6 +2689,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2664 int update_sd = 0; 2689 int update_sd = 0;
2665 struct reiserfs_transaction_handle *th; 2690 struct reiserfs_transaction_handle *th;
2666 unsigned start; 2691 unsigned start;
2692 int lock_depth = 0;
2693 bool locked = false;
2667 2694
2668 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) 2695 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
2669 pos ++; 2696 pos ++;
@@ -2689,10 +2716,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2689 ** transaction tracking stuff when the size changes. So, we have 2716 ** transaction tracking stuff when the size changes. So, we have
2690 ** to do the i_size updates here. 2717 ** to do the i_size updates here.
2691 */ 2718 */
2692 pos += copied; 2719 if (pos + copied > inode->i_size) {
2693 if (pos > inode->i_size) {
2694 struct reiserfs_transaction_handle myth; 2720 struct reiserfs_transaction_handle myth;
2695 reiserfs_write_lock(inode->i_sb); 2721 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2722 locked = true;
2696 /* If the file have grown beyond the border where it 2723 /* If the file have grown beyond the border where it
2697 can have a tail, unmark it as needing a tail 2724 can have a tail, unmark it as needing a tail
2698 packing */ 2725 packing */
@@ -2703,12 +2730,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2703 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 2730 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2704 2731
2705 ret = journal_begin(&myth, inode->i_sb, 1); 2732 ret = journal_begin(&myth, inode->i_sb, 1);
2706 if (ret) { 2733 if (ret)
2707 reiserfs_write_unlock(inode->i_sb);
2708 goto journal_error; 2734 goto journal_error;
2709 } 2735
2710 reiserfs_update_inode_transaction(inode); 2736 reiserfs_update_inode_transaction(inode);
2711 inode->i_size = pos; 2737 inode->i_size = pos + copied;
2712 /* 2738 /*
2713 * this will just nest into our transaction. It's important 2739 * this will just nest into our transaction. It's important
2714 * to use mark_inode_dirty so the inode gets pushed around on the 2740 * to use mark_inode_dirty so the inode gets pushed around on the
@@ -2718,34 +2744,40 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2718 reiserfs_update_sd(&myth, inode); 2744 reiserfs_update_sd(&myth, inode);
2719 update_sd = 1; 2745 update_sd = 1;
2720 ret = journal_end(&myth, inode->i_sb, 1); 2746 ret = journal_end(&myth, inode->i_sb, 1);
2721 reiserfs_write_unlock(inode->i_sb);
2722 if (ret) 2747 if (ret)
2723 goto journal_error; 2748 goto journal_error;
2724 } 2749 }
2725 if (th) { 2750 if (th) {
2726 reiserfs_write_lock(inode->i_sb); 2751 if (!locked) {
2752 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2753 locked = true;
2754 }
2727 if (!update_sd) 2755 if (!update_sd)
2728 mark_inode_dirty(inode); 2756 mark_inode_dirty(inode);
2729 ret = reiserfs_end_persistent_transaction(th); 2757 ret = reiserfs_end_persistent_transaction(th);
2730 reiserfs_write_unlock(inode->i_sb);
2731 if (ret) 2758 if (ret)
2732 goto out; 2759 goto out;
2733 } 2760 }
2734 2761
2735 out: 2762 out:
2763 if (locked)
2764 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2736 unlock_page(page); 2765 unlock_page(page);
2737 page_cache_release(page); 2766 page_cache_release(page);
2767
2768 if (pos + len > inode->i_size)
2769 reiserfs_truncate_failed_write(inode);
2770
2738 return ret == 0 ? copied : ret; 2771 return ret == 0 ? copied : ret;
2739 2772
2740 journal_error: 2773 journal_error:
2774 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2775 locked = false;
2741 if (th) { 2776 if (th) {
2742 reiserfs_write_lock(inode->i_sb);
2743 if (!update_sd) 2777 if (!update_sd)
2744 reiserfs_update_sd(th, inode); 2778 reiserfs_update_sd(th, inode);
2745 ret = reiserfs_end_persistent_transaction(th); 2779 ret = reiserfs_end_persistent_transaction(th);
2746 reiserfs_write_unlock(inode->i_sb);
2747 } 2780 }
2748
2749 goto out; 2781 goto out;
2750} 2782}
2751 2783
@@ -2758,7 +2790,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2758 int update_sd = 0; 2790 int update_sd = 0;
2759 struct reiserfs_transaction_handle *th = NULL; 2791 struct reiserfs_transaction_handle *th = NULL;
2760 2792
2793 reiserfs_write_unlock(inode->i_sb);
2761 reiserfs_wait_on_write_block(inode->i_sb); 2794 reiserfs_wait_on_write_block(inode->i_sb);
2795 reiserfs_write_lock(inode->i_sb);
2796
2762 if (reiserfs_transaction_running(inode->i_sb)) { 2797 if (reiserfs_transaction_running(inode->i_sb)) {
2763 th = current->journal_info; 2798 th = current->journal_info;
2764 } 2799 }
@@ -2770,7 +2805,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2770 */ 2805 */
2771 if (pos > inode->i_size) { 2806 if (pos > inode->i_size) {
2772 struct reiserfs_transaction_handle myth; 2807 struct reiserfs_transaction_handle myth;
2773 reiserfs_write_lock(inode->i_sb);
2774 /* If the file have grown beyond the border where it 2808 /* If the file have grown beyond the border where it
2775 can have a tail, unmark it as needing a tail 2809 can have a tail, unmark it as needing a tail
2776 packing */ 2810 packing */
@@ -2781,10 +2815,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2781 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 2815 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2782 2816
2783 ret = journal_begin(&myth, inode->i_sb, 1); 2817 ret = journal_begin(&myth, inode->i_sb, 1);
2784 if (ret) { 2818 if (ret)
2785 reiserfs_write_unlock(inode->i_sb);
2786 goto journal_error; 2819 goto journal_error;
2787 } 2820
2788 reiserfs_update_inode_transaction(inode); 2821 reiserfs_update_inode_transaction(inode);
2789 inode->i_size = pos; 2822 inode->i_size = pos;
2790 /* 2823 /*
@@ -2796,16 +2829,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2796 reiserfs_update_sd(&myth, inode); 2829 reiserfs_update_sd(&myth, inode);
2797 update_sd = 1; 2830 update_sd = 1;
2798 ret = journal_end(&myth, inode->i_sb, 1); 2831 ret = journal_end(&myth, inode->i_sb, 1);
2799 reiserfs_write_unlock(inode->i_sb);
2800 if (ret) 2832 if (ret)
2801 goto journal_error; 2833 goto journal_error;
2802 } 2834 }
2803 if (th) { 2835 if (th) {
2804 reiserfs_write_lock(inode->i_sb);
2805 if (!update_sd) 2836 if (!update_sd)
2806 mark_inode_dirty(inode); 2837 mark_inode_dirty(inode);
2807 ret = reiserfs_end_persistent_transaction(th); 2838 ret = reiserfs_end_persistent_transaction(th);
2808 reiserfs_write_unlock(inode->i_sb);
2809 if (ret) 2839 if (ret)
2810 goto out; 2840 goto out;
2811 } 2841 }
@@ -2815,11 +2845,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2815 2845
2816 journal_error: 2846 journal_error:
2817 if (th) { 2847 if (th) {
2818 reiserfs_write_lock(inode->i_sb);
2819 if (!update_sd) 2848 if (!update_sd)
2820 reiserfs_update_sd(th, inode); 2849 reiserfs_update_sd(th, inode);
2821 ret = reiserfs_end_persistent_transaction(th); 2850 ret = reiserfs_end_persistent_transaction(th);
2822 reiserfs_write_unlock(inode->i_sb);
2823 } 2851 }
2824 2852
2825 return ret; 2853 return ret;
@@ -3040,14 +3068,17 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3040int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) 3068int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3041{ 3069{
3042 struct inode *inode = dentry->d_inode; 3070 struct inode *inode = dentry->d_inode;
3043 int error;
3044 unsigned int ia_valid; 3071 unsigned int ia_valid;
3072 int depth;
3073 int error;
3045 3074
3046 /* must be turned off for recursive notify_change calls */ 3075 /* must be turned off for recursive notify_change calls */
3047 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3048 3077
3049 reiserfs_write_lock(inode->i_sb); 3078 depth = reiserfs_write_lock_once(inode->i_sb);
3050 if (attr->ia_valid & ATTR_SIZE) { 3079 if (attr->ia_valid & ATTR_SIZE) {
3080 dquot_initialize(inode);
3081
3051 /* version 2 items will be caught by the s_maxbytes check 3082 /* version 2 items will be caught by the s_maxbytes check
3052 ** done for us in vmtruncate 3083 ** done for us in vmtruncate
3053 */ 3084 */
@@ -3109,8 +3140,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3109 jbegin_count); 3140 jbegin_count);
3110 if (error) 3141 if (error)
3111 goto out; 3142 goto out;
3112 error = 3143 error = dquot_transfer(inode, attr);
3113 vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
3114 if (error) { 3144 if (error) {
3115 journal_end(&th, inode->i_sb, 3145 journal_end(&th, inode->i_sb,
3116 jbegin_count); 3146 jbegin_count);
@@ -3127,8 +3157,17 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3127 journal_end(&th, inode->i_sb, jbegin_count); 3157 journal_end(&th, inode->i_sb, jbegin_count);
3128 } 3158 }
3129 } 3159 }
3130 if (!error) 3160 if (!error) {
3161 /*
3162 * Relax the lock here, as it might truncate the
3163 * inode pages and wait for inode pages locks.
3164 * To release such page lock, the owner needs the
3165 * reiserfs lock
3166 */
3167 reiserfs_write_unlock_once(inode->i_sb, depth);
3131 error = inode_setattr(inode, attr); 3168 error = inode_setattr(inode, attr);
3169 depth = reiserfs_write_lock_once(inode->i_sb);
3170 }
3132 } 3171 }
3133 3172
3134 if (!error && reiserfs_posixacl(inode->i_sb)) { 3173 if (!error && reiserfs_posixacl(inode->i_sb)) {
@@ -3137,7 +3176,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3137 } 3176 }
3138 3177
3139 out: 3178 out:
3140 reiserfs_write_unlock(inode->i_sb); 3179 reiserfs_write_unlock_once(inode->i_sb, depth);
3180
3141 return error; 3181 return error;
3142} 3182}
3143 3183
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0ccc3fdda7bf..f53505de0712 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -13,44 +13,52 @@
13#include <linux/compat.h> 13#include <linux/compat.h>
14 14
15/* 15/*
16** reiserfs_ioctl - handler for ioctl for inode 16 * reiserfs_ioctl - handler for ioctl for inode
17** supported commands: 17 * supported commands:
18** 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect 18 * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
19** and prevent packing file (argument arg has to be non-zero) 19 * and prevent packing file (argument arg has to be non-zero)
20** 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION 20 * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
21** 3) That's all for a while ... 21 * 3) That's all for a while ...
22*/ 22 */
23int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 23long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 24{
25 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 26 unsigned int flags;
27 int err = 0; 27 int err = 0;
28 28
29 reiserfs_write_lock(inode->i_sb);
30
29 switch (cmd) { 31 switch (cmd) {
30 case REISERFS_IOC_UNPACK: 32 case REISERFS_IOC_UNPACK:
31 if (S_ISREG(inode->i_mode)) { 33 if (S_ISREG(inode->i_mode)) {
32 if (arg) 34 if (arg)
33 return reiserfs_unpack(inode, filp); 35 err = reiserfs_unpack(inode, filp);
34 else
35 return 0;
36 } else 36 } else
37 return -ENOTTY; 37 err = -ENOTTY;
38 /* following two cases are taken from fs/ext2/ioctl.c by Remy 38 break;
39 Card (card@masi.ibp.fr) */ 39 /*
40 * following two cases are taken from fs/ext2/ioctl.c by Remy
41 * Card (card@masi.ibp.fr)
42 */
40 case REISERFS_IOC_GETFLAGS: 43 case REISERFS_IOC_GETFLAGS:
41 if (!reiserfs_attrs(inode->i_sb)) 44 if (!reiserfs_attrs(inode->i_sb)) {
42 return -ENOTTY; 45 err = -ENOTTY;
46 break;
47 }
43 48
44 flags = REISERFS_I(inode)->i_attrs; 49 flags = REISERFS_I(inode)->i_attrs;
45 i_attrs_to_sd_attrs(inode, (__u16 *) & flags); 50 i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
46 return put_user(flags, (int __user *)arg); 51 err = put_user(flags, (int __user *)arg);
52 break;
47 case REISERFS_IOC_SETFLAGS:{ 53 case REISERFS_IOC_SETFLAGS:{
48 if (!reiserfs_attrs(inode->i_sb)) 54 if (!reiserfs_attrs(inode->i_sb)) {
49 return -ENOTTY; 55 err = -ENOTTY;
56 break;
57 }
50 58
51 err = mnt_want_write(filp->f_path.mnt); 59 err = mnt_want_write(filp->f_path.mnt);
52 if (err) 60 if (err)
53 return err; 61 break;
54 62
55 if (!is_owner_or_cap(inode)) { 63 if (!is_owner_or_cap(inode)) {
56 err = -EPERM; 64 err = -EPERM;
@@ -90,16 +98,19 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
90 mark_inode_dirty(inode); 98 mark_inode_dirty(inode);
91setflags_out: 99setflags_out:
92 mnt_drop_write(filp->f_path.mnt); 100 mnt_drop_write(filp->f_path.mnt);
93 return err; 101 break;
94 } 102 }
95 case REISERFS_IOC_GETVERSION: 103 case REISERFS_IOC_GETVERSION:
96 return put_user(inode->i_generation, (int __user *)arg); 104 err = put_user(inode->i_generation, (int __user *)arg);
105 break;
97 case REISERFS_IOC_SETVERSION: 106 case REISERFS_IOC_SETVERSION:
98 if (!is_owner_or_cap(inode)) 107 if (!is_owner_or_cap(inode)) {
99 return -EPERM; 108 err = -EPERM;
109 break;
110 }
100 err = mnt_want_write(filp->f_path.mnt); 111 err = mnt_want_write(filp->f_path.mnt);
101 if (err) 112 if (err)
102 return err; 113 break;
103 if (get_user(inode->i_generation, (int __user *)arg)) { 114 if (get_user(inode->i_generation, (int __user *)arg)) {
104 err = -EFAULT; 115 err = -EFAULT;
105 goto setversion_out; 116 goto setversion_out;
@@ -108,19 +119,20 @@ setflags_out:
108 mark_inode_dirty(inode); 119 mark_inode_dirty(inode);
109setversion_out: 120setversion_out:
110 mnt_drop_write(filp->f_path.mnt); 121 mnt_drop_write(filp->f_path.mnt);
111 return err; 122 break;
112 default: 123 default:
113 return -ENOTTY; 124 err = -ENOTTY;
114 } 125 }
126
127 reiserfs_write_unlock(inode->i_sb);
128
129 return err;
115} 130}
116 131
117#ifdef CONFIG_COMPAT 132#ifdef CONFIG_COMPAT
118long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, 133long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
119 unsigned long arg) 134 unsigned long arg)
120{ 135{
121 struct inode *inode = file->f_path.dentry->d_inode;
122 int ret;
123
124 /* These are just misnamed, they actually get/put from/to user an int */ 136 /* These are just misnamed, they actually get/put from/to user an int */
125 switch (cmd) { 137 switch (cmd) {
126 case REISERFS_IOC32_UNPACK: 138 case REISERFS_IOC32_UNPACK:
@@ -141,10 +153,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
141 default: 153 default:
142 return -ENOIOCTLCMD; 154 return -ENOIOCTLCMD;
143 } 155 }
144 lock_kernel(); 156
145 ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); 157 return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
146 unlock_kernel();
147 return ret;
148} 158}
149#endif 159#endif
150 160
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39c..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
50#include <linux/blkdev.h> 50#include <linux/blkdev.h>
51#include <linux/backing-dev.h> 51#include <linux/backing-dev.h>
52#include <linux/uaccess.h> 52#include <linux/uaccess.h>
53#include <linux/slab.h>
53 54
54#include <asm/system.h> 55#include <asm/system.h>
55 56
@@ -429,21 +430,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
429 clear_buffer_journal_restore_dirty(bh); 430 clear_buffer_journal_restore_dirty(bh);
430} 431}
431 432
432/* utility function to force a BUG if it is called without the big
433** kernel lock held. caller is the string printed just before calling BUG()
434*/
435void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
436{
437#ifdef CONFIG_SMP
438 if (current->lock_depth < 0) {
439 reiserfs_panic(sb, "journal-1", "%s called without kernel "
440 "lock held", caller);
441 }
442#else
443 ;
444#endif
445}
446
447/* return a cnode with same dev, block number and size in table, or null if not found */ 433/* return a cnode with same dev, block number and size in table, or null if not found */
448static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct 434static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
449 super_block 435 super_block
@@ -556,7 +542,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
556static inline void lock_journal(struct super_block *sb) 542static inline void lock_journal(struct super_block *sb)
557{ 543{
558 PROC_INFO_INC(sb, journal.lock_journal); 544 PROC_INFO_INC(sb, journal.lock_journal);
559 mutex_lock(&SB_JOURNAL(sb)->j_mutex); 545
546 reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
560} 547}
561 548
562/* unlock the current transaction */ 549/* unlock the current transaction */
@@ -708,7 +695,9 @@ static void check_barrier_completion(struct super_block *s,
708 disable_barrier(s); 695 disable_barrier(s);
709 set_buffer_uptodate(bh); 696 set_buffer_uptodate(bh);
710 set_buffer_dirty(bh); 697 set_buffer_dirty(bh);
698 reiserfs_write_unlock(s);
711 sync_dirty_buffer(bh); 699 sync_dirty_buffer(bh);
700 reiserfs_write_lock(s);
712 } 701 }
713} 702}
714 703
@@ -996,8 +985,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
996{ 985{
997 DEFINE_WAIT(wait); 986 DEFINE_WAIT(wait);
998 struct reiserfs_journal *j = SB_JOURNAL(s); 987 struct reiserfs_journal *j = SB_JOURNAL(s);
999 if (atomic_read(&j->j_async_throttle)) 988
989 if (atomic_read(&j->j_async_throttle)) {
990 reiserfs_write_unlock(s);
1000 congestion_wait(BLK_RW_ASYNC, HZ / 10); 991 congestion_wait(BLK_RW_ASYNC, HZ / 10);
992 reiserfs_write_lock(s);
993 }
994
1001 return 0; 995 return 0;
1002} 996}
1003 997
@@ -1043,7 +1037,8 @@ static int flush_commit_list(struct super_block *s,
1043 } 1037 }
1044 1038
1045 /* make sure nobody is trying to flush this one at the same time */ 1039 /* make sure nobody is trying to flush this one at the same time */
1046 mutex_lock(&jl->j_commit_mutex); 1040 reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
1041
1047 if (!journal_list_still_alive(s, trans_id)) { 1042 if (!journal_list_still_alive(s, trans_id)) {
1048 mutex_unlock(&jl->j_commit_mutex); 1043 mutex_unlock(&jl->j_commit_mutex);
1049 goto put_jl; 1044 goto put_jl;
@@ -1061,12 +1056,17 @@ static int flush_commit_list(struct super_block *s,
1061 1056
1062 if (!list_empty(&jl->j_bh_list)) { 1057 if (!list_empty(&jl->j_bh_list)) {
1063 int ret; 1058 int ret;
1064 unlock_kernel(); 1059
1060 /*
1061 * We might sleep in numerous places inside
1062 * write_ordered_buffers. Relax the write lock.
1063 */
1064 reiserfs_write_unlock(s);
1065 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, 1065 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1066 journal, jl, &jl->j_bh_list); 1066 journal, jl, &jl->j_bh_list);
1067 if (ret < 0 && retval == 0) 1067 if (ret < 0 && retval == 0)
1068 retval = ret; 1068 retval = ret;
1069 lock_kernel(); 1069 reiserfs_write_lock(s);
1070 } 1070 }
1071 BUG_ON(!list_empty(&jl->j_bh_list)); 1071 BUG_ON(!list_empty(&jl->j_bh_list));
1072 /* 1072 /*
@@ -1085,8 +1085,11 @@ static int flush_commit_list(struct super_block *s,
1085 SB_ONDISK_JOURNAL_SIZE(s); 1085 SB_ONDISK_JOURNAL_SIZE(s);
1086 tbh = journal_find_get_block(s, bn); 1086 tbh = journal_find_get_block(s, bn);
1087 if (tbh) { 1087 if (tbh) {
1088 if (buffer_dirty(tbh)) 1088 if (buffer_dirty(tbh)) {
1089 ll_rw_block(WRITE, 1, &tbh) ; 1089 reiserfs_write_unlock(s);
1090 ll_rw_block(WRITE, 1, &tbh);
1091 reiserfs_write_lock(s);
1092 }
1090 put_bh(tbh) ; 1093 put_bh(tbh) ;
1091 } 1094 }
1092 } 1095 }
@@ -1114,12 +1117,19 @@ static int flush_commit_list(struct super_block *s,
1114 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1117 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1115 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1118 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
1116 tbh = journal_find_get_block(s, bn); 1119 tbh = journal_find_get_block(s, bn);
1120
1121 reiserfs_write_unlock(s);
1117 wait_on_buffer(tbh); 1122 wait_on_buffer(tbh);
1123 reiserfs_write_lock(s);
1118 // since we're using ll_rw_blk above, it might have skipped over 1124 // since we're using ll_rw_blk above, it might have skipped over
1119 // a locked buffer. Double check here 1125 // a locked buffer. Double check here
1120 // 1126 //
1121 if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ 1127 /* redundant, sync_dirty_buffer() checks */
1128 if (buffer_dirty(tbh)) {
1129 reiserfs_write_unlock(s);
1122 sync_dirty_buffer(tbh); 1130 sync_dirty_buffer(tbh);
1131 reiserfs_write_lock(s);
1132 }
1123 if (unlikely(!buffer_uptodate(tbh))) { 1133 if (unlikely(!buffer_uptodate(tbh))) {
1124#ifdef CONFIG_REISERFS_CHECK 1134#ifdef CONFIG_REISERFS_CHECK
1125 reiserfs_warning(s, "journal-601", 1135 reiserfs_warning(s, "journal-601",
@@ -1143,10 +1153,15 @@ static int flush_commit_list(struct super_block *s,
1143 if (buffer_dirty(jl->j_commit_bh)) 1153 if (buffer_dirty(jl->j_commit_bh))
1144 BUG(); 1154 BUG();
1145 mark_buffer_dirty(jl->j_commit_bh) ; 1155 mark_buffer_dirty(jl->j_commit_bh) ;
1156 reiserfs_write_unlock(s);
1146 sync_dirty_buffer(jl->j_commit_bh) ; 1157 sync_dirty_buffer(jl->j_commit_bh) ;
1158 reiserfs_write_lock(s);
1147 } 1159 }
1148 } else 1160 } else {
1161 reiserfs_write_unlock(s);
1149 wait_on_buffer(jl->j_commit_bh); 1162 wait_on_buffer(jl->j_commit_bh);
1163 reiserfs_write_lock(s);
1164 }
1150 1165
1151 check_barrier_completion(s, jl->j_commit_bh); 1166 check_barrier_completion(s, jl->j_commit_bh);
1152 1167
@@ -1286,7 +1301,9 @@ static int _update_journal_header_block(struct super_block *sb,
1286 1301
1287 if (trans_id >= journal->j_last_flush_trans_id) { 1302 if (trans_id >= journal->j_last_flush_trans_id) {
1288 if (buffer_locked((journal->j_header_bh))) { 1303 if (buffer_locked((journal->j_header_bh))) {
1304 reiserfs_write_unlock(sb);
1289 wait_on_buffer((journal->j_header_bh)); 1305 wait_on_buffer((journal->j_header_bh));
1306 reiserfs_write_lock(sb);
1290 if (unlikely(!buffer_uptodate(journal->j_header_bh))) { 1307 if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1291#ifdef CONFIG_REISERFS_CHECK 1308#ifdef CONFIG_REISERFS_CHECK
1292 reiserfs_warning(sb, "journal-699", 1309 reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1329,16 @@ static int _update_journal_header_block(struct super_block *sb,
1312 disable_barrier(sb); 1329 disable_barrier(sb);
1313 goto sync; 1330 goto sync;
1314 } 1331 }
1332 reiserfs_write_unlock(sb);
1315 wait_on_buffer(journal->j_header_bh); 1333 wait_on_buffer(journal->j_header_bh);
1334 reiserfs_write_lock(sb);
1316 check_barrier_completion(sb, journal->j_header_bh); 1335 check_barrier_completion(sb, journal->j_header_bh);
1317 } else { 1336 } else {
1318 sync: 1337 sync:
1319 set_buffer_dirty(journal->j_header_bh); 1338 set_buffer_dirty(journal->j_header_bh);
1339 reiserfs_write_unlock(sb);
1320 sync_dirty_buffer(journal->j_header_bh); 1340 sync_dirty_buffer(journal->j_header_bh);
1341 reiserfs_write_lock(sb);
1321 } 1342 }
1322 if (!buffer_uptodate(journal->j_header_bh)) { 1343 if (!buffer_uptodate(journal->j_header_bh)) {
1323 reiserfs_warning(sb, "journal-837", 1344 reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1430,7 @@ static int flush_journal_list(struct super_block *s,
1409 1430
1410 /* if flushall == 0, the lock is already held */ 1431 /* if flushall == 0, the lock is already held */
1411 if (flushall) { 1432 if (flushall) {
1412 mutex_lock(&journal->j_flush_mutex); 1433 reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1413 } else if (mutex_trylock(&journal->j_flush_mutex)) { 1434 } else if (mutex_trylock(&journal->j_flush_mutex)) {
1414 BUG(); 1435 BUG();
1415 } 1436 }
@@ -1553,7 +1574,11 @@ static int flush_journal_list(struct super_block *s,
1553 reiserfs_panic(s, "journal-1011", 1574 reiserfs_panic(s, "journal-1011",
1554 "cn->bh is NULL"); 1575 "cn->bh is NULL");
1555 } 1576 }
1577
1578 reiserfs_write_unlock(s);
1556 wait_on_buffer(cn->bh); 1579 wait_on_buffer(cn->bh);
1580 reiserfs_write_lock(s);
1581
1557 if (!cn->bh) { 1582 if (!cn->bh) {
1558 reiserfs_panic(s, "journal-1012", 1583 reiserfs_panic(s, "journal-1012",
1559 "cn->bh is NULL"); 1584 "cn->bh is NULL");
@@ -1769,7 +1794,7 @@ static int kupdate_transactions(struct super_block *s,
1769 struct reiserfs_journal *journal = SB_JOURNAL(s); 1794 struct reiserfs_journal *journal = SB_JOURNAL(s);
1770 chunk.nr = 0; 1795 chunk.nr = 0;
1771 1796
1772 mutex_lock(&journal->j_flush_mutex); 1797 reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1773 if (!journal_list_still_alive(s, orig_trans_id)) { 1798 if (!journal_list_still_alive(s, orig_trans_id)) {
1774 goto done; 1799 goto done;
1775 } 1800 }
@@ -1973,7 +1998,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1973 reiserfs_mounted_fs_count--; 1998 reiserfs_mounted_fs_count--;
1974 /* wait for all commits to finish */ 1999 /* wait for all commits to finish */
1975 cancel_delayed_work(&SB_JOURNAL(sb)->j_work); 2000 cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
2001
2002 /*
2003 * We must release the write lock here because
2004 * the workqueue job (flush_async_commit) needs this lock
2005 */
2006 reiserfs_write_unlock(sb);
1976 flush_workqueue(commit_wq); 2007 flush_workqueue(commit_wq);
2008
1977 if (!reiserfs_mounted_fs_count) { 2009 if (!reiserfs_mounted_fs_count) {
1978 destroy_workqueue(commit_wq); 2010 destroy_workqueue(commit_wq);
1979 commit_wq = NULL; 2011 commit_wq = NULL;
@@ -1981,6 +2013,8 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1981 2013
1982 free_journal_ram(sb); 2014 free_journal_ram(sb);
1983 2015
2016 reiserfs_write_lock(sb);
2017
1984 return 0; 2018 return 0;
1985} 2019}
1986 2020
@@ -2184,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
2184 brelse(d_bh); 2218 brelse(d_bh);
2185 return 1; 2219 return 1;
2186 } 2220 }
2221
2222 if (bdev_read_only(sb->s_bdev)) {
2223 reiserfs_warning(sb, "clm-2076",
2224 "device is readonly, unable to replay log");
2225 brelse(c_bh);
2226 brelse(d_bh);
2227 return -EROFS;
2228 }
2229
2187 trans_id = get_desc_trans_id(desc); 2230 trans_id = get_desc_trans_id(desc);
2188 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2231 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2189 log_blocks = kmalloc(get_desc_trans_len(desc) * 2232 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2243,7 +2286,11 @@ static int journal_read_transaction(struct super_block *sb,
2243 /* read in the log blocks, memcpy to the corresponding real block */ 2286 /* read in the log blocks, memcpy to the corresponding real block */
2244 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); 2287 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
2245 for (i = 0; i < get_desc_trans_len(desc); i++) { 2288 for (i = 0; i < get_desc_trans_len(desc); i++) {
2289
2290 reiserfs_write_unlock(sb);
2246 wait_on_buffer(log_blocks[i]); 2291 wait_on_buffer(log_blocks[i]);
2292 reiserfs_write_lock(sb);
2293
2247 if (!buffer_uptodate(log_blocks[i])) { 2294 if (!buffer_uptodate(log_blocks[i])) {
2248 reiserfs_warning(sb, "journal-1212", 2295 reiserfs_warning(sb, "journal-1212",
2249 "REPLAY FAILURE fsck required! " 2296 "REPLAY FAILURE fsck required! "
@@ -2422,12 +2469,6 @@ static int journal_read(struct super_block *sb)
2422 goto start_log_replay; 2469 goto start_log_replay;
2423 } 2470 }
2424 2471
2425 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2426 reiserfs_warning(sb, "clm-2076",
2427 "device is readonly, unable to replay log");
2428 return -1;
2429 }
2430
2431 /* ok, there are transactions that need to be replayed. start with the first log block, find 2472 /* ok, there are transactions that need to be replayed. start with the first log block, find
2432 ** all the valid transactions, and pick out the oldest. 2473 ** all the valid transactions, and pick out the oldest.
2433 */ 2474 */
@@ -2722,11 +2763,18 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2722 struct reiserfs_journal *journal; 2763 struct reiserfs_journal *journal;
2723 struct reiserfs_journal_list *jl; 2764 struct reiserfs_journal_list *jl;
2724 char b[BDEVNAME_SIZE]; 2765 char b[BDEVNAME_SIZE];
2766 int ret;
2725 2767
2768 /*
2769 * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
2770 * dependency inversion warnings.
2771 */
2772 reiserfs_write_unlock(sb);
2726 journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); 2773 journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
2727 if (!journal) { 2774 if (!journal) {
2728 reiserfs_warning(sb, "journal-1256", 2775 reiserfs_warning(sb, "journal-1256",
2729 "unable to get memory for journal structure"); 2776 "unable to get memory for journal structure");
2777 reiserfs_write_lock(sb);
2730 return 1; 2778 return 1;
2731 } 2779 }
2732 memset(journal, 0, sizeof(struct reiserfs_journal)); 2780 memset(journal, 0, sizeof(struct reiserfs_journal));
@@ -2735,10 +2783,12 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2735 INIT_LIST_HEAD(&journal->j_working_list); 2783 INIT_LIST_HEAD(&journal->j_working_list);
2736 INIT_LIST_HEAD(&journal->j_journal_list); 2784 INIT_LIST_HEAD(&journal->j_journal_list);
2737 journal->j_persistent_trans = 0; 2785 journal->j_persistent_trans = 0;
2738 if (reiserfs_allocate_list_bitmaps(sb, 2786 ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
2739 journal->j_list_bitmap, 2787 reiserfs_bmap_count(sb));
2740 reiserfs_bmap_count(sb))) 2788 reiserfs_write_lock(sb);
2789 if (ret)
2741 goto free_and_return; 2790 goto free_and_return;
2791
2742 allocate_bitmap_nodes(sb); 2792 allocate_bitmap_nodes(sb);
2743 2793
2744 /* reserved for journal area support */ 2794 /* reserved for journal area support */
@@ -2765,11 +2815,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2765 goto free_and_return; 2815 goto free_and_return;
2766 } 2816 }
2767 2817
2818 /*
2819 * We need to unlock here to avoid creating the following
2820 * dependency:
2821 * reiserfs_lock -> sysfs_mutex
2822 * Because the reiserfs mmap path creates the following dependency:
2823 * mm->mmap -> reiserfs_lock, hence we have
2824 * mm->mmap -> reiserfs_lock ->sysfs_mutex
2825 * This would ends up in a circular dependency with sysfs readdir path
2826 * which does sysfs_mutex -> mm->mmap_sem
2827 * This is fine because the reiserfs lock is useless in mount path,
2828 * at least until we call journal_begin. We keep it for paranoid
2829 * reasons.
2830 */
2831 reiserfs_write_unlock(sb);
2768 if (journal_init_dev(sb, journal, j_dev_name) != 0) { 2832 if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2833 reiserfs_write_lock(sb);
2769 reiserfs_warning(sb, "sh-462", 2834 reiserfs_warning(sb, "sh-462",
2770 "unable to initialize jornal device"); 2835 "unable to initialize jornal device");
2771 goto free_and_return; 2836 goto free_and_return;
2772 } 2837 }
2838 reiserfs_write_lock(sb);
2773 2839
2774 rs = SB_DISK_SUPER_BLOCK(sb); 2840 rs = SB_DISK_SUPER_BLOCK(sb);
2775 2841
@@ -2851,7 +2917,9 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2851 journal->j_mount_id = 10; 2917 journal->j_mount_id = 10;
2852 journal->j_state = 0; 2918 journal->j_state = 0;
2853 atomic_set(&(journal->j_jlock), 0); 2919 atomic_set(&(journal->j_jlock), 0);
2920 reiserfs_write_unlock(sb);
2854 journal->j_cnode_free_list = allocate_cnodes(num_cnodes); 2921 journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
2922 reiserfs_write_lock(sb);
2855 journal->j_cnode_free_orig = journal->j_cnode_free_list; 2923 journal->j_cnode_free_orig = journal->j_cnode_free_list;
2856 journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; 2924 journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
2857 journal->j_cnode_used = 0; 2925 journal->j_cnode_used = 0;
@@ -2881,8 +2949,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2881 } 2949 }
2882 2950
2883 reiserfs_mounted_fs_count++; 2951 reiserfs_mounted_fs_count++;
2884 if (reiserfs_mounted_fs_count <= 1) 2952 if (reiserfs_mounted_fs_count <= 1) {
2953 reiserfs_write_unlock(sb);
2885 commit_wq = create_workqueue("reiserfs"); 2954 commit_wq = create_workqueue("reiserfs");
2955 reiserfs_write_lock(sb);
2956 }
2886 2957
2887 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); 2958 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
2888 journal->j_work_sb = sb; 2959 journal->j_work_sb = sb;
@@ -2964,8 +3035,11 @@ static void queue_log_writer(struct super_block *s)
2964 init_waitqueue_entry(&wait, current); 3035 init_waitqueue_entry(&wait, current);
2965 add_wait_queue(&journal->j_join_wait, &wait); 3036 add_wait_queue(&journal->j_join_wait, &wait);
2966 set_current_state(TASK_UNINTERRUPTIBLE); 3037 set_current_state(TASK_UNINTERRUPTIBLE);
2967 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) 3038 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
3039 reiserfs_write_unlock(s);
2968 schedule(); 3040 schedule();
3041 reiserfs_write_lock(s);
3042 }
2969 __set_current_state(TASK_RUNNING); 3043 __set_current_state(TASK_RUNNING);
2970 remove_wait_queue(&journal->j_join_wait, &wait); 3044 remove_wait_queue(&journal->j_join_wait, &wait);
2971} 3045}
@@ -2982,7 +3056,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
2982 struct reiserfs_journal *journal = SB_JOURNAL(sb); 3056 struct reiserfs_journal *journal = SB_JOURNAL(sb);
2983 unsigned long bcount = journal->j_bcount; 3057 unsigned long bcount = journal->j_bcount;
2984 while (1) { 3058 while (1) {
3059 reiserfs_write_unlock(sb);
2985 schedule_timeout_uninterruptible(1); 3060 schedule_timeout_uninterruptible(1);
3061 reiserfs_write_lock(sb);
2986 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; 3062 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
2987 while ((atomic_read(&journal->j_wcount) > 0 || 3063 while ((atomic_read(&journal->j_wcount) > 0 ||
2988 atomic_read(&journal->j_jlock)) && 3064 atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3109,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3033 3109
3034 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { 3110 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
3035 unlock_journal(sb); 3111 unlock_journal(sb);
3112 reiserfs_write_unlock(sb);
3036 reiserfs_wait_on_write_block(sb); 3113 reiserfs_wait_on_write_block(sb);
3114 reiserfs_write_lock(sb);
3037 PROC_INFO_INC(sb, journal.journal_relock_writers); 3115 PROC_INFO_INC(sb, journal.journal_relock_writers);
3038 goto relock; 3116 goto relock;
3039 } 3117 }
@@ -3506,14 +3584,14 @@ static void flush_async_commits(struct work_struct *work)
3506 struct reiserfs_journal_list *jl; 3584 struct reiserfs_journal_list *jl;
3507 struct list_head *entry; 3585 struct list_head *entry;
3508 3586
3509 lock_kernel(); 3587 reiserfs_write_lock(sb);
3510 if (!list_empty(&journal->j_journal_list)) { 3588 if (!list_empty(&journal->j_journal_list)) {
3511 /* last entry is the youngest, commit it and you get everything */ 3589 /* last entry is the youngest, commit it and you get everything */
3512 entry = journal->j_journal_list.prev; 3590 entry = journal->j_journal_list.prev;
3513 jl = JOURNAL_LIST_ENTRY(entry); 3591 jl = JOURNAL_LIST_ENTRY(entry);
3514 flush_commit_list(sb, jl, 1); 3592 flush_commit_list(sb, jl, 1);
3515 } 3593 }
3516 unlock_kernel(); 3594 reiserfs_write_unlock(sb);
3517} 3595}
3518 3596
3519/* 3597/*
@@ -4041,7 +4119,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4041 * the new transaction is fully setup, and we've already flushed the 4119 * the new transaction is fully setup, and we've already flushed the
4042 * ordered bh list 4120 * ordered bh list
4043 */ 4121 */
4044 mutex_lock(&jl->j_commit_mutex); 4122 reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
4045 4123
4046 /* save the transaction id in case we need to commit it later */ 4124 /* save the transaction id in case we need to commit it later */
4047 commit_trans_id = jl->j_trans_id; 4125 commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4234,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4156 next = cn->next; 4234 next = cn->next;
4157 free_cnode(sb, cn); 4235 free_cnode(sb, cn);
4158 cn = next; 4236 cn = next;
4237 reiserfs_write_unlock(sb);
4159 cond_resched(); 4238 cond_resched();
4239 reiserfs_write_lock(sb);
4160 } 4240 }
4161 4241
4162 /* we are done with both the c_bh and d_bh, but 4242 /* we are done with both the c_bh and d_bh, but
@@ -4203,10 +4283,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4203 * is lost. 4283 * is lost.
4204 */ 4284 */
4205 if (!list_empty(&jl->j_tail_bh_list)) { 4285 if (!list_empty(&jl->j_tail_bh_list)) {
4206 unlock_kernel(); 4286 reiserfs_write_unlock(sb);
4207 write_ordered_buffers(&journal->j_dirty_buffers_lock, 4287 write_ordered_buffers(&journal->j_dirty_buffers_lock,
4208 journal, jl, &jl->j_tail_bh_list); 4288 journal, jl, &jl->j_tail_bh_list);
4209 lock_kernel(); 4289 reiserfs_write_lock(sb);
4210 } 4290 }
4211 BUG_ON(!list_empty(&jl->j_tail_bh_list)); 4291 BUG_ON(!list_empty(&jl->j_tail_bh_list));
4212 mutex_unlock(&jl->j_commit_mutex); 4292 mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 000000000000..b87aa2c1afc1
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,97 @@
1#include <linux/reiserfs_fs.h>
2#include <linux/mutex.h>
3
4/*
5 * The previous reiserfs locking scheme was heavily based on
6 * the tricky properties of the Bkl:
7 *
8 * - it was acquired recursively by a same task
9 * - the performances relied on the release-while-schedule() property
10 *
11 * Now that we replace it by a mutex, we still want to keep the same
12 * recursive property to avoid big changes in the code structure.
13 * We use our own lock_owner here because the owner field on a mutex
14 * is only available in SMP or mutex debugging, also we only need this field
15 * for this mutex, no need for a system wide mutex facility.
16 *
17 * Also this lock is often released before a call that could block because
18 * reiserfs performances were partialy based on the release while schedule()
19 * property of the Bkl.
20 */
21void reiserfs_write_lock(struct super_block *s)
22{
23 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
24
25 if (sb_i->lock_owner != current) {
26 mutex_lock(&sb_i->lock);
27 sb_i->lock_owner = current;
28 }
29
30 /* No need to protect it, only the current task touches it */
31 sb_i->lock_depth++;
32}
33
34void reiserfs_write_unlock(struct super_block *s)
35{
36 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
37
38 /*
39 * Are we unlocking without even holding the lock?
40 * Such a situation must raise a BUG() if we don't want
41 * to corrupt the data.
42 */
43 BUG_ON(sb_i->lock_owner != current);
44
45 if (--sb_i->lock_depth == -1) {
46 sb_i->lock_owner = NULL;
47 mutex_unlock(&sb_i->lock);
48 }
49}
50
51/*
52 * If we already own the lock, just exit and don't increase the depth.
53 * Useful when we don't want to lock more than once.
54 *
55 * We always return the lock_depth we had before calling
56 * this function.
57 */
58int reiserfs_write_lock_once(struct super_block *s)
59{
60 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
61
62 if (sb_i->lock_owner != current) {
63 mutex_lock(&sb_i->lock);
64 sb_i->lock_owner = current;
65 return sb_i->lock_depth++;
66 }
67
68 return sb_i->lock_depth;
69}
70
71void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
72{
73 if (lock_depth == -1)
74 reiserfs_write_unlock(s);
75}
76
77/*
78 * Utility function to force a BUG if it is called without the superblock
79 * write lock held. caller is the string printed just before calling BUG()
80 */
81void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
82{
83 struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
84
85 if (sb_i->lock_depth < 0)
86 reiserfs_panic(sb, "%s called without kernel lock held %d",
87 caller);
88}
89
90#ifdef CONFIG_REISERFS_CHECK
91void reiserfs_lock_check_recursive(struct super_block *sb)
92{
93 struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
94
95 WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
96}
97#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 271579128634..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h>
16#include <linux/reiserfs_fs.h> 17#include <linux/reiserfs_fs.h>
17#include <linux/reiserfs_acl.h> 18#include <linux/reiserfs_acl.h>
18#include <linux/reiserfs_xattr.h> 19#include <linux/reiserfs_xattr.h>
@@ -324,6 +325,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
324 struct nameidata *nd) 325 struct nameidata *nd)
325{ 326{
326 int retval; 327 int retval;
328 int lock_depth;
327 struct inode *inode = NULL; 329 struct inode *inode = NULL;
328 struct reiserfs_dir_entry de; 330 struct reiserfs_dir_entry de;
329 INITIALIZE_PATH(path_to_entry); 331 INITIALIZE_PATH(path_to_entry);
@@ -331,7 +333,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
331 if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) 333 if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
332 return ERR_PTR(-ENAMETOOLONG); 334 return ERR_PTR(-ENAMETOOLONG);
333 335
334 reiserfs_write_lock(dir->i_sb); 336 /*
337 * Might be called with or without the write lock, must be careful
338 * to not recursively hold it in case we want to release the lock
339 * before rescheduling.
340 */
341 lock_depth = reiserfs_write_lock_once(dir->i_sb);
342
335 de.de_gen_number_bit_string = NULL; 343 de.de_gen_number_bit_string = NULL;
336 retval = 344 retval =
337 reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, 345 reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +349,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
341 inode = reiserfs_iget(dir->i_sb, 349 inode = reiserfs_iget(dir->i_sb,
342 (struct cpu_key *)&(de.de_dir_id)); 350 (struct cpu_key *)&(de.de_dir_id));
343 if (!inode || IS_ERR(inode)) { 351 if (!inode || IS_ERR(inode)) {
344 reiserfs_write_unlock(dir->i_sb); 352 reiserfs_write_unlock_once(dir->i_sb, lock_depth);
345 return ERR_PTR(-EACCES); 353 return ERR_PTR(-EACCES);
346 } 354 }
347 355
@@ -350,7 +358,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
350 if (IS_PRIVATE(dir)) 358 if (IS_PRIVATE(dir))
351 inode->i_flags |= S_PRIVATE; 359 inode->i_flags |= S_PRIVATE;
352 } 360 }
353 reiserfs_write_unlock(dir->i_sb); 361 reiserfs_write_unlock_once(dir->i_sb, lock_depth);
354 if (retval == IO_ERROR) { 362 if (retval == IO_ERROR) {
355 return ERR_PTR(-EIO); 363 return ERR_PTR(-EIO);
356 } 364 }
@@ -539,7 +547,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
539*/ 547*/
540static int drop_new_inode(struct inode *inode) 548static int drop_new_inode(struct inode *inode)
541{ 549{
542 vfs_dq_drop(inode); 550 dquot_drop(inode);
543 make_bad_inode(inode); 551 make_bad_inode(inode);
544 inode->i_flags |= S_NOQUOTA; 552 inode->i_flags |= S_NOQUOTA;
545 iput(inode); 553 iput(inode);
@@ -547,7 +555,7 @@ static int drop_new_inode(struct inode *inode)
547} 555}
548 556
549/* utility function that does setup for reiserfs_new_inode. 557/* utility function that does setup for reiserfs_new_inode.
550** vfs_dq_init needs lots of credits so it's better to have it 558** dquot_initialize needs lots of credits so it's better to have it
551** outside of a transaction, so we had to pull some bits of 559** outside of a transaction, so we had to pull some bits of
552** reiserfs_new_inode out into this func. 560** reiserfs_new_inode out into this func.
553*/ 561*/
@@ -570,7 +578,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
570 } else { 578 } else {
571 inode->i_gid = current_fsgid(); 579 inode->i_gid = current_fsgid();
572 } 580 }
573 vfs_dq_init(inode); 581 dquot_initialize(inode);
574 return 0; 582 return 0;
575} 583}
576 584
@@ -587,6 +595,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
587 struct reiserfs_transaction_handle th; 595 struct reiserfs_transaction_handle th;
588 struct reiserfs_security_handle security; 596 struct reiserfs_security_handle security;
589 597
598 dquot_initialize(dir);
599
590 if (!(inode = new_inode(dir->i_sb))) { 600 if (!(inode = new_inode(dir->i_sb))) {
591 return -ENOMEM; 601 return -ENOMEM;
592 } 602 }
@@ -659,6 +669,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
659 if (!new_valid_dev(rdev)) 669 if (!new_valid_dev(rdev))
660 return -EINVAL; 670 return -EINVAL;
661 671
672 dquot_initialize(dir);
673
662 if (!(inode = new_inode(dir->i_sb))) { 674 if (!(inode = new_inode(dir->i_sb))) {
663 return -ENOMEM; 675 return -ENOMEM;
664 } 676 }
@@ -725,12 +737,15 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
725 struct inode *inode; 737 struct inode *inode;
726 struct reiserfs_transaction_handle th; 738 struct reiserfs_transaction_handle th;
727 struct reiserfs_security_handle security; 739 struct reiserfs_security_handle security;
740 int lock_depth;
728 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ 741 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
729 int jbegin_count = 742 int jbegin_count =
730 JOURNAL_PER_BALANCE_CNT * 3 + 743 JOURNAL_PER_BALANCE_CNT * 3 +
731 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + 744 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
732 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); 745 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
733 746
747 dquot_initialize(dir);
748
734#ifdef DISPLACE_NEW_PACKING_LOCALITIES 749#ifdef DISPLACE_NEW_PACKING_LOCALITIES
735 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ 750 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
736 REISERFS_I(dir)->new_packing_locality = 1; 751 REISERFS_I(dir)->new_packing_locality = 1;
@@ -748,7 +763,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
748 return retval; 763 return retval;
749 } 764 }
750 jbegin_count += retval; 765 jbegin_count += retval;
751 reiserfs_write_lock(dir->i_sb); 766 lock_depth = reiserfs_write_lock_once(dir->i_sb);
752 767
753 retval = journal_begin(&th, dir->i_sb, jbegin_count); 768 retval = journal_begin(&th, dir->i_sb, jbegin_count);
754 if (retval) { 769 if (retval) {
@@ -798,8 +813,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
798 d_instantiate(dentry, inode); 813 d_instantiate(dentry, inode);
799 unlock_new_inode(inode); 814 unlock_new_inode(inode);
800 retval = journal_end(&th, dir->i_sb, jbegin_count); 815 retval = journal_end(&th, dir->i_sb, jbegin_count);
801 out_failed: 816out_failed:
802 reiserfs_write_unlock(dir->i_sb); 817 reiserfs_write_unlock_once(dir->i_sb, lock_depth);
803 return retval; 818 return retval;
804} 819}
805 820
@@ -834,6 +849,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
834 JOURNAL_PER_BALANCE_CNT * 2 + 2 + 849 JOURNAL_PER_BALANCE_CNT * 2 + 2 +
835 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 850 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
836 851
852 dquot_initialize(dir);
853
837 reiserfs_write_lock(dir->i_sb); 854 reiserfs_write_lock(dir->i_sb);
838 retval = journal_begin(&th, dir->i_sb, jbegin_count); 855 retval = journal_begin(&th, dir->i_sb, jbegin_count);
839 if (retval) 856 if (retval)
@@ -913,6 +930,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
913 struct reiserfs_transaction_handle th; 930 struct reiserfs_transaction_handle th;
914 int jbegin_count; 931 int jbegin_count;
915 unsigned long savelink; 932 unsigned long savelink;
933 int depth;
934
935 dquot_initialize(dir);
916 936
917 inode = dentry->d_inode; 937 inode = dentry->d_inode;
918 938
@@ -924,7 +944,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
924 JOURNAL_PER_BALANCE_CNT * 2 + 2 + 944 JOURNAL_PER_BALANCE_CNT * 2 + 2 +
925 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 945 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
926 946
927 reiserfs_write_lock(dir->i_sb); 947 depth = reiserfs_write_lock_once(dir->i_sb);
928 retval = journal_begin(&th, dir->i_sb, jbegin_count); 948 retval = journal_begin(&th, dir->i_sb, jbegin_count);
929 if (retval) 949 if (retval)
930 goto out_unlink; 950 goto out_unlink;
@@ -985,7 +1005,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
985 1005
986 retval = journal_end(&th, dir->i_sb, jbegin_count); 1006 retval = journal_end(&th, dir->i_sb, jbegin_count);
987 reiserfs_check_path(&path); 1007 reiserfs_check_path(&path);
988 reiserfs_write_unlock(dir->i_sb); 1008 reiserfs_write_unlock_once(dir->i_sb, depth);
989 return retval; 1009 return retval;
990 1010
991 end_unlink: 1011 end_unlink:
@@ -995,7 +1015,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
995 if (err) 1015 if (err)
996 retval = err; 1016 retval = err;
997 out_unlink: 1017 out_unlink:
998 reiserfs_write_unlock(dir->i_sb); 1018 reiserfs_write_unlock_once(dir->i_sb, depth);
999 return retval; 1019 return retval;
1000} 1020}
1001 1021
@@ -1015,6 +1035,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
1015 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + 1035 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
1016 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); 1036 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
1017 1037
1038 dquot_initialize(parent_dir);
1039
1018 if (!(inode = new_inode(parent_dir->i_sb))) { 1040 if (!(inode = new_inode(parent_dir->i_sb))) {
1019 return -ENOMEM; 1041 return -ENOMEM;
1020 } 1042 }
@@ -1102,6 +1124,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1102 JOURNAL_PER_BALANCE_CNT * 3 + 1124 JOURNAL_PER_BALANCE_CNT * 3 +
1103 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 1125 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
1104 1126
1127 dquot_initialize(dir);
1128
1105 reiserfs_write_lock(dir->i_sb); 1129 reiserfs_write_lock(dir->i_sb);
1106 if (inode->i_nlink >= REISERFS_LINK_MAX) { 1130 if (inode->i_nlink >= REISERFS_LINK_MAX) {
1107 //FIXME: sd_nlink is 32 bit for new files 1131 //FIXME: sd_nlink is 32 bit for new files
@@ -1226,6 +1250,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1226 JOURNAL_PER_BALANCE_CNT * 3 + 5 + 1250 JOURNAL_PER_BALANCE_CNT * 3 + 5 +
1227 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); 1251 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
1228 1252
1253 dquot_initialize(old_dir);
1254 dquot_initialize(new_dir);
1255
1229 old_inode = old_dentry->d_inode; 1256 old_inode = old_dentry->d_inode;
1230 new_dentry_inode = new_dentry->d_inode; 1257 new_dentry_inode = new_dentry->d_inode;
1231 1258
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 536eacaeb710..adbc6f538515 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
349 349
350 . */ 350 . */
351 351
352#ifdef CONFIG_REISERFS_CHECK
353extern struct tree_balance *cur_tb;
354#endif
355
356void __reiserfs_panic(struct super_block *sb, const char *id, 352void __reiserfs_panic(struct super_block *sb, const char *id,
357 const char *function, const char *fmt, ...) 353 const char *function, const char *fmt, ...)
358{ 354{
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9229e5514a4e..7a9981196c1c 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -17,8 +17,6 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19 19
20#ifdef CONFIG_REISERFS_PROC_INFO
21
22/* 20/*
23 * LOCKING: 21 * LOCKING:
24 * 22 *
@@ -48,14 +46,6 @@ static int show_version(struct seq_file *m, struct super_block *sb)
48 return 0; 46 return 0;
49} 47}
50 48
51int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
52 int count, int *eof, void *data)
53{
54 *start = buffer;
55 *eof = 1;
56 return 0;
57}
58
59#define SF( x ) ( r -> x ) 49#define SF( x ) ( r -> x )
60#define SFP( x ) SF( s_proc_info_data.x ) 50#define SFP( x ) SF( s_proc_info_data.x )
61#define SFPL( x ) SFP( x[ level ] ) 51#define SFPL( x ) SFP( x[ level ] )
@@ -538,19 +528,6 @@ int reiserfs_proc_info_done(struct super_block *sb)
538 return 0; 528 return 0;
539} 529}
540 530
541struct proc_dir_entry *reiserfs_proc_register_global(char *name,
542 read_proc_t * func)
543{
544 return (proc_info_root) ? create_proc_read_entry(name, 0,
545 proc_info_root,
546 func, NULL) : NULL;
547}
548
549void reiserfs_proc_unregister_global(const char *name)
550{
551 remove_proc_entry(name, proc_info_root);
552}
553
554int reiserfs_proc_info_global_init(void) 531int reiserfs_proc_info_global_init(void)
555{ 532{
556 if (proc_info_root == NULL) { 533 if (proc_info_root == NULL) {
@@ -572,48 +549,6 @@ int reiserfs_proc_info_global_done(void)
572 } 549 }
573 return 0; 550 return 0;
574} 551}
575
576/* REISERFS_PROC_INFO */
577#else
578
579int reiserfs_proc_info_init(struct super_block *sb)
580{
581 return 0;
582}
583int reiserfs_proc_info_done(struct super_block *sb)
584{
585 return 0;
586}
587
588struct proc_dir_entry *reiserfs_proc_register_global(char *name,
589 read_proc_t * func)
590{
591 return NULL;
592}
593
594void reiserfs_proc_unregister_global(const char *name)
595{;
596}
597
598int reiserfs_proc_info_global_init(void)
599{
600 return 0;
601}
602int reiserfs_proc_info_global_done(void)
603{
604 return 0;
605}
606
607int reiserfs_global_version_in_proc(char *buffer, char **start,
608 off_t offset,
609 int count, int *eof, void *data)
610{
611 return 0;
612}
613
614/* REISERFS_PROC_INFO */
615#endif
616
617/* 552/*
618 * Revision 1.1.8.2 2001/07/15 17:08:42 god 553 * Revision 1.1.8.2 2001/07/15 17:08:42 god
619 * . use get_super() in procfs.c 554 * . use get_super() in procfs.c
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 18b315d3d104..b3a94d20f0fc 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
141 141
142 set_buffer_uptodate(bh); 142 set_buffer_uptodate(bh);
143 mark_buffer_dirty(bh); 143 mark_buffer_dirty(bh);
144 reiserfs_write_unlock(s);
144 sync_dirty_buffer(bh); 145 sync_dirty_buffer(bh);
146 reiserfs_write_lock(s);
145 // update bitmap_info stuff 147 // update bitmap_info stuff
146 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; 148 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
147 brelse(bh); 149 brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d036ee5b1c81..313d39d639eb 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key, /* Key to search for. */
222 return ITEM_NOT_FOUND; 222 return ITEM_NOT_FOUND;
223} 223}
224 224
225#ifdef CONFIG_REISERFS_CHECK
226extern struct tree_balance *cur_tb;
227#endif
228 225
229/* Minimal possible key. It is never in the tree. */ 226/* Minimal possible key. It is never in the tree. */
230const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; 227const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
519 516
520#define SEARCH_BY_KEY_READA 16 517#define SEARCH_BY_KEY_READA 16
521 518
522/* The function is NOT SCHEDULE-SAFE! */ 519/*
523static void search_by_key_reada(struct super_block *s, 520 * The function is NOT SCHEDULE-SAFE!
521 * It might unlock the write lock if we needed to wait for a block
522 * to be read. Note that in this case it won't recover the lock to avoid
523 * high contention resulting from too much lock requests, especially
524 * the caller (search_by_key) will perform other schedule-unsafe
525 * operations just after calling this function.
526 *
527 * @return true if we have unlocked
528 */
529static bool search_by_key_reada(struct super_block *s,
524 struct buffer_head **bh, 530 struct buffer_head **bh,
525 b_blocknr_t *b, int num) 531 b_blocknr_t *b, int num)
526{ 532{
527 int i, j; 533 int i, j;
534 bool unlocked = false;
528 535
529 for (i = 0; i < num; i++) { 536 for (i = 0; i < num; i++) {
530 bh[i] = sb_getblk(s, b[i]); 537 bh[i] = sb_getblk(s, b[i]);
531 } 538 }
539 /*
540 * We are going to read some blocks on which we
541 * have a reference. It's safe, though we might be
542 * reading blocks concurrently changed if we release
543 * the lock. But it's still fine because we check later
544 * if the tree changed
545 */
532 for (j = 0; j < i; j++) { 546 for (j = 0; j < i; j++) {
533 /* 547 /*
534 * note, this needs attention if we are getting rid of the BKL 548 * note, this needs attention if we are getting rid of the BKL
535 * you have to make sure the prepared bit isn't set on this buffer 549 * you have to make sure the prepared bit isn't set on this buffer
536 */ 550 */
537 if (!buffer_uptodate(bh[j])) 551 if (!buffer_uptodate(bh[j])) {
552 if (!unlocked) {
553 reiserfs_write_unlock(s);
554 unlocked = true;
555 }
538 ll_rw_block(READA, 1, bh + j); 556 ll_rw_block(READA, 1, bh + j);
557 }
539 brelse(bh[j]); 558 brelse(bh[j]);
540 } 559 }
560 return unlocked;
541} 561}
542 562
543/************************************************************************** 563/**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
625 have a pointer to it. */ 645 have a pointer to it. */
626 if ((bh = last_element->pe_buffer = 646 if ((bh = last_element->pe_buffer =
627 sb_getblk(sb, block_number))) { 647 sb_getblk(sb, block_number))) {
648 bool unlocked = false;
649
628 if (!buffer_uptodate(bh) && reada_count > 1) 650 if (!buffer_uptodate(bh) && reada_count > 1)
629 search_by_key_reada(sb, reada_bh, 651 /* may unlock the write lock */
652 unlocked = search_by_key_reada(sb, reada_bh,
630 reada_blocks, reada_count); 653 reada_blocks, reada_count);
654 /*
655 * If we haven't already unlocked the write lock,
656 * then we need to do that here before reading
657 * the current block
658 */
659 if (!buffer_uptodate(bh) && !unlocked) {
660 reiserfs_write_unlock(sb);
661 unlocked = true;
662 }
631 ll_rw_block(READ, 1, &bh); 663 ll_rw_block(READ, 1, &bh);
632 wait_on_buffer(bh); 664 wait_on_buffer(bh);
665
666 if (unlocked)
667 reiserfs_write_lock(sb);
633 if (!buffer_uptodate(bh)) 668 if (!buffer_uptodate(bh))
634 goto io_error; 669 goto io_error;
635 } else { 670 } else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
673 !key_in_buffer(search_path, key, sb), 708 !key_in_buffer(search_path, key, sb),
674 "PAP-5130: key is not in the buffer"); 709 "PAP-5130: key is not in the buffer");
675#ifdef CONFIG_REISERFS_CHECK 710#ifdef CONFIG_REISERFS_CHECK
676 if (cur_tb) { 711 if (REISERFS_SB(sb)->cur_tb) {
677 print_cur_tb("5140"); 712 print_cur_tb("5140");
678 reiserfs_panic(sb, "PAP-5140", 713 reiserfs_panic(sb, "PAP-5140",
679 "schedule occurred in do_balance!"); 714 "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
1024 reiserfs_free_block(th, inode, block, 1); 1059 reiserfs_free_block(th, inode, block, 1);
1025 } 1060 }
1026 1061
1062 reiserfs_write_unlock(sb);
1027 cond_resched(); 1063 cond_resched();
1064 reiserfs_write_lock(sb);
1028 1065
1029 if (item_moved (&s_ih, path)) { 1066 if (item_moved (&s_ih, path)) {
1030 need_re_search = 1; 1067 need_re_search = 1;
@@ -1262,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1262 "reiserquota delete_item(): freeing %u, id=%u type=%c", 1299 "reiserquota delete_item(): freeing %u, id=%u type=%c",
1263 quota_cut_bytes, inode->i_uid, head2type(&s_ih)); 1300 quota_cut_bytes, inode->i_uid, head2type(&s_ih));
1264#endif 1301#endif
1265 vfs_dq_free_space_nodirty(inode, quota_cut_bytes); 1302 dquot_free_space_nodirty(inode, quota_cut_bytes);
1266 1303
1267 /* Return deleted body length */ 1304 /* Return deleted body length */
1268 return ret_value; 1305 return ret_value;
@@ -1346,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1346 quota_cut_bytes, inode->i_uid, 1383 quota_cut_bytes, inode->i_uid,
1347 key2type(key)); 1384 key2type(key));
1348#endif 1385#endif
1349 vfs_dq_free_space_nodirty(inode, 1386 dquot_free_space_nodirty(inode,
1350 quota_cut_bytes); 1387 quota_cut_bytes);
1351 } 1388 }
1352 break; 1389 break;
@@ -1696,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1696 "reiserquota cut_from_item(): freeing %u id=%u type=%c", 1733 "reiserquota cut_from_item(): freeing %u id=%u type=%c",
1697 quota_cut_bytes, inode->i_uid, '?'); 1734 quota_cut_bytes, inode->i_uid, '?');
1698#endif 1735#endif
1699 vfs_dq_free_space_nodirty(inode, quota_cut_bytes); 1736 dquot_free_space_nodirty(inode, quota_cut_bytes);
1700 return ret_value; 1737 return ret_value;
1701} 1738}
1702 1739
@@ -1931,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1931 key2type(&(key->on_disk_key))); 1968 key2type(&(key->on_disk_key)));
1932#endif 1969#endif
1933 1970
1934 if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) { 1971 retval = dquot_alloc_space_nodirty(inode, pasted_size);
1972 if (retval) {
1935 pathrelse(search_path); 1973 pathrelse(search_path);
1936 return -EDQUOT; 1974 return retval;
1937 } 1975 }
1938 init_tb_struct(th, &s_paste_balance, th->t_super, search_path, 1976 init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
1939 pasted_size); 1977 pasted_size);
@@ -1987,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1987 pasted_size, inode->i_uid, 2025 pasted_size, inode->i_uid,
1988 key2type(&(key->on_disk_key))); 2026 key2type(&(key->on_disk_key)));
1989#endif 2027#endif
1990 vfs_dq_free_space_nodirty(inode, pasted_size); 2028 dquot_free_space_nodirty(inode, pasted_size);
1991 return retval; 2029 return retval;
1992} 2030}
1993 2031
@@ -2025,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2025#endif 2063#endif
2026 /* We can't dirty inode here. It would be immediately written but 2064 /* We can't dirty inode here. It would be immediately written but
2027 * appropriate stat item isn't inserted yet... */ 2065 * appropriate stat item isn't inserted yet... */
2028 if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) { 2066 retval = dquot_alloc_space_nodirty(inode, quota_bytes);
2067 if (retval) {
2029 pathrelse(path); 2068 pathrelse(path);
2030 return -EDQUOT; 2069 return retval;
2031 } 2070 }
2032 } 2071 }
2033 init_tb_struct(th, &s_ins_balance, th->t_super, path, 2072 init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2076,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2076 quota_bytes, inode->i_uid, head2type(ih)); 2115 quota_bytes, inode->i_uid, head2type(ih));
2077#endif 2116#endif
2078 if (inode) 2117 if (inode)
2079 vfs_dq_free_space_nodirty(inode, quota_bytes); 2118 dquot_free_space_nodirty(inode, quota_bytes);
2080 return retval; 2119 return retval;
2081} 2120}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f0ad05f38022..59125fb36d42 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
16#include <linux/time.h> 17#include <linux/time.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
@@ -246,7 +247,7 @@ static int finish_unfinished(struct super_block *s)
246 retval = remove_save_link_only(s, &save_link_key, 0); 247 retval = remove_save_link_only(s, &save_link_key, 0);
247 continue; 248 continue;
248 } 249 }
249 vfs_dq_init(inode); 250 dquot_initialize(inode);
250 251
251 if (truncate && S_ISDIR(inode->i_mode)) { 252 if (truncate && S_ISDIR(inode->i_mode)) {
252 /* We got a truncate request for a dir which is impossible. 253 /* We got a truncate request for a dir which is impossible.
@@ -465,7 +466,7 @@ static void reiserfs_put_super(struct super_block *s)
465 struct reiserfs_transaction_handle th; 466 struct reiserfs_transaction_handle th;
466 th.t_trans_id = 0; 467 th.t_trans_id = 0;
467 468
468 lock_kernel(); 469 reiserfs_write_lock(s);
469 470
470 if (s->s_dirt) 471 if (s->s_dirt)
471 reiserfs_write_super(s); 472 reiserfs_write_super(s);
@@ -499,10 +500,10 @@ static void reiserfs_put_super(struct super_block *s)
499 500
500 reiserfs_proc_info_done(s); 501 reiserfs_proc_info_done(s);
501 502
503 reiserfs_write_unlock(s);
504 mutex_destroy(&REISERFS_SB(s)->lock);
502 kfree(s->s_fs_info); 505 kfree(s->s_fs_info);
503 s->s_fs_info = NULL; 506 s->s_fs_info = NULL;
504
505 unlock_kernel();
506} 507}
507 508
508static struct kmem_cache *reiserfs_inode_cachep; 509static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +555,33 @@ static void reiserfs_dirty_inode(struct inode *inode)
554 struct reiserfs_transaction_handle th; 555 struct reiserfs_transaction_handle th;
555 556
556 int err = 0; 557 int err = 0;
558 int lock_depth;
559
557 if (inode->i_sb->s_flags & MS_RDONLY) { 560 if (inode->i_sb->s_flags & MS_RDONLY) {
558 reiserfs_warning(inode->i_sb, "clm-6006", 561 reiserfs_warning(inode->i_sb, "clm-6006",
559 "writing inode %lu on readonly FS", 562 "writing inode %lu on readonly FS",
560 inode->i_ino); 563 inode->i_ino);
561 return; 564 return;
562 } 565 }
563 reiserfs_write_lock(inode->i_sb); 566 lock_depth = reiserfs_write_lock_once(inode->i_sb);
564 567
565 /* this is really only used for atime updates, so they don't have 568 /* this is really only used for atime updates, so they don't have
566 ** to be included in O_SYNC or fsync 569 ** to be included in O_SYNC or fsync
567 */ 570 */
568 err = journal_begin(&th, inode->i_sb, 1); 571 err = journal_begin(&th, inode->i_sb, 1);
569 if (err) { 572 if (err)
570 reiserfs_write_unlock(inode->i_sb); 573 goto out;
571 return; 574
572 }
573 reiserfs_update_sd(&th, inode); 575 reiserfs_update_sd(&th, inode);
574 journal_end(&th, inode->i_sb, 1); 576 journal_end(&th, inode->i_sb, 1);
575 reiserfs_write_unlock(inode->i_sb); 577
578out:
579 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
580}
581
582static void reiserfs_clear_inode(struct inode *inode)
583{
584 dquot_drop(inode);
576} 585}
577 586
578#ifdef CONFIG_QUOTA 587#ifdef CONFIG_QUOTA
@@ -587,6 +596,7 @@ static const struct super_operations reiserfs_sops = {
587 .destroy_inode = reiserfs_destroy_inode, 596 .destroy_inode = reiserfs_destroy_inode,
588 .write_inode = reiserfs_write_inode, 597 .write_inode = reiserfs_write_inode,
589 .dirty_inode = reiserfs_dirty_inode, 598 .dirty_inode = reiserfs_dirty_inode,
599 .clear_inode = reiserfs_clear_inode,
590 .delete_inode = reiserfs_delete_inode, 600 .delete_inode = reiserfs_delete_inode,
591 .put_super = reiserfs_put_super, 601 .put_super = reiserfs_put_super,
592 .write_super = reiserfs_write_super, 602 .write_super = reiserfs_write_super,
@@ -613,13 +623,6 @@ static int reiserfs_write_info(struct super_block *, int);
613static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 623static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
614 624
615static const struct dquot_operations reiserfs_quota_operations = { 625static const struct dquot_operations reiserfs_quota_operations = {
616 .initialize = dquot_initialize,
617 .drop = dquot_drop,
618 .alloc_space = dquot_alloc_space,
619 .alloc_inode = dquot_alloc_inode,
620 .free_space = dquot_free_space,
621 .free_inode = dquot_free_inode,
622 .transfer = dquot_transfer,
623 .write_dquot = reiserfs_write_dquot, 626 .write_dquot = reiserfs_write_dquot,
624 .acquire_dquot = reiserfs_acquire_dquot, 627 .acquire_dquot = reiserfs_acquire_dquot,
625 .release_dquot = reiserfs_release_dquot, 628 .release_dquot = reiserfs_release_dquot,
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1168 unsigned int qfmt = 0; 1171 unsigned int qfmt = 0;
1169#ifdef CONFIG_QUOTA 1172#ifdef CONFIG_QUOTA
1170 int i; 1173 int i;
1174#endif
1175
1176 reiserfs_write_lock(s);
1171 1177
1178#ifdef CONFIG_QUOTA
1172 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); 1179 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
1173#endif 1180#endif
1174 1181
1175 lock_kernel();
1176 rs = SB_DISK_SUPER_BLOCK(s); 1182 rs = SB_DISK_SUPER_BLOCK(s);
1177 1183
1178 if (!reiserfs_parse_options 1184 if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1295 1301
1296out_ok: 1302out_ok:
1297 replace_mount_options(s, new_opts); 1303 replace_mount_options(s, new_opts);
1298 unlock_kernel(); 1304 reiserfs_write_unlock(s);
1299 return 0; 1305 return 0;
1300 1306
1301out_err: 1307out_err:
1302 kfree(new_opts); 1308 kfree(new_opts);
1303 unlock_kernel(); 1309 reiserfs_write_unlock(s);
1304 return err; 1310 return err;
1305} 1311}
1306 1312
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
1404static int reread_meta_blocks(struct super_block *s) 1410static int reread_meta_blocks(struct super_block *s)
1405{ 1411{
1406 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); 1412 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
1413 reiserfs_write_unlock(s);
1407 wait_on_buffer(SB_BUFFER_WITH_SB(s)); 1414 wait_on_buffer(SB_BUFFER_WITH_SB(s));
1415 reiserfs_write_lock(s);
1408 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { 1416 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
1409 reiserfs_warning(s, "reiserfs-2504", "error reading the super"); 1417 reiserfs_warning(s, "reiserfs-2504", "error reading the super");
1410 return 1; 1418 return 1;
@@ -1611,10 +1619,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1611 save_mount_options(s, data); 1619 save_mount_options(s, data);
1612 1620
1613 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1621 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1614 if (!sbi) { 1622 if (!sbi)
1615 errval = -ENOMEM; 1623 return -ENOMEM;
1616 goto error;
1617 }
1618 s->s_fs_info = sbi; 1624 s->s_fs_info = sbi;
1619 /* Set default values for options: non-aggressive tails, RO on errors */ 1625 /* Set default values for options: non-aggressive tails, RO on errors */
1620 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); 1626 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1627,6 +1633,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1627 /* setup default block allocator options */ 1633 /* setup default block allocator options */
1628 reiserfs_init_alloc_options(s); 1634 reiserfs_init_alloc_options(s);
1629 1635
1636 mutex_init(&REISERFS_SB(s)->lock);
1637 REISERFS_SB(s)->lock_depth = -1;
1638
1639 /*
1640 * This function is called with the bkl, which also was the old
1641 * locking used here.
1642 * do_journal_begin() will soon check if we hold the lock (ie: was the
1643 * bkl). This is likely because do_journal_begin() has several another
1644 * callers because at this time, it doesn't seem to be necessary to
1645 * protect against anything.
1646 * Anyway, let's be conservative and lock for now.
1647 */
1648 reiserfs_write_lock(s);
1649
1630 jdev_name = NULL; 1650 jdev_name = NULL;
1631 if (reiserfs_parse_options 1651 if (reiserfs_parse_options
1632 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, 1652 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,6 +1872,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1852 init_waitqueue_head(&(sbi->s_wait)); 1872 init_waitqueue_head(&(sbi->s_wait));
1853 spin_lock_init(&sbi->bitmap_lock); 1873 spin_lock_init(&sbi->bitmap_lock);
1854 1874
1875 reiserfs_write_unlock(s);
1876
1855 return (0); 1877 return (0);
1856 1878
1857error: 1879error:
@@ -1859,6 +1881,8 @@ error:
1859 journal_release_error(NULL, s); 1881 journal_release_error(NULL, s);
1860 } 1882 }
1861 1883
1884 reiserfs_write_unlock(s);
1885
1862 reiserfs_free_bitmap_cache(s); 1886 reiserfs_free_bitmap_cache(s);
1863 if (SB_BUFFER_WITH_SB(s)) 1887 if (SB_BUFFER_WITH_SB(s))
1864 brelse(SB_BUFFER_WITH_SB(s)); 1888 brelse(SB_BUFFER_WITH_SB(s));
@@ -2196,8 +2220,6 @@ static int __init init_reiserfs_fs(void)
2196 } 2220 }
2197 2221
2198 reiserfs_proc_info_global_init(); 2222 reiserfs_proc_info_global_init();
2199 reiserfs_proc_register_global("version",
2200 reiserfs_global_version_in_proc);
2201 2223
2202 ret = register_filesystem(&reiserfs_fs_type); 2224 ret = register_filesystem(&reiserfs_fs_type);
2203 2225
@@ -2205,7 +2227,6 @@ static int __init init_reiserfs_fs(void)
2205 return 0; 2227 return 0;
2206 } 2228 }
2207 2229
2208 reiserfs_proc_unregister_global("version");
2209 reiserfs_proc_info_global_done(); 2230 reiserfs_proc_info_global_done();
2210 destroy_inodecache(); 2231 destroy_inodecache();
2211 2232
@@ -2214,7 +2235,6 @@ static int __init init_reiserfs_fs(void)
2214 2235
2215static void __exit exit_reiserfs_fs(void) 2236static void __exit exit_reiserfs_fs(void)
2216{ 2237{
2217 reiserfs_proc_unregister_global("version");
2218 reiserfs_proc_info_global_done(); 2238 reiserfs_proc_info_global_done();
2219 unregister_filesystem(&reiserfs_fs_type); 2239 unregister_filesystem(&reiserfs_fs_type);
2220 destroy_inodecache(); 2240 destroy_inodecache();
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6925b835a43b..4f9586bb7631 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
38#include <linux/dcache.h> 38#include <linux/dcache.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/gfp.h>
41#include <linux/fs.h> 42#include <linux/fs.h>
42#include <linux/file.h> 43#include <linux/file.h>
43#include <linux/pagemap.h> 44#include <linux/pagemap.h>
@@ -48,6 +49,7 @@
48#include <net/checksum.h> 49#include <net/checksum.h>
49#include <linux/stat.h> 50#include <linux/stat.h>
50#include <linux/quotaops.h> 51#include <linux/quotaops.h>
52#include <linux/security.h>
51 53
52#define PRIVROOT_NAME ".reiserfs_priv" 54#define PRIVROOT_NAME ".reiserfs_priv"
53#define XAROOT_NAME "xattrs" 55#define XAROOT_NAME "xattrs"
@@ -60,7 +62,6 @@
60static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) 62static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
61{ 63{
62 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 64 BUG_ON(!mutex_is_locked(&dir->i_mutex));
63 vfs_dq_init(dir);
64 return dir->i_op->create(dir, dentry, mode, NULL); 65 return dir->i_op->create(dir, dentry, mode, NULL);
65} 66}
66#endif 67#endif
@@ -68,7 +69,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
68static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) 69static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
69{ 70{
70 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 71 BUG_ON(!mutex_is_locked(&dir->i_mutex));
71 vfs_dq_init(dir);
72 return dir->i_op->mkdir(dir, dentry, mode); 72 return dir->i_op->mkdir(dir, dentry, mode);
73} 73}
74 74
@@ -80,9 +80,9 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
80{ 80{
81 int error; 81 int error;
82 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 82 BUG_ON(!mutex_is_locked(&dir->i_mutex));
83 vfs_dq_init(dir);
84 83
85 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 84 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
85 I_MUTEX_CHILD, dir->i_sb);
86 error = dir->i_op->unlink(dir, dentry); 86 error = dir->i_op->unlink(dir, dentry);
87 mutex_unlock(&dentry->d_inode->i_mutex); 87 mutex_unlock(&dentry->d_inode->i_mutex);
88 88
@@ -95,9 +95,9 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
95{ 95{
96 int error; 96 int error;
97 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 97 BUG_ON(!mutex_is_locked(&dir->i_mutex));
98 vfs_dq_init(dir);
99 98
100 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 99 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
100 I_MUTEX_CHILD, dir->i_sb);
101 dentry_unhash(dentry); 101 dentry_unhash(dentry);
102 error = dir->i_op->rmdir(dir, dentry); 102 error = dir->i_op->rmdir(dir, dentry);
103 if (!error) 103 if (!error)
@@ -234,16 +234,22 @@ static int reiserfs_for_each_xattr(struct inode *inode,
234 if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) 234 if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
235 return 0; 235 return 0;
236 236
237 reiserfs_write_unlock(inode->i_sb);
237 dir = open_xa_dir(inode, XATTR_REPLACE); 238 dir = open_xa_dir(inode, XATTR_REPLACE);
238 if (IS_ERR(dir)) { 239 if (IS_ERR(dir)) {
239 err = PTR_ERR(dir); 240 err = PTR_ERR(dir);
241 reiserfs_write_lock(inode->i_sb);
240 goto out; 242 goto out;
241 } else if (!dir->d_inode) { 243 } else if (!dir->d_inode) {
242 err = 0; 244 err = 0;
245 reiserfs_write_lock(inode->i_sb);
243 goto out_dir; 246 goto out_dir;
244 } 247 }
245 248
246 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); 249 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
250
251 reiserfs_write_lock(inode->i_sb);
252
247 buf.xadir = dir; 253 buf.xadir = dir;
248 err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); 254 err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
249 while ((err == 0 || err == -ENOSPC) && buf.count) { 255 while ((err == 0 || err == -ENOSPC) && buf.count) {
@@ -282,8 +288,9 @@ static int reiserfs_for_each_xattr(struct inode *inode,
282 err = journal_begin(&th, inode->i_sb, blocks); 288 err = journal_begin(&th, inode->i_sb, blocks);
283 if (!err) { 289 if (!err) {
284 int jerror; 290 int jerror;
285 mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, 291 reiserfs_mutex_lock_nested_safe(
286 I_MUTEX_XATTR); 292 &dir->d_parent->d_inode->i_mutex,
293 I_MUTEX_XATTR, inode->i_sb);
287 err = action(dir, data); 294 err = action(dir, data);
288 jerror = journal_end(&th, inode->i_sb, blocks); 295 jerror = journal_end(&th, inode->i_sb, blocks);
289 mutex_unlock(&dir->d_parent->d_inode->i_mutex); 296 mutex_unlock(&dir->d_parent->d_inode->i_mutex);
@@ -442,7 +449,9 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
442 } 449 }
443 450
444 if (dentry->d_inode) { 451 if (dentry->d_inode) {
452 reiserfs_write_lock(inode->i_sb);
445 err = xattr_unlink(xadir->d_inode, dentry); 453 err = xattr_unlink(xadir->d_inode, dentry);
454 reiserfs_write_unlock(inode->i_sb);
446 update_ctime(inode); 455 update_ctime(inode);
447 } 456 }
448 457
@@ -476,15 +485,24 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
476 if (get_inode_sd_version(inode) == STAT_DATA_V1) 485 if (get_inode_sd_version(inode) == STAT_DATA_V1)
477 return -EOPNOTSUPP; 486 return -EOPNOTSUPP;
478 487
479 if (!buffer) 488 reiserfs_write_unlock(inode->i_sb);
480 return lookup_and_delete_xattr(inode, name); 489
490 if (!buffer) {
491 err = lookup_and_delete_xattr(inode, name);
492 reiserfs_write_lock(inode->i_sb);
493 return err;
494 }
481 495
482 dentry = xattr_lookup(inode, name, flags); 496 dentry = xattr_lookup(inode, name, flags);
483 if (IS_ERR(dentry)) 497 if (IS_ERR(dentry)) {
498 reiserfs_write_lock(inode->i_sb);
484 return PTR_ERR(dentry); 499 return PTR_ERR(dentry);
500 }
485 501
486 down_write(&REISERFS_I(inode)->i_xattr_sem); 502 down_write(&REISERFS_I(inode)->i_xattr_sem);
487 503
504 reiserfs_write_lock(inode->i_sb);
505
488 xahash = xattr_hash(buffer, buffer_size); 506 xahash = xattr_hash(buffer, buffer_size);
489 while (buffer_pos < buffer_size || buffer_pos == 0) { 507 while (buffer_pos < buffer_size || buffer_pos == 0) {
490 size_t chunk; 508 size_t chunk;
@@ -539,8 +557,12 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
539 .ia_size = buffer_size, 557 .ia_size = buffer_size,
540 .ia_valid = ATTR_SIZE | ATTR_CTIME, 558 .ia_valid = ATTR_SIZE | ATTR_CTIME,
541 }; 559 };
560
561 reiserfs_write_unlock(inode->i_sb);
542 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); 562 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
543 down_write(&dentry->d_inode->i_alloc_sem); 563 down_write(&dentry->d_inode->i_alloc_sem);
564 reiserfs_write_lock(inode->i_sb);
565
544 err = reiserfs_setattr(dentry, &newattrs); 566 err = reiserfs_setattr(dentry, &newattrs);
545 up_write(&dentry->d_inode->i_alloc_sem); 567 up_write(&dentry->d_inode->i_alloc_sem);
546 mutex_unlock(&dentry->d_inode->i_mutex); 568 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -726,15 +748,14 @@ ssize_t
726reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
727 size_t size) 749 size_t size)
728{ 750{
729 struct inode *inode = dentry->d_inode;
730 struct xattr_handler *handler; 751 struct xattr_handler *handler;
731 752
732 handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); 753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
733 754
734 if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) 755 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
735 return -EOPNOTSUPP; 756 return -EOPNOTSUPP;
736 757
737 return handler->get(inode, name, buffer, size); 758 return handler->get(dentry, name, buffer, size, handler->flags);
738} 759}
739 760
740/* 761/*
@@ -746,15 +767,14 @@ int
746reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
747 size_t size, int flags) 768 size_t size, int flags)
748{ 769{
749 struct inode *inode = dentry->d_inode;
750 struct xattr_handler *handler; 770 struct xattr_handler *handler;
751 771
752 handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); 772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
753 773
754 if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) 774 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
755 return -EOPNOTSUPP; 775 return -EOPNOTSUPP;
756 776
757 return handler->set(inode, name, value, size, flags); 777 return handler->set(dentry, name, value, size, flags, handler->flags);
758} 778}
759 779
760/* 780/*
@@ -764,21 +784,20 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
764 */ 784 */
765int reiserfs_removexattr(struct dentry *dentry, const char *name) 785int reiserfs_removexattr(struct dentry *dentry, const char *name)
766{ 786{
767 struct inode *inode = dentry->d_inode;
768 struct xattr_handler *handler; 787 struct xattr_handler *handler;
769 handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); 788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
770 789
771 if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) 790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
772 return -EOPNOTSUPP; 791 return -EOPNOTSUPP;
773 792
774 return handler->set(inode, name, NULL, 0, XATTR_REPLACE); 793 return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
775} 794}
776 795
777struct listxattr_buf { 796struct listxattr_buf {
778 size_t size; 797 size_t size;
779 size_t pos; 798 size_t pos;
780 char *buf; 799 char *buf;
781 struct inode *inode; 800 struct dentry *dentry;
782}; 801};
783 802
784static int listxattr_filler(void *buf, const char *name, int namelen, 803static int listxattr_filler(void *buf, const char *name, int namelen,
@@ -789,17 +808,19 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
789 if (name[0] != '.' || 808 if (name[0] != '.' ||
790 (namelen != 1 && (name[1] != '.' || namelen != 2))) { 809 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
791 struct xattr_handler *handler; 810 struct xattr_handler *handler;
792 handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr, 811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
793 name); 812 name);
794 if (!handler) /* Unsupported xattr name */ 813 if (!handler) /* Unsupported xattr name */
795 return 0; 814 return 0;
796 if (b->buf) { 815 if (b->buf) {
797 size = handler->list(b->inode, b->buf + b->pos, 816 size = handler->list(b->dentry, b->buf + b->pos,
798 b->size, name, namelen); 817 b->size, name, namelen,
818 handler->flags);
799 if (size > b->size) 819 if (size > b->size)
800 return -ERANGE; 820 return -ERANGE;
801 } else { 821 } else {
802 size = handler->list(b->inode, NULL, 0, name, namelen); 822 size = handler->list(b->dentry, NULL, 0, name,
823 namelen, handler->flags);
803 } 824 }
804 825
805 b->pos += size; 826 b->pos += size;
@@ -820,7 +841,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
820 int err = 0; 841 int err = 0;
821 loff_t pos = 0; 842 loff_t pos = 0;
822 struct listxattr_buf buf = { 843 struct listxattr_buf buf = {
823 .inode = dentry->d_inode, 844 .dentry = dentry,
824 .buf = buffer, 845 .buf = buffer,
825 .size = buffer ? size : 0, 846 .size = buffer ? size : 0,
826 }; 847 };
@@ -975,7 +996,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
975 int err = 0; 996 int err = 0;
976 997
977 /* If we don't have the privroot located yet - go find it */ 998 /* If we don't have the privroot located yet - go find it */
978 mutex_lock(&s->s_root->d_inode->i_mutex); 999 reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
979 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, 1000 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
980 strlen(PRIVROOT_NAME)); 1001 strlen(PRIVROOT_NAME));
981 if (!IS_ERR(dentry)) { 1002 if (!IS_ERR(dentry)) {
@@ -1004,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
1004 goto error; 1025 goto error;
1005 1026
1006 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { 1027 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
1007 mutex_lock(&s->s_root->d_inode->i_mutex); 1028 reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
1008 err = create_privroot(REISERFS_SB(s)->priv_root); 1029 err = create_privroot(REISERFS_SB(s)->priv_root);
1009 mutex_unlock(&s->s_root->d_inode->i_mutex); 1030 mutex_unlock(&s->s_root->d_inode->i_mutex);
1010 } 1031 }
1011 1032
1012 if (privroot->d_inode) { 1033 if (privroot->d_inode) {
1013 s->s_xattr = reiserfs_xattr_handlers; 1034 s->s_xattr = reiserfs_xattr_handlers;
1014 mutex_lock(&privroot->d_inode->i_mutex); 1035 reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
1015 if (!REISERFS_SB(s)->xattr_root) { 1036 if (!REISERFS_SB(s)->xattr_root) {
1016 struct dentry *dentry; 1037 struct dentry *dentry;
1017 dentry = lookup_one_len(XAROOT_NAME, privroot, 1038 dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 35d6e672a279..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/xattr.h> 7#include <linux/xattr.h>
8#include <linux/slab.h>
8#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
9#include <linux/reiserfs_xattr.h> 10#include <linux/reiserfs_xattr.h>
10#include <linux/reiserfs_acl.h> 11#include <linux/reiserfs_acl.h>
@@ -15,8 +16,10 @@ static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
15 struct posix_acl *acl); 16 struct posix_acl *acl);
16 17
17static int 18static int
18xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) 19posix_acl_set(struct dentry *dentry, const char *name, const void *value,
20 size_t size, int flags, int type)
19{ 21{
22 struct inode *inode = dentry->d_inode;
20 struct posix_acl *acl; 23 struct posix_acl *acl;
21 int error, error2; 24 int error, error2;
22 struct reiserfs_transaction_handle th; 25 struct reiserfs_transaction_handle th;
@@ -60,15 +63,16 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
60} 63}
61 64
62static int 65static int
63xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) 66posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
67 size_t size, int type)
64{ 68{
65 struct posix_acl *acl; 69 struct posix_acl *acl;
66 int error; 70 int error;
67 71
68 if (!reiserfs_posixacl(inode->i_sb)) 72 if (!reiserfs_posixacl(dentry->d_sb))
69 return -EOPNOTSUPP; 73 return -EOPNOTSUPP;
70 74
71 acl = reiserfs_get_acl(inode, type); 75 acl = reiserfs_get_acl(dentry->d_inode, type);
72 if (IS_ERR(acl)) 76 if (IS_ERR(acl))
73 return PTR_ERR(acl); 77 return PTR_ERR(acl);
74 if (acl == NULL) 78 if (acl == NULL)
@@ -452,7 +456,9 @@ int reiserfs_acl_chmod(struct inode *inode)
452 return 0; 456 return 0;
453 } 457 }
454 458
459 reiserfs_write_unlock(inode->i_sb);
455 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); 460 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
461 reiserfs_write_lock(inode->i_sb);
456 if (!acl) 462 if (!acl)
457 return 0; 463 return 0;
458 if (IS_ERR(acl)) 464 if (IS_ERR(acl))
@@ -482,30 +488,12 @@ int reiserfs_acl_chmod(struct inode *inode)
482 return error; 488 return error;
483} 489}
484 490
485static int 491static size_t posix_acl_access_list(struct dentry *dentry, char *list,
486posix_acl_access_get(struct inode *inode, const char *name,
487 void *buffer, size_t size)
488{
489 if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
490 return -EINVAL;
491 return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
492}
493
494static int
495posix_acl_access_set(struct inode *inode, const char *name,
496 const void *value, size_t size, int flags)
497{
498 if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
499 return -EINVAL;
500 return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
501}
502
503static size_t posix_acl_access_list(struct inode *inode, char *list,
504 size_t list_size, const char *name, 492 size_t list_size, const char *name,
505 size_t name_len) 493 size_t name_len, int type)
506{ 494{
507 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); 495 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
508 if (!reiserfs_posixacl(inode->i_sb)) 496 if (!reiserfs_posixacl(dentry->d_sb))
509 return 0; 497 return 0;
510 if (list && size <= list_size) 498 if (list && size <= list_size)
511 memcpy(list, POSIX_ACL_XATTR_ACCESS, size); 499 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -514,35 +502,18 @@ static size_t posix_acl_access_list(struct inode *inode, char *list,
514 502
515struct xattr_handler reiserfs_posix_acl_access_handler = { 503struct xattr_handler reiserfs_posix_acl_access_handler = {
516 .prefix = POSIX_ACL_XATTR_ACCESS, 504 .prefix = POSIX_ACL_XATTR_ACCESS,
517 .get = posix_acl_access_get, 505 .flags = ACL_TYPE_ACCESS,
518 .set = posix_acl_access_set, 506 .get = posix_acl_get,
507 .set = posix_acl_set,
519 .list = posix_acl_access_list, 508 .list = posix_acl_access_list,
520}; 509};
521 510
522static int 511static size_t posix_acl_default_list(struct dentry *dentry, char *list,
523posix_acl_default_get(struct inode *inode, const char *name,
524 void *buffer, size_t size)
525{
526 if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
527 return -EINVAL;
528 return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
529}
530
531static int
532posix_acl_default_set(struct inode *inode, const char *name,
533 const void *value, size_t size, int flags)
534{
535 if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
536 return -EINVAL;
537 return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
538}
539
540static size_t posix_acl_default_list(struct inode *inode, char *list,
541 size_t list_size, const char *name, 512 size_t list_size, const char *name,
542 size_t name_len) 513 size_t name_len, int type)
543{ 514{
544 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); 515 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
545 if (!reiserfs_posixacl(inode->i_sb)) 516 if (!reiserfs_posixacl(dentry->d_sb))
546 return 0; 517 return 0;
547 if (list && size <= list_size) 518 if (list && size <= list_size)
548 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); 519 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -551,7 +522,8 @@ static size_t posix_acl_default_list(struct inode *inode, char *list,
551 522
552struct xattr_handler reiserfs_posix_acl_default_handler = { 523struct xattr_handler reiserfs_posix_acl_default_handler = {
553 .prefix = POSIX_ACL_XATTR_DEFAULT, 524 .prefix = POSIX_ACL_XATTR_DEFAULT,
554 .get = posix_acl_default_get, 525 .flags = ACL_TYPE_DEFAULT,
555 .set = posix_acl_default_set, 526 .get = posix_acl_get,
527 .set = posix_acl_set,
556 .list = posix_acl_default_list, 528 .list = posix_acl_default_list,
557}; 529};
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index a92c8792c0f6..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,41 +3,43 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/slab.h>
6#include <linux/reiserfs_xattr.h> 7#include <linux/reiserfs_xattr.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
9 10
10static int 11static int
11security_get(struct inode *inode, const char *name, void *buffer, size_t size) 12security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
13 int handler_flags)
12{ 14{
13 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) 15 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
14 return -EINVAL; 16 return -EINVAL;
15 17
16 if (IS_PRIVATE(inode)) 18 if (IS_PRIVATE(dentry->d_inode))
17 return -EPERM; 19 return -EPERM;
18 20
19 return reiserfs_xattr_get(inode, name, buffer, size); 21 return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
20} 22}
21 23
22static int 24static int
23security_set(struct inode *inode, const char *name, const void *buffer, 25security_set(struct dentry *dentry, const char *name, const void *buffer,
24 size_t size, int flags) 26 size_t size, int flags, int handler_flags)
25{ 27{
26 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) 28 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
27 return -EINVAL; 29 return -EINVAL;
28 30
29 if (IS_PRIVATE(inode)) 31 if (IS_PRIVATE(dentry->d_inode))
30 return -EPERM; 32 return -EPERM;
31 33
32 return reiserfs_xattr_set(inode, name, buffer, size, flags); 34 return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
33} 35}
34 36
35static size_t security_list(struct inode *inode, char *list, size_t list_len, 37static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
36 const char *name, size_t namelen) 38 const char *name, size_t namelen, int handler_flags)
37{ 39{
38 const size_t len = namelen + 1; 40 const size_t len = namelen + 1;
39 41
40 if (IS_PRIVATE(inode)) 42 if (IS_PRIVATE(dentry->d_inode))
41 return 0; 43 return 0;
42 44
43 if (list && len <= list_len) { 45 if (list && len <= list_len) {
@@ -75,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
75 return error; 77 return error;
76 } 78 }
77 79
78 if (sec->length) { 80 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
79 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 81 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
80 reiserfs_xattr_nblocks(inode, sec->length); 82 reiserfs_xattr_nblocks(inode, sec->length);
81 /* We don't want to count the directories twice if we have 83 /* We don't want to count the directories twice if we have
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a865042f75e2..5b08aaca3daf 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,36 +8,37 @@
8#include <asm/uaccess.h> 8#include <asm/uaccess.h>
9 9
10static int 10static int
11trusted_get(struct inode *inode, const char *name, void *buffer, size_t size) 11trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
12 int handler_flags)
12{ 13{
13 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) 14 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
14 return -EINVAL; 15 return -EINVAL;
15 16
16 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) 17 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
17 return -EPERM; 18 return -EPERM;
18 19
19 return reiserfs_xattr_get(inode, name, buffer, size); 20 return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
20} 21}
21 22
22static int 23static int
23trusted_set(struct inode *inode, const char *name, const void *buffer, 24trusted_set(struct dentry *dentry, const char *name, const void *buffer,
24 size_t size, int flags) 25 size_t size, int flags, int handler_flags)
25{ 26{
26 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) 27 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
27 return -EINVAL; 28 return -EINVAL;
28 29
29 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) 30 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
30 return -EPERM; 31 return -EPERM;
31 32
32 return reiserfs_xattr_set(inode, name, buffer, size, flags); 33 return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
33} 34}
34 35
35static size_t trusted_list(struct inode *inode, char *list, size_t list_size, 36static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
36 const char *name, size_t name_len) 37 const char *name, size_t name_len, int handler_flags)
37{ 38{
38 const size_t len = name_len + 1; 39 const size_t len = name_len + 1;
39 40
40 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) 41 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
41 return 0; 42 return 0;
42 43
43 if (list && len <= list_size) { 44 if (list && len <= list_size) {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index e3238dc4f3db..75d59c49b911 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,34 +7,35 @@
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8 8
9static int 9static int
10user_get(struct inode *inode, const char *name, void *buffer, size_t size) 10user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
11 int handler_flags)
11{ 12{
12 13
13 if (strlen(name) < sizeof(XATTR_USER_PREFIX)) 14 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
14 return -EINVAL; 15 return -EINVAL;
15 if (!reiserfs_xattrs_user(inode->i_sb)) 16 if (!reiserfs_xattrs_user(dentry->d_sb))
16 return -EOPNOTSUPP; 17 return -EOPNOTSUPP;
17 return reiserfs_xattr_get(inode, name, buffer, size); 18 return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
18} 19}
19 20
20static int 21static int
21user_set(struct inode *inode, const char *name, const void *buffer, 22user_set(struct dentry *dentry, const char *name, const void *buffer,
22 size_t size, int flags) 23 size_t size, int flags, int handler_flags)
23{ 24{
24 if (strlen(name) < sizeof(XATTR_USER_PREFIX)) 25 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
25 return -EINVAL; 26 return -EINVAL;
26 27
27 if (!reiserfs_xattrs_user(inode->i_sb)) 28 if (!reiserfs_xattrs_user(dentry->d_sb))
28 return -EOPNOTSUPP; 29 return -EOPNOTSUPP;
29 return reiserfs_xattr_set(inode, name, buffer, size, flags); 30 return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
30} 31}
31 32
32static size_t user_list(struct inode *inode, char *list, size_t list_size, 33static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
33 const char *name, size_t name_len) 34 const char *name, size_t name_len, int handler_flags)
34{ 35{
35 const size_t len = name_len + 1; 36 const size_t len = name_len + 1;
36 37
37 if (!reiserfs_xattrs_user(inode->i_sb)) 38 if (!reiserfs_xattrs_user(dentry->d_sb))
38 return 0; 39 return 0;
39 if (list && len <= list_size) { 40 if (list && len <= list_size) {
40 memcpy(list, name, name_len); 41 memcpy(list, name, name_len);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index c117fa80d1e9..42d213546894 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -544,6 +544,7 @@ error:
544error_rsb_inval: 544error_rsb_inval:
545 ret = -EINVAL; 545 ret = -EINVAL;
546error_rsb: 546error_rsb:
547 kfree(rsb);
547 return ret; 548 return ret;
548} 549}
549 550
diff --git a/fs/select.c b/fs/select.c
index fd38ce2e32e3..500a669f7790 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
691} 691}
692#endif /* HAVE_SET_RESTORE_SIGMASK */ 692#endif /* HAVE_SET_RESTORE_SIGMASK */
693 693
694#ifdef __ARCH_WANT_SYS_OLD_SELECT
695struct sel_arg_struct {
696 unsigned long n;
697 fd_set __user *inp, *outp, *exp;
698 struct timeval __user *tvp;
699};
700
701SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
702{
703 struct sel_arg_struct a;
704
705 if (copy_from_user(&a, arg, sizeof(a)))
706 return -EFAULT;
707 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
708}
709#endif
710
694struct poll_list { 711struct poll_list {
695 struct poll_list *next; 712 struct poll_list *next;
696 int len; 713 int len;
@@ -821,7 +838,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
821 struct poll_list *walk = head; 838 struct poll_list *walk = head;
822 unsigned long todo = nfds; 839 unsigned long todo = nfds;
823 840
824 if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 841 if (nfds > rlimit(RLIMIT_NOFILE))
825 return -EINVAL; 842 return -EINVAL;
826 843
827 len = min_t(unsigned int, nfds, N_STACK_PPS); 844 len = min_t(unsigned int, nfds, N_STACK_PPS);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..e1f437be6c3c 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
674 674
675 return NULL; 675 return NULL;
676} 676}
677
678EXPORT_SYMBOL(seq_list_start); 677EXPORT_SYMBOL(seq_list_start);
679 678
680struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) 679struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
684 683
685 return seq_list_start(head, pos - 1); 684 return seq_list_start(head, pos - 1);
686} 685}
687
688EXPORT_SYMBOL(seq_list_start_head); 686EXPORT_SYMBOL(seq_list_start_head);
689 687
690struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) 688struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,131 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
695 ++*ppos; 693 ++*ppos;
696 return lh == head ? NULL : lh; 694 return lh == head ? NULL : lh;
697} 695}
698
699EXPORT_SYMBOL(seq_list_next); 696EXPORT_SYMBOL(seq_list_next);
697
698/**
699 * seq_hlist_start - start an iteration of a hlist
700 * @head: the head of the hlist
701 * @pos: the start position of the sequence
702 *
703 * Called at seq_file->op->start().
704 */
705struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
706{
707 struct hlist_node *node;
708
709 hlist_for_each(node, head)
710 if (pos-- == 0)
711 return node;
712 return NULL;
713}
714EXPORT_SYMBOL(seq_hlist_start);
715
716/**
717 * seq_hlist_start_head - start an iteration of a hlist
718 * @head: the head of the hlist
719 * @pos: the start position of the sequence
720 *
721 * Called at seq_file->op->start(). Call this function if you want to
722 * print a header at the top of the output.
723 */
724struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
725{
726 if (!pos)
727 return SEQ_START_TOKEN;
728
729 return seq_hlist_start(head, pos - 1);
730}
731EXPORT_SYMBOL(seq_hlist_start_head);
732
733/**
734 * seq_hlist_next - move to the next position of the hlist
735 * @v: the current iterator
736 * @head: the head of the hlist
737 * @ppos: the current position
738 *
739 * Called at seq_file->op->next().
740 */
741struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
742 loff_t *ppos)
743{
744 struct hlist_node *node = v;
745
746 ++*ppos;
747 if (v == SEQ_START_TOKEN)
748 return head->first;
749 else
750 return node->next;
751}
752EXPORT_SYMBOL(seq_hlist_next);
753
754/**
755 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
756 * @head: the head of the hlist
757 * @pos: the start position of the sequence
758 *
759 * Called at seq_file->op->start().
760 *
761 * This list-traversal primitive may safely run concurrently with
762 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
763 * as long as the traversal is guarded by rcu_read_lock().
764 */
765struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
766 loff_t pos)
767{
768 struct hlist_node *node;
769
770 __hlist_for_each_rcu(node, head)
771 if (pos-- == 0)
772 return node;
773 return NULL;
774}
775EXPORT_SYMBOL(seq_hlist_start_rcu);
776
777/**
778 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
779 * @head: the head of the hlist
780 * @pos: the start position of the sequence
781 *
782 * Called at seq_file->op->start(). Call this function if you want to
783 * print a header at the top of the output.
784 *
785 * This list-traversal primitive may safely run concurrently with
786 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
787 * as long as the traversal is guarded by rcu_read_lock().
788 */
789struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
790 loff_t pos)
791{
792 if (!pos)
793 return SEQ_START_TOKEN;
794
795 return seq_hlist_start_rcu(head, pos - 1);
796}
797EXPORT_SYMBOL(seq_hlist_start_head_rcu);
798
799/**
800 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
801 * @v: the current iterator
802 * @head: the head of the hlist
803 * @ppos: the current position
804 *
805 * Called at seq_file->op->next().
806 *
807 * This list-traversal primitive may safely run concurrently with
808 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
809 * as long as the traversal is guarded by rcu_read_lock().
810 */
811struct hlist_node *seq_hlist_next_rcu(void *v,
812 struct hlist_head *head,
813 loff_t *ppos)
814{
815 struct hlist_node *node = v;
816
817 ++*ppos;
818 if (v == SEQ_START_TOKEN)
819 return rcu_dereference(head->first);
820 else
821 return rcu_dereference(node->next);
822}
823EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b07565c94386..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/list.h> 28#include <linux/list.h>
@@ -236,7 +237,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
236 * anon_inode_getfd() will install the fd. 237 * anon_inode_getfd() will install the fd.
237 */ 238 */
238 ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, 239 ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
239 flags & (O_CLOEXEC | O_NONBLOCK)); 240 O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK)));
240 if (ufd < 0) 241 if (ufd < 0)
241 kfree(ctx); 242 kfree(ctx);
242 } else { 243 } else {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
13#include <linux/fcntl.h> 13#include <linux/fcntl.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/slab.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
19#include <linux/net.h> 18#include <linux/net.h>
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/file.h> 16#include <linux/file.h>
18#include <linux/dcache.h> 17#include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18 19
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/system.h> 21#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e17534..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/uio.h> 31#include <linux/uio.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/gfp.h>
33 34
34/* 35/*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into 36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -648,9 +649,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
648 ret = buf->ops->confirm(pipe, buf); 649 ret = buf->ops->confirm(pipe, buf);
649 if (!ret) { 650 if (!ret) {
650 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 651 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
651 652 if (file->f_op && file->f_op->sendpage)
652 ret = file->f_op->sendpage(file, buf->page, buf->offset, 653 ret = file->f_op->sendpage(file, buf->page, buf->offset,
653 sd->len, &pos, more); 654 sd->len, &pos, more);
655 else
656 ret = -EINVAL;
654 } 657 }
655 658
656 return ret; 659 return ret;
@@ -1068,8 +1071,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1068 if (unlikely(ret < 0)) 1071 if (unlikely(ret < 0))
1069 return ret; 1072 return ret;
1070 1073
1071 splice_write = out->f_op->splice_write; 1074 if (out->f_op && out->f_op->splice_write)
1072 if (!splice_write) 1075 splice_write = out->f_op->splice_write;
1076 else
1073 splice_write = default_file_splice_write; 1077 splice_write = default_file_splice_write;
1074 1078
1075 return splice_write(pipe, out, ppos, len, flags); 1079 return splice_write(pipe, out, ppos, len, flags);
@@ -1093,8 +1097,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
1093 if (unlikely(ret < 0)) 1097 if (unlikely(ret < 0))
1094 return ret; 1098 return ret;
1095 1099
1096 splice_read = in->f_op->splice_read; 1100 if (in->f_op && in->f_op->splice_read)
1097 if (!splice_read) 1101 splice_read = in->f_op->splice_read;
1102 else
1098 splice_read = default_file_splice_read; 1103 splice_read = default_file_splice_read;
1099 1104
1100 return splice_read(in, ppos, pipe, len, flags); 1105 return splice_read(in, ppos, pipe, len, flags);
@@ -1316,7 +1321,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1316 if (off_in) 1321 if (off_in)
1317 return -ESPIPE; 1322 return -ESPIPE;
1318 if (off_out) { 1323 if (off_out) {
1319 if (out->f_op->llseek == no_llseek) 1324 if (!out->f_op || !out->f_op->llseek ||
1325 out->f_op->llseek == no_llseek)
1320 return -EINVAL; 1326 return -EINVAL;
1321 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1327 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1322 return -EFAULT; 1328 return -EFAULT;
@@ -1336,7 +1342,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1336 if (off_out) 1342 if (off_out)
1337 return -ESPIPE; 1343 return -ESPIPE;
1338 if (off_in) { 1344 if (off_in) {
1339 if (in->f_op->llseek == no_llseek) 1345 if (!in->f_op || !in->f_op->llseek ||
1346 in->f_op->llseek == no_llseek)
1340 return -EINVAL; 1347 return -EINVAL;
1341 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1348 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1342 return -EFAULT; 1349 return -EFAULT;
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..df8a19ef870d 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..1cb0d81b164b 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/vfs.h> 30#include <linux/vfs.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36 34
37#include "squashfs_fs.h" 35#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h" 37#include "squashfs_fs_i.h"
40#include "squashfs.h" 38#include "squashfs.h"
39#include "decompressor.h"
41 40
42/* 41/*
43 * Read the metadata block length, this is stored in the first two 42 * Read the metadata block length, this is stored in the first two
@@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
153 } 152 }
154 153
155 if (compressed) { 154 if (compressed) {
156 int zlib_err = 0, zlib_init = 0; 155 length = squashfs_decompress(msblk, buffer, bh, b, offset,
157 156 length, srclength, pages);
158 /* 157 if (length < 0)
159 * Uncompress block. 158 goto read_failure;
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate error, data probably corrupt\n");
212 goto release_mutex;
213 }
214
215 zlib_err = zlib_inflateEnd(&msblk->stream);
216 if (zlib_err != Z_OK) {
217 ERROR("zlib_inflate error, data probably corrupt\n");
218 goto release_mutex;
219 }
220 length = msblk->stream.total_out;
221 mutex_unlock(&msblk->read_data_mutex);
222 } else { 159 } else {
223 /* 160 /*
224 * Block is uncompressed. 161 * Block is uncompressed.
@@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
255 kfree(bh); 192 kfree(bh);
256 return length; 193 return length;
257 194
258release_mutex:
259 mutex_unlock(&msblk->read_data_mutex);
260
261block_release: 195block_release:
262 for (; k < b; k++) 196 for (; k < b; k++)
263 put_bh(bh[k]); 197 put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
51#include <linux/sched.h> 51#include <linux/sched.h>
52#include <linux/spinlock.h> 52#include <linux/spinlock.h>
53#include <linux/wait.h> 53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h> 54#include <linux/pagemap.h>
56 55
57#include "squashfs_fs.h" 56#include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..157478da6ac9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * decompressor.c
22 */
23
24#include <linux/types.h>
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27
28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h"
32#include "squashfs.h"
33
34/*
35 * This file (and decompressor.h) implements a decompressor framework for
36 * Squashfs, allowing multiple decompressors to be easily supported
37 */
38
39static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41};
42
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45};
46
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0
49};
50
51static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops,
54 &squashfs_lzo_unsupported_comp_ops,
55 &squashfs_unknown_comp_ops
56};
57
58
59const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
60{
61 int i;
62
63 for (i = 0; decompressor[i]->id; i++)
64 if (id == decompressor[i]->id)
65 break;
66
67 return decompressor[i];
68}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
1#ifndef DECOMPRESSOR_H
2#define DECOMPRESSOR_H
3/*
4 * Squashfs - a compressed read only filesystem for Linux
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * decompressor.h
24 */
25
26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *);
28 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **,
30 struct buffer_head **, int, int, int, int, int);
31 int id;
32 char *name;
33 int supported;
34};
35
36static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
37{
38 return msblk->decompressor->init(msblk);
39}
40
41static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
42 void *s)
43{
44 if (msblk->decompressor)
45 msblk->decompressor->free(s);
46}
47
48static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
49 void **buffer, struct buffer_head **bh, int b, int offset, int length,
50 int srclength, int pages)
51{
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages);
54}
55#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/zlib.h>
34 33
35#include "squashfs_fs.h" 34#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 35#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
39#include <linux/vfs.h> 39#include <linux/vfs.h>
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43#include <linux/slab.h> 42#include <linux/slab.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/pagemap.h> 48#include <linux/pagemap.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/zlib.h>
51 50
52#include "squashfs_fs.h" 51#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h" 52#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
34#include <linux/fs.h> 34#include <linux/fs.h>
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/zlib.h>
38 37
39#include "squashfs_fs.h" 38#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..49daaf669e41 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/zlib.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h" 45#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..5266bd8ad932 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/zlib.h>
61 60
62#include "squashfs_fs.h" 61#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h" 62#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..fe2587af5512 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int); 51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int); 52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53 53
54/* decompressor.c */
55extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
56
54/* export.c */ 57/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, 58extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int); 59 unsigned int);
@@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
71extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
72 75
73/* 76/*
74 * Inodes and files operations 77 * Inodes, files and decompressor operations
75 */ 78 */
76 79
77/* dir.c */ 80/* dir.c */
@@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
88 91
89/* symlink.c */ 92/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops; 93extern const struct address_space_operations squashfs_symlink_aops;
94
95/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..79024245ea00 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
183#define SQUASHFS_MAX_FILE_SIZE (1LL << \ 183#define SQUASHFS_MAX_FILE_SIZE (1LL << \
184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) 184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
185 185
186#define SQUASHFS_MARKER_BYTE 0xff
187
188/* meta index cache */ 186/* meta index cache */
189#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) 187#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
190#define SQUASHFS_META_ENTRIES 127 188#define SQUASHFS_META_ENTRIES 127
@@ -211,7 +209,9 @@ struct meta_index {
211/* 209/*
212 * definitions for structures on disk 210 * definitions for structures on disk
213 */ 211 */
214#define ZLIB_COMPRESSION 1 212#define ZLIB_COMPRESSION 1
213#define LZMA_COMPRESSION 2
214#define LZO_COMPRESSION 3
215 215
216struct squashfs_super_block { 216struct squashfs_super_block {
217 __le32 s_magic; 217 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..2e77dc547e25 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,25 @@ struct squashfs_cache_entry {
52}; 52};
53 53
54struct squashfs_sb_info { 54struct squashfs_sb_info {
55 int devblksize; 55 const struct squashfs_decompressor *decompressor;
56 int devblksize_log2; 56 int devblksize;
57 struct squashfs_cache *block_cache; 57 int devblksize_log2;
58 struct squashfs_cache *fragment_cache; 58 struct squashfs_cache *block_cache;
59 struct squashfs_cache *read_page; 59 struct squashfs_cache *fragment_cache;
60 int next_meta_index; 60 struct squashfs_cache *read_page;
61 __le64 *id_table; 61 int next_meta_index;
62 __le64 *fragment_index; 62 __le64 *id_table;
63 unsigned int *fragment_index_2; 63 __le64 *fragment_index;
64 struct mutex read_data_mutex; 64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 66 struct meta_index *meta_index;
67 z_stream stream; 67 void *stream;
68 __le64 *inode_lookup_table; 68 __le64 *inode_lookup_table;
69 u64 inode_table; 69 u64 inode_table;
70 u64 directory_table; 70 u64 directory_table;
71 unsigned int block_size; 71 unsigned int block_size;
72 unsigned short block_log; 72 unsigned short block_log;
73 long long bytes_used; 73 long long bytes_used;
74 unsigned int inodes; 74 unsigned int inodes;
75}; 75};
76#endif 76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..3550aec2f655 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,41 @@
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/zlib.h>
39#include <linux/magic.h> 38#include <linux/magic.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
44#include "squashfs.h" 43#include "squashfs.h"
44#include "decompressor.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static const struct squashfs_decompressor *supported_squashfs_filesystem(short
50 major, short minor, short id)
50{ 51{
52 const struct squashfs_decompressor *decompressor;
53
51 if (major < SQUASHFS_MAJOR) { 54 if (major < SQUASHFS_MAJOR) {
52 ERROR("Major/Minor mismatch, older Squashfs %d.%d " 55 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
53 "filesystems are unsupported\n", major, minor); 56 "filesystems are unsupported\n", major, minor);
54 return -EINVAL; 57 return NULL;
55 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { 58 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
56 ERROR("Major/Minor mismatch, trying to mount newer " 59 ERROR("Major/Minor mismatch, trying to mount newer "
57 "%d.%d filesystem\n", major, minor); 60 "%d.%d filesystem\n", major, minor);
58 ERROR("Please update your kernel\n"); 61 ERROR("Please update your kernel\n");
59 return -EINVAL; 62 return NULL;
60 } 63 }
61 64
62 if (comp != ZLIB_COMPRESSION) 65 decompressor = squashfs_lookup_decompressor(id);
63 return -EINVAL; 66 if (!decompressor->supported) {
67 ERROR("Filesystem uses \"%s\" compression. This is not "
68 "supported\n", decompressor->name);
69 return NULL;
70 }
64 71
65 return 0; 72 return decompressor;
66} 73}
67 74
68 75
@@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
87 } 94 }
88 msblk = sb->s_fs_info; 95 msblk = sb->s_fs_info;
89 96
90 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
91 GFP_KERNEL);
92 if (msblk->stream.workspace == NULL) {
93 ERROR("Failed to allocate zlib workspace\n");
94 goto failure;
95 }
96
97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); 97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
98 if (sblk == NULL) { 98 if (sblk == NULL) {
99 ERROR("Failed to allocate squashfs_super_block\n"); 99 ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
120 goto failed_mount; 120 goto failed_mount;
121 } 121 }
122 122
123 err = -EINVAL;
124
123 /* Check it is a SQUASHFS superblock */ 125 /* Check it is a SQUASHFS superblock */
124 sb->s_magic = le32_to_cpu(sblk->s_magic); 126 sb->s_magic = le32_to_cpu(sblk->s_magic);
125 if (sb->s_magic != SQUASHFS_MAGIC) { 127 if (sb->s_magic != SQUASHFS_MAGIC) {
126 if (!silent) 128 if (!silent)
127 ERROR("Can't find a SQUASHFS superblock on %s\n", 129 ERROR("Can't find a SQUASHFS superblock on %s\n",
128 bdevname(sb->s_bdev, b)); 130 bdevname(sb->s_bdev, b));
129 err = -EINVAL;
130 goto failed_mount; 131 goto failed_mount;
131 } 132 }
132 133
133 /* Check the MAJOR & MINOR versions and compression type */ 134 /* Check the MAJOR & MINOR versions and lookup compression type */
134 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), 135 msblk->decompressor = supported_squashfs_filesystem(
136 le16_to_cpu(sblk->s_major),
135 le16_to_cpu(sblk->s_minor), 137 le16_to_cpu(sblk->s_minor),
136 le16_to_cpu(sblk->compression)); 138 le16_to_cpu(sblk->compression));
137 if (err < 0) 139 if (msblk->decompressor == NULL)
138 goto failed_mount; 140 goto failed_mount;
139 141
140 err = -EINVAL;
141
142 /* 142 /*
143 * Check if there's xattrs in the filesystem. These are not 143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored. 144 * supported in this version, so warn that they will be ignored.
@@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
205 205
206 err = -ENOMEM; 206 err = -ENOMEM;
207 207
208 msblk->stream = squashfs_decompressor_init(msblk);
209 if (msblk->stream == NULL)
210 goto failed_mount;
211
208 msblk->block_cache = squashfs_cache_init("metadata", 212 msblk->block_cache = squashfs_cache_init("metadata",
209 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); 213 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
210 if (msblk->block_cache == NULL) 214 if (msblk->block_cache == NULL)
@@ -292,17 +296,16 @@ failed_mount:
292 squashfs_cache_delete(msblk->block_cache); 296 squashfs_cache_delete(msblk->block_cache);
293 squashfs_cache_delete(msblk->fragment_cache); 297 squashfs_cache_delete(msblk->fragment_cache);
294 squashfs_cache_delete(msblk->read_page); 298 squashfs_cache_delete(msblk->read_page);
299 squashfs_decompressor_free(msblk, msblk->stream);
295 kfree(msblk->inode_lookup_table); 300 kfree(msblk->inode_lookup_table);
296 kfree(msblk->fragment_index); 301 kfree(msblk->fragment_index);
297 kfree(msblk->id_table); 302 kfree(msblk->id_table);
298 kfree(msblk->stream.workspace);
299 kfree(sb->s_fs_info); 303 kfree(sb->s_fs_info);
300 sb->s_fs_info = NULL; 304 sb->s_fs_info = NULL;
301 kfree(sblk); 305 kfree(sblk);
302 return err; 306 return err;
303 307
304failure: 308failure:
305 kfree(msblk->stream.workspace);
306 kfree(sb->s_fs_info); 309 kfree(sb->s_fs_info);
307 sb->s_fs_info = NULL; 310 sb->s_fs_info = NULL;
308 return -ENOMEM; 311 return -ENOMEM;
@@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb)
346 squashfs_cache_delete(sbi->block_cache); 349 squashfs_cache_delete(sbi->block_cache);
347 squashfs_cache_delete(sbi->fragment_cache); 350 squashfs_cache_delete(sbi->fragment_cache);
348 squashfs_cache_delete(sbi->read_page); 351 squashfs_cache_delete(sbi->read_page);
352 squashfs_decompressor_free(sbi, sbi->stream);
349 kfree(sbi->id_table); 353 kfree(sbi->id_table);
350 kfree(sbi->fragment_index); 354 kfree(sbi->fragment_index);
351 kfree(sbi->meta_index); 355 kfree(sbi->meta_index);
352 kfree(sbi->stream.workspace);
353 kfree(sb->s_fs_info); 356 kfree(sb->s_fs_info);
354 sb->s_fs_info = NULL; 357 sb->s_fs_info = NULL;
355 } 358 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,10 +33,8 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/vfs.h> 34#include <linux/vfs.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/pagemap.h> 37#include <linux/pagemap.h>
39#include <linux/zlib.h>
40 38
41#include "squashfs_fs.h" 39#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 40#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..15a03d0fb9f3
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,151 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * zlib_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/slab.h>
28#include <linux/zlib.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36static void *zlib_init(struct squashfs_sb_info *dummy)
37{
38 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
39 if (stream == NULL)
40 goto failed;
41 stream->workspace = kmalloc(zlib_inflate_workspacesize(),
42 GFP_KERNEL);
43 if (stream->workspace == NULL)
44 goto failed;
45
46 return stream;
47
48failed:
49 ERROR("Failed to allocate zlib workspace\n");
50 kfree(stream);
51 return NULL;
52}
53
54
55static void zlib_free(void *strm)
56{
57 z_stream *stream = strm;
58
59 if (stream)
60 kfree(stream->workspace);
61 kfree(stream);
62}
63
64
65static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
66 struct buffer_head **bh, int b, int offset, int length, int srclength,
67 int pages)
68{
69 int zlib_err = 0, zlib_init = 0;
70 int avail, bytes, k = 0, page = 0;
71 z_stream *stream = msblk->stream;
72
73 mutex_lock(&msblk->read_data_mutex);
74
75 stream->avail_out = 0;
76 stream->avail_in = 0;
77
78 bytes = length;
79 do {
80 if (stream->avail_in == 0 && k < b) {
81 avail = min(bytes, msblk->devblksize - offset);
82 bytes -= avail;
83 wait_on_buffer(bh[k]);
84 if (!buffer_uptodate(bh[k]))
85 goto release_mutex;
86
87 if (avail == 0) {
88 offset = 0;
89 put_bh(bh[k++]);
90 continue;
91 }
92
93 stream->next_in = bh[k]->b_data + offset;
94 stream->avail_in = avail;
95 offset = 0;
96 }
97
98 if (stream->avail_out == 0 && page < pages) {
99 stream->next_out = buffer[page++];
100 stream->avail_out = PAGE_CACHE_SIZE;
101 }
102
103 if (!zlib_init) {
104 zlib_err = zlib_inflateInit(stream);
105 if (zlib_err != Z_OK) {
106 ERROR("zlib_inflateInit returned unexpected "
107 "result 0x%x, srclength %d\n",
108 zlib_err, srclength);
109 goto release_mutex;
110 }
111 zlib_init = 1;
112 }
113
114 zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
115
116 if (stream->avail_in == 0 && k < b)
117 put_bh(bh[k++]);
118 } while (zlib_err == Z_OK);
119
120 if (zlib_err != Z_STREAM_END) {
121 ERROR("zlib_inflate error, data probably corrupt\n");
122 goto release_mutex;
123 }
124
125 zlib_err = zlib_inflateEnd(stream);
126 if (zlib_err != Z_OK) {
127 ERROR("zlib_inflate error, data probably corrupt\n");
128 goto release_mutex;
129 }
130
131 mutex_unlock(&msblk->read_data_mutex);
132 return stream->total_out;
133
134release_mutex:
135 mutex_unlock(&msblk->read_data_mutex);
136
137 for (; k < b; k++)
138 put_bh(bh[k]);
139
140 return -EIO;
141}
142
143const struct squashfs_decompressor squashfs_zlib_comp_ops = {
144 .init = zlib_init,
145 .free = zlib_free,
146 .decompress = zlib_uncompress,
147 .id = ZLIB_COMPRESSION,
148 .name = "zlib",
149 .supported = 1
150};
151
diff --git a/fs/stack.c b/fs/stack.c
index 67716f6a1a4a..4a6f7f440658 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -7,18 +7,63 @@
7 * This function cannot be inlined since i_size_{read,write} is rather 7 * This function cannot be inlined since i_size_{read,write} is rather
8 * heavy-weight on 32-bit systems 8 * heavy-weight on 32-bit systems
9 */ 9 */
10void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) 10void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
11{ 11{
12 i_size_write(dst, i_size_read((struct inode *)src)); 12 loff_t i_size;
13 dst->i_blocks = src->i_blocks; 13 blkcnt_t i_blocks;
14
15 /*
16 * i_size_read() includes its own seqlocking and protection from
17 * preemption (see include/linux/fs.h): we need nothing extra for
18 * that here, and prefer to avoid nesting locks than attempt to keep
19 * i_size and i_blocks in sync together.
20 */
21 i_size = i_size_read(src);
22
23 /*
24 * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to
25 * keep the two halves of i_blocks in sync despite SMP or PREEMPT -
26 * though stat's generic_fillattr() doesn't bother, and we won't be
27 * applying quotas (where i_blocks does become important) at the
28 * upper level.
29 *
30 * We don't actually know what locking is used at the lower level;
31 * but if it's a filesystem that supports quotas, it will be using
32 * i_lock as in inode_add_bytes(). tmpfs uses other locking, and
33 * its 32-bit is (just) able to exceed 2TB i_size with the aid of
34 * holes; but its i_blocks cannot carry into the upper long without
35 * almost 2TB swap - let's ignore that case.
36 */
37 if (sizeof(i_blocks) > sizeof(long))
38 spin_lock(&src->i_lock);
39 i_blocks = src->i_blocks;
40 if (sizeof(i_blocks) > sizeof(long))
41 spin_unlock(&src->i_lock);
42
43 /*
44 * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for
45 * fsstack_copy_inode_size() to hold some lock around
46 * i_size_write(), otherwise i_size_read() may spin forever (see
47 * include/linux/fs.h). We don't necessarily hold i_mutex when this
48 * is called, so take i_lock for that case.
49 *
50 * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the
51 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
52 * for that case too, and do both at once by combining the tests.
53 *
54 * There is none of this locking overhead in the 64-bit case.
55 */
56 if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
57 spin_lock(&dst->i_lock);
58 i_size_write(dst, i_size);
59 dst->i_blocks = i_blocks;
60 if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
61 spin_unlock(&dst->i_lock);
14} 62}
15EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); 63EXPORT_SYMBOL_GPL(fsstack_copy_inode_size);
16 64
17/* copy all attributes; get_nlinks is optional way to override the i_nlink 65/* copy all attributes */
18 * copying 66void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
19 */
20void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
21 int (*get_nlinks)(struct inode *))
22{ 67{
23 dest->i_mode = src->i_mode; 68 dest->i_mode = src->i_mode;
24 dest->i_uid = src->i_uid; 69 dest->i_uid = src->i_uid;
@@ -29,14 +74,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
29 dest->i_ctime = src->i_ctime; 74 dest->i_ctime = src->i_ctime;
30 dest->i_blkbits = src->i_blkbits; 75 dest->i_blkbits = src->i_blkbits;
31 dest->i_flags = src->i_flags; 76 dest->i_flags = src->i_flags;
32 77 dest->i_nlink = src->i_nlink;
33 /*
34 * Update the nlinks AFTER updating the above fields, because the
35 * get_links callback may depend on them.
36 */
37 if (!get_nlinks)
38 dest->i_nlink = src->i_nlink;
39 else
40 dest->i_nlink = (*get_nlinks)(dest);
41} 78}
42EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); 79EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/stat.c b/fs/stat.c
index 075694e31d8b..c4ecd52c5737 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
401} 401}
402#endif /* __ARCH_WANT_STAT64 */ 402#endif /* __ARCH_WANT_STAT64 */
403 403
404void inode_add_bytes(struct inode *inode, loff_t bytes) 404/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
405void __inode_add_bytes(struct inode *inode, loff_t bytes)
405{ 406{
406 spin_lock(&inode->i_lock);
407 inode->i_blocks += bytes >> 9; 407 inode->i_blocks += bytes >> 9;
408 bytes &= 511; 408 bytes &= 511;
409 inode->i_bytes += bytes; 409 inode->i_bytes += bytes;
@@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes)
411 inode->i_blocks++; 411 inode->i_blocks++;
412 inode->i_bytes -= 512; 412 inode->i_bytes -= 512;
413 } 413 }
414}
415
416void inode_add_bytes(struct inode *inode, loff_t bytes)
417{
418 spin_lock(&inode->i_lock);
419 __inode_add_bytes(inode, bytes);
414 spin_unlock(&inode->i_lock); 420 spin_unlock(&inode->i_lock);
415} 421}
416 422
diff --git a/fs/super.c b/fs/super.c
index 19eb70b374bc..f35ac6022109 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -568,7 +568,7 @@ out:
568int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 568int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
569{ 569{
570 int retval; 570 int retval;
571 int remount_rw; 571 int remount_rw, remount_ro;
572 572
573 if (sb->s_frozen != SB_UNFROZEN) 573 if (sb->s_frozen != SB_UNFROZEN)
574 return -EBUSY; 574 return -EBUSY;
@@ -583,9 +583,12 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
583 shrink_dcache_sb(sb); 583 shrink_dcache_sb(sb);
584 sync_filesystem(sb); 584 sync_filesystem(sb);
585 585
586 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
587 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
588
586 /* If we are remounting RDONLY and current sb is read/write, 589 /* If we are remounting RDONLY and current sb is read/write,
587 make sure there are no rw files opened */ 590 make sure there are no rw files opened */
588 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { 591 if (remount_ro) {
589 if (force) 592 if (force)
590 mark_files_ro(sb); 593 mark_files_ro(sb);
591 else if (!fs_may_remount_ro(sb)) 594 else if (!fs_may_remount_ro(sb))
@@ -594,7 +597,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
594 if (retval < 0 && retval != -ENOSYS) 597 if (retval < 0 && retval != -ENOSYS)
595 return -EBUSY; 598 return -EBUSY;
596 } 599 }
597 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
598 600
599 if (sb->s_op->remount_fs) { 601 if (sb->s_op->remount_fs) {
600 retval = sb->s_op->remount_fs(sb, &flags, data); 602 retval = sb->s_op->remount_fs(sb, &flags, data);
@@ -604,6 +606,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
604 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 606 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
605 if (remount_rw) 607 if (remount_rw)
606 vfs_dq_quota_on_remount(sb); 608 vfs_dq_quota_on_remount(sb);
609 /*
610 * Some filesystems modify their metadata via some other path than the
611 * bdev buffer cache (eg. use a private mapping, or directories in
612 * pagecache, etc). Also file data modifications go via their own
613 * mappings. So If we try to mount readonly then copy the filesystem
614 * from bdev, we could get stale data, so invalidate it to give a best
615 * effort at coherency.
616 */
617 if (remount_ro && sb->s_bdev)
618 invalidate_bdev(sb->s_bdev);
607 return 0; 619 return 0;
608} 620}
609 621
@@ -901,8 +913,9 @@ int get_sb_single(struct file_system_type *fs_type,
901 return error; 913 return error;
902 } 914 }
903 s->s_flags |= MS_ACTIVE; 915 s->s_flags |= MS_ACTIVE;
916 } else {
917 do_remount_sb(s, flags, data, 0);
904 } 918 }
905 do_remount_sb(s, flags, data, 0);
906 simple_set_mnt(mnt, s); 919 simple_set_mnt(mnt, s);
907 return 0; 920 return 0;
908} 921}
@@ -924,6 +937,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
924 if (!mnt) 937 if (!mnt)
925 goto out; 938 goto out;
926 939
940 if (flags & MS_KERNMOUNT)
941 mnt->mnt_flags = MNT_INTERNAL;
942
927 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { 943 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
928 secdata = alloc_secdata(); 944 secdata = alloc_secdata();
929 if (!secdata) 945 if (!secdata)
diff --git a/fs/sync.c b/fs/sync.c
index d104591b066b..fc5c3d75cf3c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/sched.h> 10#include <linux/sched.h>
10#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -34,14 +35,14 @@ static int __sync_filesystem(struct super_block *sb, int wait)
34 if (!sb->s_bdi) 35 if (!sb->s_bdi)
35 return 0; 36 return 0;
36 37
37 /* Avoid doing twice syncing and cache pruning for quota sync */ 38 if (sb->s_qcop && sb->s_qcop->quota_sync)
38 if (!wait) { 39 sb->s_qcop->quota_sync(sb, -1, wait);
39 writeout_quota_sb(sb, -1); 40
40 writeback_inodes_sb(sb); 41 if (wait)
41 } else {
42 sync_quota_sb(sb, -1);
43 sync_inodes_sb(sb); 42 sync_inodes_sb(sb);
44 } 43 else
44 writeback_inodes_sb(sb);
45
45 if (sb->s_op->sync_fs) 46 if (sb->s_op->sync_fs)
46 sb->s_op->sync_fs(sb, wait); 47 sb->s_op->sync_fs(sb, wait);
47 return __sync_blockdev(sb->s_bdev, wait); 48 return __sync_blockdev(sb->s_bdev, wait);
@@ -295,10 +296,11 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
295 */ 296 */
296int generic_write_sync(struct file *file, loff_t pos, loff_t count) 297int generic_write_sync(struct file *file, loff_t pos, loff_t count)
297{ 298{
298 if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) 299 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
299 return 0; 300 return 0;
300 return vfs_fsync_range(file, file->f_path.dentry, pos, 301 return vfs_fsync_range(file, file->f_path.dentry, pos,
301 pos + count - 1, 1); 302 pos + count - 1,
303 (file->f_flags & __O_SYNC) ? 0 : 1);
302} 304}
303EXPORT_SYMBOL(generic_write_sync); 305EXPORT_SYMBOL(generic_write_sync);
304 306
@@ -354,6 +356,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
354{ 356{
355 int ret; 357 int ret;
356 struct file *file; 358 struct file *file;
359 struct address_space *mapping;
357 loff_t endbyte; /* inclusive */ 360 loff_t endbyte; /* inclusive */
358 int fput_needed; 361 int fput_needed;
359 umode_t i_mode; 362 umode_t i_mode;
@@ -404,7 +407,28 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
404 !S_ISLNK(i_mode)) 407 !S_ISLNK(i_mode))
405 goto out_put; 408 goto out_put;
406 409
407 ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); 410 mapping = file->f_mapping;
411 if (!mapping) {
412 ret = -EINVAL;
413 goto out_put;
414 }
415
416 ret = 0;
417 if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
418 ret = filemap_fdatawait_range(mapping, offset, endbyte);
419 if (ret < 0)
420 goto out_put;
421 }
422
423 if (flags & SYNC_FILE_RANGE_WRITE) {
424 ret = filemap_fdatawrite_range(mapping, offset, endbyte);
425 if (ret < 0)
426 goto out_put;
427 }
428
429 if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
430 ret = filemap_fdatawait_range(mapping, offset, endbyte);
431
408out_put: 432out_put:
409 fput_light(file, fput_needed); 433 fput_light(file, fput_needed);
410out: 434out:
@@ -436,42 +460,3 @@ asmlinkage long SyS_sync_file_range2(long fd, long flags,
436} 460}
437SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); 461SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
438#endif 462#endif
439
440/*
441 * `endbyte' is inclusive
442 */
443int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
444 loff_t endbyte, unsigned int flags)
445{
446 int ret;
447
448 if (!mapping) {
449 ret = -EINVAL;
450 goto out;
451 }
452
453 ret = 0;
454 if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
455 ret = wait_on_page_writeback_range(mapping,
456 offset >> PAGE_CACHE_SHIFT,
457 endbyte >> PAGE_CACHE_SHIFT);
458 if (ret < 0)
459 goto out;
460 }
461
462 if (flags & SYNC_FILE_RANGE_WRITE) {
463 ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
464 WB_SYNC_ALL);
465 if (ret < 0)
466 goto out;
467 }
468
469 if (flags & SYNC_FILE_RANGE_WAIT_AFTER) {
470 ret = wait_on_page_writeback_range(mapping,
471 offset >> PAGE_CACHE_SHIFT,
472 endbyte >> PAGE_CACHE_SHIFT);
473 }
474out:
475 return ret;
476}
477EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 60c702bc10ae..e9d293593e52 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
54 int rc; 54 int rc;
55 55
56 /* need attr_sd for attr, its parent for kobj */ 56 /* need attr_sd for attr, its parent for kobj */
57 if (!sysfs_get_active_two(attr_sd)) 57 if (!sysfs_get_active(attr_sd))
58 return -ENODEV; 58 return -ENODEV;
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active_two(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
66 return rc; 66 return rc;
67} 67}
@@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
125 int rc; 125 int rc;
126 126
127 /* need attr_sd for attr, its parent for kobj */ 127 /* need attr_sd for attr, its parent for kobj */
128 if (!sysfs_get_active_two(attr_sd)) 128 if (!sysfs_get_active(attr_sd))
129 return -ENODEV; 129 return -ENODEV;
130 130
131 rc = -EIO; 131 rc = -EIO;
132 if (attr->write) 132 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 133 rc = attr->write(kobj, attr, buffer, offset, count);
134 134
135 sysfs_put_active_two(attr_sd); 135 sysfs_put_active(attr_sd);
136 136
137 return rc; 137 return rc;
138} 138}
@@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
184 if (!bb->vm_ops || !bb->vm_ops->open) 184 if (!bb->vm_ops || !bb->vm_ops->open)
185 return; 185 return;
186 186
187 if (!sysfs_get_active_two(attr_sd)) 187 if (!sysfs_get_active(attr_sd))
188 return; 188 return;
189 189
190 bb->vm_ops->open(vma); 190 bb->vm_ops->open(vma);
191 191
192 sysfs_put_active_two(attr_sd); 192 sysfs_put_active(attr_sd);
193} 193}
194 194
195static void bin_vma_close(struct vm_area_struct *vma) 195static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
201 if (!bb->vm_ops || !bb->vm_ops->close) 201 if (!bb->vm_ops || !bb->vm_ops->close)
202 return; 202 return;
203 203
204 if (!sysfs_get_active_two(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
205 return; 205 return;
206 206
207 bb->vm_ops->close(vma); 207 bb->vm_ops->close(vma);
208 208
209 sysfs_put_active_two(attr_sd); 209 sysfs_put_active(attr_sd);
210} 210}
211 211
212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
219 if (!bb->vm_ops || !bb->vm_ops->fault) 219 if (!bb->vm_ops || !bb->vm_ops->fault)
220 return VM_FAULT_SIGBUS; 220 return VM_FAULT_SIGBUS;
221 221
222 if (!sysfs_get_active_two(attr_sd)) 222 if (!sysfs_get_active(attr_sd))
223 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
224 224
225 ret = bb->vm_ops->fault(vma, vmf); 225 ret = bb->vm_ops->fault(vma, vmf);
226 226
227 sysfs_put_active_two(attr_sd); 227 sysfs_put_active(attr_sd);
228 return ret; 228 return ret;
229} 229}
230 230
@@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
241 if (!bb->vm_ops->page_mkwrite) 241 if (!bb->vm_ops->page_mkwrite)
242 return 0; 242 return 0;
243 243
244 if (!sysfs_get_active_two(attr_sd)) 244 if (!sysfs_get_active(attr_sd))
245 return VM_FAULT_SIGBUS; 245 return VM_FAULT_SIGBUS;
246 246
247 ret = bb->vm_ops->page_mkwrite(vma, vmf); 247 ret = bb->vm_ops->page_mkwrite(vma, vmf);
248 248
249 sysfs_put_active_two(attr_sd); 249 sysfs_put_active(attr_sd);
250 return ret; 250 return ret;
251} 251}
252 252
@@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
261 if (!bb->vm_ops || !bb->vm_ops->access) 261 if (!bb->vm_ops || !bb->vm_ops->access)
262 return -EINVAL; 262 return -EINVAL;
263 263
264 if (!sysfs_get_active_two(attr_sd)) 264 if (!sysfs_get_active(attr_sd))
265 return -EINVAL; 265 return -EINVAL;
266 266
267 ret = bb->vm_ops->access(vma, addr, buf, len, write); 267 ret = bb->vm_ops->access(vma, addr, buf, len, write);
268 268
269 sysfs_put_active_two(attr_sd); 269 sysfs_put_active(attr_sd);
270 return ret; 270 return ret;
271} 271}
272 272
@@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
281 if (!bb->vm_ops || !bb->vm_ops->set_policy) 281 if (!bb->vm_ops || !bb->vm_ops->set_policy)
282 return 0; 282 return 0;
283 283
284 if (!sysfs_get_active_two(attr_sd)) 284 if (!sysfs_get_active(attr_sd))
285 return -EINVAL; 285 return -EINVAL;
286 286
287 ret = bb->vm_ops->set_policy(vma, new); 287 ret = bb->vm_ops->set_policy(vma, new);
288 288
289 sysfs_put_active_two(attr_sd); 289 sysfs_put_active(attr_sd);
290 return ret; 290 return ret;
291} 291}
292 292
@@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
301 if (!bb->vm_ops || !bb->vm_ops->get_policy) 301 if (!bb->vm_ops || !bb->vm_ops->get_policy)
302 return vma->vm_policy; 302 return vma->vm_policy;
303 303
304 if (!sysfs_get_active_two(attr_sd)) 304 if (!sysfs_get_active(attr_sd))
305 return vma->vm_policy; 305 return vma->vm_policy;
306 306
307 pol = bb->vm_ops->get_policy(vma, addr); 307 pol = bb->vm_ops->get_policy(vma, addr);
308 308
309 sysfs_put_active_two(attr_sd); 309 sysfs_put_active(attr_sd);
310 return pol; 310 return pol;
311} 311}
312 312
@@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
321 if (!bb->vm_ops || !bb->vm_ops->migrate) 321 if (!bb->vm_ops || !bb->vm_ops->migrate)
322 return 0; 322 return 0;
323 323
324 if (!sysfs_get_active_two(attr_sd)) 324 if (!sysfs_get_active(attr_sd))
325 return 0; 325 return 0;
326 326
327 ret = bb->vm_ops->migrate(vma, from, to, flags); 327 ret = bb->vm_ops->migrate(vma, from, to, flags);
328 328
329 sysfs_put_active_two(attr_sd); 329 sysfs_put_active(attr_sd);
330 return ret; 330 return ret;
331} 331}
332#endif 332#endif
@@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
356 356
357 /* need attr_sd for attr, its parent for kobj */ 357 /* need attr_sd for attr, its parent for kobj */
358 rc = -ENODEV; 358 rc = -ENODEV;
359 if (!sysfs_get_active_two(attr_sd)) 359 if (!sysfs_get_active(attr_sd))
360 goto out_unlock; 360 goto out_unlock;
361 361
362 rc = -EINVAL; 362 rc = -EINVAL;
@@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
384 bb->vm_ops = vma->vm_ops; 384 bb->vm_ops = vma->vm_ops;
385 vma->vm_ops = &bin_vm_ops; 385 vma->vm_ops = &bin_vm_ops;
386out_put: 386out_put:
387 sysfs_put_active_two(attr_sd); 387 sysfs_put_active(attr_sd);
388out_unlock: 388out_unlock:
389 mutex_unlock(&bb->mutex); 389 mutex_unlock(&bb->mutex);
390 390
@@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file)
399 int error; 399 int error;
400 400
401 /* binary file operations requires both @sd and its parent */ 401 /* binary file operations requires both @sd and its parent */
402 if (!sysfs_get_active_two(attr_sd)) 402 if (!sysfs_get_active(attr_sd))
403 return -ENODEV; 403 return -ENODEV;
404 404
405 error = -EACCES; 405 error = -EACCES;
@@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file)
426 mutex_unlock(&sysfs_bin_lock); 426 mutex_unlock(&sysfs_bin_lock);
427 427
428 /* open succeeded, put active references */ 428 /* open succeeded, put active references */
429 sysfs_put_active_two(attr_sd); 429 sysfs_put_active(attr_sd);
430 return 0; 430 return 0;
431 431
432 err_out: 432 err_out:
433 sysfs_put_active_two(attr_sd); 433 sysfs_put_active(attr_sd);
434 kfree(bb); 434 kfree(bb);
435 return error; 435 return error;
436} 436}
@@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd)
483 * @attr: attribute descriptor. 483 * @attr: attribute descriptor.
484 */ 484 */
485 485
486int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) 486int sysfs_create_bin_file(struct kobject *kobj,
487 const struct bin_attribute *attr)
487{ 488{
488 BUG_ON(!kobj || !kobj->sd || !attr); 489 BUG_ON(!kobj || !kobj->sd || !attr);
489 490
@@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
497 * @attr: attribute descriptor. 498 * @attr: attribute descriptor.
498 */ 499 */
499 500
500void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) 501void sysfs_remove_bin_file(struct kobject *kobj,
502 const struct bin_attribute *attr)
501{ 503{
502 sysfs_hash_and_remove(kobj->sd, attr->attr.name); 504 sysfs_hash_and_remove(kobj->sd, attr->attr.name);
503} 505}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e0201837d244..590717861c7a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -25,7 +25,6 @@
25#include "sysfs.h" 25#include "sysfs.h"
26 26
27DEFINE_MUTEX(sysfs_mutex); 27DEFINE_MUTEX(sysfs_mutex);
28DEFINE_MUTEX(sysfs_rename_mutex);
29DEFINE_SPINLOCK(sysfs_assoc_lock); 28DEFINE_SPINLOCK(sysfs_assoc_lock);
30 29
31static DEFINE_SPINLOCK(sysfs_ino_lock); 30static DEFINE_SPINLOCK(sysfs_ino_lock);
@@ -85,46 +84,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
85} 84}
86 85
87/** 86/**
88 * sysfs_get_dentry - get dentry for the given sysfs_dirent
89 * @sd: sysfs_dirent of interest
90 *
91 * Get dentry for @sd. Dentry is looked up if currently not
92 * present. This function descends from the root looking up
93 * dentry for each step.
94 *
95 * LOCKING:
96 * mutex_lock(sysfs_rename_mutex)
97 *
98 * RETURNS:
99 * Pointer to found dentry on success, ERR_PTR() value on error.
100 */
101struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
102{
103 struct dentry *dentry = dget(sysfs_sb->s_root);
104
105 while (dentry->d_fsdata != sd) {
106 struct sysfs_dirent *cur;
107 struct dentry *parent;
108
109 /* find the first ancestor which hasn't been looked up */
110 cur = sd;
111 while (cur->s_parent != dentry->d_fsdata)
112 cur = cur->s_parent;
113
114 /* look it up */
115 parent = dentry;
116 mutex_lock(&parent->d_inode->i_mutex);
117 dentry = lookup_one_noperm(cur->s_name, parent);
118 mutex_unlock(&parent->d_inode->i_mutex);
119 dput(parent);
120
121 if (IS_ERR(dentry))
122 break;
123 }
124 return dentry;
125}
126
127/**
128 * sysfs_get_active - get an active reference to sysfs_dirent 87 * sysfs_get_active - get an active reference to sysfs_dirent
129 * @sd: sysfs_dirent to get an active reference to 88 * @sd: sysfs_dirent to get an active reference to
130 * 89 *
@@ -134,7 +93,7 @@ struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
134 * RETURNS: 93 * RETURNS:
135 * Pointer to @sd on success, NULL on failure. 94 * Pointer to @sd on success, NULL on failure.
136 */ 95 */
137static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) 96struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
138{ 97{
139 if (unlikely(!sd)) 98 if (unlikely(!sd))
140 return NULL; 99 return NULL;
@@ -147,8 +106,10 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
147 return NULL; 106 return NULL;
148 107
149 t = atomic_cmpxchg(&sd->s_active, v, v + 1); 108 t = atomic_cmpxchg(&sd->s_active, v, v + 1);
150 if (likely(t == v)) 109 if (likely(t == v)) {
110 rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
151 return sd; 111 return sd;
112 }
152 if (t < 0) 113 if (t < 0)
153 return NULL; 114 return NULL;
154 115
@@ -163,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
163 * Put an active reference to @sd. This function is noop if @sd 124 * Put an active reference to @sd. This function is noop if @sd
164 * is NULL. 125 * is NULL.
165 */ 126 */
166static void sysfs_put_active(struct sysfs_dirent *sd) 127void sysfs_put_active(struct sysfs_dirent *sd)
167{ 128{
168 struct completion *cmpl; 129 struct completion *cmpl;
169 int v; 130 int v;
@@ -171,6 +132,7 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
171 if (unlikely(!sd)) 132 if (unlikely(!sd))
172 return; 133 return;
173 134
135 rwsem_release(&sd->dep_map, 1, _RET_IP_);
174 v = atomic_dec_return(&sd->s_active); 136 v = atomic_dec_return(&sd->s_active);
175 if (likely(v != SD_DEACTIVATED_BIAS)) 137 if (likely(v != SD_DEACTIVATED_BIAS))
176 return; 138 return;
@@ -183,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
183} 145}
184 146
185/** 147/**
186 * sysfs_get_active_two - get active references to sysfs_dirent and parent
187 * @sd: sysfs_dirent of interest
188 *
189 * Get active reference to @sd and its parent. Parent's active
190 * reference is grabbed first. This function is noop if @sd is
191 * NULL.
192 *
193 * RETURNS:
194 * Pointer to @sd on success, NULL on failure.
195 */
196struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
197{
198 if (sd) {
199 if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
200 return NULL;
201 if (unlikely(!sysfs_get_active(sd))) {
202 sysfs_put_active(sd->s_parent);
203 return NULL;
204 }
205 }
206 return sd;
207}
208
209/**
210 * sysfs_put_active_two - put active references to sysfs_dirent and parent
211 * @sd: sysfs_dirent of interest
212 *
213 * Put active references to @sd and its parent. This function is
214 * noop if @sd is NULL.
215 */
216void sysfs_put_active_two(struct sysfs_dirent *sd)
217{
218 if (sd) {
219 sysfs_put_active(sd);
220 sysfs_put_active(sd->s_parent);
221 }
222}
223
224/**
225 * sysfs_deactivate - deactivate sysfs_dirent 148 * sysfs_deactivate - deactivate sysfs_dirent
226 * @sd: sysfs_dirent to deactivate 149 * @sd: sysfs_dirent to deactivate
227 * 150 *
@@ -233,17 +156,27 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
233 int v; 156 int v;
234 157
235 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); 158 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
159
160 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
161 return;
162
236 sd->s_sibling = (void *)&wait; 163 sd->s_sibling = (void *)&wait;
237 164
165 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
238 /* atomic_add_return() is a mb(), put_active() will always see 166 /* atomic_add_return() is a mb(), put_active() will always see
239 * the updated sd->s_sibling. 167 * the updated sd->s_sibling.
240 */ 168 */
241 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); 169 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
242 170
243 if (v != SD_DEACTIVATED_BIAS) 171 if (v != SD_DEACTIVATED_BIAS) {
172 lock_contended(&sd->dep_map, _RET_IP_);
244 wait_for_completion(&wait); 173 wait_for_completion(&wait);
174 }
245 175
246 sd->s_sibling = NULL; 176 sd->s_sibling = NULL;
177
178 lock_acquired(&sd->dep_map, _RET_IP_);
179 rwsem_release(&sd->dep_map, 1, _RET_IP_);
247} 180}
248 181
249static int sysfs_alloc_ino(ino_t *pino) 182static int sysfs_alloc_ino(ino_t *pino)
@@ -298,7 +231,61 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
298 goto repeat; 231 goto repeat;
299} 232}
300 233
301static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) 234static int sysfs_dentry_delete(struct dentry *dentry)
235{
236 struct sysfs_dirent *sd = dentry->d_fsdata;
237 return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
238}
239
240static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
241{
242 struct sysfs_dirent *sd = dentry->d_fsdata;
243 int is_dir;
244
245 mutex_lock(&sysfs_mutex);
246
247 /* The sysfs dirent has been deleted */
248 if (sd->s_flags & SYSFS_FLAG_REMOVED)
249 goto out_bad;
250
251 /* The sysfs dirent has been moved? */
252 if (dentry->d_parent->d_fsdata != sd->s_parent)
253 goto out_bad;
254
255 /* The sysfs dirent has been renamed */
256 if (strcmp(dentry->d_name.name, sd->s_name) != 0)
257 goto out_bad;
258
259 mutex_unlock(&sysfs_mutex);
260out_valid:
261 return 1;
262out_bad:
263 /* Remove the dentry from the dcache hashes.
264 * If this is a deleted dentry we use d_drop instead of d_delete
265 * so sysfs doesn't need to cope with negative dentries.
266 *
267 * If this is a dentry that has simply been renamed we
268 * use d_drop to remove it from the dcache lookup on its
269 * old parent. If this dentry persists later when a lookup
270 * is performed at its new name the dentry will be readded
271 * to the dcache hashes.
272 */
273 is_dir = (sysfs_type(sd) == SYSFS_DIR);
274 mutex_unlock(&sysfs_mutex);
275 if (is_dir) {
276 /* If we have submounts we must allow the vfs caches
277 * to lie about the state of the filesystem to prevent
278 * leaks and other nasty things.
279 */
280 if (have_submounts(dentry))
281 goto out_valid;
282 shrink_dcache_parent(dentry);
283 }
284 d_drop(dentry);
285 return 0;
286}
287
288static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode)
302{ 289{
303 struct sysfs_dirent * sd = dentry->d_fsdata; 290 struct sysfs_dirent * sd = dentry->d_fsdata;
304 291
@@ -307,7 +294,9 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
307} 294}
308 295
309static const struct dentry_operations sysfs_dentry_ops = { 296static const struct dentry_operations sysfs_dentry_ops = {
310 .d_iput = sysfs_d_iput, 297 .d_revalidate = sysfs_dentry_revalidate,
298 .d_delete = sysfs_dentry_delete,
299 .d_iput = sysfs_dentry_iput,
311}; 300};
312 301
313struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) 302struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
@@ -344,12 +333,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
344 return NULL; 333 return NULL;
345} 334}
346 335
347static int sysfs_ilookup_test(struct inode *inode, void *arg)
348{
349 struct sysfs_dirent *sd = arg;
350 return inode->i_ino == sd->s_ino;
351}
352
353/** 336/**
354 * sysfs_addrm_start - prepare for sysfs_dirent add/remove 337 * sysfs_addrm_start - prepare for sysfs_dirent add/remove
355 * @acxt: pointer to sysfs_addrm_cxt to be used 338 * @acxt: pointer to sysfs_addrm_cxt to be used
@@ -357,47 +340,20 @@ static int sysfs_ilookup_test(struct inode *inode, void *arg)
357 * 340 *
358 * This function is called when the caller is about to add or 341 * This function is called when the caller is about to add or
359 * remove sysfs_dirent under @parent_sd. This function acquires 342 * remove sysfs_dirent under @parent_sd. This function acquires
360 * sysfs_mutex, grabs inode for @parent_sd if available and lock 343 * sysfs_mutex. @acxt is used to keep and pass context to
361 * i_mutex of it. @acxt is used to keep and pass context to
362 * other addrm functions. 344 * other addrm functions.
363 * 345 *
364 * LOCKING: 346 * LOCKING:
365 * Kernel thread context (may sleep). sysfs_mutex is locked on 347 * Kernel thread context (may sleep). sysfs_mutex is locked on
366 * return. i_mutex of parent inode is locked on return if 348 * return.
367 * available.
368 */ 349 */
369void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, 350void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
370 struct sysfs_dirent *parent_sd) 351 struct sysfs_dirent *parent_sd)
371{ 352{
372 struct inode *inode;
373
374 memset(acxt, 0, sizeof(*acxt)); 353 memset(acxt, 0, sizeof(*acxt));
375 acxt->parent_sd = parent_sd; 354 acxt->parent_sd = parent_sd;
376 355
377 /* Lookup parent inode. inode initialization is protected by
378 * sysfs_mutex, so inode existence can be determined by
379 * looking up inode while holding sysfs_mutex.
380 */
381 mutex_lock(&sysfs_mutex); 356 mutex_lock(&sysfs_mutex);
382
383 inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
384 parent_sd);
385 if (inode) {
386 WARN_ON(inode->i_state & I_NEW);
387
388 /* parent inode available */
389 acxt->parent_inode = inode;
390
391 /* sysfs_mutex is below i_mutex in lock hierarchy.
392 * First, trylock i_mutex. If fails, unlock
393 * sysfs_mutex and lock them in order.
394 */
395 if (!mutex_trylock(&inode->i_mutex)) {
396 mutex_unlock(&sysfs_mutex);
397 mutex_lock(&inode->i_mutex);
398 mutex_lock(&sysfs_mutex);
399 }
400 }
401} 357}
402 358
403/** 359/**
@@ -422,18 +378,22 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
422 */ 378 */
423int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) 379int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
424{ 380{
381 struct sysfs_inode_attrs *ps_iattr;
382
425 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) 383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
426 return -EEXIST; 384 return -EEXIST;
427 385
428 sd->s_parent = sysfs_get(acxt->parent_sd); 386 sd->s_parent = sysfs_get(acxt->parent_sd);
429 387
430 if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
431 inc_nlink(acxt->parent_inode);
432
433 acxt->cnt++;
434
435 sysfs_link_sibling(sd); 388 sysfs_link_sibling(sd);
436 389
390 /* Update timestamps on the parent */
391 ps_iattr = acxt->parent_sd->s_iattr;
392 if (ps_iattr) {
393 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
394 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
395 }
396
437 return 0; 397 return 0;
438} 398}
439 399
@@ -512,70 +472,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
512 */ 472 */
513void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) 473void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
514{ 474{
475 struct sysfs_inode_attrs *ps_iattr;
476
515 BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED); 477 BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
516 478
517 sysfs_unlink_sibling(sd); 479 sysfs_unlink_sibling(sd);
518 480
481 /* Update timestamps on the parent */
482 ps_iattr = acxt->parent_sd->s_iattr;
483 if (ps_iattr) {
484 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
485 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
486 }
487
519 sd->s_flags |= SYSFS_FLAG_REMOVED; 488 sd->s_flags |= SYSFS_FLAG_REMOVED;
520 sd->s_sibling = acxt->removed; 489 sd->s_sibling = acxt->removed;
521 acxt->removed = sd; 490 acxt->removed = sd;
522
523 if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
524 drop_nlink(acxt->parent_inode);
525
526 acxt->cnt++;
527}
528
529/**
530 * sysfs_drop_dentry - drop dentry for the specified sysfs_dirent
531 * @sd: target sysfs_dirent
532 *
533 * Drop dentry for @sd. @sd must have been unlinked from its
534 * parent on entry to this function such that it can't be looked
535 * up anymore.
536 */
537static void sysfs_drop_dentry(struct sysfs_dirent *sd)
538{
539 struct inode *inode;
540 struct dentry *dentry;
541
542 inode = ilookup(sysfs_sb, sd->s_ino);
543 if (!inode)
544 return;
545
546 /* Drop any existing dentries associated with sd.
547 *
548 * For the dentry to be properly freed we need to grab a
549 * reference to the dentry under the dcache lock, unhash it,
550 * and then put it. The playing with the dentry count allows
551 * dput to immediately free the dentry if it is not in use.
552 */
553repeat:
554 spin_lock(&dcache_lock);
555 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
556 if (d_unhashed(dentry))
557 continue;
558 dget_locked(dentry);
559 spin_lock(&dentry->d_lock);
560 __d_drop(dentry);
561 spin_unlock(&dentry->d_lock);
562 spin_unlock(&dcache_lock);
563 dput(dentry);
564 goto repeat;
565 }
566 spin_unlock(&dcache_lock);
567
568 /* adjust nlink and update timestamp */
569 mutex_lock(&inode->i_mutex);
570
571 inode->i_ctime = CURRENT_TIME;
572 drop_nlink(inode);
573 if (sysfs_type(sd) == SYSFS_DIR)
574 drop_nlink(inode);
575
576 mutex_unlock(&inode->i_mutex);
577
578 iput(inode);
579} 491}
580 492
581/** 493/**
@@ -584,25 +496,15 @@ repeat:
584 * 496 *
585 * Finish up sysfs_dirent add/remove. Resources acquired by 497 * Finish up sysfs_dirent add/remove. Resources acquired by
586 * sysfs_addrm_start() are released and removed sysfs_dirents are 498 * sysfs_addrm_start() are released and removed sysfs_dirents are
587 * cleaned up. Timestamps on the parent inode are updated. 499 * cleaned up.
588 * 500 *
589 * LOCKING: 501 * LOCKING:
590 * All mutexes acquired by sysfs_addrm_start() are released. 502 * sysfs_mutex is released.
591 */ 503 */
592void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) 504void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
593{ 505{
594 /* release resources acquired by sysfs_addrm_start() */ 506 /* release resources acquired by sysfs_addrm_start() */
595 mutex_unlock(&sysfs_mutex); 507 mutex_unlock(&sysfs_mutex);
596 if (acxt->parent_inode) {
597 struct inode *inode = acxt->parent_inode;
598
599 /* if added/removed, update timestamps on the parent */
600 if (acxt->cnt)
601 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
602
603 mutex_unlock(&inode->i_mutex);
604 iput(inode);
605 }
606 508
607 /* kill removed sysfs_dirents */ 509 /* kill removed sysfs_dirents */
608 while (acxt->removed) { 510 while (acxt->removed) {
@@ -611,7 +513,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
611 acxt->removed = sd->s_sibling; 513 acxt->removed = sd->s_sibling;
612 sd->s_sibling = NULL; 514 sd->s_sibling = NULL;
613 515
614 sysfs_drop_dentry(sd);
615 sysfs_deactivate(sd); 516 sysfs_deactivate(sd);
616 unmap_bin_file(sd); 517 unmap_bin_file(sd);
617 sysfs_put(sd); 518 sysfs_put(sd);
@@ -744,17 +645,22 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
744 } 645 }
745 646
746 /* attach dentry and inode */ 647 /* attach dentry and inode */
747 inode = sysfs_get_inode(sd); 648 inode = sysfs_get_inode(dir->i_sb, sd);
748 if (!inode) { 649 if (!inode) {
749 ret = ERR_PTR(-ENOMEM); 650 ret = ERR_PTR(-ENOMEM);
750 goto out_unlock; 651 goto out_unlock;
751 } 652 }
752 653
753 /* instantiate and hash dentry */ 654 /* instantiate and hash dentry */
754 dentry->d_op = &sysfs_dentry_ops; 655 ret = d_find_alias(inode);
755 dentry->d_fsdata = sysfs_get(sd); 656 if (!ret) {
756 d_instantiate(dentry, inode); 657 dentry->d_op = &sysfs_dentry_ops;
757 d_rehash(dentry); 658 dentry->d_fsdata = sysfs_get(sd);
659 d_add(dentry, inode);
660 } else {
661 d_move(ret, dentry);
662 iput(inode);
663 }
758 664
759 out_unlock: 665 out_unlock:
760 mutex_unlock(&sysfs_mutex); 666 mutex_unlock(&sysfs_mutex);
@@ -763,7 +669,9 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
763 669
764const struct inode_operations sysfs_dir_inode_operations = { 670const struct inode_operations sysfs_dir_inode_operations = {
765 .lookup = sysfs_lookup, 671 .lookup = sysfs_lookup,
672 .permission = sysfs_permission,
766 .setattr = sysfs_setattr, 673 .setattr = sysfs_setattr,
674 .getattr = sysfs_getattr,
767 .setxattr = sysfs_setxattr, 675 .setxattr = sysfs_setxattr,
768}; 676};
769 677
@@ -826,141 +734,65 @@ void sysfs_remove_dir(struct kobject * kobj)
826 __sysfs_remove_dir(sd); 734 __sysfs_remove_dir(sd);
827} 735}
828 736
829int sysfs_rename_dir(struct kobject * kobj, const char *new_name) 737int sysfs_rename(struct sysfs_dirent *sd,
738 struct sysfs_dirent *new_parent_sd, const char *new_name)
830{ 739{
831 struct sysfs_dirent *sd = kobj->sd;
832 struct dentry *parent = NULL;
833 struct dentry *old_dentry = NULL, *new_dentry = NULL;
834 const char *dup_name = NULL; 740 const char *dup_name = NULL;
835 int error; 741 int error;
836 742
837 mutex_lock(&sysfs_rename_mutex); 743 mutex_lock(&sysfs_mutex);
838 744
839 error = 0; 745 error = 0;
840 if (strcmp(sd->s_name, new_name) == 0) 746 if ((sd->s_parent == new_parent_sd) &&
747 (strcmp(sd->s_name, new_name) == 0))
841 goto out; /* nothing to rename */ 748 goto out; /* nothing to rename */
842 749
843 /* get the original dentry */
844 old_dentry = sysfs_get_dentry(sd);
845 if (IS_ERR(old_dentry)) {
846 error = PTR_ERR(old_dentry);
847 old_dentry = NULL;
848 goto out;
849 }
850
851 parent = old_dentry->d_parent;
852
853 /* lock parent and get dentry for new name */
854 mutex_lock(&parent->d_inode->i_mutex);
855 mutex_lock(&sysfs_mutex);
856
857 error = -EEXIST; 750 error = -EEXIST;
858 if (sysfs_find_dirent(sd->s_parent, new_name)) 751 if (sysfs_find_dirent(new_parent_sd, new_name))
859 goto out_unlock; 752 goto out;
860
861 error = -ENOMEM;
862 new_dentry = d_alloc_name(parent, new_name);
863 if (!new_dentry)
864 goto out_unlock;
865 753
866 /* rename sysfs_dirent */ 754 /* rename sysfs_dirent */
867 error = -ENOMEM; 755 if (strcmp(sd->s_name, new_name) != 0) {
868 new_name = dup_name = kstrdup(new_name, GFP_KERNEL); 756 error = -ENOMEM;
869 if (!new_name) 757 new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
870 goto out_unlock; 758 if (!new_name)
871 759 goto out;
872 dup_name = sd->s_name; 760
873 sd->s_name = new_name; 761 dup_name = sd->s_name;
762 sd->s_name = new_name;
763 }
874 764
875 /* rename */ 765 /* Remove from old parent's list and insert into new parent's list. */
876 d_add(new_dentry, NULL); 766 if (sd->s_parent != new_parent_sd) {
877 d_move(old_dentry, new_dentry); 767 sysfs_unlink_sibling(sd);
768 sysfs_get(new_parent_sd);
769 sysfs_put(sd->s_parent);
770 sd->s_parent = new_parent_sd;
771 sysfs_link_sibling(sd);
772 }
878 773
879 error = 0; 774 error = 0;
880 out_unlock: 775 out:
881 mutex_unlock(&sysfs_mutex); 776 mutex_unlock(&sysfs_mutex);
882 mutex_unlock(&parent->d_inode->i_mutex);
883 kfree(dup_name); 777 kfree(dup_name);
884 dput(old_dentry);
885 dput(new_dentry);
886 out:
887 mutex_unlock(&sysfs_rename_mutex);
888 return error; 778 return error;
889} 779}
890 780
781int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
782{
783 return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
784}
785
891int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) 786int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
892{ 787{
893 struct sysfs_dirent *sd = kobj->sd; 788 struct sysfs_dirent *sd = kobj->sd;
894 struct sysfs_dirent *new_parent_sd; 789 struct sysfs_dirent *new_parent_sd;
895 struct dentry *old_parent, *new_parent = NULL;
896 struct dentry *old_dentry = NULL, *new_dentry = NULL;
897 int error;
898 790
899 mutex_lock(&sysfs_rename_mutex);
900 BUG_ON(!sd->s_parent); 791 BUG_ON(!sd->s_parent);
901 new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? 792 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
902 new_parent_kobj->sd : &sysfs_root; 793 new_parent_kobj->sd : &sysfs_root;
903 794
904 error = 0; 795 return sysfs_rename(sd, new_parent_sd, sd->s_name);
905 if (sd->s_parent == new_parent_sd)
906 goto out; /* nothing to move */
907
908 /* get dentries */
909 old_dentry = sysfs_get_dentry(sd);
910 if (IS_ERR(old_dentry)) {
911 error = PTR_ERR(old_dentry);
912 old_dentry = NULL;
913 goto out;
914 }
915 old_parent = old_dentry->d_parent;
916
917 new_parent = sysfs_get_dentry(new_parent_sd);
918 if (IS_ERR(new_parent)) {
919 error = PTR_ERR(new_parent);
920 new_parent = NULL;
921 goto out;
922 }
923
924again:
925 mutex_lock(&old_parent->d_inode->i_mutex);
926 if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
927 mutex_unlock(&old_parent->d_inode->i_mutex);
928 goto again;
929 }
930 mutex_lock(&sysfs_mutex);
931
932 error = -EEXIST;
933 if (sysfs_find_dirent(new_parent_sd, sd->s_name))
934 goto out_unlock;
935
936 error = -ENOMEM;
937 new_dentry = d_alloc_name(new_parent, sd->s_name);
938 if (!new_dentry)
939 goto out_unlock;
940
941 error = 0;
942 d_add(new_dentry, NULL);
943 d_move(old_dentry, new_dentry);
944
945 /* Remove from old parent's list and insert into new parent's list. */
946 sysfs_unlink_sibling(sd);
947 sysfs_get(new_parent_sd);
948 drop_nlink(old_parent->d_inode);
949 sysfs_put(sd->s_parent);
950 sd->s_parent = new_parent_sd;
951 inc_nlink(new_parent->d_inode);
952 sysfs_link_sibling(sd);
953
954 out_unlock:
955 mutex_unlock(&sysfs_mutex);
956 mutex_unlock(&new_parent->d_inode->i_mutex);
957 mutex_unlock(&old_parent->d_inode->i_mutex);
958 out:
959 dput(new_parent);
960 dput(old_dentry);
961 dput(new_dentry);
962 mutex_unlock(&sysfs_rename_mutex);
963 return error;
964} 796}
965 797
966/* Relationship between s_mode and the DT_xxx types */ 798/* Relationship between s_mode and the DT_xxx types */
@@ -969,11 +801,46 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
969 return (sd->s_mode >> 12) & 15; 801 return (sd->s_mode >> 12) & 15;
970} 802}
971 803
804static int sysfs_dir_release(struct inode *inode, struct file *filp)
805{
806 sysfs_put(filp->private_data);
807 return 0;
808}
809
810static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
811 ino_t ino, struct sysfs_dirent *pos)
812{
813 if (pos) {
814 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
815 pos->s_parent == parent_sd &&
816 ino == pos->s_ino;
817 sysfs_put(pos);
818 if (valid)
819 return pos;
820 }
821 pos = NULL;
822 if ((ino > 1) && (ino < INT_MAX)) {
823 pos = parent_sd->s_dir.children;
824 while (pos && (ino > pos->s_ino))
825 pos = pos->s_sibling;
826 }
827 return pos;
828}
829
830static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
831 ino_t ino, struct sysfs_dirent *pos)
832{
833 pos = sysfs_dir_pos(parent_sd, ino, pos);
834 if (pos)
835 pos = pos->s_sibling;
836 return pos;
837}
838
972static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 839static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
973{ 840{
974 struct dentry *dentry = filp->f_path.dentry; 841 struct dentry *dentry = filp->f_path.dentry;
975 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 842 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
976 struct sysfs_dirent *pos; 843 struct sysfs_dirent *pos = filp->private_data;
977 ino_t ino; 844 ino_t ino;
978 845
979 if (filp->f_pos == 0) { 846 if (filp->f_pos == 0) {
@@ -989,29 +856,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
989 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) 856 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
990 filp->f_pos++; 857 filp->f_pos++;
991 } 858 }
992 if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) { 859 mutex_lock(&sysfs_mutex);
993 mutex_lock(&sysfs_mutex); 860 for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
994 861 pos;
995 /* Skip the dentries we have already reported */ 862 pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
996 pos = parent_sd->s_dir.children; 863 const char * name;
997 while (pos && (filp->f_pos > pos->s_ino)) 864 unsigned int type;
998 pos = pos->s_sibling; 865 int len, ret;
999 866
1000 for ( ; pos; pos = pos->s_sibling) { 867 name = pos->s_name;
1001 const char * name; 868 len = strlen(name);
1002 int len; 869 ino = pos->s_ino;
1003 870 type = dt_type(pos);
1004 name = pos->s_name; 871 filp->f_pos = ino;
1005 len = strlen(name); 872 filp->private_data = sysfs_get(pos);
1006 filp->f_pos = ino = pos->s_ino;
1007 873
1008 if (filldir(dirent, name, len, filp->f_pos, ino,
1009 dt_type(pos)) < 0)
1010 break;
1011 }
1012 if (!pos)
1013 filp->f_pos = INT_MAX;
1014 mutex_unlock(&sysfs_mutex); 874 mutex_unlock(&sysfs_mutex);
875 ret = filldir(dirent, name, len, filp->f_pos, ino, type);
876 mutex_lock(&sysfs_mutex);
877 if (ret < 0)
878 break;
879 }
880 mutex_unlock(&sysfs_mutex);
881 if ((filp->f_pos > 1) && !pos) { /* EOF */
882 filp->f_pos = INT_MAX;
883 filp->private_data = NULL;
1015 } 884 }
1016 return 0; 885 return 0;
1017} 886}
@@ -1020,5 +889,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
1020const struct file_operations sysfs_dir_operations = { 889const struct file_operations sysfs_dir_operations = {
1021 .read = generic_read_dir, 890 .read = generic_read_dir,
1022 .readdir = sysfs_readdir, 891 .readdir = sysfs_readdir,
892 .release = sysfs_dir_release,
1023 .llseek = generic_file_llseek, 893 .llseek = generic_file_llseek,
1024}; 894};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f5ea4680f15f..e222b2582746 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
53 size_t count; 53 size_t count;
54 loff_t pos; 54 loff_t pos;
55 char * page; 55 char * page;
56 struct sysfs_ops * ops; 56 const struct sysfs_ops * ops;
57 struct mutex mutex; 57 struct mutex mutex;
58 int needs_read_fill; 58 int needs_read_fill;
59 int event; 59 int event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
75{ 75{
76 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 76 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
78 struct sysfs_ops * ops = buffer->ops; 78 const struct sysfs_ops * ops = buffer->ops;
79 int ret = 0; 79 int ret = 0;
80 ssize_t count; 80 ssize_t count;
81 81
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 /* need attr_sd for attr and ops, its parent for kobj */ 87 /* need attr_sd for attr and ops, its parent for kobj */
88 if (!sysfs_get_active_two(attr_sd)) 88 if (!sysfs_get_active(attr_sd))
89 return -ENODEV; 89 return -ENODEV;
90 90
91 buffer->event = atomic_read(&attr_sd->s_attr.open->event); 91 buffer->event = atomic_read(&attr_sd->s_attr.open->event);
92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); 92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
93 93
94 sysfs_put_active_two(attr_sd); 94 sysfs_put_active(attr_sd);
95 95
96 /* 96 /*
97 * The code works fine with PAGE_SIZE return but it's likely to 97 * The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
199{ 199{
200 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 200 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
202 struct sysfs_ops * ops = buffer->ops; 202 const struct sysfs_ops * ops = buffer->ops;
203 int rc; 203 int rc;
204 204
205 /* need attr_sd for attr and ops, its parent for kobj */ 205 /* need attr_sd for attr and ops, its parent for kobj */
206 if (!sysfs_get_active_two(attr_sd)) 206 if (!sysfs_get_active(attr_sd))
207 return -ENODEV; 207 return -ENODEV;
208 208
209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); 209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
210 210
211 sysfs_put_active_two(attr_sd); 211 sysfs_put_active(attr_sd);
212 212
213 return rc; 213 return rc;
214} 214}
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
337 struct sysfs_buffer *buffer; 337 struct sysfs_buffer *buffer;
338 struct sysfs_ops *ops; 338 const struct sysfs_ops *ops;
339 int error = -EACCES; 339 int error = -EACCES;
340 char *p; 340 char *p;
341 341
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
347 if (!sysfs_get_active_two(attr_sd)) 347 if (!sysfs_get_active(attr_sd))
348 return -ENODEV; 348 return -ENODEV;
349 349
350 /* every kobject with an attribute needs a ktype assigned */ 350 /* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
393 goto err_free; 393 goto err_free;
394 394
395 /* open succeeded, put active references */ 395 /* open succeeded, put active references */
396 sysfs_put_active_two(attr_sd); 396 sysfs_put_active(attr_sd);
397 return 0; 397 return 0;
398 398
399 err_free: 399 err_free:
400 kfree(buffer); 400 kfree(buffer);
401 err_out: 401 err_out:
402 sysfs_put_active_two(attr_sd); 402 sysfs_put_active(attr_sd);
403 return error; 403 return error;
404} 404}
405 405
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
437 struct sysfs_open_dirent *od = attr_sd->s_attr.open; 437 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
438 438
439 /* need parent for the kobj, grab both */ 439 /* need parent for the kobj, grab both */
440 if (!sysfs_get_active_two(attr_sd)) 440 if (!sysfs_get_active(attr_sd))
441 goto trigger; 441 goto trigger;
442 442
443 poll_wait(filp, &od->poll, wait); 443 poll_wait(filp, &od->poll, wait);
444 444
445 sysfs_put_active_two(attr_sd); 445 sysfs_put_active(attr_sd);
446 446
447 if (buffer->event != atomic_read(&od->event)) 447 if (buffer->event != atomic_read(&od->event))
448 goto trigger; 448 goto trigger;
@@ -509,6 +509,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
509 if (!sd) 509 if (!sd)
510 return -ENOMEM; 510 return -ENOMEM;
511 sd->s_attr.attr = (void *)attr; 511 sd->s_attr.attr = (void *)attr;
512 sysfs_dirent_init_lockdep(sd);
512 513
513 sysfs_addrm_start(&acxt, dir_sd); 514 sysfs_addrm_start(&acxt, dir_sd);
514 rc = sysfs_add_one(&acxt, sd); 515 rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +543,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
542 543
543} 544}
544 545
546int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
547{
548 int err = 0;
549 int i;
550
551 for (i = 0; ptr[i] && !err; i++)
552 err = sysfs_create_file(kobj, ptr[i]);
553 if (err)
554 while (--i >= 0)
555 sysfs_remove_file(kobj, ptr[i]);
556 return err;
557}
545 558
546/** 559/**
547 * sysfs_add_file_to_group - add an attribute file to a pre-existing group. 560 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -579,46 +592,23 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
579 */ 592 */
580int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) 593int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
581{ 594{
582 struct sysfs_dirent *victim_sd = NULL; 595 struct sysfs_dirent *sd;
583 struct dentry *victim = NULL;
584 struct inode * inode;
585 struct iattr newattrs; 596 struct iattr newattrs;
586 int rc; 597 int rc;
587 598
588 rc = -ENOENT; 599 mutex_lock(&sysfs_mutex);
589 victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
590 if (!victim_sd)
591 goto out;
592 600
593 mutex_lock(&sysfs_rename_mutex); 601 rc = -ENOENT;
594 victim = sysfs_get_dentry(victim_sd); 602 sd = sysfs_find_dirent(kobj->sd, attr->name);
595 mutex_unlock(&sysfs_rename_mutex); 603 if (!sd)
596 if (IS_ERR(victim)) {
597 rc = PTR_ERR(victim);
598 victim = NULL;
599 goto out; 604 goto out;
600 }
601
602 inode = victim->d_inode;
603
604 mutex_lock(&inode->i_mutex);
605
606 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
607 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
608 newattrs.ia_ctime = current_fs_time(inode->i_sb);
609 rc = sysfs_setattr(victim, &newattrs);
610 605
611 if (rc == 0) { 606 newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
612 fsnotify_change(victim, newattrs.ia_valid); 607 newattrs.ia_valid = ATTR_MODE;
613 mutex_lock(&sysfs_mutex); 608 rc = sysfs_sd_setattr(sd, &newattrs);
614 victim_sd->s_mode = newattrs.ia_mode;
615 mutex_unlock(&sysfs_mutex);
616 }
617 609
618 mutex_unlock(&inode->i_mutex);
619 out: 610 out:
620 dput(victim); 611 mutex_unlock(&sysfs_mutex);
621 sysfs_put(victim_sd);
622 return rc; 612 return rc;
623} 613}
624EXPORT_SYMBOL_GPL(sysfs_chmod_file); 614EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -637,6 +627,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
637 sysfs_hash_and_remove(kobj->sd, attr->name); 627 sysfs_hash_and_remove(kobj->sd, attr->name);
638} 628}
639 629
630void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
631{
632 int i;
633 for (i = 0; ptr[i]; i++)
634 sysfs_remove_file(kobj, ptr[i]);
635}
640 636
641/** 637/**
642 * sysfs_remove_file_from_group - remove an attribute file from a group. 638 * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -755,3 +751,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
755 751
756EXPORT_SYMBOL_GPL(sysfs_create_file); 752EXPORT_SYMBOL_GPL(sysfs_create_file);
757EXPORT_SYMBOL_GPL(sysfs_remove_file); 753EXPORT_SYMBOL_GPL(sysfs_remove_file);
754EXPORT_SYMBOL_GPL(sysfs_remove_files);
755EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e28cecf179f5..a4a0a9419711 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/xattr.h> 22#include <linux/xattr.h>
22#include <linux/security.h> 23#include <linux/security.h>
23#include "sysfs.h" 24#include "sysfs.h"
@@ -37,7 +38,9 @@ static struct backing_dev_info sysfs_backing_dev_info = {
37}; 38};
38 39
39static const struct inode_operations sysfs_inode_operations ={ 40static const struct inode_operations sysfs_inode_operations ={
41 .permission = sysfs_permission,
40 .setattr = sysfs_setattr, 42 .setattr = sysfs_setattr,
43 .getattr = sysfs_getattr,
41 .setxattr = sysfs_setxattr, 44 .setxattr = sysfs_setxattr,
42}; 45};
43 46
@@ -46,7 +49,7 @@ int __init sysfs_inode_init(void)
46 return bdi_init(&sysfs_backing_dev_info); 49 return bdi_init(&sysfs_backing_dev_info);
47} 50}
48 51
49struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) 52static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
50{ 53{
51 struct sysfs_inode_attrs *attrs; 54 struct sysfs_inode_attrs *attrs;
52 struct iattr *iattrs; 55 struct iattr *iattrs;
@@ -64,81 +67,101 @@ struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
64 67
65 return attrs; 68 return attrs;
66} 69}
67int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) 70
71int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
68{ 72{
69 struct inode * inode = dentry->d_inode;
70 struct sysfs_dirent * sd = dentry->d_fsdata;
71 struct sysfs_inode_attrs *sd_attrs; 73 struct sysfs_inode_attrs *sd_attrs;
72 struct iattr *iattrs; 74 struct iattr *iattrs;
73 unsigned int ia_valid = iattr->ia_valid; 75 unsigned int ia_valid = iattr->ia_valid;
76
77 sd_attrs = sd->s_iattr;
78
79 if (!sd_attrs) {
80 /* setting attributes for the first time, allocate now */
81 sd_attrs = sysfs_init_inode_attrs(sd);
82 if (!sd_attrs)
83 return -ENOMEM;
84 sd->s_iattr = sd_attrs;
85 }
86 /* attributes were changed at least once in past */
87 iattrs = &sd_attrs->ia_iattr;
88
89 if (ia_valid & ATTR_UID)
90 iattrs->ia_uid = iattr->ia_uid;
91 if (ia_valid & ATTR_GID)
92 iattrs->ia_gid = iattr->ia_gid;
93 if (ia_valid & ATTR_ATIME)
94 iattrs->ia_atime = iattr->ia_atime;
95 if (ia_valid & ATTR_MTIME)
96 iattrs->ia_mtime = iattr->ia_mtime;
97 if (ia_valid & ATTR_CTIME)
98 iattrs->ia_ctime = iattr->ia_ctime;
99 if (ia_valid & ATTR_MODE) {
100 umode_t mode = iattr->ia_mode;
101 iattrs->ia_mode = sd->s_mode = mode;
102 }
103 return 0;
104}
105
106int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
107{
108 struct inode *inode = dentry->d_inode;
109 struct sysfs_dirent *sd = dentry->d_fsdata;
74 int error; 110 int error;
75 111
76 if (!sd) 112 if (!sd)
77 return -EINVAL; 113 return -EINVAL;
78 114
79 sd_attrs = sd->s_iattr; 115 mutex_lock(&sysfs_mutex);
80
81 error = inode_change_ok(inode, iattr); 116 error = inode_change_ok(inode, iattr);
82 if (error) 117 if (error)
83 return error; 118 goto out;
84 119
85 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
86 121
87 error = inode_setattr(inode, iattr); 122 error = inode_setattr(inode, iattr);
88 if (error) 123 if (error)
89 return error; 124 goto out;
90 125
91 if (!sd_attrs) { 126 error = sysfs_sd_setattr(sd, iattr);
92 /* setting attributes for the first time, allocate now */ 127out:
93 sd_attrs = sysfs_init_inode_attrs(sd); 128 mutex_unlock(&sysfs_mutex);
94 if (!sd_attrs)
95 return -ENOMEM;
96 sd->s_iattr = sd_attrs;
97 } else {
98 /* attributes were changed at least once in past */
99 iattrs = &sd_attrs->ia_iattr;
100
101 if (ia_valid & ATTR_UID)
102 iattrs->ia_uid = iattr->ia_uid;
103 if (ia_valid & ATTR_GID)
104 iattrs->ia_gid = iattr->ia_gid;
105 if (ia_valid & ATTR_ATIME)
106 iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
107 inode->i_sb->s_time_gran);
108 if (ia_valid & ATTR_MTIME)
109 iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
110 inode->i_sb->s_time_gran);
111 if (ia_valid & ATTR_CTIME)
112 iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
113 inode->i_sb->s_time_gran);
114 if (ia_valid & ATTR_MODE) {
115 umode_t mode = iattr->ia_mode;
116
117 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
118 mode &= ~S_ISGID;
119 iattrs->ia_mode = sd->s_mode = mode;
120 }
121 }
122 return error; 129 return error;
123} 130}
124 131
132static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len)
133{
134 struct sysfs_inode_attrs *iattrs;
135 void *old_secdata;
136 size_t old_secdata_len;
137
138 iattrs = sd->s_iattr;
139 if (!iattrs)
140 iattrs = sysfs_init_inode_attrs(sd);
141 if (!iattrs)
142 return -ENOMEM;
143
144 old_secdata = iattrs->ia_secdata;
145 old_secdata_len = iattrs->ia_secdata_len;
146
147 iattrs->ia_secdata = *secdata;
148 iattrs->ia_secdata_len = *secdata_len;
149
150 *secdata = old_secdata;
151 *secdata_len = old_secdata_len;
152 return 0;
153}
154
125int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 155int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
126 size_t size, int flags) 156 size_t size, int flags)
127{ 157{
128 struct sysfs_dirent *sd = dentry->d_fsdata; 158 struct sysfs_dirent *sd = dentry->d_fsdata;
129 struct sysfs_inode_attrs *iattrs;
130 void *secdata; 159 void *secdata;
131 int error; 160 int error;
132 u32 secdata_len = 0; 161 u32 secdata_len = 0;
133 162
134 if (!sd) 163 if (!sd)
135 return -EINVAL; 164 return -EINVAL;
136 if (!sd->s_iattr)
137 sd->s_iattr = sysfs_init_inode_attrs(sd);
138 if (!sd->s_iattr)
139 return -ENOMEM;
140
141 iattrs = sd->s_iattr;
142 165
143 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { 166 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
144 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; 167 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
@@ -150,12 +173,13 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
150 &secdata, &secdata_len); 173 &secdata, &secdata_len);
151 if (error) 174 if (error)
152 goto out; 175 goto out;
153 if (iattrs->ia_secdata)
154 security_release_secctx(iattrs->ia_secdata,
155 iattrs->ia_secdata_len);
156 iattrs->ia_secdata = secdata;
157 iattrs->ia_secdata_len = secdata_len;
158 176
177 mutex_lock(&sysfs_mutex);
178 error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
179 mutex_unlock(&sysfs_mutex);
180
181 if (secdata)
182 security_release_secctx(secdata, secdata_len);
159 } else 183 } else
160 return -EINVAL; 184 return -EINVAL;
161out: 185out:
@@ -170,7 +194,6 @@ static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
170 194
171static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) 195static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
172{ 196{
173 inode->i_mode = iattr->ia_mode;
174 inode->i_uid = iattr->ia_uid; 197 inode->i_uid = iattr->ia_uid;
175 inode->i_gid = iattr->ia_gid; 198 inode->i_gid = iattr->ia_gid;
176 inode->i_atime = iattr->ia_atime; 199 inode->i_atime = iattr->ia_atime;
@@ -178,17 +201,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
178 inode->i_ctime = iattr->ia_ctime; 201 inode->i_ctime = iattr->ia_ctime;
179} 202}
180 203
181
182/*
183 * sysfs has a different i_mutex lock order behavior for i_mutex than other
184 * filesystems; sysfs i_mutex is called in many places with subsystem locks
185 * held. At the same time, many of the VFS locking rules do not apply to
186 * sysfs at all (cross directory rename for example). To untangle this mess
187 * (which gives false positives in lockdep), we're giving sysfs inodes their
188 * own class for i_mutex.
189 */
190static struct lock_class_key sysfs_inode_imutex_key;
191
192static int sysfs_count_nlink(struct sysfs_dirent *sd) 204static int sysfs_count_nlink(struct sysfs_dirent *sd)
193{ 205{
194 struct sysfs_dirent *child; 206 struct sysfs_dirent *child;
@@ -201,38 +213,55 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
201 return nr + 2; 213 return nr + 2;
202} 214}
203 215
216static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
217{
218 struct sysfs_inode_attrs *iattrs = sd->s_iattr;
219
220 inode->i_mode = sd->s_mode;
221 if (iattrs) {
222 /* sysfs_dirent has non-default attributes
223 * get them from persistent copy in sysfs_dirent
224 */
225 set_inode_attr(inode, &iattrs->ia_iattr);
226 security_inode_notifysecctx(inode,
227 iattrs->ia_secdata,
228 iattrs->ia_secdata_len);
229 }
230
231 if (sysfs_type(sd) == SYSFS_DIR)
232 inode->i_nlink = sysfs_count_nlink(sd);
233}
234
235int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
236{
237 struct sysfs_dirent *sd = dentry->d_fsdata;
238 struct inode *inode = dentry->d_inode;
239
240 mutex_lock(&sysfs_mutex);
241 sysfs_refresh_inode(sd, inode);
242 mutex_unlock(&sysfs_mutex);
243
244 generic_fillattr(inode, stat);
245 return 0;
246}
247
204static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) 248static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
205{ 249{
206 struct bin_attribute *bin_attr; 250 struct bin_attribute *bin_attr;
207 struct sysfs_inode_attrs *iattrs;
208 251
209 inode->i_private = sysfs_get(sd); 252 inode->i_private = sysfs_get(sd);
210 inode->i_mapping->a_ops = &sysfs_aops; 253 inode->i_mapping->a_ops = &sysfs_aops;
211 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 254 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
212 inode->i_op = &sysfs_inode_operations; 255 inode->i_op = &sysfs_inode_operations;
213 inode->i_ino = sd->s_ino;
214 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
215 256
216 iattrs = sd->s_iattr; 257 set_default_inode_attr(inode, sd->s_mode);
217 if (iattrs) { 258 sysfs_refresh_inode(sd, inode);
218 /* sysfs_dirent has non-default attributes
219 * get them for the new inode from persistent copy
220 * in sysfs_dirent
221 */
222 set_inode_attr(inode, &iattrs->ia_iattr);
223 if (iattrs->ia_secdata)
224 security_inode_notifysecctx(inode,
225 iattrs->ia_secdata,
226 iattrs->ia_secdata_len);
227 } else
228 set_default_inode_attr(inode, sd->s_mode);
229 259
230 /* initialize inode according to type */ 260 /* initialize inode according to type */
231 switch (sysfs_type(sd)) { 261 switch (sysfs_type(sd)) {
232 case SYSFS_DIR: 262 case SYSFS_DIR:
233 inode->i_op = &sysfs_dir_inode_operations; 263 inode->i_op = &sysfs_dir_inode_operations;
234 inode->i_fop = &sysfs_dir_operations; 264 inode->i_fop = &sysfs_dir_operations;
235 inode->i_nlink = sysfs_count_nlink(sd);
236 break; 265 break;
237 case SYSFS_KOBJ_ATTR: 266 case SYSFS_KOBJ_ATTR:
238 inode->i_size = PAGE_SIZE; 267 inode->i_size = PAGE_SIZE;
@@ -255,6 +284,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
255 284
256/** 285/**
257 * sysfs_get_inode - get inode for sysfs_dirent 286 * sysfs_get_inode - get inode for sysfs_dirent
287 * @sb: super block
258 * @sd: sysfs_dirent to allocate inode for 288 * @sd: sysfs_dirent to allocate inode for
259 * 289 *
260 * Get inode for @sd. If such inode doesn't exist, a new inode 290 * Get inode for @sd. If such inode doesn't exist, a new inode
@@ -267,11 +297,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
267 * RETURNS: 297 * RETURNS:
268 * Pointer to allocated inode on success, NULL on failure. 298 * Pointer to allocated inode on success, NULL on failure.
269 */ 299 */
270struct inode * sysfs_get_inode(struct sysfs_dirent *sd) 300struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
271{ 301{
272 struct inode *inode; 302 struct inode *inode;
273 303
274 inode = iget_locked(sysfs_sb, sd->s_ino); 304 inode = iget_locked(sb, sd->s_ino);
275 if (inode && (inode->i_state & I_NEW)) 305 if (inode && (inode->i_state & I_NEW))
276 sysfs_init_inode(sd, inode); 306 sysfs_init_inode(sd, inode);
277 307
@@ -315,3 +345,14 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
315 else 345 else
316 return -ENOENT; 346 return -ENOENT;
317} 347}
348
349int sysfs_permission(struct inode *inode, int mask)
350{
351 struct sysfs_dirent *sd = inode->i_private;
352
353 mutex_lock(&sysfs_mutex);
354 sysfs_refresh_inode(sd, inode);
355 mutex_unlock(&sysfs_mutex);
356
357 return generic_permission(inode, mask, NULL);
358}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955ccaf..776137828dca 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,12 +18,12 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h> 20#include <linux/magic.h>
21#include <linux/slab.h>
21 22
22#include "sysfs.h" 23#include "sysfs.h"
23 24
24 25
25static struct vfsmount *sysfs_mount; 26static struct vfsmount *sysfs_mount;
26struct super_block * sysfs_sb = NULL;
27struct kmem_cache *sysfs_dir_cachep; 27struct kmem_cache *sysfs_dir_cachep;
28 28
29static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
@@ -50,11 +50,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
50 sb->s_magic = SYSFS_MAGIC; 50 sb->s_magic = SYSFS_MAGIC;
51 sb->s_op = &sysfs_ops; 51 sb->s_op = &sysfs_ops;
52 sb->s_time_gran = 1; 52 sb->s_time_gran = 1;
53 sysfs_sb = sb;
54 53
55 /* get root inode, initialize and unlock it */ 54 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex); 55 mutex_lock(&sysfs_mutex);
57 inode = sysfs_get_inode(&sysfs_root); 56 inode = sysfs_get_inode(sb, &sysfs_root);
58 mutex_unlock(&sysfs_mutex); 57 mutex_unlock(&sysfs_mutex);
59 if (!inode) { 58 if (!inode) {
60 pr_debug("sysfs: could not get root inode\n"); 59 pr_debug("sysfs: could not get root inode\n");
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5081ad77026..b93ec51fa7ac 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/kobject.h> 17#include <linux/kobject.h>
@@ -123,6 +124,44 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
123 sysfs_hash_and_remove(parent_sd, name); 124 sysfs_hash_and_remove(parent_sd, name);
124} 125}
125 126
127/**
128 * sysfs_rename_link - rename symlink in object's directory.
129 * @kobj: object we're acting for.
130 * @targ: object we're pointing to.
131 * @old: previous name of the symlink.
132 * @new: new name of the symlink.
133 *
134 * A helper function for the common rename symlink idiom.
135 */
136int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
137 const char *old, const char *new)
138{
139 struct sysfs_dirent *parent_sd, *sd = NULL;
140 int result;
141
142 if (!kobj)
143 parent_sd = &sysfs_root;
144 else
145 parent_sd = kobj->sd;
146
147 result = -ENOENT;
148 sd = sysfs_get_dirent(parent_sd, old);
149 if (!sd)
150 goto out;
151
152 result = -EINVAL;
153 if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
154 goto out;
155 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
156 goto out;
157
158 result = sysfs_rename(sd, parent_sd, new);
159
160out:
161 sysfs_put(sd);
162 return result;
163}
164
126static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, 165static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
127 struct sysfs_dirent *target_sd, char *path) 166 struct sysfs_dirent *target_sd, char *path)
128{ 167{
@@ -210,10 +249,13 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
210} 249}
211 250
212const struct inode_operations sysfs_symlink_inode_operations = { 251const struct inode_operations sysfs_symlink_inode_operations = {
213 .setxattr = sysfs_setxattr, 252 .setxattr = sysfs_setxattr,
214 .readlink = generic_readlink, 253 .readlink = generic_readlink,
215 .follow_link = sysfs_follow_link, 254 .follow_link = sysfs_follow_link,
216 .put_link = sysfs_put_link, 255 .put_link = sysfs_put_link,
256 .setattr = sysfs_setattr,
257 .getattr = sysfs_getattr,
258 .permission = sysfs_permission,
217}; 259};
218 260
219 261
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7482ac..30f5a44fb5d3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,7 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/lockdep.h>
11#include <linux/fs.h> 12#include <linux/fs.h>
12 13
13struct sysfs_open_dirent; 14struct sysfs_open_dirent;
@@ -50,6 +51,9 @@ struct sysfs_inode_attrs {
50struct sysfs_dirent { 51struct sysfs_dirent {
51 atomic_t s_count; 52 atomic_t s_count;
52 atomic_t s_active; 53 atomic_t s_active;
54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55 struct lockdep_map dep_map;
56#endif
53 struct sysfs_dirent *s_parent; 57 struct sysfs_dirent *s_parent;
54 struct sysfs_dirent *s_sibling; 58 struct sysfs_dirent *s_sibling;
55 const char *s_name; 59 const char *s_name;
@@ -62,8 +66,8 @@ struct sysfs_dirent {
62 }; 66 };
63 67
64 unsigned int s_flags; 68 unsigned int s_flags;
69 unsigned short s_mode;
65 ino_t s_ino; 70 ino_t s_ino;
66 umode_t s_mode;
67 struct sysfs_inode_attrs *s_iattr; 71 struct sysfs_inode_attrs *s_iattr;
68}; 72};
69 73
@@ -75,6 +79,7 @@ struct sysfs_dirent {
75#define SYSFS_KOBJ_BIN_ATTR 0x0004 79#define SYSFS_KOBJ_BIN_ATTR 0x0004
76#define SYSFS_KOBJ_LINK 0x0008 80#define SYSFS_KOBJ_LINK 0x0008
77#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
82#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
78 83
79#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 84#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
80#define SYSFS_FLAG_REMOVED 0x0200 85#define SYSFS_FLAG_REMOVED 0x0200
@@ -84,36 +89,46 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
84 return sd->s_flags & SYSFS_TYPE_MASK; 89 return sd->s_flags & SYSFS_TYPE_MASK;
85} 90}
86 91
92#ifdef CONFIG_DEBUG_LOCK_ALLOC
93#define sysfs_dirent_init_lockdep(sd) \
94do { \
95 struct attribute *attr = sd->s_attr.attr; \
96 struct lock_class_key *key = attr->key; \
97 if (!key) \
98 key = &attr->skey; \
99 \
100 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
101} while(0)
102#else
103#define sysfs_dirent_init_lockdep(sd) do {} while(0)
104#endif
105
87/* 106/*
88 * Context structure to be used while adding/removing nodes. 107 * Context structure to be used while adding/removing nodes.
89 */ 108 */
90struct sysfs_addrm_cxt { 109struct sysfs_addrm_cxt {
91 struct sysfs_dirent *parent_sd; 110 struct sysfs_dirent *parent_sd;
92 struct inode *parent_inode;
93 struct sysfs_dirent *removed; 111 struct sysfs_dirent *removed;
94 int cnt;
95}; 112};
96 113
97/* 114/*
98 * mount.c 115 * mount.c
99 */ 116 */
100extern struct sysfs_dirent sysfs_root; 117extern struct sysfs_dirent sysfs_root;
101extern struct super_block *sysfs_sb;
102extern struct kmem_cache *sysfs_dir_cachep; 118extern struct kmem_cache *sysfs_dir_cachep;
103 119
104/* 120/*
105 * dir.c 121 * dir.c
106 */ 122 */
107extern struct mutex sysfs_mutex; 123extern struct mutex sysfs_mutex;
108extern struct mutex sysfs_rename_mutex;
109extern spinlock_t sysfs_assoc_lock; 124extern spinlock_t sysfs_assoc_lock;
110 125
111extern const struct file_operations sysfs_dir_operations; 126extern const struct file_operations sysfs_dir_operations;
112extern const struct inode_operations sysfs_dir_inode_operations; 127extern const struct inode_operations sysfs_dir_inode_operations;
113 128
114struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); 129struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
115struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); 130struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
116void sysfs_put_active_two(struct sysfs_dirent *sd); 131void sysfs_put_active(struct sysfs_dirent *sd);
117void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, 132void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
118 struct sysfs_dirent *parent_sd); 133 struct sysfs_dirent *parent_sd);
119int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); 134int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -133,6 +148,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
133 struct sysfs_dirent **p_sd); 148 struct sysfs_dirent **p_sd);
134void sysfs_remove_subdir(struct sysfs_dirent *sd); 149void sysfs_remove_subdir(struct sysfs_dirent *sd);
135 150
151int sysfs_rename(struct sysfs_dirent *sd,
152 struct sysfs_dirent *new_parent_sd, const char *new_name);
153
136static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) 154static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
137{ 155{
138 if (sd) { 156 if (sd) {
@@ -153,9 +171,12 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
153/* 171/*
154 * inode.c 172 * inode.c
155 */ 173 */
156struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 174struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
157void sysfs_delete_inode(struct inode *inode); 175void sysfs_delete_inode(struct inode *inode);
176int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
177int sysfs_permission(struct inode *inode, int mask);
158int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 178int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
179int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
159int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 180int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
160 size_t size, int flags); 181 size_t size, int flags);
161int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 182int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a7..4573734d723d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/vfs.h> 28#include <linux/vfs.h>
29#include <linux/writeback.h>
29#include <linux/namei.h> 30#include <linux/namei.h>
30#include <asm/byteorder.h> 31#include <asm/byteorder.h>
31#include "sysv.h" 32#include "sysv.h"
@@ -246,7 +247,7 @@ bad_inode:
246 return ERR_PTR(-EIO); 247 return ERR_PTR(-EIO);
247} 248}
248 249
249int sysv_write_inode(struct inode *inode, int wait) 250static int __sysv_write_inode(struct inode *inode, int wait)
250{ 251{
251 struct super_block * sb = inode->i_sb; 252 struct super_block * sb = inode->i_sb;
252 struct sysv_sb_info * sbi = SYSV_SB(sb); 253 struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait)
296 return 0; 297 return 0;
297} 298}
298 299
300int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
301{
302 return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
303}
304
299int sysv_sync_inode(struct inode *inode) 305int sysv_sync_inode(struct inode *inode)
300{ 306{
301 return sysv_write_inode(inode, 1); 307 return __sysv_write_inode(inode, 1);
302} 308}
303 309
304static void sysv_delete_inode(struct inode *inode) 310static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf60..94cb9b4d76c2 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
142 142
143/* inode.c */ 143/* inode.c */
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 144extern struct inode *sysv_iget(struct super_block *, unsigned int);
145extern int sysv_write_inode(struct inode *, int); 145extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
146extern int sysv_sync_inode(struct inode *); 146extern int sysv_sync_inode(struct inode *);
147extern void sysv_set_inode(struct inode *, dev_t); 147extern void sysv_set_inode(struct inode *, dev_t);
148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); 148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b042bd7034b1..98158de91d24 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <linux/list.h> 18#include <linux/list.h>
18#include <linux/spinlock.h> 19#include <linux/spinlock.h>
19#include <linux/time.h> 20#include <linux/time.h>
@@ -200,7 +201,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
200 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); 201 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
201 202
202 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 203 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
203 flags & TFD_SHARED_FCNTL_FLAGS); 204 O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
204 if (ufd < 0) 205 if (ufd < 0)
205 kfree(ctx); 206 kfree(ctx);
206 207
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
45 45
46#include <linux/freezer.h> 46#include <linux/freezer.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/slab.h>
48#include "ubifs.h" 49#include "ubifs.h"
49 50
50/** 51/**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index dbc093afd946..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
37 38
38#ifdef CONFIG_UBIFS_FS_DEBUG 39#ifdef CONFIG_UBIFS_FS_DEBUG
39 40
@@ -350,13 +351,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
350 le32_to_cpu(sup->fmt_version)); 351 le32_to_cpu(sup->fmt_version));
351 printk(KERN_DEBUG "\ttime_gran %u\n", 352 printk(KERN_DEBUG "\ttime_gran %u\n",
352 le32_to_cpu(sup->time_gran)); 353 le32_to_cpu(sup->time_gran));
353 printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X" 354 printk(KERN_DEBUG "\tUUID %pUB\n",
354 "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", 355 sup->uuid);
355 sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
356 sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
357 sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
358 sup->uuid[12], sup->uuid[13], sup->uuid[14],
359 sup->uuid[15]);
360 break; 356 break;
361 } 357 }
362 case UBIFS_MST_NODE: 358 case UBIFS_MST_NODE:
@@ -2014,7 +2010,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2014 inum = key_inum_flash(c, &dent->key); 2010 inum = key_inum_flash(c, &dent->key);
2015 fscki1 = read_add_inode(c, priv, inum); 2011 fscki1 = read_add_inode(c, priv, inum);
2016 if (IS_ERR(fscki1)) { 2012 if (IS_ERR(fscki1)) {
2017 err = PTR_ERR(fscki); 2013 err = PTR_ERR(fscki1);
2018 ubifs_err("error %d while processing entry node and " 2014 ubifs_err("error %d while processing entry node and "
2019 "trying to find parent inode node %lu", 2015 "trying to find parent inode node %lu",
2020 err, (unsigned long)inum); 2016 err, (unsigned long)inum);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111fff..401e503d44a1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1120 if (release) 1120 if (release)
1121 ubifs_release_budget(c, &ino_req); 1121 ubifs_release_budget(c, &ino_req);
1122 if (IS_SYNC(old_inode)) 1122 if (IS_SYNC(old_inode))
1123 err = old_inode->i_sb->s_op->write_inode(old_inode, 1); 1123 err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
1124 return err; 1124 return err;
1125 1125
1126out_cancel: 1126out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1009adc8d602..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -45,13 +45,14 @@
45 * 45 *
46 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the 46 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
47 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> 47 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
48 * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not 48 * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not
49 * set as well. However, UBIFS disables readahead. 49 * set as well. However, UBIFS disables readahead.
50 */ 50 */
51 51
52#include "ubifs.h" 52#include "ubifs.h"
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/slab.h>
55 56
56static int read_block(struct inode *inode, void *addr, unsigned int block, 57static int read_block(struct inode *inode, void *addr, unsigned int block,
57 struct ubifs_data_node *dn) 58 struct ubifs_data_node *dn)
@@ -1011,7 +1012,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1011 /* Is the page fully inside @i_size? */ 1012 /* Is the page fully inside @i_size? */
1012 if (page->index < end_index) { 1013 if (page->index < end_index) {
1013 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { 1014 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
1014 err = inode->i_sb->s_op->write_inode(inode, 1); 1015 err = inode->i_sb->s_op->write_inode(inode, NULL);
1015 if (err) 1016 if (err)
1016 goto out_unlock; 1017 goto out_unlock;
1017 /* 1018 /*
@@ -1039,7 +1040,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1039 kunmap_atomic(kaddr, KM_USER0); 1040 kunmap_atomic(kaddr, KM_USER0);
1040 1041
1041 if (i_size > synced_i_size) { 1042 if (i_size > synced_i_size) {
1042 err = inode->i_sb->s_op->write_inode(inode, 1); 1043 err = inode->i_sb->s_op->write_inode(inode, NULL);
1043 if (err) 1044 if (err)
1044 goto out_unlock; 1045 goto out_unlock;
1045 } 1046 }
@@ -1242,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1242 if (release) 1243 if (release)
1243 ubifs_release_budget(c, &req); 1244 ubifs_release_budget(c, &req);
1244 if (IS_SYNC(inode)) 1245 if (IS_SYNC(inode))
1245 err = inode->i_sb->s_op->write_inode(inode, 1); 1246 err = inode->i_sb->s_op->write_inode(inode, NULL);
1246 return err; 1247 return err;
1247 1248
1248out: 1249out:
@@ -1316,7 +1317,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1316 * the inode unless this is a 'datasync()' call. 1317 * the inode unless this is a 'datasync()' call.
1317 */ 1318 */
1318 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { 1319 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
1319 err = inode->i_sb->s_op->write_inode(inode, 1); 1320 err = inode->i_sb->s_op->write_inode(inode, NULL);
1320 if (err) 1321 if (err)
1321 return err; 1322 return err;
1322 } 1323 }
@@ -1389,7 +1390,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
1389 unsigned long nr_segs, loff_t pos) 1390 unsigned long nr_segs, loff_t pos)
1390{ 1391{
1391 int err; 1392 int err;
1392 ssize_t ret;
1393 struct inode *inode = iocb->ki_filp->f_mapping->host; 1393 struct inode *inode = iocb->ki_filp->f_mapping->host;
1394 struct ubifs_info *c = inode->i_sb->s_fs_info; 1394 struct ubifs_info *c = inode->i_sb->s_fs_info;
1395 1395
@@ -1397,17 +1397,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
1397 if (err) 1397 if (err)
1398 return err; 1398 return err;
1399 1399
1400 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 1400 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1401 if (ret < 0)
1402 return ret;
1403
1404 if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
1405 err = ubifs_sync_wbufs_by_inode(c, inode);
1406 if (err)
1407 return err;
1408 }
1409
1410 return ret;
1411} 1401}
1412 1402
1413static int ubifs_set_page_dirty(struct page *page) 1403static int ubifs_set_page_dirty(struct page *page)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 618c2701d3a7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,7 +53,9 @@
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
54 */ 54 */
55 55
56#include <linux/slab.h>
56#include <linux/pagemap.h> 57#include <linux/pagemap.h>
58#include <linux/list_sort.h>
57#include "ubifs.h" 59#include "ubifs.h"
58 60
59/* 61/*
@@ -108,101 +110,6 @@ static int switch_gc_head(struct ubifs_info *c)
108} 110}
109 111
110/** 112/**
111 * list_sort - sort a list.
112 * @priv: private data, passed to @cmp
113 * @head: the list to sort
114 * @cmp: the elements comparison function
115 *
116 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
117 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
118 * in ascending order.
119 *
120 * The comparison function @cmp is supposed to return a negative value if @a is
121 * than @b, and a positive value if @a is greater than @b. If @a and @b are
122 * equivalent, then it does not matter what this function returns.
123 */
124static void list_sort(void *priv, struct list_head *head,
125 int (*cmp)(void *priv, struct list_head *a,
126 struct list_head *b))
127{
128 struct list_head *p, *q, *e, *list, *tail, *oldhead;
129 int insize, nmerges, psize, qsize, i;
130
131 if (list_empty(head))
132 return;
133
134 list = head->next;
135 list_del(head);
136 insize = 1;
137 for (;;) {
138 p = oldhead = list;
139 list = tail = NULL;
140 nmerges = 0;
141
142 while (p) {
143 nmerges++;
144 q = p;
145 psize = 0;
146 for (i = 0; i < insize; i++) {
147 psize++;
148 q = q->next == oldhead ? NULL : q->next;
149 if (!q)
150 break;
151 }
152
153 qsize = insize;
154 while (psize > 0 || (qsize > 0 && q)) {
155 if (!psize) {
156 e = q;
157 q = q->next;
158 qsize--;
159 if (q == oldhead)
160 q = NULL;
161 } else if (!qsize || !q) {
162 e = p;
163 p = p->next;
164 psize--;
165 if (p == oldhead)
166 p = NULL;
167 } else if (cmp(priv, p, q) <= 0) {
168 e = p;
169 p = p->next;
170 psize--;
171 if (p == oldhead)
172 p = NULL;
173 } else {
174 e = q;
175 q = q->next;
176 qsize--;
177 if (q == oldhead)
178 q = NULL;
179 }
180 if (tail)
181 tail->next = e;
182 else
183 list = e;
184 e->prev = tail;
185 tail = e;
186 }
187 p = q;
188 }
189
190 tail->next = list;
191 list->prev = tail;
192
193 if (nmerges <= 1)
194 break;
195
196 insize *= 2;
197 }
198
199 head->next = list;
200 head->prev = list->prev;
201 list->prev->next = head;
202 list->prev = head;
203}
204
205/**
206 * data_nodes_cmp - compare 2 data nodes. 113 * data_nodes_cmp - compare 2 data nodes.
207 * @priv: UBIFS file-system description object 114 * @priv: UBIFS file-system description object
208 * @a: first data node 115 * @a: first data node
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..77d5cf4a7547 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
51 */ 51 */
52 52
53#include <linux/crc32.h> 53#include <linux/crc32.h>
54#include <linux/slab.h>
54#include "ubifs.h" 55#include "ubifs.h"
55 56
56/** 57/**
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
46#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h> 47#include <linux/crc16.h>
48#include <linux/math64.h> 48#include <linux/math64.h>
49#include <linux/slab.h>
49 50
50/** 51/**
51 * do_calc_lpt_geom - calculate sizes for the LPT area. 52 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
26 */ 26 */
27 27
28#include <linux/crc16.h> 28#include <linux/crc16.h>
29#include <linux/slab.h>
29#include "ubifs.h" 30#include "ubifs.h"
30 31
31/** 32/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f94ddf7efba0..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -23,7 +23,7 @@
23/* 23/*
24 * This file implements functions needed to recover from unclean un-mounts. 24 * This file implements functions needed to recover from unclean un-mounts.
25 * When UBIFS is mounted, it checks a flag on the master node to determine if 25 * When UBIFS is mounted, it checks a flag on the master node to determine if
26 * an un-mount was completed sucessfully. If not, the process of mounting 26 * an un-mount was completed successfully. If not, the process of mounting
27 * incorparates additional checking and fixing of on-flash data structures. 27 * incorparates additional checking and fixing of on-flash data structures.
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that 28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted 29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/** 37/**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
27 */ 27 */
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/slab.h>
30#include <linux/random.h> 31#include <linux/random.h>
31#include <linux/math64.h> 32#include <linux/math64.h>
32 33
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 333e181ee987..4d2f2157dd3f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
283/* 283/*
284 * Note, Linux write-back code calls this without 'i_mutex'. 284 * Note, Linux write-back code calls this without 'i_mutex'.
285 */ 285 */
286static int ubifs_write_inode(struct inode *inode, int wait) 286static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
287{ 287{
288 int err = 0; 288 int err = 0;
289 struct ubifs_info *c = inode->i_sb->s_fs_info; 289 struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1393,12 +1393,7 @@ static int mount_ubifs(struct ubifs_info *c)
1393 c->leb_size, c->leb_size >> 10); 1393 c->leb_size, c->leb_size >> 10);
1394 dbg_msg("data journal heads: %d", 1394 dbg_msg("data journal heads: %d",
1395 c->jhead_cnt - NONDATA_JHEADS_CNT); 1395 c->jhead_cnt - NONDATA_JHEADS_CNT);
1396 dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" 1396 dbg_msg("UUID: %pUB", c->uuid);
1397 "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
1398 c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
1399 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
1400 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
1401 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
1402 dbg_msg("big_lpt %d", c->big_lpt); 1397 dbg_msg("big_lpt %d", c->big_lpt);
1403 dbg_msg("log LEBs: %d (%d - %d)", 1398 dbg_msg("log LEBs: %d (%d - %d)",
1404 c->log_lebs, UBIFS_LOG_LNUM, c->log_last); 1399 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1842,22 +1837,32 @@ const struct super_operations ubifs_super_operations = {
1842 * @name: UBI volume name 1837 * @name: UBI volume name
1843 * @mode: UBI volume open mode 1838 * @mode: UBI volume open mode
1844 * 1839 *
1845 * There are several ways to specify UBI volumes when mounting UBIFS: 1840 * The primary method of mounting UBIFS is by specifying the UBI volume
1846 * o ubiX_Y - UBI device number X, volume Y; 1841 * character device node path. However, UBIFS may also be mounted withoug any
1847 * o ubiY - UBI device number 0, volume Y; 1842 * character device node using one of the following methods:
1843 *
1844 * o ubiX_Y - mount UBI device number X, volume Y;
1845 * o ubiY - mount UBI device number 0, volume Y;
1848 * o ubiX:NAME - mount UBI device X, volume with name NAME; 1846 * o ubiX:NAME - mount UBI device X, volume with name NAME;
1849 * o ubi:NAME - mount UBI device 0, volume with name NAME. 1847 * o ubi:NAME - mount UBI device 0, volume with name NAME.
1850 * 1848 *
1851 * Alternative '!' separator may be used instead of ':' (because some shells 1849 * Alternative '!' separator may be used instead of ':' (because some shells
1852 * like busybox may interpret ':' as an NFS host name separator). This function 1850 * like busybox may interpret ':' as an NFS host name separator). This function
1853 * returns ubi volume object in case of success and a negative error code in 1851 * returns UBI volume description object in case of success and a negative
1854 * case of failure. 1852 * error code in case of failure.
1855 */ 1853 */
1856static struct ubi_volume_desc *open_ubi(const char *name, int mode) 1854static struct ubi_volume_desc *open_ubi(const char *name, int mode)
1857{ 1855{
1856 struct ubi_volume_desc *ubi;
1858 int dev, vol; 1857 int dev, vol;
1859 char *endptr; 1858 char *endptr;
1860 1859
1860 /* First, try to open using the device node path method */
1861 ubi = ubi_open_volume_path(name, mode);
1862 if (!IS_ERR(ubi))
1863 return ubi;
1864
1865 /* Try the "nodev" method */
1861 if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') 1866 if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
1862 return ERR_PTR(-EINVAL); 1867 return ERR_PTR(-EINVAL);
1863 1868
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/* 37/*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/slab.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
56 */ 56 */
57 57
58#include "ubifs.h" 58#include "ubifs.h"
59#include <linux/slab.h>
59#include <linux/xattr.h> 60#include <linux/xattr.h>
60#include <linux/posix_acl_xattr.h> 61#include <linux/posix_acl_xattr.h>
61 62
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 1e068535b58b..19626e2491c4 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,55 +31,8 @@
31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr) 31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) 32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) 33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
34#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
35#define udf_find_next_one_bit(addr, size, offset) \ 34#define udf_find_next_one_bit(addr, size, offset) \
36 find_next_one_bit(addr, size, offset) 35 ext2_find_next_bit(addr, size, offset)
37
38#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
39#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
40#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
41#define uintBPL_t uint(BITS_PER_LONG)
42#define uint(x) xuint(x)
43#define xuint(x) __le ## x
44
45static inline int find_next_one_bit(void *addr, int size, int offset)
46{
47 uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
48 int result = offset & ~(BITS_PER_LONG - 1);
49 unsigned long tmp;
50
51 if (offset >= size)
52 return size;
53 size -= result;
54 offset &= (BITS_PER_LONG - 1);
55 if (offset) {
56 tmp = leBPL_to_cpup(p++);
57 tmp &= ~0UL << offset;
58 if (size < BITS_PER_LONG)
59 goto found_first;
60 if (tmp)
61 goto found_middle;
62 size -= BITS_PER_LONG;
63 result += BITS_PER_LONG;
64 }
65 while (size & ~(BITS_PER_LONG - 1)) {
66 tmp = leBPL_to_cpup(p++);
67 if (tmp)
68 goto found_middle;
69 result += BITS_PER_LONG;
70 size -= BITS_PER_LONG;
71 }
72 if (!size)
73 return result;
74 tmp = leBPL_to_cpup(p);
75found_first:
76 tmp &= ~0UL >> (BITS_PER_LONG - size);
77found_middle:
78 return result + ffz(~tmp);
79}
80
81#define find_first_one_bit(addr, size)\
82 find_next_one_bit((addr), (size), 0)
83 36
84static int read_block_bitmap(struct super_block *sb, 37static int read_block_bitmap(struct super_block *sb,
85 struct udf_bitmap *bitmap, unsigned int block, 38 struct udf_bitmap *bitmap, unsigned int block,
@@ -208,7 +161,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
208 ((char *)bh->b_data)[(bit + i) >> 3]); 161 ((char *)bh->b_data)[(bit + i) >> 3]);
209 } else { 162 } else {
210 if (inode) 163 if (inode)
211 vfs_dq_free_block(inode, 1); 164 dquot_free_block(inode, 1);
212 udf_add_free_space(sb, sbi->s_partition, 1); 165 udf_add_free_space(sb, sbi->s_partition, 1);
213 } 166 }
214 } 167 }
@@ -260,11 +213,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
260 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 213 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
261 if (!udf_test_bit(bit, bh->b_data)) 214 if (!udf_test_bit(bit, bh->b_data))
262 goto out; 215 goto out;
263 else if (vfs_dq_prealloc_block(inode, 1)) 216 else if (dquot_prealloc_block(inode, 1))
264 goto out; 217 goto out;
265 else if (!udf_clear_bit(bit, bh->b_data)) { 218 else if (!udf_clear_bit(bit, bh->b_data)) {
266 udf_debug("bit already cleared for block %d\n", bit); 219 udf_debug("bit already cleared for block %d\n", bit);
267 vfs_dq_free_block(inode, 1); 220 dquot_free_block(inode, 1);
268 goto out; 221 goto out;
269 } 222 }
270 block_count--; 223 block_count--;
@@ -390,10 +343,14 @@ got_block:
390 /* 343 /*
391 * Check quota for allocation of this block. 344 * Check quota for allocation of this block.
392 */ 345 */
393 if (inode && vfs_dq_alloc_block(inode, 1)) { 346 if (inode) {
394 mutex_unlock(&sbi->s_alloc_mutex); 347 int ret = dquot_alloc_block(inode, 1);
395 *err = -EDQUOT; 348
396 return 0; 349 if (ret) {
350 mutex_unlock(&sbi->s_alloc_mutex);
351 *err = ret;
352 return 0;
353 }
397 } 354 }
398 355
399 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 356 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
@@ -440,7 +397,7 @@ static void udf_table_free_blocks(struct super_block *sb,
440 (bloc->logicalBlockNum + count) > 397 (bloc->logicalBlockNum + count) >
441 partmap->s_partition_len) { 398 partmap->s_partition_len) {
442 udf_debug("%d < %d || %d + %d > %d\n", 399 udf_debug("%d < %d || %d + %d > %d\n",
443 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 400 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
444 partmap->s_partition_len); 401 partmap->s_partition_len);
445 goto error_return; 402 goto error_return;
446 } 403 }
@@ -449,7 +406,7 @@ static void udf_table_free_blocks(struct super_block *sb,
449 /* We do this up front - There are some error conditions that 406 /* We do this up front - There are some error conditions that
450 could occure, but.. oh well */ 407 could occure, but.. oh well */
451 if (inode) 408 if (inode)
452 vfs_dq_free_block(inode, count); 409 dquot_free_block(inode, count);
453 udf_add_free_space(sb, sbi->s_partition, count); 410 udf_add_free_space(sb, sbi->s_partition, count);
454 411
455 start = bloc->logicalBlockNum + offset; 412 start = bloc->logicalBlockNum + offset;
@@ -547,7 +504,7 @@ static void udf_table_free_blocks(struct super_block *sb,
547 } 504 }
548 505
549 if (epos.offset + (2 * adsize) > sb->s_blocksize) { 506 if (epos.offset + (2 * adsize) > sb->s_blocksize) {
550 char *sptr, *dptr; 507 unsigned char *sptr, *dptr;
551 int loffset; 508 int loffset;
552 509
553 brelse(oepos.bh); 510 brelse(oepos.bh);
@@ -694,7 +651,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
694 epos.offset -= adsize; 651 epos.offset -= adsize;
695 652
696 alloc_count = (elen >> sb->s_blocksize_bits); 653 alloc_count = (elen >> sb->s_blocksize_bits);
697 if (inode && vfs_dq_prealloc_block(inode, 654 if (inode && dquot_prealloc_block(inode,
698 alloc_count > block_count ? block_count : alloc_count)) 655 alloc_count > block_count ? block_count : alloc_count))
699 alloc_count = 0; 656 alloc_count = 0;
700 else if (alloc_count > block_count) { 657 else if (alloc_count > block_count) {
@@ -797,12 +754,13 @@ static int udf_table_new_block(struct super_block *sb,
797 newblock = goal_eloc.logicalBlockNum; 754 newblock = goal_eloc.logicalBlockNum;
798 goal_eloc.logicalBlockNum++; 755 goal_eloc.logicalBlockNum++;
799 goal_elen -= sb->s_blocksize; 756 goal_elen -= sb->s_blocksize;
800 757 if (inode) {
801 if (inode && vfs_dq_alloc_block(inode, 1)) { 758 *err = dquot_alloc_block(inode, 1);
802 brelse(goal_epos.bh); 759 if (*err) {
803 mutex_unlock(&sbi->s_alloc_mutex); 760 brelse(goal_epos.bh);
804 *err = -EDQUOT; 761 mutex_unlock(&sbi->s_alloc_mutex);
805 return 0; 762 return 0;
763 }
806 } 764 }
807 765
808 if (goal_elen) 766 if (goal_elen)
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 61d9a76a3a69..f0f2a436251e 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
45 int block, iblock; 45 int block, iblock;
46 loff_t nf_pos = (filp->f_pos - 1) << 2; 46 loff_t nf_pos = (filp->f_pos - 1) << 2;
47 int flen; 47 int flen;
48 char *fname = NULL; 48 unsigned char *fname = NULL;
49 char *nameptr; 49 unsigned char *nameptr;
50 uint16_t liu; 50 uint16_t liu;
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index b80cbd78833c..1eb06774ed90 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,6 +34,7 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
37#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
38#include <linux/aio.h> 39#include <linux/aio.h>
39 40
@@ -196,6 +197,7 @@ static int udf_release_file(struct inode *inode, struct file *filp)
196 mutex_lock(&inode->i_mutex); 197 mutex_lock(&inode->i_mutex);
197 lock_kernel(); 198 lock_kernel();
198 udf_discard_prealloc(inode); 199 udf_discard_prealloc(inode);
200 udf_truncate_tail_extent(inode);
199 unlock_kernel(); 201 unlock_kernel();
200 mutex_unlock(&inode->i_mutex); 202 mutex_unlock(&inode->i_mutex);
201 } 203 }
@@ -206,7 +208,7 @@ const struct file_operations udf_file_operations = {
206 .read = do_sync_read, 208 .read = do_sync_read,
207 .aio_read = generic_file_aio_read, 209 .aio_read = generic_file_aio_read,
208 .ioctl = udf_ioctl, 210 .ioctl = udf_ioctl,
209 .open = generic_file_open, 211 .open = dquot_file_open,
210 .mmap = generic_file_mmap, 212 .mmap = generic_file_mmap,
211 .write = do_sync_write, 213 .write = do_sync_write,
212 .aio_write = udf_file_aio_write, 214 .aio_write = udf_file_aio_write,
@@ -216,6 +218,29 @@ const struct file_operations udf_file_operations = {
216 .llseek = generic_file_llseek, 218 .llseek = generic_file_llseek,
217}; 219};
218 220
221static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
222{
223 struct inode *inode = dentry->d_inode;
224 int error;
225
226 error = inode_change_ok(inode, iattr);
227 if (error)
228 return error;
229
230 if (iattr->ia_valid & ATTR_SIZE)
231 dquot_initialize(inode);
232
233 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
234 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
235 error = dquot_transfer(inode, iattr);
236 if (error)
237 return error;
238 }
239
240 return inode_setattr(inode, iattr);
241}
242
219const struct inode_operations udf_file_inode_operations = { 243const struct inode_operations udf_file_inode_operations = {
220 .truncate = udf_truncate, 244 .truncate = udf_truncate,
245 .setattr = udf_setattr,
221}; 246};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e2..fb68c9cd0c3e 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
36 * Note: we must free any quota before locking the superblock, 36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well. 37 * as writing the quota to disk may need the lock as well.
38 */ 38 */
39 vfs_dq_free_inode(inode); 39 dquot_free_inode(inode);
40 vfs_dq_drop(inode); 40 dquot_drop(inode);
41 41
42 clear_inode(inode); 42 clear_inode(inode);
43 43
@@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 61 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 62 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 63 struct inode *inode;
64 int block; 64 int block, ret;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 66 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 67 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
153 insert_inode_hash(inode); 153 insert_inode_hash(inode);
154 mark_inode_dirty(inode); 154 mark_inode_dirty(inode);
155 155
156 if (vfs_dq_alloc_inode(inode)) { 156 dquot_initialize(inode);
157 vfs_dq_drop(inode); 157 ret = dquot_alloc_inode(inode);
158 if (ret) {
159 dquot_drop(inode);
158 inode->i_flags |= S_NOQUOTA; 160 inode->i_flags |= S_NOQUOTA;
159 inode->i_nlink = 0; 161 inode->i_nlink = 0;
160 iput(inode); 162 iput(inode);
161 *err = -EDQUOT; 163 *err = ret;
162 return NULL; 164 return NULL;
163 } 165 }
164 166
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 6d24c2c63f93..bb863fe579ac 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,6 +36,7 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
39#include <linux/slab.h> 40#include <linux/slab.h>
40#include <linux/crc-itu-t.h> 41#include <linux/crc-itu-t.h>
41 42
@@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
70 71
71void udf_delete_inode(struct inode *inode) 72void udf_delete_inode(struct inode *inode)
72{ 73{
74 if (!is_bad_inode(inode))
75 dquot_initialize(inode);
76
73 truncate_inode_pages(&inode->i_data, 0); 77 truncate_inode_pages(&inode->i_data, 0);
74 78
75 if (is_bad_inode(inode)) 79 if (is_bad_inode(inode))
@@ -97,15 +101,19 @@ no_delete:
97 */ 101 */
98void udf_clear_inode(struct inode *inode) 102void udf_clear_inode(struct inode *inode)
99{ 103{
100 struct udf_inode_info *iinfo; 104 struct udf_inode_info *iinfo = UDF_I(inode);
101 if (!(inode->i_sb->s_flags & MS_RDONLY)) { 105
102 lock_kernel(); 106 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
103 udf_truncate_tail_extent(inode); 107 inode->i_size != iinfo->i_lenExtents) {
104 unlock_kernel(); 108 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
105 write_inode_now(inode, 0); 109 "inode size %llu different from extent length %llu. "
106 invalidate_inode_buffers(inode); 110 "Filesystem need not be standards compliant.\n",
111 inode->i_sb->s_id, inode->i_ino, inode->i_mode,
112 (unsigned long long)inode->i_size,
113 (unsigned long long)iinfo->i_lenExtents);
107 } 114 }
108 iinfo = UDF_I(inode); 115
116 dquot_drop(inode);
109 kfree(iinfo->i_ext.i_data); 117 kfree(iinfo->i_ext.i_data);
110 iinfo->i_ext.i_data = NULL; 118 iinfo->i_ext.i_data = NULL;
111} 119}
@@ -198,7 +206,6 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
198 int newblock; 206 int newblock;
199 struct buffer_head *dbh = NULL; 207 struct buffer_head *dbh = NULL;
200 struct kernel_lb_addr eloc; 208 struct kernel_lb_addr eloc;
201 uint32_t elen;
202 uint8_t alloctype; 209 uint8_t alloctype;
203 struct extent_position epos; 210 struct extent_position epos;
204 211
@@ -273,12 +280,11 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
273 eloc.logicalBlockNum = *block; 280 eloc.logicalBlockNum = *block;
274 eloc.partitionReferenceNum = 281 eloc.partitionReferenceNum =
275 iinfo->i_location.partitionReferenceNum; 282 iinfo->i_location.partitionReferenceNum;
276 elen = inode->i_sb->s_blocksize; 283 iinfo->i_lenExtents = inode->i_size;
277 iinfo->i_lenExtents = elen;
278 epos.bh = NULL; 284 epos.bh = NULL;
279 epos.block = iinfo->i_location; 285 epos.block = iinfo->i_location;
280 epos.offset = udf_file_entry_alloc_offset(inode); 286 epos.offset = udf_file_entry_alloc_offset(inode);
281 udf_add_aext(inode, &epos, &eloc, elen, 0); 287 udf_add_aext(inode, &epos, &eloc, inode->i_size, 0);
282 /* UniqueID stuff */ 288 /* UniqueID stuff */
283 289
284 brelse(epos.bh); 290 brelse(epos.bh);
@@ -1373,12 +1379,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1373 return mode; 1379 return mode;
1374} 1380}
1375 1381
1376int udf_write_inode(struct inode *inode, int sync) 1382int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
1377{ 1383{
1378 int ret; 1384 int ret;
1379 1385
1380 lock_kernel(); 1386 lock_kernel();
1381 ret = udf_update_inode(inode, sync); 1387 ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1382 unlock_kernel(); 1388 unlock_kernel();
1383 1389
1384 return ret; 1390 return ret;
@@ -1402,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1402 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 1408 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
1403 struct udf_inode_info *iinfo = UDF_I(inode); 1409 struct udf_inode_info *iinfo = UDF_I(inode);
1404 1410
1405 bh = udf_tread(inode->i_sb, 1411 bh = udf_tgetblk(inode->i_sb,
1406 udf_get_lb_pblock(inode->i_sb, 1412 udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
1407 &iinfo->i_location, 0));
1408 if (!bh) { 1413 if (!bh) {
1409 udf_debug("bread failure\n"); 1414 udf_debug("getblk failure\n");
1410 return -EIO; 1415 return -ENOMEM;
1411 } 1416 }
1412 1417
1413 memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); 1418 lock_buffer(bh);
1414 1419 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1415 fe = (struct fileEntry *)bh->b_data; 1420 fe = (struct fileEntry *)bh->b_data;
1416 efe = (struct extendedFileEntry *)bh->b_data; 1421 efe = (struct extendedFileEntry *)bh->b_data;
1417 1422
1418 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { 1423 if (iinfo->i_use) {
1419 struct unallocSpaceEntry *use = 1424 struct unallocSpaceEntry *use =
1420 (struct unallocSpaceEntry *)bh->b_data; 1425 (struct unallocSpaceEntry *)bh->b_data;
1421 1426
@@ -1423,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1423 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), 1428 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
1424 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1429 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1425 sizeof(struct unallocSpaceEntry)); 1430 sizeof(struct unallocSpaceEntry));
1431 use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
1432 use->descTag.tagLocation =
1433 cpu_to_le32(iinfo->i_location.logicalBlockNum);
1426 crclen = sizeof(struct unallocSpaceEntry) + 1434 crclen = sizeof(struct unallocSpaceEntry) +
1427 iinfo->i_lenAlloc - sizeof(struct tag); 1435 iinfo->i_lenAlloc - sizeof(struct tag);
1428 use->descTag.tagLocation = cpu_to_le32(
1429 iinfo->i_location.
1430 logicalBlockNum);
1431 use->descTag.descCRCLength = cpu_to_le16(crclen); 1436 use->descTag.descCRCLength = cpu_to_le16(crclen);
1432 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1437 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1433 sizeof(struct tag), 1438 sizeof(struct tag),
1434 crclen)); 1439 crclen));
1435 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1440 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1436 1441
1437 mark_buffer_dirty(bh); 1442 goto out;
1438 brelse(bh);
1439 return err;
1440 } 1443 }
1441 1444
1442 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) 1445 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1591,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1591 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); 1594 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
1592 fe->descTag.tagLocation = cpu_to_le32( 1595 fe->descTag.tagLocation = cpu_to_le32(
1593 iinfo->i_location.logicalBlockNum); 1596 iinfo->i_location.logicalBlockNum);
1594 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1597 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
1595 sizeof(struct tag);
1596 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1598 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1597 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), 1599 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1598 crclen)); 1600 crclen));
1599 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1601 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1600 1602
1603out:
1604 set_buffer_uptodate(bh);
1605 unlock_buffer(bh);
1606
1601 /* write the data blocks */ 1607 /* write the data blocks */
1602 mark_buffer_dirty(bh); 1608 mark_buffer_dirty(bh);
1603 if (do_sync) { 1609 if (do_sync) {
1604 sync_dirty_buffer(bh); 1610 sync_dirty_buffer(bh);
1605 if (buffer_req(bh) && !buffer_uptodate(bh)) { 1611 if (buffer_write_io_error(bh)) {
1606 printk(KERN_WARNING "IO error syncing udf inode " 1612 printk(KERN_WARNING "IO error syncing udf inode "
1607 "[%s:%08lx]\n", inode->i_sb->s_id, 1613 "[%s:%08lx]\n", inode->i_sb->s_id,
1608 inode->i_ino); 1614 inode->i_ino);
@@ -1672,7 +1678,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1672 return -1; 1678 return -1;
1673 1679
1674 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { 1680 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
1675 char *sptr, *dptr; 1681 unsigned char *sptr, *dptr;
1676 struct buffer_head *nbh; 1682 struct buffer_head *nbh;
1677 int err, loffset; 1683 int err, loffset;
1678 struct kernel_lb_addr obloc = epos->block; 1684 struct kernel_lb_addr obloc = epos->block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 21dad8c608f9..db423ab078b1 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -34,8 +34,8 @@
34#include <linux/crc-itu-t.h> 34#include <linux/crc-itu-t.h>
35#include <linux/exportfs.h> 35#include <linux/exportfs.h>
36 36
37static inline int udf_match(int len1, const char *name1, int len2, 37static inline int udf_match(int len1, const unsigned char *name1, int len2,
38 const char *name2) 38 const unsigned char *name2)
39{ 39{
40 if (len1 != len2) 40 if (len1 != len2)
41 return 0; 41 return 0;
@@ -142,15 +142,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
142} 142}
143 143
144static struct fileIdentDesc *udf_find_entry(struct inode *dir, 144static struct fileIdentDesc *udf_find_entry(struct inode *dir,
145 struct qstr *child, 145 const struct qstr *child,
146 struct udf_fileident_bh *fibh, 146 struct udf_fileident_bh *fibh,
147 struct fileIdentDesc *cfi) 147 struct fileIdentDesc *cfi)
148{ 148{
149 struct fileIdentDesc *fi = NULL; 149 struct fileIdentDesc *fi = NULL;
150 loff_t f_pos; 150 loff_t f_pos;
151 int block, flen; 151 int block, flen;
152 char *fname = NULL; 152 unsigned char *fname = NULL;
153 char *nameptr; 153 unsigned char *nameptr;
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
@@ -308,7 +308,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
308{ 308{
309 struct super_block *sb = dir->i_sb; 309 struct super_block *sb = dir->i_sb;
310 struct fileIdentDesc *fi = NULL; 310 struct fileIdentDesc *fi = NULL;
311 char *name = NULL; 311 unsigned char *name = NULL;
312 int namelen; 312 int namelen;
313 loff_t f_pos; 313 loff_t f_pos;
314 loff_t size = udf_ext0_offset(dir) + dir->i_size; 314 loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -408,15 +408,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
408 } 408 }
409 409
410add: 410add:
411 /* Is there any extent whose size we need to round up? */
412 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
413 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
414 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
415 epos.offset -= sizeof(struct short_ad);
416 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
417 epos.offset -= sizeof(struct long_ad);
418 udf_write_aext(dir, &epos, &eloc, elen, 1);
419 }
420 f_pos += nfidlen; 411 f_pos += nfidlen;
421 412
422 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && 413 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB &&
@@ -439,6 +430,7 @@ add:
439 udf_current_aext(dir, &epos, &eloc, &elen, 1); 430 udf_current_aext(dir, &epos, &eloc, &elen, 1);
440 } 431 }
441 432
433 /* Entry fits into current block? */
442 if (sb->s_blocksize - fibh->eoffset >= nfidlen) { 434 if (sb->s_blocksize - fibh->eoffset >= nfidlen) {
443 fibh->soffset = fibh->eoffset; 435 fibh->soffset = fibh->eoffset;
444 fibh->eoffset += nfidlen; 436 fibh->eoffset += nfidlen;
@@ -462,6 +454,16 @@ add:
462 (fibh->sbh->b_data + fibh->soffset); 454 (fibh->sbh->b_data + fibh->soffset);
463 } 455 }
464 } else { 456 } else {
457 /* Round up last extent in the file */
458 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
459 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
460 epos.offset -= sizeof(struct short_ad);
461 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
462 epos.offset -= sizeof(struct long_ad);
463 udf_write_aext(dir, &epos, &eloc, elen, 1);
464 dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize
465 - 1) & ~(sb->s_blocksize - 1);
466
465 fibh->soffset = fibh->eoffset - sb->s_blocksize; 467 fibh->soffset = fibh->eoffset - sb->s_blocksize;
466 fibh->eoffset += nfidlen - sb->s_blocksize; 468 fibh->eoffset += nfidlen - sb->s_blocksize;
467 if (fibh->sbh != fibh->ebh) { 469 if (fibh->sbh != fibh->ebh) {
@@ -508,6 +510,20 @@ add:
508 dir->i_size += nfidlen; 510 dir->i_size += nfidlen;
509 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 511 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
510 dinfo->i_lenAlloc += nfidlen; 512 dinfo->i_lenAlloc += nfidlen;
513 else {
514 /* Find the last extent and truncate it to proper size */
515 while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
516 (EXT_RECORDED_ALLOCATED >> 30))
517 ;
518 elen -= dinfo->i_lenExtents - dir->i_size;
519 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
520 epos.offset -= sizeof(struct short_ad);
521 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
522 epos.offset -= sizeof(struct long_ad);
523 udf_write_aext(dir, &epos, &eloc, elen, 1);
524 dinfo->i_lenExtents = dir->i_size;
525 }
526
511 mark_inode_dirty(dir); 527 mark_inode_dirty(dir);
512 goto out_ok; 528 goto out_ok;
513 } else { 529 } else {
@@ -547,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
547 int err; 563 int err;
548 struct udf_inode_info *iinfo; 564 struct udf_inode_info *iinfo;
549 565
566 dquot_initialize(dir);
567
550 lock_kernel(); 568 lock_kernel();
551 inode = udf_new_inode(dir, mode, &err); 569 inode = udf_new_inode(dir, mode, &err);
552 if (!inode) { 570 if (!inode) {
@@ -600,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
600 if (!old_valid_dev(rdev)) 618 if (!old_valid_dev(rdev))
601 return -EINVAL; 619 return -EINVAL;
602 620
621 dquot_initialize(dir);
622
603 lock_kernel(); 623 lock_kernel();
604 err = -EIO; 624 err = -EIO;
605 inode = udf_new_inode(dir, mode, &err); 625 inode = udf_new_inode(dir, mode, &err);
@@ -646,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
646 struct udf_inode_info *dinfo = UDF_I(dir); 666 struct udf_inode_info *dinfo = UDF_I(dir);
647 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
648 668
669 dquot_initialize(dir);
670
649 lock_kernel(); 671 lock_kernel();
650 err = -EMLINK; 672 err = -EMLINK;
651 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 673 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -783,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
783 struct fileIdentDesc *fi, cfi; 805 struct fileIdentDesc *fi, cfi;
784 struct kernel_lb_addr tloc; 806 struct kernel_lb_addr tloc;
785 807
808 dquot_initialize(dir);
809
786 retval = -ENOENT; 810 retval = -ENOENT;
787 lock_kernel(); 811 lock_kernel();
788 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 812 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -829,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
829 struct fileIdentDesc cfi; 853 struct fileIdentDesc cfi;
830 struct kernel_lb_addr tloc; 854 struct kernel_lb_addr tloc;
831 855
856 dquot_initialize(dir);
857
832 retval = -ENOENT; 858 retval = -ENOENT;
833 lock_kernel(); 859 lock_kernel();
834 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 860 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -869,20 +895,22 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
869{ 895{
870 struct inode *inode; 896 struct inode *inode;
871 struct pathComponent *pc; 897 struct pathComponent *pc;
872 char *compstart; 898 const char *compstart;
873 struct udf_fileident_bh fibh; 899 struct udf_fileident_bh fibh;
874 struct extent_position epos = {}; 900 struct extent_position epos = {};
875 int eoffset, elen = 0; 901 int eoffset, elen = 0;
876 struct fileIdentDesc *fi; 902 struct fileIdentDesc *fi;
877 struct fileIdentDesc cfi; 903 struct fileIdentDesc cfi;
878 char *ea; 904 uint8_t *ea;
879 int err; 905 int err;
880 int block; 906 int block;
881 char *name = NULL; 907 unsigned char *name = NULL;
882 int namelen; 908 int namelen;
883 struct buffer_head *bh; 909 struct buffer_head *bh;
884 struct udf_inode_info *iinfo; 910 struct udf_inode_info *iinfo;
885 911
912 dquot_initialize(dir);
913
886 lock_kernel(); 914 lock_kernel();
887 inode = udf_new_inode(dir, S_IFLNK, &err); 915 inode = udf_new_inode(dir, S_IFLNK, &err);
888 if (!inode) 916 if (!inode)
@@ -922,7 +950,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
922 block = udf_get_pblock(inode->i_sb, block, 950 block = udf_get_pblock(inode->i_sb, block,
923 iinfo->i_location.partitionReferenceNum, 951 iinfo->i_location.partitionReferenceNum,
924 0); 952 0);
925 epos.bh = udf_tread(inode->i_sb, block); 953 epos.bh = udf_tgetblk(inode->i_sb, block);
926 lock_buffer(epos.bh); 954 lock_buffer(epos.bh);
927 memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize); 955 memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
928 set_buffer_uptodate(epos.bh); 956 set_buffer_uptodate(epos.bh);
@@ -954,7 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
954 982
955 pc = (struct pathComponent *)(ea + elen); 983 pc = (struct pathComponent *)(ea + elen);
956 984
957 compstart = (char *)symname; 985 compstart = symname;
958 986
959 do { 987 do {
960 symname++; 988 symname++;
@@ -999,6 +1027,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
999 inode->i_size = elen; 1027 inode->i_size = elen;
1000 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1028 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1001 iinfo->i_lenAlloc = inode->i_size; 1029 iinfo->i_lenAlloc = inode->i_size;
1030 else
1031 udf_truncate_tail_extent(inode);
1002 mark_inode_dirty(inode); 1032 mark_inode_dirty(inode);
1003 1033
1004 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1034 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -1051,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1051 int err; 1081 int err;
1052 struct buffer_head *bh; 1082 struct buffer_head *bh;
1053 1083
1084 dquot_initialize(dir);
1085
1054 lock_kernel(); 1086 lock_kernel();
1055 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1087 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1056 unlock_kernel(); 1088 unlock_kernel();
@@ -1113,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1113 struct kernel_lb_addr tloc; 1145 struct kernel_lb_addr tloc;
1114 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1146 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1115 1147
1148 dquot_initialize(old_dir);
1149 dquot_initialize(new_dir);
1150
1116 lock_kernel(); 1151 lock_kernel();
1117 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1152 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1118 if (ofi) { 1153 if (ofi) {
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
29 28
30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9d1b8c2e6c45..1e4543cbcd27 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1078,21 +1078,39 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1078 return 0; 1078 return 0;
1079} 1079}
1080 1080
1081static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) 1081static void udf_find_vat_block(struct super_block *sb, int p_index,
1082 int type1_index, sector_t start_block)
1082{ 1083{
1083 struct udf_sb_info *sbi = UDF_SB(sb); 1084 struct udf_sb_info *sbi = UDF_SB(sb);
1084 struct udf_part_map *map = &sbi->s_partmaps[p_index]; 1085 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1086 sector_t vat_block;
1085 struct kernel_lb_addr ino; 1087 struct kernel_lb_addr ino;
1088
1089 /*
1090 * VAT file entry is in the last recorded block. Some broken disks have
1091 * it a few blocks before so try a bit harder...
1092 */
1093 ino.partitionReferenceNum = type1_index;
1094 for (vat_block = start_block;
1095 vat_block >= map->s_partition_root &&
1096 vat_block >= start_block - 3 &&
1097 !sbi->s_vat_inode; vat_block--) {
1098 ino.logicalBlockNum = vat_block - map->s_partition_root;
1099 sbi->s_vat_inode = udf_iget(sb, &ino);
1100 }
1101}
1102
1103static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1104{
1105 struct udf_sb_info *sbi = UDF_SB(sb);
1106 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1086 struct buffer_head *bh = NULL; 1107 struct buffer_head *bh = NULL;
1087 struct udf_inode_info *vati; 1108 struct udf_inode_info *vati;
1088 uint32_t pos; 1109 uint32_t pos;
1089 struct virtualAllocationTable20 *vat20; 1110 struct virtualAllocationTable20 *vat20;
1090 sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; 1111 sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
1091 1112
1092 /* VAT file entry is in the last recorded block */ 1113 udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
1093 ino.partitionReferenceNum = type1_index;
1094 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1095 sbi->s_vat_inode = udf_iget(sb, &ino);
1096 if (!sbi->s_vat_inode && 1114 if (!sbi->s_vat_inode &&
1097 sbi->s_last_block != blocks - 1) { 1115 sbi->s_last_block != blocks - 1) {
1098 printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" 1116 printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
@@ -1100,9 +1118,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1100 "block of the device (%lu).\n", 1118 "block of the device (%lu).\n",
1101 (unsigned long)sbi->s_last_block, 1119 (unsigned long)sbi->s_last_block,
1102 (unsigned long)blocks - 1); 1120 (unsigned long)blocks - 1);
1103 ino.partitionReferenceNum = type1_index; 1121 udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
1104 ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
1105 sbi->s_vat_inode = udf_iget(sb, &ino);
1106 } 1122 }
1107 if (!sbi->s_vat_inode) 1123 if (!sbi->s_vat_inode)
1108 return 1; 1124 return 1;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c3265e1385d4..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,18 +26,17 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/slab.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
33#include "udf_i.h" 32#include "udf_i.h"
34 33
35static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen, 34static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
36 char *to) 35 int fromlen, unsigned char *to)
37{ 36{
38 struct pathComponent *pc; 37 struct pathComponent *pc;
39 int elen = 0; 38 int elen = 0;
40 char *p = to; 39 unsigned char *p = to;
41 40
42 while (elen < fromlen) { 41 while (elen < fromlen) {
43 pc = (struct pathComponent *)(from + elen); 42 pc = (struct pathComponent *)(from + elen);
@@ -75,9 +74,9 @@ static int udf_symlink_filler(struct file *file, struct page *page)
75{ 74{
76 struct inode *inode = page->mapping->host; 75 struct inode *inode = page->mapping->host;
77 struct buffer_head *bh = NULL; 76 struct buffer_head *bh = NULL;
78 char *symlink; 77 unsigned char *symlink;
79 int err = -EIO; 78 int err = -EIO;
80 char *p = kmap(page); 79 unsigned char *p = kmap(page);
81 struct udf_inode_info *iinfo; 80 struct udf_inode_info *iinfo;
82 81
83 lock_kernel(); 82 lock_kernel();
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee7..4223ac855da9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *);
142extern void udf_read_inode(struct inode *); 142extern void udf_read_inode(struct inode *);
143extern void udf_delete_inode(struct inode *); 143extern void udf_delete_inode(struct inode *);
144extern void udf_clear_inode(struct inode *); 144extern void udf_clear_inode(struct inode *);
145extern int udf_write_inode(struct inode *, int); 145extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
146extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
147extern int udf_extend_file(struct inode *, struct extent_position *, 147extern int udf_extend_file(struct inode *, struct extent_position *,
148 struct kernel_long_ad *, sector_t); 148 struct kernel_long_ad *, sector_t);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> /* for memset */ 24#include <linux/string.h> /* for memset */
25#include <linux/nls.h> 25#include <linux/nls.h>
26#include <linux/crc-itu-t.h> 26#include <linux/crc-itu-t.h>
27#include <linux/slab.h>
27 28
28#include "udf_sb.h" 29#include "udf_sb.h"
29 30
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95dff..5cfa4d85ccf2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 85 "bit already cleared for fragment %u", i);
86 } 86 }
87 87
88 vfs_dq_free_block(inode, count); 88 dquot_free_block(inode, count);
89 89
90 90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 197 ufs_clusteracct (sb, ucpi, blkno, 1);
198 vfs_dq_free_block(inode, uspi->s_fpb); 198 dquot_free_block(inode, uspi->s_fpb);
199 199
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 201 uspi->cs_total.cs_nbfree++;
@@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 511 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 512 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 513 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
514 515
515 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
516 (unsigned long long)fragment, oldcount, newcount); 517 (unsigned long long)fragment, oldcount, newcount);
@@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
556 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
557 for (i = oldcount; i < newcount; i++) 558 for (i = oldcount; i < newcount; i++)
558 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
559 if (vfs_dq_alloc_block(inode, count)) { 560 ret = dquot_alloc_block(inode, count);
560 *err = -EDQUOT; 561 if (ret) {
562 *err = ret;
561 return 0; 563 return 0;
562 } 564 }
563 565
@@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
596 struct ufs_cylinder_group * ucg; 598 struct ufs_cylinder_group * ucg;
597 unsigned oldcg, i, j, k, allocsize; 599 unsigned oldcg, i, j, k, allocsize;
598 u64 result; 600 u64 result;
601 int ret;
599 602
600 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
601 inode->i_ino, cgno, (unsigned long long)goal, count); 604 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -664,7 +667,7 @@ cg_found:
664 for (i = count; i < uspi->s_fpb; i++) 667 for (i = count; i < uspi->s_fpb; i++)
665 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
666 i = uspi->s_fpb - count; 669 i = uspi->s_fpb - count;
667 vfs_dq_free_block(inode, i); 670 dquot_free_block(inode, i);
668 671
669 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
670 uspi->cs_total.cs_nffree += i; 673 uspi->cs_total.cs_nffree += i;
@@ -676,8 +679,9 @@ cg_found:
676 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
677 if (result == INVBLOCK) 680 if (result == INVBLOCK)
678 return 0; 681 return 0;
679 if (vfs_dq_alloc_block(inode, count)) { 682 ret = dquot_alloc_block(inode, count);
680 *err = -EDQUOT; 683 if (ret) {
684 *err = ret;
681 return 0; 685 return 0;
682 } 686 }
683 for (i = 0; i < count; i++) 687 for (i = 0; i < count; i++)
@@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
714 struct ufs_super_block_first * usb1; 718 struct ufs_super_block_first * usb1;
715 struct ufs_cylinder_group * ucg; 719 struct ufs_cylinder_group * ucg;
716 u64 result, blkno; 720 u64 result, blkno;
721 int ret;
717 722
718 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
719 724
@@ -747,8 +752,9 @@ gotit:
747 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
748 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
749 ufs_clusteracct (sb, ucpi, blkno, -1); 754 ufs_clusteracct (sb, ucpi, blkno, -1);
750 if (vfs_dq_alloc_block(inode, uspi->s_fpb)) { 755 ret = dquot_alloc_block(inode, uspi->s_fpb);
751 *err = -EDQUOT; 756 if (ret) {
757 *err = ret;
752 return INVBLOCK; 758 return INVBLOCK;
753 } 759 }
754 760
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6f671f1ac271..317a0d444f6b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -31,7 +31,7 @@
31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller. 31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
32 */ 32 */
33static inline int ufs_match(struct super_block *sb, int len, 33static inline int ufs_match(struct super_block *sb, int len,
34 const char * const name, struct ufs_dir_entry * de) 34 const unsigned char *name, struct ufs_dir_entry *de)
35{ 35{
36 if (len != ufs_get_de_namlen(sb, de)) 36 if (len != ufs_get_de_namlen(sb, de))
37 return 0; 37 return 0;
@@ -70,13 +70,13 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; 70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
71} 71}
72 72
73ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry) 73ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
74{ 74{
75 ino_t res = 0; 75 ino_t res = 0;
76 struct ufs_dir_entry *de; 76 struct ufs_dir_entry *de;
77 struct page *page; 77 struct page *page;
78 78
79 de = ufs_find_entry(dir, dentry, &page); 79 de = ufs_find_entry(dir, qstr, &page);
80 if (de) { 80 if (de) {
81 res = fs32_to_cpu(dir->i_sb, de->d_ino); 81 res = fs32_to_cpu(dir->i_sb, de->d_ino);
82 ufs_put_page(page); 82 ufs_put_page(page);
@@ -249,12 +249,12 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
249 * (as a parameter - res_dir). Page is returned mapped and unlocked. 249 * (as a parameter - res_dir). Page is returned mapped and unlocked.
250 * Entry is guaranteed to be valid. 250 * Entry is guaranteed to be valid.
251 */ 251 */
252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry, 252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
253 struct page **res_page) 253 struct page **res_page)
254{ 254{
255 struct super_block *sb = dir->i_sb; 255 struct super_block *sb = dir->i_sb;
256 const char *name = dentry->d_name.name; 256 const unsigned char *name = qstr->name;
257 int namelen = dentry->d_name.len; 257 int namelen = qstr->len;
258 unsigned reclen = UFS_DIR_REC_LEN(namelen); 258 unsigned reclen = UFS_DIR_REC_LEN(namelen);
259 unsigned long start, n; 259 unsigned long start, n;
260 unsigned long npages = ufs_dir_pages(dir); 260 unsigned long npages = ufs_dir_pages(dir);
@@ -313,7 +313,7 @@ found:
313int ufs_add_link(struct dentry *dentry, struct inode *inode) 313int ufs_add_link(struct dentry *dentry, struct inode *inode)
314{ 314{
315 struct inode *dir = dentry->d_parent->d_inode; 315 struct inode *dir = dentry->d_parent->d_inode;
316 const char *name = dentry->d_name.name; 316 const unsigned char *name = dentry->d_name.name;
317 int namelen = dentry->d_name.len; 317 int namelen = dentry->d_name.len;
318 struct super_block *sb = dir->i_sb; 318 struct super_block *sb = dir->i_sb;
319 unsigned reclen = UFS_DIR_REC_LEN(namelen); 319 unsigned reclen = UFS_DIR_REC_LEN(namelen);
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 73655c61240a..a8962cecde5b 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,6 +24,7 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
27 28
28#include "ufs_fs.h" 29#include "ufs_fs.h"
29#include "ufs.h" 30#include "ufs.h"
@@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = {
40 .write = do_sync_write, 41 .write = do_sync_write,
41 .aio_write = generic_file_aio_write, 42 .aio_write = generic_file_aio_write,
42 .mmap = generic_file_mmap, 43 .mmap = generic_file_mmap,
43 .open = generic_file_open, 44 .open = dquot_file_open,
44 .fsync = simple_fsync, 45 .fsync = simple_fsync,
45 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
46}; 47};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0d..230ecf608026 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
95 95
96 is_directory = S_ISDIR(inode->i_mode); 96 is_directory = S_ISDIR(inode->i_mode);
97 97
98 vfs_dq_free_inode(inode); 98 dquot_free_inode(inode);
99 vfs_dq_drop(inode); 99 dquot_drop(inode);
100 100
101 clear_inode (inode); 101 clear_inode (inode);
102 102
@@ -355,9 +355,10 @@ cg_found:
355 355
356 unlock_super (sb); 356 unlock_super (sb);
357 357
358 if (vfs_dq_alloc_inode(inode)) { 358 dquot_initialize(inode);
359 vfs_dq_drop(inode); 359 err = dquot_alloc_inode(inode);
360 err = -EDQUOT; 360 if (err) {
361 dquot_drop(inode);
361 goto fail_without_unlock; 362 goto fail_without_unlock;
362 } 363 }
363 364
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd46..80b68c3702d1 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,8 @@
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h>
40#include <linux/quotaops.h>
39 41
40#include "ufs_fs.h" 42#include "ufs_fs.h"
41#include "ufs.h" 43#include "ufs.h"
@@ -890,11 +892,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
890 return 0; 892 return 0;
891} 893}
892 894
893int ufs_write_inode (struct inode * inode, int wait) 895int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
894{ 896{
895 int ret; 897 int ret;
896 lock_kernel(); 898 lock_kernel();
897 ret = ufs_update_inode (inode, wait); 899 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
898 unlock_kernel(); 900 unlock_kernel();
899 return ret; 901 return ret;
900} 902}
@@ -908,6 +910,9 @@ void ufs_delete_inode (struct inode * inode)
908{ 910{
909 loff_t old_i_size; 911 loff_t old_i_size;
910 912
913 if (!is_bad_inode(inode))
914 dquot_initialize(inode);
915
911 truncate_inode_pages(&inode->i_data, 0); 916 truncate_inode_pages(&inode->i_data, 0);
912 if (is_bad_inode(inode)) 917 if (is_bad_inode(inode))
913 goto no_delete; 918 goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 23119fe7ad62..118556243e7a 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,6 +30,7 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
33 34
34#include "ufs_fs.h" 35#include "ufs_fs.h"
35#include "ufs.h" 36#include "ufs.h"
@@ -56,7 +57,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
56 return ERR_PTR(-ENAMETOOLONG); 57 return ERR_PTR(-ENAMETOOLONG);
57 58
58 lock_kernel(); 59 lock_kernel();
59 ino = ufs_inode_by_name(dir, dentry); 60 ino = ufs_inode_by_name(dir, &dentry->d_name);
60 if (ino) { 61 if (ino) {
61 inode = ufs_iget(dir->i_sb, ino); 62 inode = ufs_iget(dir->i_sb, ino);
62 if (IS_ERR(inode)) { 63 if (IS_ERR(inode)) {
@@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
84 int err; 85 int err;
85 86
86 UFSD("BEGIN\n"); 87 UFSD("BEGIN\n");
88
89 dquot_initialize(dir);
90
87 inode = ufs_new_inode(dir, mode); 91 inode = ufs_new_inode(dir, mode);
88 err = PTR_ERR(inode); 92 err = PTR_ERR(inode);
89 93
@@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
107 111
108 if (!old_valid_dev(rdev)) 112 if (!old_valid_dev(rdev))
109 return -EINVAL; 113 return -EINVAL;
114
115 dquot_initialize(dir);
116
110 inode = ufs_new_inode(dir, mode); 117 inode = ufs_new_inode(dir, mode);
111 err = PTR_ERR(inode); 118 err = PTR_ERR(inode);
112 if (!IS_ERR(inode)) { 119 if (!IS_ERR(inode)) {
@@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
131 if (l > sb->s_blocksize) 138 if (l > sb->s_blocksize)
132 goto out_notlocked; 139 goto out_notlocked;
133 140
141 dquot_initialize(dir);
142
134 lock_kernel(); 143 lock_kernel();
135 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
136 err = PTR_ERR(inode); 145 err = PTR_ERR(inode);
@@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
176 return -EMLINK; 185 return -EMLINK;
177 } 186 }
178 187
188 dquot_initialize(dir);
189
179 inode->i_ctime = CURRENT_TIME_SEC; 190 inode->i_ctime = CURRENT_TIME_SEC;
180 inode_inc_link_count(inode); 191 inode_inc_link_count(inode);
181 atomic_inc(&inode->i_count); 192 atomic_inc(&inode->i_count);
@@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
193 if (dir->i_nlink >= UFS_LINK_MAX) 204 if (dir->i_nlink >= UFS_LINK_MAX)
194 goto out; 205 goto out;
195 206
207 dquot_initialize(dir);
208
196 lock_kernel(); 209 lock_kernel();
197 inode_inc_link_count(dir); 210 inode_inc_link_count(dir);
198 211
@@ -237,7 +250,9 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
237 struct page *page; 250 struct page *page;
238 int err = -ENOENT; 251 int err = -ENOENT;
239 252
240 de = ufs_find_entry(dir, dentry, &page); 253 dquot_initialize(dir);
254
255 de = ufs_find_entry(dir, &dentry->d_name, &page);
241 if (!de) 256 if (!de)
242 goto out; 257 goto out;
243 258
@@ -281,7 +296,10 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
281 struct ufs_dir_entry *old_de; 296 struct ufs_dir_entry *old_de;
282 int err = -ENOENT; 297 int err = -ENOENT;
283 298
284 old_de = ufs_find_entry(old_dir, old_dentry, &old_page); 299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
285 if (!old_de) 303 if (!old_de)
286 goto out; 304 goto out;
287 305
@@ -301,7 +319,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
301 goto out_dir; 319 goto out_dir;
302 320
303 err = -ENOENT; 321 err = -ENOENT;
304 new_de = ufs_find_entry(new_dir, new_dentry, &new_page); 322 new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
305 if (!new_de) 323 if (!new_de)
306 goto out_dir; 324 goto out_dir;
307 inode_inc_link_count(old_inode); 325 inode_inc_link_count(old_inode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 5faed7954d0a..14743d935a93 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -66,6 +66,7 @@
66 */ 66 */
67 67
68 68
69#include <linux/exportfs.h>
69#include <linux/module.h> 70#include <linux/module.h>
70#include <linux/bitops.h> 71#include <linux/bitops.h>
71 72
@@ -96,6 +97,56 @@
96#include "swab.h" 97#include "swab.h"
97#include "util.h" 98#include "util.h"
98 99
100static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
101{
102 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
103 struct inode *inode;
104
105 if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
106 return ERR_PTR(-ESTALE);
107
108 inode = ufs_iget(sb, ino);
109 if (IS_ERR(inode))
110 return ERR_CAST(inode);
111 if (generation && inode->i_generation != generation) {
112 iput(inode);
113 return ERR_PTR(-ESTALE);
114 }
115 return inode;
116}
117
118static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid,
119 int fh_len, int fh_type)
120{
121 return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
122}
123
124static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
125 int fh_len, int fh_type)
126{
127 return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
128}
129
130static struct dentry *ufs_get_parent(struct dentry *child)
131{
132 struct qstr dot_dot = {
133 .name = "..",
134 .len = 2,
135 };
136 ino_t ino;
137
138 ino = ufs_inode_by_name(child->d_inode, &dot_dot);
139 if (!ino)
140 return ERR_PTR(-ENOENT);
141 return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino));
142}
143
144static const struct export_operations ufs_export_ops = {
145 .fh_to_dentry = ufs_fh_to_dentry,
146 .fh_to_parent = ufs_fh_to_parent,
147 .get_parent = ufs_get_parent,
148};
149
99#ifdef CONFIG_UFS_DEBUG 150#ifdef CONFIG_UFS_DEBUG
100/* 151/*
101 * Print contents of ufs_super_block, useful for debugging 152 * Print contents of ufs_super_block, useful for debugging
@@ -965,6 +1016,9 @@ magic_found:
965 case UFS_FSSTABLE: 1016 case UFS_FSSTABLE:
966 UFSD("fs is stable\n"); 1017 UFSD("fs is stable\n");
967 break; 1018 break;
1019 case UFS_FSLOG:
1020 UFSD("fs is logging fs\n");
1021 break;
968 case UFS_FSOSF1: 1022 case UFS_FSOSF1:
969 UFSD("fs is DEC OSF/1\n"); 1023 UFSD("fs is DEC OSF/1\n");
970 break; 1024 break;
@@ -990,6 +1044,7 @@ magic_found:
990 * Read ufs_super_block into internal data structures 1044 * Read ufs_super_block into internal data structures
991 */ 1045 */
992 sb->s_op = &ufs_super_ops; 1046 sb->s_op = &ufs_super_ops;
1047 sb->s_export_op = &ufs_export_ops;
993 sb->dq_op = NULL; /***/ 1048 sb->dq_op = NULL; /***/
994 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); 1049 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
995 1050
@@ -1380,6 +1435,11 @@ static void destroy_inodecache(void)
1380 kmem_cache_destroy(ufs_inode_cachep); 1435 kmem_cache_destroy(ufs_inode_cachep);
1381} 1436}
1382 1437
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1383#ifdef CONFIG_QUOTA 1443#ifdef CONFIG_QUOTA
1384static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); 1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1385static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); 1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
@@ -1390,6 +1450,7 @@ static const struct super_operations ufs_super_ops = {
1390 .destroy_inode = ufs_destroy_inode, 1450 .destroy_inode = ufs_destroy_inode,
1391 .write_inode = ufs_write_inode, 1451 .write_inode = ufs_write_inode,
1392 .delete_inode = ufs_delete_inode, 1452 .delete_inode = ufs_delete_inode,
1453 .clear_inode = ufs_clear_inode,
1393 .put_super = ufs_put_super, 1454 .put_super = ufs_put_super,
1394 .write_super = ufs_write_super, 1455 .write_super = ufs_write_super,
1395 .sync_fs = ufs_sync_fs, 1456 .sync_fs = ufs_sync_fs,
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 41dd431ce228..d3b6270cb377 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,6 +44,7 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
47 48
48#include "ufs_fs.h" 49#include "ufs_fs.h"
49#include "ufs.h" 50#include "ufs.h"
@@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
517 if (error) 518 if (error)
518 return error; 519 return error;
519 520
521 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
522 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
523 error = dquot_transfer(inode, attr);
524 if (error)
525 return error;
526 }
520 if (ia_valid & ATTR_SIZE && 527 if (ia_valid & ATTR_SIZE &&
521 attr->ia_size != i_size_read(inode)) { 528 attr->ia_size != i_size_read(inode)) {
522 loff_t old_i_size = inode->i_size; 529 loff_t old_i_size = inode->i_size;
530
531 dquot_initialize(inode);
532
523 error = vmtruncate(inode, attr->ia_size); 533 error = vmtruncate(inode, attr->ia_size);
524 if (error) 534 if (error)
525 return error; 535 return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 644e77e13599..43f9f5d5670e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
86/* dir.c */ 86/* dir.c */
87extern const struct inode_operations ufs_dir_inode_operations; 87extern const struct inode_operations ufs_dir_inode_operations;
88extern int ufs_add_link (struct dentry *, struct inode *); 88extern int ufs_add_link (struct dentry *, struct inode *);
89extern ino_t ufs_inode_by_name(struct inode *, struct dentry *); 89extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
90extern int ufs_make_empty(struct inode *, struct inode *); 90extern int ufs_make_empty(struct inode *, struct inode *);
91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct dentry *, struct page **); 91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); 92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
93extern int ufs_empty_dir (struct inode *); 93extern int ufs_empty_dir (struct inode *);
94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); 94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
106 106
107/* inode.c */ 107/* inode.c */
108extern struct inode *ufs_iget(struct super_block *, unsigned long); 108extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, int); 109extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 110extern int ufs_sync_inode (struct inode *);
111extern void ufs_delete_inode (struct inode *); 111extern void ufs_delete_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); 112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a80..6943ec677c0b 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16;
138 138
139#define UFS_USEEFT ((__u16)65535) 139#define UFS_USEEFT ((__u16)65535)
140 140
141/* fs_clean values */
141#define UFS_FSOK 0x7c269d38 142#define UFS_FSOK 0x7c269d38
142#define UFS_FSACTIVE ((__s8)0x00) 143#define UFS_FSACTIVE ((__s8)0x00)
143#define UFS_FSCLEAN ((__s8)0x01) 144#define UFS_FSCLEAN ((__s8)0x01)
@@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16;
145#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */ 146#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */
146#define UFS_FSBAD ((__s8)0xff) 147#define UFS_FSBAD ((__s8)0xff)
147 148
149/* Solaris-specific fs_clean values */
150#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */
151#define UFS_FSLOG ((__s8)0xfd) /* logging fs */
152#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */
153
148/* From here to next blank line, s_flags for ufs_sb_info */ 154/* From here to next blank line, s_flags for ufs_sb_info */
149/* directory entry encoding */ 155/* directory entry encoding */
150#define UFS_DE_MASK 0x00000010 /* mask for the following */ 156#define UFS_DE_MASK 0x00000010 /* mask for the following */
@@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16;
227 */ 233 */
228#define ufs_cbtocylno(bno) \ 234#define ufs_cbtocylno(bno) \
229 ((bno) * uspi->s_nspf / uspi->s_spc) 235 ((bno) * uspi->s_nspf / uspi->s_spc)
230#define ufs_cbtorpos(bno) \ 236#define ufs_cbtorpos(bno) \
237 ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \
238 (((((bno) * uspi->s_nspf % uspi->s_spc) % \
239 uspi->s_nsect) * \
240 uspi->s_nrpos) / uspi->s_nsect) \
241 : \
231 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \ 242 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
232 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \ 243 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
233 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \ 244 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
234 * uspi->s_nrpos) / uspi->s_npsect) 245 * uspi->s_nrpos) / uspi->s_npsect))
235 246
236/* 247/*
237 * The following macros optimize certain frequently calculated 248 * The following macros optimize certain frequently calculated
diff --git a/fs/xattr.c b/fs/xattr.c
index 6d4f6d3449fb..46f87e828b48 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -615,12 +615,11 @@ ssize_t
615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) 615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
616{ 616{
617 struct xattr_handler *handler; 617 struct xattr_handler *handler;
618 struct inode *inode = dentry->d_inode;
619 618
620 handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); 619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
621 if (!handler) 620 if (!handler)
622 return -EOPNOTSUPP; 621 return -EOPNOTSUPP;
623 return handler->get(inode, name, buffer, size); 622 return handler->get(dentry, name, buffer, size, handler->flags);
624} 623}
625 624
626/* 625/*
@@ -630,18 +629,20 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
630ssize_t 629ssize_t
631generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
632{ 631{
633 struct inode *inode = dentry->d_inode; 632 struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
634 struct xattr_handler *handler, **handlers = inode->i_sb->s_xattr;
635 unsigned int size = 0; 633 unsigned int size = 0;
636 634
637 if (!buffer) { 635 if (!buffer) {
638 for_each_xattr_handler(handlers, handler) 636 for_each_xattr_handler(handlers, handler) {
639 size += handler->list(inode, NULL, 0, NULL, 0); 637 size += handler->list(dentry, NULL, 0, NULL, 0,
638 handler->flags);
639 }
640 } else { 640 } else {
641 char *buf = buffer; 641 char *buf = buffer;
642 642
643 for_each_xattr_handler(handlers, handler) { 643 for_each_xattr_handler(handlers, handler) {
644 size = handler->list(inode, buf, buffer_size, NULL, 0); 644 size = handler->list(dentry, buf, buffer_size,
645 NULL, 0, handler->flags);
645 if (size > buffer_size) 646 if (size > buffer_size)
646 return -ERANGE; 647 return -ERANGE;
647 buf += size; 648 buf += size;
@@ -659,14 +660,13 @@ int
659generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) 660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
660{ 661{
661 struct xattr_handler *handler; 662 struct xattr_handler *handler;
662 struct inode *inode = dentry->d_inode;
663 663
664 if (size == 0) 664 if (size == 0)
665 value = ""; /* empty EA, do not remove */ 665 value = ""; /* empty EA, do not remove */
666 handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); 666 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
667 if (!handler) 667 if (!handler)
668 return -EOPNOTSUPP; 668 return -EOPNOTSUPP;
669 return handler->set(inode, name, value, size, flags); 669 return handler->set(dentry, name, value, size, 0, handler->flags);
670} 670}
671 671
672/* 672/*
@@ -677,12 +677,12 @@ int
677generic_removexattr(struct dentry *dentry, const char *name) 677generic_removexattr(struct dentry *dentry, const char *name)
678{ 678{
679 struct xattr_handler *handler; 679 struct xattr_handler *handler;
680 struct inode *inode = dentry->d_inode;
681 680
682 handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); 681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
683 if (!handler) 682 if (!handler)
684 return -EOPNOTSUPP; 683 return -EOPNOTSUPP;
685 return handler->set(inode, name, NULL, 0, XATTR_REPLACE); 684 return handler->set(dentry, name, NULL, 0,
685 XATTR_REPLACE, handler->flags);
686} 686}
687 687
688EXPORT_SYMBOL(generic_getxattr); 688EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index c6ad7c7e3ee9..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h>
12 12
13 13
14/* 14/*
@@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size)
36 if (count == 0) 36 if (count == 0)
37 return NULL; 37 return NULL;
38 38
39 acl = posix_acl_alloc(count, GFP_KERNEL); 39 acl = posix_acl_alloc(count, GFP_NOFS);
40 if (!acl) 40 if (!acl)
41 return ERR_PTR(-ENOMEM); 41 return ERR_PTR(-ENOMEM);
42 acl_e = acl->a_entries; 42 acl_e = acl->a_entries;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7a59daed1782..b4769e40e8bc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char 19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6
20 20
21XFS_LINUX := linux-2.6 21XFS_LINUX := linux-2.6
22 22
@@ -26,6 +26,8 @@ endif
26 26
27obj-$(CONFIG_XFS_FS) += xfs.o 27obj-$(CONFIG_XFS_FS) += xfs.o
28 28
29xfs-y += linux-2.6/xfs_trace.o
30
29xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ 31xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
30 xfs_dquot.o \ 32 xfs_dquot.o \
31 xfs_dquot_item.o \ 33 xfs_dquot_item.o \
@@ -90,8 +92,7 @@ xfs-y += xfs_alloc.o \
90 xfs_rw.o \ 92 xfs_rw.o \
91 xfs_dmops.o 93 xfs_dmops.o
92 94
93xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o \ 95xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o
94 xfs_dir2_trace.o
95 96
96# Objects in linux/ 97# Objects in linux/
97xfs-y += $(addprefix $(XFS_LINUX)/, \ 98xfs-y += $(addprefix $(XFS_LINUX)/, \
@@ -104,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
104 xfs_globals.o \ 105 xfs_globals.o \
105 xfs_ioctl.o \ 106 xfs_ioctl.o \
106 xfs_iops.o \ 107 xfs_iops.o \
107 xfs_lrw.o \
108 xfs_super.o \ 108 xfs_super.o \
109 xfs_sync.o \ 109 xfs_sync.o \
110 xfs_xattr.o) 110 xfs_xattr.o)
@@ -113,6 +113,3 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
113xfs-y += $(addprefix support/, \ 113xfs-y += $(addprefix support/, \
114 debug.o \ 114 debug.o \
115 uuid.o) 115 uuid.o)
116
117xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o
118
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,16 +16,33 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/vmalloc.h>
20#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/slab.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h" 24#include "time.h"
25#include "kmem.h" 25#include "kmem.h"
26 26
27#define MAX_VMALLOCS 6 27/*
28#define MAX_SLAB_SIZE 0x20000 28 * Greedy allocation. May fail and may return vmalloced memory.
29 *
30 * Must be freed using kmem_free_large.
31 */
32void *
33kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
34{
35 void *ptr;
36 size_t kmsize = maxsize;
37
38 while (!(ptr = kmem_zalloc_large(kmsize))) {
39 if ((kmsize >>= 1) <= minsize)
40 kmsize = minsize;
41 }
42 if (ptr)
43 *size = kmsize;
44 return ptr;
45}
29 46
30void * 47void *
31kmem_alloc(size_t size, unsigned int __nocast flags) 48kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +51,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
34 gfp_t lflags = kmem_flags_convert(flags); 51 gfp_t lflags = kmem_flags_convert(flags);
35 void *ptr; 52 void *ptr;
36 53
37#ifdef DEBUG
38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
39 printk(KERN_WARNING "Large %s attempt, size=%ld\n",
40 __func__, (long)size);
41 dump_stack();
42 }
43#endif
44
45 do { 54 do {
46 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) 55 ptr = kmalloc(size, lflags);
47 ptr = kmalloc(size, lflags);
48 else
49 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
50 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 56 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
51 return ptr; 57 return ptr;
52 if (!(++retries % 100)) 58 if (!(++retries % 100))
@@ -68,27 +74,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
68 return ptr; 74 return ptr;
69} 75}
70 76
71void *
72kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
73 unsigned int __nocast flags)
74{
75 void *ptr;
76 size_t kmsize = maxsize;
77 unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
78
79 while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
80 if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
81 break;
82 if ((kmsize >>= 1) <= minsize) {
83 kmsize = minsize;
84 kmflags = flags;
85 }
86 }
87 if (ptr)
88 *size = kmsize;
89 return ptr;
90}
91
92void 77void
93kmem_free(const void *ptr) 78kmem_free(const void *ptr)
94{ 79{
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/vmalloc.h>
24 25
25/* 26/*
26 * General memory allocation interfaces 27 * General memory allocation interfaces
@@ -30,7 +31,6 @@
30#define KM_NOSLEEP 0x0002u 31#define KM_NOSLEEP 0x0002u
31#define KM_NOFS 0x0004u 32#define KM_NOFS 0x0004u
32#define KM_MAYFAIL 0x0008u 33#define KM_MAYFAIL 0x0008u
33#define KM_LARGE 0x0010u
34 34
35/* 35/*
36 * We use a special process flag to avoid recursive callbacks into 36 * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
42{ 42{
43 gfp_t lflags; 43 gfp_t lflags;
44 44
45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); 45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
46 46
47 if (flags & KM_NOSLEEP) { 47 if (flags & KM_NOSLEEP) {
48 lflags = GFP_ATOMIC | __GFP_NOWARN; 48 lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
56 56
57extern void *kmem_alloc(size_t, unsigned int __nocast); 57extern void *kmem_alloc(size_t, unsigned int __nocast);
58extern void *kmem_zalloc(size_t, unsigned int __nocast); 58extern void *kmem_zalloc(size_t, unsigned int __nocast);
59extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
60extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); 59extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
61extern void kmem_free(const void *); 60extern void kmem_free(const void *);
62 61
62static inline void *kmem_zalloc_large(size_t size)
63{
64 void *ptr;
65
66 ptr = vmalloc(size);
67 if (ptr)
68 memset(ptr, 0, size);
69 return ptr;
70}
71static inline void kmem_free_large(void *ptr)
72{
73 vfree(ptr);
74}
75
76extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
77
63/* 78/*
64 * Zone interfaces 79 * Zone interfaces
65 */ 80 */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b23a54506446..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -21,6 +21,8 @@
21#include "xfs_bmap_btree.h" 21#include "xfs_bmap_btree.h"
22#include "xfs_inode.h" 22#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 23#include "xfs_vnodeops.h"
24#include "xfs_trace.h"
25#include <linux/slab.h>
24#include <linux/xattr.h> 26#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h> 27#include <linux/posix_acl_xattr.h>
26 28
@@ -105,7 +107,7 @@ xfs_get_acl(struct inode *inode, int type)
105 struct posix_acl *acl; 107 struct posix_acl *acl;
106 struct xfs_acl *xfs_acl; 108 struct xfs_acl *xfs_acl;
107 int len = sizeof(struct xfs_acl); 109 int len = sizeof(struct xfs_acl);
108 char *ea_name; 110 unsigned char *ea_name;
109 int error; 111 int error;
110 112
111 acl = get_cached_acl(inode, type); 113 acl = get_cached_acl(inode, type);
@@ -132,7 +134,8 @@ xfs_get_acl(struct inode *inode, int type)
132 if (!xfs_acl) 134 if (!xfs_acl)
133 return ERR_PTR(-ENOMEM); 135 return ERR_PTR(-ENOMEM);
134 136
135 error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); 137 error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
138 &len, ATTR_ROOT);
136 if (error) { 139 if (error) {
137 /* 140 /*
138 * If the attribute doesn't exist make sure we have a negative 141 * If the attribute doesn't exist make sure we have a negative
@@ -161,7 +164,7 @@ STATIC int
161xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 164xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
162{ 165{
163 struct xfs_inode *ip = XFS_I(inode); 166 struct xfs_inode *ip = XFS_I(inode);
164 char *ea_name; 167 unsigned char *ea_name;
165 int error; 168 int error;
166 169
167 if (S_ISLNK(inode->i_mode)) 170 if (S_ISLNK(inode->i_mode))
@@ -193,7 +196,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
193 (sizeof(struct xfs_acl_entry) * 196 (sizeof(struct xfs_acl_entry) *
194 (XFS_ACL_MAX_ENTRIES - acl->a_count)); 197 (XFS_ACL_MAX_ENTRIES - acl->a_count));
195 198
196 error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, 199 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
197 len, ATTR_ROOT); 200 len, ATTR_ROOT);
198 201
199 kfree(xfs_acl); 202 kfree(xfs_acl);
@@ -250,8 +253,9 @@ xfs_set_mode(struct inode *inode, mode_t mode)
250 if (mode != inode->i_mode) { 253 if (mode != inode->i_mode) {
251 struct iattr iattr; 254 struct iattr iattr;
252 255
253 iattr.ia_valid = ATTR_MODE; 256 iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
254 iattr.ia_mode = mode; 257 iattr.ia_mode = mode;
258 iattr.ia_ctime = current_fs_time(inode->i_sb);
255 259
256 error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); 260 error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
257 } 261 }
@@ -260,7 +264,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
260} 264}
261 265
262static int 266static int
263xfs_acl_exists(struct inode *inode, char *name) 267xfs_acl_exists(struct inode *inode, unsigned char *name)
264{ 268{
265 int len = sizeof(struct xfs_acl); 269 int len = sizeof(struct xfs_acl);
266 270
@@ -353,37 +357,14 @@ xfs_acl_chmod(struct inode *inode)
353 return error; 357 return error;
354} 358}
355 359
356/*
357 * System xattr handlers.
358 *
359 * Currently Posix ACLs are the only system namespace extended attribute
360 * handlers supported by XFS, so we just implement the handlers here.
361 * If we ever support other system extended attributes this will need
362 * some refactoring.
363 */
364
365static int 360static int
366xfs_decode_acl(const char *name) 361xfs_xattr_acl_get(struct dentry *dentry, const char *name,
367{ 362 void *value, size_t size, int type)
368 if (strcmp(name, "posix_acl_access") == 0)
369 return ACL_TYPE_ACCESS;
370 else if (strcmp(name, "posix_acl_default") == 0)
371 return ACL_TYPE_DEFAULT;
372 return -EINVAL;
373}
374
375static int
376xfs_xattr_system_get(struct inode *inode, const char *name,
377 void *value, size_t size)
378{ 363{
379 struct posix_acl *acl; 364 struct posix_acl *acl;
380 int type, error; 365 int error;
381
382 type = xfs_decode_acl(name);
383 if (type < 0)
384 return type;
385 366
386 acl = xfs_get_acl(inode, type); 367 acl = xfs_get_acl(dentry->d_inode, type);
387 if (IS_ERR(acl)) 368 if (IS_ERR(acl))
388 return PTR_ERR(acl); 369 return PTR_ERR(acl);
389 if (acl == NULL) 370 if (acl == NULL)
@@ -396,15 +377,13 @@ xfs_xattr_system_get(struct inode *inode, const char *name,
396} 377}
397 378
398static int 379static int
399xfs_xattr_system_set(struct inode *inode, const char *name, 380xfs_xattr_acl_set(struct dentry *dentry, const char *name,
400 const void *value, size_t size, int flags) 381 const void *value, size_t size, int flags, int type)
401{ 382{
383 struct inode *inode = dentry->d_inode;
402 struct posix_acl *acl = NULL; 384 struct posix_acl *acl = NULL;
403 int error = 0, type; 385 int error = 0;
404 386
405 type = xfs_decode_acl(name);
406 if (type < 0)
407 return type;
408 if (flags & XATTR_CREATE) 387 if (flags & XATTR_CREATE)
409 return -EINVAL; 388 return -EINVAL;
410 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) 389 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
@@ -461,8 +440,16 @@ xfs_xattr_system_set(struct inode *inode, const char *name,
461 return error; 440 return error;
462} 441}
463 442
464struct xattr_handler xfs_xattr_system_handler = { 443struct xattr_handler xfs_xattr_acl_access_handler = {
465 .prefix = XATTR_SYSTEM_PREFIX, 444 .prefix = POSIX_ACL_XATTR_ACCESS,
466 .get = xfs_xattr_system_get, 445 .flags = ACL_TYPE_ACCESS,
467 .set = xfs_xattr_system_set, 446 .get = xfs_xattr_acl_get,
447 .set = xfs_xattr_acl_set,
448};
449
450struct xattr_handler xfs_xattr_acl_default_handler = {
451 .prefix = POSIX_ACL_XATTR_DEFAULT,
452 .flags = ACL_TYPE_DEFAULT,
453 .get = xfs_xattr_acl_get,
454 .set = xfs_xattr_acl_set,
468}; 455};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30eea74dc..0f8b9968a803 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,6 +38,9 @@
38#include "xfs_rw.h" 38#include "xfs_rw.h"
39#include "xfs_iomap.h" 39#include "xfs_iomap.h"
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h"
42#include "xfs_bmap.h"
43#include <linux/gfp.h>
41#include <linux/mpage.h> 44#include <linux/mpage.h>
42#include <linux/pagevec.h> 45#include <linux/pagevec.h>
43#include <linux/writeback.h> 46#include <linux/writeback.h>
@@ -76,7 +79,7 @@ xfs_ioend_wake(
76 wake_up(to_ioend_wq(ip)); 79 wake_up(to_ioend_wq(ip));
77} 80}
78 81
79STATIC void 82void
80xfs_count_page_state( 83xfs_count_page_state(
81 struct page *page, 84 struct page *page,
82 int *delalloc, 85 int *delalloc,
@@ -98,48 +101,6 @@ xfs_count_page_state(
98 } while ((bh = bh->b_this_page) != head); 101 } while ((bh = bh->b_this_page) != head);
99} 102}
100 103
101#if defined(XFS_RW_TRACE)
102void
103xfs_page_trace(
104 int tag,
105 struct inode *inode,
106 struct page *page,
107 unsigned long pgoff)
108{
109 xfs_inode_t *ip;
110 loff_t isize = i_size_read(inode);
111 loff_t offset = page_offset(page);
112 int delalloc = -1, unmapped = -1, unwritten = -1;
113
114 if (page_has_buffers(page))
115 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
116
117 ip = XFS_I(inode);
118 if (!ip->i_rwtrace)
119 return;
120
121 ktrace_enter(ip->i_rwtrace,
122 (void *)((unsigned long)tag),
123 (void *)ip,
124 (void *)inode,
125 (void *)page,
126 (void *)pgoff,
127 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
128 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
129 (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
130 (void *)((unsigned long)(isize & 0xffffffff)),
131 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
132 (void *)((unsigned long)(offset & 0xffffffff)),
133 (void *)((unsigned long)delalloc),
134 (void *)((unsigned long)unmapped),
135 (void *)((unsigned long)unwritten),
136 (void *)((unsigned long)current_pid()),
137 (void *)NULL);
138}
139#else
140#define xfs_page_trace(tag, inode, page, pgoff)
141#endif
142
143STATIC struct block_device * 104STATIC struct block_device *
144xfs_find_bdev_for_inode( 105xfs_find_bdev_for_inode(
145 struct xfs_inode *ip) 106 struct xfs_inode *ip)
@@ -204,14 +165,17 @@ xfs_ioend_new_eof(
204} 165}
205 166
206/* 167/*
207 * Update on-disk file size now that data has been written to disk. 168 * Update on-disk file size now that data has been written to disk. The
208 * The current in-memory file size is i_size. If a write is beyond 169 * current in-memory file size is i_size. If a write is beyond eof i_new_size
209 * eof i_new_size will be the intended file size until i_size is 170 * will be the intended file size until i_size is updated. If this write does
210 * updated. If this write does not extend all the way to the valid 171 * not extend all the way to the valid file size then restrict this update to
211 * file size then restrict this update to the end of the write. 172 * the end of the write.
173 *
174 * This function does not block as blocking on the inode lock in IO completion
175 * can lead to IO completion order dependency deadlocks.. If it can't get the
176 * inode ilock it will return EAGAIN. Callers must handle this.
212 */ 177 */
213 178STATIC int
214STATIC void
215xfs_setfilesize( 179xfs_setfilesize(
216 xfs_ioend_t *ioend) 180 xfs_ioend_t *ioend)
217{ 181{
@@ -222,85 +186,19 @@ xfs_setfilesize(
222 ASSERT(ioend->io_type != IOMAP_READ); 186 ASSERT(ioend->io_type != IOMAP_READ);
223 187
224 if (unlikely(ioend->io_error)) 188 if (unlikely(ioend->io_error))
225 return; 189 return 0;
190
191 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
192 return EAGAIN;
226 193
227 xfs_ilock(ip, XFS_ILOCK_EXCL);
228 isize = xfs_ioend_new_eof(ioend); 194 isize = xfs_ioend_new_eof(ioend);
229 if (isize) { 195 if (isize) {
230 ip->i_d.di_size = isize; 196 ip->i_d.di_size = isize;
231 xfs_mark_inode_dirty_sync(ip); 197 xfs_mark_inode_dirty(ip);
232 } 198 }
233 199
234 xfs_iunlock(ip, XFS_ILOCK_EXCL); 200 xfs_iunlock(ip, XFS_ILOCK_EXCL);
235} 201 return 0;
236
237/*
238 * Buffered IO write completion for delayed allocate extents.
239 */
240STATIC void
241xfs_end_bio_delalloc(
242 struct work_struct *work)
243{
244 xfs_ioend_t *ioend =
245 container_of(work, xfs_ioend_t, io_work);
246
247 xfs_setfilesize(ioend);
248 xfs_destroy_ioend(ioend);
249}
250
251/*
252 * Buffered IO write completion for regular, written extents.
253 */
254STATIC void
255xfs_end_bio_written(
256 struct work_struct *work)
257{
258 xfs_ioend_t *ioend =
259 container_of(work, xfs_ioend_t, io_work);
260
261 xfs_setfilesize(ioend);
262 xfs_destroy_ioend(ioend);
263}
264
265/*
266 * IO write completion for unwritten extents.
267 *
268 * Issue transactions to convert a buffer range from unwritten
269 * to written extents.
270 */
271STATIC void
272xfs_end_bio_unwritten(
273 struct work_struct *work)
274{
275 xfs_ioend_t *ioend =
276 container_of(work, xfs_ioend_t, io_work);
277 struct xfs_inode *ip = XFS_I(ioend->io_inode);
278 xfs_off_t offset = ioend->io_offset;
279 size_t size = ioend->io_size;
280
281 if (likely(!ioend->io_error)) {
282 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
283 int error;
284 error = xfs_iomap_write_unwritten(ip, offset, size);
285 if (error)
286 ioend->io_error = error;
287 }
288 xfs_setfilesize(ioend);
289 }
290 xfs_destroy_ioend(ioend);
291}
292
293/*
294 * IO read completion for regular, written extents.
295 */
296STATIC void
297xfs_end_bio_read(
298 struct work_struct *work)
299{
300 xfs_ioend_t *ioend =
301 container_of(work, xfs_ioend_t, io_work);
302
303 xfs_destroy_ioend(ioend);
304} 202}
305 203
306/* 204/*
@@ -314,10 +212,10 @@ xfs_finish_ioend(
314 int wait) 212 int wait)
315{ 213{
316 if (atomic_dec_and_test(&ioend->io_remaining)) { 214 if (atomic_dec_and_test(&ioend->io_remaining)) {
317 struct workqueue_struct *wq = xfsdatad_workqueue; 215 struct workqueue_struct *wq;
318 if (ioend->io_work.func == xfs_end_bio_unwritten)
319 wq = xfsconvertd_workqueue;
320 216
217 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
218 xfsconvertd_workqueue : xfsdatad_workqueue;
321 queue_work(wq, &ioend->io_work); 219 queue_work(wq, &ioend->io_work);
322 if (wait) 220 if (wait)
323 flush_workqueue(wq); 221 flush_workqueue(wq);
@@ -325,6 +223,53 @@ xfs_finish_ioend(
325} 223}
326 224
327/* 225/*
226 * IO write completion.
227 */
228STATIC void
229xfs_end_io(
230 struct work_struct *work)
231{
232 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
233 struct xfs_inode *ip = XFS_I(ioend->io_inode);
234 int error = 0;
235
236 /*
237 * For unwritten extents we need to issue transactions to convert a
238 * range to normal written extens after the data I/O has finished.
239 */
240 if (ioend->io_type == IOMAP_UNWRITTEN &&
241 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
242
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
244 ioend->io_size);
245 if (error)
246 ioend->io_error = error;
247 }
248
249 /*
250 * We might have to update the on-disk file size after extending
251 * writes.
252 */
253 if (ioend->io_type != IOMAP_READ) {
254 error = xfs_setfilesize(ioend);
255 ASSERT(!error || error == EAGAIN);
256 }
257
258 /*
259 * If we didn't complete processing of the ioend, requeue it to the
260 * tail of the workqueue for another attempt later. Otherwise destroy
261 * it.
262 */
263 if (error == EAGAIN) {
264 atomic_inc(&ioend->io_remaining);
265 xfs_finish_ioend(ioend, 0);
266 /* ensure we don't spin on blocked ioends */
267 delay(1);
268 } else
269 xfs_destroy_ioend(ioend);
270}
271
272/*
328 * Allocate and initialise an IO completion structure. 273 * Allocate and initialise an IO completion structure.
329 * We need to track unwritten extent write completion here initially. 274 * We need to track unwritten extent write completion here initially.
330 * We'll need to extend this for updating the ondisk inode size later 275 * We'll need to extend this for updating the ondisk inode size later
@@ -355,15 +300,7 @@ xfs_alloc_ioend(
355 ioend->io_offset = 0; 300 ioend->io_offset = 0;
356 ioend->io_size = 0; 301 ioend->io_size = 0;
357 302
358 if (type == IOMAP_UNWRITTEN) 303 INIT_WORK(&ioend->io_work, xfs_end_io);
359 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
360 else if (type == IOMAP_DELAY)
361 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
362 else if (type == IOMAP_READ)
363 INIT_WORK(&ioend->io_work, xfs_end_bio_read);
364 else
365 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
366
367 return ioend; 304 return ioend;
368} 305}
369 306
@@ -380,7 +317,7 @@ xfs_map_blocks(
380 return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); 317 return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
381} 318}
382 319
383STATIC_INLINE int 320STATIC int
384xfs_iomap_valid( 321xfs_iomap_valid(
385 xfs_iomap_t *iomapp, 322 xfs_iomap_t *iomapp,
386 loff_t offset) 323 loff_t offset)
@@ -412,8 +349,9 @@ xfs_end_bio(
412 349
413STATIC void 350STATIC void
414xfs_submit_ioend_bio( 351xfs_submit_ioend_bio(
415 xfs_ioend_t *ioend, 352 struct writeback_control *wbc,
416 struct bio *bio) 353 xfs_ioend_t *ioend,
354 struct bio *bio)
417{ 355{
418 atomic_inc(&ioend->io_remaining); 356 atomic_inc(&ioend->io_remaining);
419 bio->bi_private = ioend; 357 bio->bi_private = ioend;
@@ -424,9 +362,10 @@ xfs_submit_ioend_bio(
424 * but don't update the inode size until I/O completion. 362 * but don't update the inode size until I/O completion.
425 */ 363 */
426 if (xfs_ioend_new_eof(ioend)) 364 if (xfs_ioend_new_eof(ioend))
427 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); 365 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
428 366
429 submit_bio(WRITE, bio); 367 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
368 WRITE_SYNC_PLUG : WRITE, bio);
430 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); 369 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
431 bio_put(bio); 370 bio_put(bio);
432} 371}
@@ -505,6 +444,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
505 */ 444 */
506STATIC void 445STATIC void
507xfs_submit_ioend( 446xfs_submit_ioend(
447 struct writeback_control *wbc,
508 xfs_ioend_t *ioend) 448 xfs_ioend_t *ioend)
509{ 449{
510 xfs_ioend_t *head = ioend; 450 xfs_ioend_t *head = ioend;
@@ -533,19 +473,19 @@ xfs_submit_ioend(
533 retry: 473 retry:
534 bio = xfs_alloc_ioend_bio(bh); 474 bio = xfs_alloc_ioend_bio(bh);
535 } else if (bh->b_blocknr != lastblock + 1) { 475 } else if (bh->b_blocknr != lastblock + 1) {
536 xfs_submit_ioend_bio(ioend, bio); 476 xfs_submit_ioend_bio(wbc, ioend, bio);
537 goto retry; 477 goto retry;
538 } 478 }
539 479
540 if (bio_add_buffer(bio, bh) != bh->b_size) { 480 if (bio_add_buffer(bio, bh) != bh->b_size) {
541 xfs_submit_ioend_bio(ioend, bio); 481 xfs_submit_ioend_bio(wbc, ioend, bio);
542 goto retry; 482 goto retry;
543 } 483 }
544 484
545 lastblock = bh->b_blocknr; 485 lastblock = bh->b_blocknr;
546 } 486 }
547 if (bio) 487 if (bio)
548 xfs_submit_ioend_bio(ioend, bio); 488 xfs_submit_ioend_bio(wbc, ioend, bio);
549 xfs_finish_ioend(ioend, 0); 489 xfs_finish_ioend(ioend, 0);
550 } while ((ioend = next) != NULL); 490 } while ((ioend = next) != NULL);
551} 491}
@@ -904,16 +844,9 @@ xfs_convert_page(
904 844
905 if (startio) { 845 if (startio) {
906 if (count) { 846 if (count) {
907 struct backing_dev_info *bdi;
908
909 bdi = inode->i_mapping->backing_dev_info;
910 wbc->nr_to_write--; 847 wbc->nr_to_write--;
911 if (bdi_write_congested(bdi)) { 848 if (wbc->nr_to_write <= 0)
912 wbc->encountered_congestion = 1;
913 done = 1;
914 } else if (wbc->nr_to_write <= 0) {
915 done = 1; 849 done = 1;
916 }
917 } 850 }
918 xfs_start_page_writeback(page, !page_dirty, count); 851 xfs_start_page_writeback(page, !page_dirty, count);
919 } 852 }
@@ -962,6 +895,125 @@ xfs_cluster_write(
962 } 895 }
963} 896}
964 897
898STATIC void
899xfs_vm_invalidatepage(
900 struct page *page,
901 unsigned long offset)
902{
903 trace_xfs_invalidatepage(page->mapping->host, page, offset);
904 block_invalidatepage(page, offset);
905}
906
907/*
908 * If the page has delalloc buffers on it, we need to punch them out before we
909 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
910 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
911 * is done on that same region - the delalloc extent is returned when none is
912 * supposed to be there.
913 *
914 * We prevent this by truncating away the delalloc regions on the page before
915 * invalidating it. Because they are delalloc, we can do this without needing a
916 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
917 * truncation without a transaction as there is no space left for block
918 * reservation (typically why we see a ENOSPC in writeback).
919 *
920 * This is not a performance critical path, so for now just do the punching a
921 * buffer head at a time.
922 */
923STATIC void
924xfs_aops_discard_page(
925 struct page *page)
926{
927 struct inode *inode = page->mapping->host;
928 struct xfs_inode *ip = XFS_I(inode);
929 struct buffer_head *bh, *head;
930 loff_t offset = page_offset(page);
931 ssize_t len = 1 << inode->i_blkbits;
932
933 if (!xfs_is_delayed_page(page, IOMAP_DELAY))
934 goto out_invalidate;
935
936 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
937 goto out_invalidate;
938
939 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
940 "page discard on page %p, inode 0x%llx, offset %llu.",
941 page, ip->i_ino, offset);
942
943 xfs_ilock(ip, XFS_ILOCK_EXCL);
944 bh = head = page_buffers(page);
945 do {
946 int done;
947 xfs_fileoff_t offset_fsb;
948 xfs_bmbt_irec_t imap;
949 int nimaps = 1;
950 int error;
951 xfs_fsblock_t firstblock;
952 xfs_bmap_free_t flist;
953
954 if (!buffer_delay(bh))
955 goto next_buffer;
956
957 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
958
959 /*
960 * Map the range first and check that it is a delalloc extent
961 * before trying to unmap the range. Otherwise we will be
962 * trying to remove a real extent (which requires a
963 * transaction) or a hole, which is probably a bad idea...
964 */
965 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
966 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
967 &nimaps, NULL, NULL);
968
969 if (error) {
970 /* something screwed, just bail */
971 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
972 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
973 "page discard failed delalloc mapping lookup.");
974 }
975 break;
976 }
977 if (!nimaps) {
978 /* nothing there */
979 goto next_buffer;
980 }
981 if (imap.br_startblock != DELAYSTARTBLOCK) {
982 /* been converted, ignore */
983 goto next_buffer;
984 }
985 WARN_ON(imap.br_blockcount == 0);
986
987 /*
988 * Note: while we initialise the firstblock/flist pair, they
989 * should never be used because blocks should never be
990 * allocated or freed for a delalloc extent and hence we need
991 * don't cancel or finish them after the xfs_bunmapi() call.
992 */
993 xfs_bmap_init(&flist, &firstblock);
994 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
995 &flist, NULL, &done);
996
997 ASSERT(!flist.xbf_count && !flist.xbf_first);
998 if (error) {
999 /* something screwed, just bail */
1000 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1001 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
1002 "page discard unable to remove delalloc mapping.");
1003 }
1004 break;
1005 }
1006next_buffer:
1007 offset += len;
1008
1009 } while ((bh = bh->b_this_page) != head);
1010
1011 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1012out_invalidate:
1013 xfs_vm_invalidatepage(page, 0);
1014 return;
1015}
1016
965/* 1017/*
966 * Calling this without startio set means we are being asked to make a dirty 1018 * Calling this without startio set means we are being asked to make a dirty
967 * page ready for freeing it's buffers. When called with startio set then 1019 * page ready for freeing it's buffers. When called with startio set then
@@ -1198,7 +1250,7 @@ xfs_page_state_convert(
1198 } 1250 }
1199 1251
1200 if (iohead) 1252 if (iohead)
1201 xfs_submit_ioend(iohead); 1253 xfs_submit_ioend(wbc, iohead);
1202 1254
1203 return page_dirty; 1255 return page_dirty;
1204 1256
@@ -1213,7 +1265,7 @@ error:
1213 */ 1265 */
1214 if (err != -EAGAIN) { 1266 if (err != -EAGAIN) {
1215 if (!unmapped) 1267 if (!unmapped)
1216 block_invalidatepage(page, 0); 1268 xfs_aops_discard_page(page);
1217 ClearPageUptodate(page); 1269 ClearPageUptodate(page);
1218 } 1270 }
1219 return err; 1271 return err;
@@ -1249,7 +1301,7 @@ xfs_vm_writepage(
1249 int delalloc, unmapped, unwritten; 1301 int delalloc, unmapped, unwritten;
1250 struct inode *inode = page->mapping->host; 1302 struct inode *inode = page->mapping->host;
1251 1303
1252 xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0); 1304 trace_xfs_writepage(inode, page, 0);
1253 1305
1254 /* 1306 /*
1255 * We need a transaction if: 1307 * We need a transaction if:
@@ -1354,7 +1406,7 @@ xfs_vm_releasepage(
1354 .nr_to_write = 1, 1406 .nr_to_write = 1,
1355 }; 1407 };
1356 1408
1357 xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0); 1409 trace_xfs_releasepage(inode, page, 0);
1358 1410
1359 if (!page_has_buffers(page)) 1411 if (!page_has_buffers(page))
1360 return 0; 1412 return 0;
@@ -1535,7 +1587,7 @@ xfs_end_io_direct(
1535 * didn't map an unwritten extent so switch it's completion 1587 * didn't map an unwritten extent so switch it's completion
1536 * handler. 1588 * handler.
1537 */ 1589 */
1538 INIT_WORK(&ioend->io_work, xfs_end_bio_written); 1590 ioend->io_type = IOMAP_NEW;
1539 xfs_finish_ioend(ioend, 0); 1591 xfs_finish_ioend(ioend, 0);
1540 } 1592 }
1541 1593
@@ -1562,19 +1614,13 @@ xfs_vm_direct_IO(
1562 1614
1563 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1615 bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1564 1616
1565 if (rw == WRITE) { 1617 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1566 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); 1618 IOMAP_UNWRITTEN : IOMAP_READ);
1567 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1619
1568 bdev, iov, offset, nr_segs, 1620 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1569 xfs_get_blocks_direct, 1621 offset, nr_segs,
1570 xfs_end_io_direct); 1622 xfs_get_blocks_direct,
1571 } else { 1623 xfs_end_io_direct);
1572 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1573 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1574 bdev, iov, offset, nr_segs,
1575 xfs_get_blocks_direct,
1576 xfs_end_io_direct);
1577 }
1578 1624
1579 if (unlikely(ret != -EIOCBQUEUED && iocb->private)) 1625 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1580 xfs_destroy_ioend(iocb->private); 1626 xfs_destroy_ioend(iocb->private);
@@ -1629,16 +1675,6 @@ xfs_vm_readpages(
1629 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1675 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1630} 1676}
1631 1677
1632STATIC void
1633xfs_vm_invalidatepage(
1634 struct page *page,
1635 unsigned long offset)
1636{
1637 xfs_page_trace(XFS_INVALIDPAGE_ENTER,
1638 page->mapping->host, page, offset);
1639 block_invalidatepage(page, offset);
1640}
1641
1642const struct address_space_operations xfs_address_space_operations = { 1678const struct address_space_operations xfs_address_space_operations = {
1643 .readpage = xfs_vm_readpage, 1679 .readpage = xfs_vm_readpage,
1644 .readpages = xfs_vm_readpages, 1680 .readpages = xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 221b3e66ceef..4cfc6ea87df8 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -45,4 +45,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
45extern void xfs_ioend_init(void); 45extern void xfs_ioend_init(void);
46extern void xfs_ioend_wait(struct xfs_inode *); 46extern void xfs_ioend_wait(struct xfs_inode *);
47 47
48extern void xfs_count_page_state(struct page *, int *, int *, int *);
49
48#endif /* __XFS_AOPS_H__ */ 50#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 965df1227d64..44c2b0ef9a41 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/slab.h> 21#include <linux/gfp.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
@@ -33,12 +33,14 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
36 37
37#include "xfs_sb.h" 38#include "xfs_sb.h"
38#include "xfs_inum.h" 39#include "xfs_inum.h"
39#include "xfs_ag.h" 40#include "xfs_ag.h"
40#include "xfs_dmapi.h" 41#include "xfs_dmapi.h"
41#include "xfs_mount.h" 42#include "xfs_mount.h"
43#include "xfs_trace.h"
42 44
43static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
44STATIC int xfsbufd(void *); 46STATIC int xfsbufd(void *);
@@ -53,34 +55,6 @@ static struct workqueue_struct *xfslogd_workqueue;
53struct workqueue_struct *xfsdatad_workqueue; 55struct workqueue_struct *xfsdatad_workqueue;
54struct workqueue_struct *xfsconvertd_workqueue; 56struct workqueue_struct *xfsconvertd_workqueue;
55 57
56#ifdef XFS_BUF_TRACE
57void
58xfs_buf_trace(
59 xfs_buf_t *bp,
60 char *id,
61 void *data,
62 void *ra)
63{
64 ktrace_enter(xfs_buf_trace_buf,
65 bp, id,
66 (void *)(unsigned long)bp->b_flags,
67 (void *)(unsigned long)bp->b_hold.counter,
68 (void *)(unsigned long)bp->b_sema.count,
69 (void *)current,
70 data, ra,
71 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
72 (void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
73 (void *)(unsigned long)bp->b_buffer_length,
74 NULL, NULL, NULL, NULL, NULL);
75}
76ktrace_t *xfs_buf_trace_buf;
77#define XFS_BUF_TRACE_SIZE 4096
78#define XB_TRACE(bp, id, data) \
79 xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
80#else
81#define XB_TRACE(bp, id, data) do { } while (0)
82#endif
83
84#ifdef XFS_BUF_LOCK_TRACKING 58#ifdef XFS_BUF_LOCK_TRACKING
85# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 59# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
86# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 60# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
@@ -103,6 +77,27 @@ ktrace_t *xfs_buf_trace_buf;
103#define xfs_buf_deallocate(bp) \ 77#define xfs_buf_deallocate(bp) \
104 kmem_zone_free(xfs_buf_zone, (bp)); 78 kmem_zone_free(xfs_buf_zone, (bp));
105 79
80static inline int
81xfs_buf_is_vmapped(
82 struct xfs_buf *bp)
83{
84 /*
85 * Return true if the buffer is vmapped.
86 *
87 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
88 * code is clever enough to know it doesn't have to map a single page,
89 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
90 */
91 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
92}
93
94static inline int
95xfs_buf_vmap_len(
96 struct xfs_buf *bp)
97{
98 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
99}
100
106/* 101/*
107 * Page Region interfaces. 102 * Page Region interfaces.
108 * 103 *
@@ -149,7 +144,7 @@ page_region_mask(
149 return mask; 144 return mask;
150} 145}
151 146
152STATIC_INLINE void 147STATIC void
153set_page_region( 148set_page_region(
154 struct page *page, 149 struct page *page,
155 size_t offset, 150 size_t offset,
@@ -161,7 +156,7 @@ set_page_region(
161 SetPageUptodate(page); 156 SetPageUptodate(page);
162} 157}
163 158
164STATIC_INLINE int 159STATIC int
165test_page_region( 160test_page_region(
166 struct page *page, 161 struct page *page,
167 size_t offset, 162 size_t offset,
@@ -173,75 +168,6 @@ test_page_region(
173} 168}
174 169
175/* 170/*
176 * Mapping of multi-page buffers into contiguous virtual space
177 */
178
179typedef struct a_list {
180 void *vm_addr;
181 struct a_list *next;
182} a_list_t;
183
184static a_list_t *as_free_head;
185static int as_list_len;
186static DEFINE_SPINLOCK(as_lock);
187
188/*
189 * Try to batch vunmaps because they are costly.
190 */
191STATIC void
192free_address(
193 void *addr)
194{
195 a_list_t *aentry;
196
197#ifdef CONFIG_XEN
198 /*
199 * Xen needs to be able to make sure it can get an exclusive
200 * RO mapping of pages it wants to turn into a pagetable. If
201 * a newly allocated page is also still being vmap()ed by xfs,
202 * it will cause pagetable construction to fail. This is a
203 * quick workaround to always eagerly unmap pages so that Xen
204 * is happy.
205 */
206 vunmap(addr);
207 return;
208#endif
209
210 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
211 if (likely(aentry)) {
212 spin_lock(&as_lock);
213 aentry->next = as_free_head;
214 aentry->vm_addr = addr;
215 as_free_head = aentry;
216 as_list_len++;
217 spin_unlock(&as_lock);
218 } else {
219 vunmap(addr);
220 }
221}
222
223STATIC void
224purge_addresses(void)
225{
226 a_list_t *aentry, *old;
227
228 if (as_free_head == NULL)
229 return;
230
231 spin_lock(&as_lock);
232 aentry = as_free_head;
233 as_free_head = NULL;
234 as_list_len = 0;
235 spin_unlock(&as_lock);
236
237 while ((old = aentry) != NULL) {
238 vunmap(aentry->vm_addr);
239 aentry = aentry->next;
240 kfree(old);
241 }
242}
243
244/*
245 * Internal xfs_buf_t object manipulation 171 * Internal xfs_buf_t object manipulation
246 */ 172 */
247 173
@@ -279,7 +205,8 @@ _xfs_buf_initialize(
279 init_waitqueue_head(&bp->b_waiters); 205 init_waitqueue_head(&bp->b_waiters);
280 206
281 XFS_STATS_INC(xb_create); 207 XFS_STATS_INC(xb_create);
282 XB_TRACE(bp, "initialize", target); 208
209 trace_xfs_buf_init(bp, _RET_IP_);
283} 210}
284 211
285/* 212/*
@@ -318,6 +245,7 @@ _xfs_buf_free_pages(
318{ 245{
319 if (bp->b_pages != bp->b_page_array) { 246 if (bp->b_pages != bp->b_page_array) {
320 kmem_free(bp->b_pages); 247 kmem_free(bp->b_pages);
248 bp->b_pages = NULL;
321 } 249 }
322} 250}
323 251
@@ -332,15 +260,16 @@ void
332xfs_buf_free( 260xfs_buf_free(
333 xfs_buf_t *bp) 261 xfs_buf_t *bp)
334{ 262{
335 XB_TRACE(bp, "free", 0); 263 trace_xfs_buf_free(bp, _RET_IP_);
336 264
337 ASSERT(list_empty(&bp->b_hash_list)); 265 ASSERT(list_empty(&bp->b_hash_list));
338 266
339 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
340 uint i; 268 uint i;
341 269
342 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 270 if (xfs_buf_is_vmapped(bp))
343 free_address(bp->b_addr - bp->b_offset); 271 vm_unmap_ram(bp->b_addr - bp->b_offset,
272 bp->b_page_count);
344 273
345 for (i = 0; i < bp->b_page_count; i++) { 274 for (i = 0; i < bp->b_page_count; i++) {
346 struct page *page = bp->b_pages[i]; 275 struct page *page = bp->b_pages[i];
@@ -349,9 +278,8 @@ xfs_buf_free(
349 ASSERT(!PagePrivate(page)); 278 ASSERT(!PagePrivate(page));
350 page_cache_release(page); 279 page_cache_release(page);
351 } 280 }
352 _xfs_buf_free_pages(bp);
353 } 281 }
354 282 _xfs_buf_free_pages(bp);
355 xfs_buf_deallocate(bp); 283 xfs_buf_deallocate(bp);
356} 284}
357 285
@@ -445,7 +373,6 @@ _xfs_buf_lookup_pages(
445 if (page_count == bp->b_page_count) 373 if (page_count == bp->b_page_count)
446 bp->b_flags |= XBF_DONE; 374 bp->b_flags |= XBF_DONE;
447 375
448 XB_TRACE(bp, "lookup_pages", (long)page_count);
449 return error; 376 return error;
450} 377}
451 378
@@ -462,10 +389,8 @@ _xfs_buf_map_pages(
462 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
463 bp->b_flags |= XBF_MAPPED; 390 bp->b_flags |= XBF_MAPPED;
464 } else if (flags & XBF_MAPPED) { 391 } else if (flags & XBF_MAPPED) {
465 if (as_list_len > 64) 392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
466 purge_addresses(); 393 -1, PAGE_KERNEL);
467 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
468 VM_MAP, PAGE_KERNEL);
469 if (unlikely(bp->b_addr == NULL)) 394 if (unlikely(bp->b_addr == NULL))
470 return -ENOMEM; 395 return -ENOMEM;
471 bp->b_addr += bp->b_offset; 396 bp->b_addr += bp->b_offset;
@@ -548,7 +473,6 @@ found:
548 if (down_trylock(&bp->b_sema)) { 473 if (down_trylock(&bp->b_sema)) {
549 if (!(flags & XBF_TRYLOCK)) { 474 if (!(flags & XBF_TRYLOCK)) {
550 /* wait for buffer ownership */ 475 /* wait for buffer ownership */
551 XB_TRACE(bp, "get_lock", 0);
552 xfs_buf_lock(bp); 476 xfs_buf_lock(bp);
553 XFS_STATS_INC(xb_get_locked_waited); 477 XFS_STATS_INC(xb_get_locked_waited);
554 } else { 478 } else {
@@ -571,7 +495,8 @@ found:
571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 495 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
572 bp->b_flags &= XBF_MAPPED; 496 bp->b_flags &= XBF_MAPPED;
573 } 497 }
574 XB_TRACE(bp, "got_lock", 0); 498
499 trace_xfs_buf_find(bp, flags, _RET_IP_);
575 XFS_STATS_INC(xb_get_locked); 500 XFS_STATS_INC(xb_get_locked);
576 return bp; 501 return bp;
577} 502}
@@ -582,7 +507,7 @@ found:
582 * although backing storage may not be. 507 * although backing storage may not be.
583 */ 508 */
584xfs_buf_t * 509xfs_buf_t *
585xfs_buf_get_flags( 510xfs_buf_get(
586 xfs_buftarg_t *target,/* target for buffer */ 511 xfs_buftarg_t *target,/* target for buffer */
587 xfs_off_t ioff, /* starting offset of range */ 512 xfs_off_t ioff, /* starting offset of range */
588 size_t isize, /* length of range */ 513 size_t isize, /* length of range */
@@ -627,7 +552,7 @@ xfs_buf_get_flags(
627 bp->b_bn = ioff; 552 bp->b_bn = ioff;
628 bp->b_count_desired = bp->b_buffer_length; 553 bp->b_count_desired = bp->b_buffer_length;
629 554
630 XB_TRACE(bp, "get", (unsigned long)flags); 555 trace_xfs_buf_get(bp, flags, _RET_IP_);
631 return bp; 556 return bp;
632 557
633 no_buffer: 558 no_buffer:
@@ -644,8 +569,6 @@ _xfs_buf_read(
644{ 569{
645 int status; 570 int status;
646 571
647 XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
648
649 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); 572 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
650 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 573 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
651 574
@@ -661,7 +584,7 @@ _xfs_buf_read(
661} 584}
662 585
663xfs_buf_t * 586xfs_buf_t *
664xfs_buf_read_flags( 587xfs_buf_read(
665 xfs_buftarg_t *target, 588 xfs_buftarg_t *target,
666 xfs_off_t ioff, 589 xfs_off_t ioff,
667 size_t isize, 590 size_t isize,
@@ -671,21 +594,20 @@ xfs_buf_read_flags(
671 594
672 flags |= XBF_READ; 595 flags |= XBF_READ;
673 596
674 bp = xfs_buf_get_flags(target, ioff, isize, flags); 597 bp = xfs_buf_get(target, ioff, isize, flags);
675 if (bp) { 598 if (bp) {
599 trace_xfs_buf_read(bp, flags, _RET_IP_);
600
676 if (!XFS_BUF_ISDONE(bp)) { 601 if (!XFS_BUF_ISDONE(bp)) {
677 XB_TRACE(bp, "read", (unsigned long)flags);
678 XFS_STATS_INC(xb_get_read); 602 XFS_STATS_INC(xb_get_read);
679 _xfs_buf_read(bp, flags); 603 _xfs_buf_read(bp, flags);
680 } else if (flags & XBF_ASYNC) { 604 } else if (flags & XBF_ASYNC) {
681 XB_TRACE(bp, "read_async", (unsigned long)flags);
682 /* 605 /*
683 * Read ahead call which is already satisfied, 606 * Read ahead call which is already satisfied,
684 * drop the buffer 607 * drop the buffer
685 */ 608 */
686 goto no_buffer; 609 goto no_buffer;
687 } else { 610 } else {
688 XB_TRACE(bp, "read_done", (unsigned long)flags);
689 /* We do not want read in the flags */ 611 /* We do not want read in the flags */
690 bp->b_flags &= ~XBF_READ; 612 bp->b_flags &= ~XBF_READ;
691 } 613 }
@@ -718,7 +640,7 @@ xfs_buf_readahead(
718 return; 640 return;
719 641
720 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 642 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
721 xfs_buf_read_flags(target, ioff, isize, flags); 643 xfs_buf_read(target, ioff, isize, flags);
722} 644}
723 645
724xfs_buf_t * 646xfs_buf_t *
@@ -823,7 +745,7 @@ xfs_buf_get_noaddr(
823 745
824 xfs_buf_unlock(bp); 746 xfs_buf_unlock(bp);
825 747
826 XB_TRACE(bp, "no_daddr", len); 748 trace_xfs_buf_get_noaddr(bp, _RET_IP_);
827 return bp; 749 return bp;
828 750
829 fail_free_mem: 751 fail_free_mem:
@@ -845,8 +767,8 @@ void
845xfs_buf_hold( 767xfs_buf_hold(
846 xfs_buf_t *bp) 768 xfs_buf_t *bp)
847{ 769{
770 trace_xfs_buf_hold(bp, _RET_IP_);
848 atomic_inc(&bp->b_hold); 771 atomic_inc(&bp->b_hold);
849 XB_TRACE(bp, "hold", 0);
850} 772}
851 773
852/* 774/*
@@ -859,7 +781,7 @@ xfs_buf_rele(
859{ 781{
860 xfs_bufhash_t *hash = bp->b_hash; 782 xfs_bufhash_t *hash = bp->b_hash;
861 783
862 XB_TRACE(bp, "rele", bp->b_relse); 784 trace_xfs_buf_rele(bp, _RET_IP_);
863 785
864 if (unlikely(!hash)) { 786 if (unlikely(!hash)) {
865 ASSERT(!bp->b_relse); 787 ASSERT(!bp->b_relse);
@@ -909,21 +831,19 @@ xfs_buf_cond_lock(
909 int locked; 831 int locked;
910 832
911 locked = down_trylock(&bp->b_sema) == 0; 833 locked = down_trylock(&bp->b_sema) == 0;
912 if (locked) { 834 if (locked)
913 XB_SET_OWNER(bp); 835 XB_SET_OWNER(bp);
914 } 836
915 XB_TRACE(bp, "cond_lock", (long)locked); 837 trace_xfs_buf_cond_lock(bp, _RET_IP_);
916 return locked ? 0 : -EBUSY; 838 return locked ? 0 : -EBUSY;
917} 839}
918 840
919#if defined(DEBUG) || defined(XFS_BLI_TRACE)
920int 841int
921xfs_buf_lock_value( 842xfs_buf_lock_value(
922 xfs_buf_t *bp) 843 xfs_buf_t *bp)
923{ 844{
924 return bp->b_sema.count; 845 return bp->b_sema.count;
925} 846}
926#endif
927 847
928/* 848/*
929 * Locks a buffer object. 849 * Locks a buffer object.
@@ -935,12 +855,14 @@ void
935xfs_buf_lock( 855xfs_buf_lock(
936 xfs_buf_t *bp) 856 xfs_buf_t *bp)
937{ 857{
938 XB_TRACE(bp, "lock", 0); 858 trace_xfs_buf_lock(bp, _RET_IP_);
859
939 if (atomic_read(&bp->b_io_remaining)) 860 if (atomic_read(&bp->b_io_remaining))
940 blk_run_address_space(bp->b_target->bt_mapping); 861 blk_run_address_space(bp->b_target->bt_mapping);
941 down(&bp->b_sema); 862 down(&bp->b_sema);
942 XB_SET_OWNER(bp); 863 XB_SET_OWNER(bp);
943 XB_TRACE(bp, "locked", 0); 864
865 trace_xfs_buf_lock_done(bp, _RET_IP_);
944} 866}
945 867
946/* 868/*
@@ -962,7 +884,8 @@ xfs_buf_unlock(
962 884
963 XB_CLEAR_OWNER(bp); 885 XB_CLEAR_OWNER(bp);
964 up(&bp->b_sema); 886 up(&bp->b_sema);
965 XB_TRACE(bp, "unlock", 0); 887
888 trace_xfs_buf_unlock(bp, _RET_IP_);
966} 889}
967 890
968 891
@@ -974,17 +897,18 @@ void
974xfs_buf_pin( 897xfs_buf_pin(
975 xfs_buf_t *bp) 898 xfs_buf_t *bp)
976{ 899{
900 trace_xfs_buf_pin(bp, _RET_IP_);
977 atomic_inc(&bp->b_pin_count); 901 atomic_inc(&bp->b_pin_count);
978 XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
979} 902}
980 903
981void 904void
982xfs_buf_unpin( 905xfs_buf_unpin(
983 xfs_buf_t *bp) 906 xfs_buf_t *bp)
984{ 907{
908 trace_xfs_buf_unpin(bp, _RET_IP_);
909
985 if (atomic_dec_and_test(&bp->b_pin_count)) 910 if (atomic_dec_and_test(&bp->b_pin_count))
986 wake_up_all(&bp->b_waiters); 911 wake_up_all(&bp->b_waiters);
987 XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
988} 912}
989 913
990int 914int
@@ -1035,7 +959,7 @@ xfs_buf_iodone_work(
1035 */ 959 */
1036 if ((bp->b_error == EOPNOTSUPP) && 960 if ((bp->b_error == EOPNOTSUPP) &&
1037 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { 961 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
1038 XB_TRACE(bp, "ordered_retry", bp->b_iodone); 962 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
1039 bp->b_flags &= ~XBF_ORDERED; 963 bp->b_flags &= ~XBF_ORDERED;
1040 bp->b_flags |= _XFS_BARRIER_FAILED; 964 bp->b_flags |= _XFS_BARRIER_FAILED;
1041 xfs_buf_iorequest(bp); 965 xfs_buf_iorequest(bp);
@@ -1050,12 +974,12 @@ xfs_buf_ioend(
1050 xfs_buf_t *bp, 974 xfs_buf_t *bp,
1051 int schedule) 975 int schedule)
1052{ 976{
977 trace_xfs_buf_iodone(bp, _RET_IP_);
978
1053 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 979 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1054 if (bp->b_error == 0) 980 if (bp->b_error == 0)
1055 bp->b_flags |= XBF_DONE; 981 bp->b_flags |= XBF_DONE;
1056 982
1057 XB_TRACE(bp, "iodone", bp->b_iodone);
1058
1059 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 983 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
1060 if (schedule) { 984 if (schedule) {
1061 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 985 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
@@ -1075,26 +999,34 @@ xfs_buf_ioerror(
1075{ 999{
1076 ASSERT(error >= 0 && error <= 0xffff); 1000 ASSERT(error >= 0 && error <= 0xffff);
1077 bp->b_error = (unsigned short)error; 1001 bp->b_error = (unsigned short)error;
1078 XB_TRACE(bp, "ioerror", (unsigned long)error); 1002 trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1079} 1003}
1080 1004
1081int 1005int
1082xfs_bawrite( 1006xfs_bwrite(
1083 void *mp, 1007 struct xfs_mount *mp,
1084 struct xfs_buf *bp) 1008 struct xfs_buf *bp)
1085{ 1009{
1086 XB_TRACE(bp, "bawrite", 0); 1010 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
1011 int error = 0;
1087 1012
1088 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1013 bp->b_strat = xfs_bdstrat_cb;
1014 bp->b_mount = mp;
1015 bp->b_flags |= XBF_WRITE;
1016 if (!iowait)
1017 bp->b_flags |= _XBF_RUN_QUEUES;
1089 1018
1090 xfs_buf_delwri_dequeue(bp); 1019 xfs_buf_delwri_dequeue(bp);
1020 xfs_buf_iostrategy(bp);
1091 1021
1092 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1022 if (iowait) {
1093 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1023 error = xfs_buf_iowait(bp);
1024 if (error)
1025 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1026 xfs_buf_relse(bp);
1027 }
1094 1028
1095 bp->b_mount = mp; 1029 return error;
1096 bp->b_strat = xfs_bdstrat_cb;
1097 return xfs_bdstrat_cb(bp);
1098} 1030}
1099 1031
1100void 1032void
@@ -1102,7 +1034,7 @@ xfs_bdwrite(
1102 void *mp, 1034 void *mp,
1103 struct xfs_buf *bp) 1035 struct xfs_buf *bp)
1104{ 1036{
1105 XB_TRACE(bp, "bdwrite", 0); 1037 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1106 1038
1107 bp->b_strat = xfs_bdstrat_cb; 1039 bp->b_strat = xfs_bdstrat_cb;
1108 bp->b_mount = mp; 1040 bp->b_mount = mp;
@@ -1113,7 +1045,127 @@ xfs_bdwrite(
1113 xfs_buf_delwri_queue(bp, 1); 1045 xfs_buf_delwri_queue(bp, 1);
1114} 1046}
1115 1047
1116STATIC_INLINE void 1048/*
1049 * Called when we want to stop a buffer from getting written or read.
1050 * We attach the EIO error, muck with its flags, and call biodone
1051 * so that the proper iodone callbacks get called.
1052 */
1053STATIC int
1054xfs_bioerror(
1055 xfs_buf_t *bp)
1056{
1057#ifdef XFSERRORDEBUG
1058 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1059#endif
1060
1061 /*
1062 * No need to wait until the buffer is unpinned, we aren't flushing it.
1063 */
1064 XFS_BUF_ERROR(bp, EIO);
1065
1066 /*
1067 * We're calling biodone, so delete XBF_DONE flag.
1068 */
1069 XFS_BUF_UNREAD(bp);
1070 XFS_BUF_UNDELAYWRITE(bp);
1071 XFS_BUF_UNDONE(bp);
1072 XFS_BUF_STALE(bp);
1073
1074 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1075 xfs_biodone(bp);
1076
1077 return EIO;
1078}
1079
1080/*
1081 * Same as xfs_bioerror, except that we are releasing the buffer
1082 * here ourselves, and avoiding the biodone call.
1083 * This is meant for userdata errors; metadata bufs come with
1084 * iodone functions attached, so that we can track down errors.
1085 */
1086STATIC int
1087xfs_bioerror_relse(
1088 struct xfs_buf *bp)
1089{
1090 int64_t fl = XFS_BUF_BFLAGS(bp);
1091 /*
1092 * No need to wait until the buffer is unpinned.
1093 * We aren't flushing it.
1094 *
1095 * chunkhold expects B_DONE to be set, whether
1096 * we actually finish the I/O or not. We don't want to
1097 * change that interface.
1098 */
1099 XFS_BUF_UNREAD(bp);
1100 XFS_BUF_UNDELAYWRITE(bp);
1101 XFS_BUF_DONE(bp);
1102 XFS_BUF_STALE(bp);
1103 XFS_BUF_CLR_IODONE_FUNC(bp);
1104 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1105 if (!(fl & XBF_ASYNC)) {
1106 /*
1107 * Mark b_error and B_ERROR _both_.
1108 * Lot's of chunkcache code assumes that.
1109 * There's no reason to mark error for
1110 * ASYNC buffers.
1111 */
1112 XFS_BUF_ERROR(bp, EIO);
1113 XFS_BUF_FINISH_IOWAIT(bp);
1114 } else {
1115 xfs_buf_relse(bp);
1116 }
1117
1118 return EIO;
1119}
1120
1121
1122/*
1123 * All xfs metadata buffers except log state machine buffers
1124 * get this attached as their b_bdstrat callback function.
1125 * This is so that we can catch a buffer
1126 * after prematurely unpinning it to forcibly shutdown the filesystem.
1127 */
1128int
1129xfs_bdstrat_cb(
1130 struct xfs_buf *bp)
1131{
1132 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1133 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1134 /*
1135 * Metadata write that didn't get logged but
1136 * written delayed anyway. These aren't associated
1137 * with a transaction, and can be ignored.
1138 */
1139 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1140 return xfs_bioerror_relse(bp);
1141 else
1142 return xfs_bioerror(bp);
1143 }
1144
1145 xfs_buf_iorequest(bp);
1146 return 0;
1147}
1148
1149/*
1150 * Wrapper around bdstrat so that we can stop data from going to disk in case
1151 * we are shutting down the filesystem. Typically user data goes thru this
1152 * path; one of the exceptions is the superblock.
1153 */
1154void
1155xfsbdstrat(
1156 struct xfs_mount *mp,
1157 struct xfs_buf *bp)
1158{
1159 if (XFS_FORCED_SHUTDOWN(mp)) {
1160 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1161 xfs_bioerror_relse(bp);
1162 return;
1163 }
1164
1165 xfs_buf_iorequest(bp);
1166}
1167
1168STATIC void
1117_xfs_buf_ioend( 1169_xfs_buf_ioend(
1118 xfs_buf_t *bp, 1170 xfs_buf_t *bp,
1119 int schedule) 1171 int schedule)
@@ -1135,6 +1187,9 @@ xfs_buf_bio_end_io(
1135 1187
1136 xfs_buf_ioerror(bp, -error); 1188 xfs_buf_ioerror(bp, -error);
1137 1189
1190 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1191 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1192
1138 do { 1193 do {
1139 struct page *page = bvec->bv_page; 1194 struct page *page = bvec->bv_page;
1140 1195
@@ -1177,10 +1232,14 @@ _xfs_buf_ioapply(
1177 if (bp->b_flags & XBF_ORDERED) { 1232 if (bp->b_flags & XBF_ORDERED) {
1178 ASSERT(!(bp->b_flags & XBF_READ)); 1233 ASSERT(!(bp->b_flags & XBF_READ));
1179 rw = WRITE_BARRIER; 1234 rw = WRITE_BARRIER;
1180 } else if (bp->b_flags & _XBF_RUN_QUEUES) { 1235 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1181 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1236 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1182 bp->b_flags &= ~_XBF_RUN_QUEUES; 1237 bp->b_flags &= ~_XBF_RUN_QUEUES;
1183 rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; 1238 rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
1239 } else if (bp->b_flags & _XBF_RUN_QUEUES) {
1240 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1241 bp->b_flags &= ~_XBF_RUN_QUEUES;
1242 rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META;
1184 } else { 1243 } else {
1185 rw = (bp->b_flags & XBF_WRITE) ? WRITE : 1244 rw = (bp->b_flags & XBF_WRITE) ? WRITE :
1186 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1245 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
@@ -1240,6 +1299,10 @@ next_chunk:
1240 1299
1241submit_io: 1300submit_io:
1242 if (likely(bio->bi_size)) { 1301 if (likely(bio->bi_size)) {
1302 if (xfs_buf_is_vmapped(bp)) {
1303 flush_kernel_vmap_range(bp->b_addr,
1304 xfs_buf_vmap_len(bp));
1305 }
1243 submit_bio(rw, bio); 1306 submit_bio(rw, bio);
1244 if (size) 1307 if (size)
1245 goto next_chunk; 1308 goto next_chunk;
@@ -1253,7 +1316,7 @@ int
1253xfs_buf_iorequest( 1316xfs_buf_iorequest(
1254 xfs_buf_t *bp) 1317 xfs_buf_t *bp)
1255{ 1318{
1256 XB_TRACE(bp, "iorequest", 0); 1319 trace_xfs_buf_iorequest(bp, _RET_IP_);
1257 1320
1258 if (bp->b_flags & XBF_DELWRI) { 1321 if (bp->b_flags & XBF_DELWRI) {
1259 xfs_buf_delwri_queue(bp, 1); 1322 xfs_buf_delwri_queue(bp, 1);
@@ -1287,11 +1350,13 @@ int
1287xfs_buf_iowait( 1350xfs_buf_iowait(
1288 xfs_buf_t *bp) 1351 xfs_buf_t *bp)
1289{ 1352{
1290 XB_TRACE(bp, "iowait", 0); 1353 trace_xfs_buf_iowait(bp, _RET_IP_);
1354
1291 if (atomic_read(&bp->b_io_remaining)) 1355 if (atomic_read(&bp->b_io_remaining))
1292 blk_run_address_space(bp->b_target->bt_mapping); 1356 blk_run_address_space(bp->b_target->bt_mapping);
1293 wait_for_completion(&bp->b_iowait); 1357 wait_for_completion(&bp->b_iowait);
1294 XB_TRACE(bp, "iowaited", (long)bp->b_error); 1358
1359 trace_xfs_buf_iowait_done(bp, _RET_IP_);
1295 return bp->b_error; 1360 return bp->b_error;
1296} 1361}
1297 1362
@@ -1318,7 +1383,7 @@ xfs_buf_iomove(
1318 xfs_buf_t *bp, /* buffer to process */ 1383 xfs_buf_t *bp, /* buffer to process */
1319 size_t boff, /* starting buffer offset */ 1384 size_t boff, /* starting buffer offset */
1320 size_t bsize, /* length to copy */ 1385 size_t bsize, /* length to copy */
1321 caddr_t data, /* data address */ 1386 void *data, /* data address */
1322 xfs_buf_rw_t mode) /* read/write/zero flag */ 1387 xfs_buf_rw_t mode) /* read/write/zero flag */
1323{ 1388{
1324 size_t bend, cpoff, csize; 1389 size_t bend, cpoff, csize;
@@ -1400,8 +1465,8 @@ xfs_alloc_bufhash(
1400 1465
1401 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1466 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1402 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1467 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1403 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1468 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1404 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1469 sizeof(xfs_bufhash_t));
1405 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1470 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1406 spin_lock_init(&btp->bt_hash[i].bh_lock); 1471 spin_lock_init(&btp->bt_hash[i].bh_lock);
1407 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1472 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1412,7 +1477,7 @@ STATIC void
1412xfs_free_bufhash( 1477xfs_free_bufhash(
1413 xfs_buftarg_t *btp) 1478 xfs_buftarg_t *btp)
1414{ 1479{
1415 kmem_free(btp->bt_hash); 1480 kmem_free_large(btp->bt_hash);
1416 btp->bt_hash = NULL; 1481 btp->bt_hash = NULL;
1417} 1482}
1418 1483
@@ -1604,7 +1669,8 @@ xfs_buf_delwri_queue(
1604 struct list_head *dwq = &bp->b_target->bt_delwrite_queue; 1669 struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
1605 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1670 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1606 1671
1607 XB_TRACE(bp, "delwri_q", (long)unlock); 1672 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1673
1608 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); 1674 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
1609 1675
1610 spin_lock(dwlk); 1676 spin_lock(dwlk);
@@ -1616,6 +1682,11 @@ xfs_buf_delwri_queue(
1616 list_del(&bp->b_list); 1682 list_del(&bp->b_list);
1617 } 1683 }
1618 1684
1685 if (list_empty(dwq)) {
1686 /* start xfsbufd as it is about to have something to do */
1687 wake_up_process(bp->b_target->bt_task);
1688 }
1689
1619 bp->b_flags |= _XBF_DELWRI_Q; 1690 bp->b_flags |= _XBF_DELWRI_Q;
1620 list_add_tail(&bp->b_list, dwq); 1691 list_add_tail(&bp->b_list, dwq);
1621 bp->b_queuetime = jiffies; 1692 bp->b_queuetime = jiffies;
@@ -1644,7 +1715,36 @@ xfs_buf_delwri_dequeue(
1644 if (dequeued) 1715 if (dequeued)
1645 xfs_buf_rele(bp); 1716 xfs_buf_rele(bp);
1646 1717
1647 XB_TRACE(bp, "delwri_dq", (long)dequeued); 1718 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1719}
1720
1721/*
1722 * If a delwri buffer needs to be pushed before it has aged out, then promote
1723 * it to the head of the delwri queue so that it will be flushed on the next
1724 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1725 * than the age currently needed to flush the buffer. Hence the next time the
1726 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1727 */
1728void
1729xfs_buf_delwri_promote(
1730 struct xfs_buf *bp)
1731{
1732 struct xfs_buftarg *btp = bp->b_target;
1733 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1734
1735 ASSERT(bp->b_flags & XBF_DELWRI);
1736 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1737
1738 /*
1739 * Check the buffer age before locking the delayed write queue as we
1740 * don't need to promote buffers that are already past the flush age.
1741 */
1742 if (bp->b_queuetime < jiffies - age)
1743 return;
1744 bp->b_queuetime = jiffies - age;
1745 spin_lock(&btp->bt_delwrite_lock);
1746 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1747 spin_unlock(&btp->bt_delwrite_lock);
1648} 1748}
1649 1749
1650STATIC void 1750STATIC void
@@ -1665,6 +1765,8 @@ xfsbufd_wakeup(
1665 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1765 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1666 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1766 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1667 continue; 1767 continue;
1768 if (list_empty(&btp->bt_delwrite_queue))
1769 continue;
1668 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1770 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1669 wake_up_process(btp->bt_task); 1771 wake_up_process(btp->bt_task);
1670 } 1772 }
@@ -1692,7 +1794,7 @@ xfs_buf_delwri_split(
1692 INIT_LIST_HEAD(list); 1794 INIT_LIST_HEAD(list);
1693 spin_lock(dwlk); 1795 spin_lock(dwlk);
1694 list_for_each_entry_safe(bp, n, dwq, b_list) { 1796 list_for_each_entry_safe(bp, n, dwq, b_list) {
1695 XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); 1797 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1696 ASSERT(bp->b_flags & XBF_DELWRI); 1798 ASSERT(bp->b_flags & XBF_DELWRI);
1697 1799
1698 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { 1800 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1715,20 +1817,53 @@ xfs_buf_delwri_split(
1715 1817
1716} 1818}
1717 1819
1820/*
1821 * Compare function is more complex than it needs to be because
1822 * the return value is only 32 bits and we are doing comparisons
1823 * on 64 bit values
1824 */
1825static int
1826xfs_buf_cmp(
1827 void *priv,
1828 struct list_head *a,
1829 struct list_head *b)
1830{
1831 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1832 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1833 xfs_daddr_t diff;
1834
1835 diff = ap->b_bn - bp->b_bn;
1836 if (diff < 0)
1837 return -1;
1838 if (diff > 0)
1839 return 1;
1840 return 0;
1841}
1842
1843void
1844xfs_buf_delwri_sort(
1845 xfs_buftarg_t *target,
1846 struct list_head *list)
1847{
1848 list_sort(NULL, list, xfs_buf_cmp);
1849}
1850
1718STATIC int 1851STATIC int
1719xfsbufd( 1852xfsbufd(
1720 void *data) 1853 void *data)
1721{ 1854{
1722 struct list_head tmp; 1855 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1723 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1724 int count;
1725 xfs_buf_t *bp;
1726 1856
1727 current->flags |= PF_MEMALLOC; 1857 current->flags |= PF_MEMALLOC;
1728 1858
1729 set_freezable(); 1859 set_freezable();
1730 1860
1731 do { 1861 do {
1862 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1863 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1864 int count = 0;
1865 struct list_head tmp;
1866
1732 if (unlikely(freezing(current))) { 1867 if (unlikely(freezing(current))) {
1733 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1868 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1734 refrigerator(); 1869 refrigerator();
@@ -1736,24 +1871,20 @@ xfsbufd(
1736 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1871 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1737 } 1872 }
1738 1873
1739 schedule_timeout_interruptible( 1874 /* sleep for a long time if there is nothing to do. */
1740 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1875 if (list_empty(&target->bt_delwrite_queue))
1876 tout = MAX_SCHEDULE_TIMEOUT;
1877 schedule_timeout_interruptible(tout);
1741 1878
1742 xfs_buf_delwri_split(target, &tmp, 1879 xfs_buf_delwri_split(target, &tmp, age);
1743 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1880 list_sort(NULL, &tmp, xfs_buf_cmp);
1744
1745 count = 0;
1746 while (!list_empty(&tmp)) { 1881 while (!list_empty(&tmp)) {
1747 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1882 struct xfs_buf *bp;
1748 ASSERT(target == bp->b_target); 1883 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1749
1750 list_del_init(&bp->b_list); 1884 list_del_init(&bp->b_list);
1751 xfs_buf_iostrategy(bp); 1885 xfs_buf_iostrategy(bp);
1752 count++; 1886 count++;
1753 } 1887 }
1754
1755 if (as_list_len > 0)
1756 purge_addresses();
1757 if (count) 1888 if (count)
1758 blk_run_address_space(target->bt_mapping); 1889 blk_run_address_space(target->bt_mapping);
1759 1890
@@ -1772,42 +1903,45 @@ xfs_flush_buftarg(
1772 xfs_buftarg_t *target, 1903 xfs_buftarg_t *target,
1773 int wait) 1904 int wait)
1774{ 1905{
1775 struct list_head tmp; 1906 xfs_buf_t *bp;
1776 xfs_buf_t *bp, *n;
1777 int pincount = 0; 1907 int pincount = 0;
1908 LIST_HEAD(tmp_list);
1909 LIST_HEAD(wait_list);
1778 1910
1779 xfs_buf_runall_queues(xfsconvertd_workqueue); 1911 xfs_buf_runall_queues(xfsconvertd_workqueue);
1780 xfs_buf_runall_queues(xfsdatad_workqueue); 1912 xfs_buf_runall_queues(xfsdatad_workqueue);
1781 xfs_buf_runall_queues(xfslogd_workqueue); 1913 xfs_buf_runall_queues(xfslogd_workqueue);
1782 1914
1783 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1915 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1784 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1916 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1785 1917
1786 /* 1918 /*
1787 * Dropped the delayed write list lock, now walk the temporary list 1919 * Dropped the delayed write list lock, now walk the temporary list.
1920 * All I/O is issued async and then if we need to wait for completion
1921 * we do that after issuing all the IO.
1788 */ 1922 */
1789 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1923 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1924 while (!list_empty(&tmp_list)) {
1925 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1790 ASSERT(target == bp->b_target); 1926 ASSERT(target == bp->b_target);
1791 if (wait) 1927 list_del_init(&bp->b_list);
1928 if (wait) {
1792 bp->b_flags &= ~XBF_ASYNC; 1929 bp->b_flags &= ~XBF_ASYNC;
1793 else 1930 list_add(&bp->b_list, &wait_list);
1794 list_del_init(&bp->b_list); 1931 }
1795
1796 xfs_buf_iostrategy(bp); 1932 xfs_buf_iostrategy(bp);
1797 } 1933 }
1798 1934
1799 if (wait) 1935 if (wait) {
1936 /* Expedite and wait for IO to complete. */
1800 blk_run_address_space(target->bt_mapping); 1937 blk_run_address_space(target->bt_mapping);
1938 while (!list_empty(&wait_list)) {
1939 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1801 1940
1802 /* 1941 list_del_init(&bp->b_list);
1803 * Remaining list items must be flushed before returning 1942 xfs_iowait(bp);
1804 */ 1943 xfs_buf_relse(bp);
1805 while (!list_empty(&tmp)) { 1944 }
1806 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1807
1808 list_del_init(&bp->b_list);
1809 xfs_iowait(bp);
1810 xfs_buf_relse(bp);
1811 } 1945 }
1812 1946
1813 return pincount; 1947 return pincount;
@@ -1816,14 +1950,10 @@ xfs_flush_buftarg(
1816int __init 1950int __init
1817xfs_buf_init(void) 1951xfs_buf_init(void)
1818{ 1952{
1819#ifdef XFS_BUF_TRACE
1820 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
1821#endif
1822
1823 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1953 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1824 KM_ZONE_HWALIGN, NULL); 1954 KM_ZONE_HWALIGN, NULL);
1825 if (!xfs_buf_zone) 1955 if (!xfs_buf_zone)
1826 goto out_free_trace_buf; 1956 goto out;
1827 1957
1828 xfslogd_workqueue = create_workqueue("xfslogd"); 1958 xfslogd_workqueue = create_workqueue("xfslogd");
1829 if (!xfslogd_workqueue) 1959 if (!xfslogd_workqueue)
@@ -1846,10 +1976,7 @@ xfs_buf_init(void)
1846 destroy_workqueue(xfslogd_workqueue); 1976 destroy_workqueue(xfslogd_workqueue);
1847 out_free_buf_zone: 1977 out_free_buf_zone:
1848 kmem_zone_destroy(xfs_buf_zone); 1978 kmem_zone_destroy(xfs_buf_zone);
1849 out_free_trace_buf: 1979 out:
1850#ifdef XFS_BUF_TRACE
1851 ktrace_free(xfs_buf_trace_buf);
1852#endif
1853 return -ENOMEM; 1980 return -ENOMEM;
1854} 1981}
1855 1982
@@ -1861,9 +1988,6 @@ xfs_buf_terminate(void)
1861 destroy_workqueue(xfsdatad_workqueue); 1988 destroy_workqueue(xfsdatad_workqueue);
1862 destroy_workqueue(xfslogd_workqueue); 1989 destroy_workqueue(xfslogd_workqueue);
1863 kmem_zone_destroy(xfs_buf_zone); 1990 kmem_zone_destroy(xfs_buf_zone);
1864#ifdef XFS_BUF_TRACE
1865 ktrace_free(xfs_buf_trace_buf);
1866#endif
1867} 1991}
1868 1992
1869#ifdef CONFIG_KDB_MODULES 1993#ifdef CONFIG_KDB_MODULES
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 9b4d666ad31f..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -55,6 +55,7 @@ typedef enum {
55 XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ 55 XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */
56 XBF_ORDERED = (1 << 11), /* use ordered writes */ 56 XBF_ORDERED = (1 << 11), /* use ordered writes */
57 XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ 57 XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */
58 XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */
58 59
59 /* flags used only as arguments to access routines */ 60 /* flags used only as arguments to access routines */
60 XBF_LOCK = (1 << 14), /* lock requested */ 61 XBF_LOCK = (1 << 14), /* lock requested */
@@ -95,6 +96,28 @@ typedef enum {
95 _XFS_BARRIER_FAILED = (1 << 23), 96 _XFS_BARRIER_FAILED = (1 << 23),
96} xfs_buf_flags_t; 97} xfs_buf_flags_t;
97 98
99#define XFS_BUF_FLAGS \
100 { XBF_READ, "READ" }, \
101 { XBF_WRITE, "WRITE" }, \
102 { XBF_MAPPED, "MAPPED" }, \
103 { XBF_ASYNC, "ASYNC" }, \
104 { XBF_DONE, "DONE" }, \
105 { XBF_DELWRI, "DELWRI" }, \
106 { XBF_STALE, "STALE" }, \
107 { XBF_FS_MANAGED, "FS_MANAGED" }, \
108 { XBF_ORDERED, "ORDERED" }, \
109 { XBF_READ_AHEAD, "READ_AHEAD" }, \
110 { XBF_LOCK, "LOCK" }, /* should never be set */\
111 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
112 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
113 { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \
114 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119
120
98typedef enum { 121typedef enum {
99 XBT_FORCE_SLEEP = 0, 122 XBT_FORCE_SLEEP = 0,
100 XBT_FORCE_FLUSH = 1, 123 XBT_FORCE_FLUSH = 1,
@@ -186,15 +209,10 @@ extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
186#define xfs_incore(buftarg,blkno,len,lockit) \ 209#define xfs_incore(buftarg,blkno,len,lockit) \
187 _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) 210 _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
188 211
189extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t, 212extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
190 xfs_buf_flags_t); 213 xfs_buf_flags_t);
191#define xfs_buf_get(target, blkno, len, flags) \ 214extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
192 xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
193
194extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
195 xfs_buf_flags_t); 215 xfs_buf_flags_t);
196#define xfs_buf_read(target, blkno, len, flags) \
197 xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
198 216
199extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 217extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
200extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 218extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
@@ -214,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
214extern void xfs_buf_unlock(xfs_buf_t *); 232extern void xfs_buf_unlock(xfs_buf_t *);
215 233
216/* Buffer Read and Write Routines */ 234/* Buffer Read and Write Routines */
217extern int xfs_bawrite(void *mp, xfs_buf_t *bp); 235extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
218extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); 236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
237
238extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
239extern int xfs_bdstrat_cb(struct xfs_buf *);
240
219extern void xfs_buf_ioend(xfs_buf_t *, int); 241extern void xfs_buf_ioend(xfs_buf_t *, int);
220extern void xfs_buf_ioerror(xfs_buf_t *, int); 242extern void xfs_buf_ioerror(xfs_buf_t *, int);
221extern int xfs_buf_iorequest(xfs_buf_t *); 243extern int xfs_buf_iorequest(xfs_buf_t *);
222extern int xfs_buf_iowait(xfs_buf_t *); 244extern int xfs_buf_iowait(xfs_buf_t *);
223extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
224 xfs_buf_rw_t); 246 xfs_buf_rw_t);
225 247
226static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 248static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -243,49 +265,29 @@ extern int xfs_buf_ispin(xfs_buf_t *);
243 265
244/* Delayed Write Buffer Routines */ 266/* Delayed Write Buffer Routines */
245extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 267extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *);
246 269
247/* Buffer Daemon Setup Routines */ 270/* Buffer Daemon Setup Routines */
248extern int xfs_buf_init(void); 271extern int xfs_buf_init(void);
249extern void xfs_buf_terminate(void); 272extern void xfs_buf_terminate(void);
250 273
251#ifdef XFS_BUF_TRACE
252extern ktrace_t *xfs_buf_trace_buf;
253extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
254#else
255#define xfs_buf_trace(bp,id,ptr,ra) do { } while (0)
256#endif
257
258#define xfs_buf_target_name(target) \ 274#define xfs_buf_target_name(target) \
259 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 275 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
260 276
261 277
262#define XFS_B_ASYNC XBF_ASYNC
263#define XFS_B_DELWRI XBF_DELWRI
264#define XFS_B_READ XBF_READ
265#define XFS_B_WRITE XBF_WRITE
266#define XFS_B_STALE XBF_STALE
267
268#define XFS_BUF_TRYLOCK XBF_TRYLOCK
269#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
270#define XFS_BUF_LOCK XBF_LOCK
271#define XFS_BUF_MAPPED XBF_MAPPED
272
273#define BUF_BUSY XBF_DONT_BLOCK
274
275#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 278#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
276#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 279#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
277 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 280 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
278 281
279#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 282#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE)
280#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 283#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
281#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) 284#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
282#define XFS_BUF_SUPER_STALE(bp) do { \ 285#define XFS_BUF_SUPER_STALE(bp) do { \
283 XFS_BUF_STALE(bp); \ 286 XFS_BUF_STALE(bp); \
284 xfs_buf_delwri_dequeue(bp); \ 287 xfs_buf_delwri_dequeue(bp); \
285 XFS_BUF_DONE(bp); \ 288 XFS_BUF_DONE(bp); \
286 } while (0) 289 } while (0)
287 290
288#define XFS_BUF_MANAGE XBF_FS_MANAGED
289#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) 291#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
290 292
291#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 293#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
@@ -370,39 +372,15 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
370 372
371#define xfs_bpin(bp) xfs_buf_pin(bp) 373#define xfs_bpin(bp) xfs_buf_pin(bp)
372#define xfs_bunpin(bp) xfs_buf_unpin(bp) 374#define xfs_bunpin(bp) xfs_buf_unpin(bp)
373
374#define xfs_buftrace(id, bp) \
375 xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
376
377#define xfs_biodone(bp) xfs_buf_ioend(bp, 0) 375#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
378 376
379#define xfs_biomove(bp, off, len, data, rw) \ 377#define xfs_biomove(bp, off, len, data, rw) \
380 xfs_buf_iomove((bp), (off), (len), (data), \ 378 xfs_buf_iomove((bp), (off), (len), (data), \
381 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) 379 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
382 380
383#define xfs_biozero(bp, off, len) \ 381#define xfs_biozero(bp, off, len) \
384 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 382 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
385 383
386
387static inline int XFS_bwrite(xfs_buf_t *bp)
388{
389 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
390 int error = 0;
391
392 if (!iowait)
393 bp->b_flags |= _XBF_RUN_QUEUES;
394
395 xfs_buf_delwri_dequeue(bp);
396 xfs_buf_iostrategy(bp);
397 if (iowait) {
398 error = xfs_buf_iowait(bp);
399 xfs_buf_relse(bp);
400 }
401 return error;
402}
403
404#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
405
406#define xfs_iowait(bp) xfs_buf_iowait(bp) 384#define xfs_iowait(bp) xfs_buf_iowait(bp)
407 385
408#define xfs_baread(target, rablkno, ralen) \ 386#define xfs_baread(target, rablkno, ralen) \
@@ -417,6 +395,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
417extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
418extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
419extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 397extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
398
420#ifdef CONFIG_KDB_MODULES 399#ifdef CONFIG_KDB_MODULES
421extern struct list_head *xfs_get_buftarg_list(void); 400extern struct list_head *xfs_get_buftarg_list(void);
422#endif 401#endif
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 87b8cbd23d4b..846b75aeb2ab 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_vnodeops.h" 29#include "xfs_vnodeops.h"
30#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_inode_item.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
@@ -215,9 +216,28 @@ xfs_fs_get_parent(
215 return d_obtain_alias(VFS_I(cip)); 216 return d_obtain_alias(VFS_I(cip));
216} 217}
217 218
219STATIC int
220xfs_fs_nfs_commit_metadata(
221 struct inode *inode)
222{
223 struct xfs_inode *ip = XFS_I(inode);
224 struct xfs_mount *mp = ip->i_mount;
225 int error = 0;
226
227 xfs_ilock(ip, XFS_ILOCK_SHARED);
228 if (xfs_ipincount(ip)) {
229 error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
230 XFS_LOG_SYNC, NULL);
231 }
232 xfs_iunlock(ip, XFS_ILOCK_SHARED);
233
234 return error;
235}
236
218const struct export_operations xfs_export_operations = { 237const struct export_operations xfs_export_operations = {
219 .encode_fh = xfs_fs_encode_fh, 238 .encode_fh = xfs_fs_encode_fh,
220 .fh_to_dentry = xfs_fs_fh_to_dentry, 239 .fh_to_dentry = xfs_fs_fh_to_dentry,
221 .fh_to_parent = xfs_fs_fh_to_parent, 240 .fh_to_parent = xfs_fs_fh_to_parent,
222 .get_parent = xfs_fs_get_parent, 241 .get_parent = xfs_fs_get_parent,
242 .commit_metadata = xfs_fs_nfs_commit_metadata,
223}; 243};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index eff61e2732af..42dd3bcfba6b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h"
19#include "xfs_bit.h" 20#include "xfs_bit.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_inum.h" 22#include "xfs_inum.h"
@@ -34,52 +35,279 @@
34#include "xfs_dir2_sf.h" 35#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 36#include "xfs_dinode.h"
36#include "xfs_inode.h" 37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
37#include "xfs_error.h" 40#include "xfs_error.h"
38#include "xfs_rw.h" 41#include "xfs_rw.h"
39#include "xfs_vnodeops.h" 42#include "xfs_vnodeops.h"
40#include "xfs_da_btree.h" 43#include "xfs_da_btree.h"
41#include "xfs_ioctl.h" 44#include "xfs_ioctl.h"
45#include "xfs_trace.h"
42 46
43#include <linux/dcache.h> 47#include <linux/dcache.h>
44 48
45static const struct vm_operations_struct xfs_file_vm_ops; 49static const struct vm_operations_struct xfs_file_vm_ops;
46 50
47STATIC ssize_t 51/*
48xfs_file_aio_read( 52 * xfs_iozero
49 struct kiocb *iocb, 53 *
50 const struct iovec *iov, 54 * xfs_iozero clears the specified range of buffer supplied,
51 unsigned long nr_segs, 55 * and marks all the affected blocks as valid and modified. If
52 loff_t pos) 56 * an affected block is not allocated, it will be allocated. If
57 * an affected block is not completely overwritten, and is not
58 * valid before the operation, it will be read from disk before
59 * being partially zeroed.
60 */
61STATIC int
62xfs_iozero(
63 struct xfs_inode *ip, /* inode */
64 loff_t pos, /* offset in file */
65 size_t count) /* size of data to zero */
53{ 66{
54 struct file *file = iocb->ki_filp; 67 struct page *page;
55 int ioflags = IO_ISAIO; 68 struct address_space *mapping;
69 int status;
56 70
57 BUG_ON(iocb->ki_pos != pos); 71 mapping = VFS_I(ip)->i_mapping;
58 if (unlikely(file->f_flags & O_DIRECT)) 72 do {
59 ioflags |= IO_ISDIRECT; 73 unsigned offset, bytes;
60 if (file->f_mode & FMODE_NOCMTIME) 74 void *fsdata;
61 ioflags |= IO_INVIS; 75
62 return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, 76 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
63 nr_segs, &iocb->ki_pos, ioflags); 77 bytes = PAGE_CACHE_SIZE - offset;
78 if (bytes > count)
79 bytes = count;
80
81 status = pagecache_write_begin(NULL, mapping, pos, bytes,
82 AOP_FLAG_UNINTERRUPTIBLE,
83 &page, &fsdata);
84 if (status)
85 break;
86
87 zero_user(page, offset, bytes);
88
89 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
90 page, fsdata);
91 WARN_ON(status <= 0); /* can't return less than zero! */
92 pos += bytes;
93 count -= bytes;
94 status = 0;
95 } while (count);
96
97 return (-status);
98}
99
100STATIC int
101xfs_file_fsync(
102 struct file *file,
103 struct dentry *dentry,
104 int datasync)
105{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode);
107 struct xfs_trans *tp;
108 int error = 0;
109 int log_flushed = 0;
110
111 xfs_itrace_entry(ip);
112
113 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
114 return -XFS_ERROR(EIO);
115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117
118 /*
119 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the
121 * log because of committed transactions that haven't hit the disk yet.
122 * Likewise, there could be unflushed non-transactional changes to the
123 * inode core that have to go to disk and this requires us to issue
124 * a synchronous transaction to capture these changes correctly.
125 *
126 * This code relies on the assumption that if the i_update_core field
127 * of the inode is clear and the inode is unpinned then it is clean
128 * and no action is required.
129 */
130 xfs_ilock(ip, XFS_ILOCK_SHARED);
131
132 /*
133 * First check if the VFS inode is marked dirty. All the dirtying
134 * of non-transactional updates no goes through mark_inode_dirty*,
135 * which allows us to distinguish beteeen pure timestamp updates
136 * and i_size updates which need to be caught for fdatasync.
137 * After that also theck for the dirty state in the XFS inode, which
138 * might gets cleared when the inode gets written out via the AIL
139 * or xfs_iflush_cluster.
140 */
141 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
142 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
143 ip->i_update_core) {
144 /*
145 * Kick off a transaction to log the inode core to get the
146 * updates. The sync transaction will also force the log.
147 */
148 xfs_iunlock(ip, XFS_ILOCK_SHARED);
149 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
150 error = xfs_trans_reserve(tp, 0,
151 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
152 if (error) {
153 xfs_trans_cancel(tp, 0);
154 return -error;
155 }
156 xfs_ilock(ip, XFS_ILOCK_EXCL);
157
158 /*
159 * Note - it's possible that we might have pushed ourselves out
160 * of the way during trans_reserve which would flush the inode.
161 * But there's no guarantee that the inode buffer has actually
162 * gone out yet (it's delwri). Plus the buffer could be pinned
163 * anyway if it's part of an inode in another recent
164 * transaction. So we play it safe and fire off the
165 * transaction anyway.
166 */
167 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
168 xfs_trans_ihold(tp, ip);
169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
170 xfs_trans_set_sync(tp);
171 error = _xfs_trans_commit(tp, 0, &log_flushed);
172
173 xfs_iunlock(ip, XFS_ILOCK_EXCL);
174 } else {
175 /*
176 * Timestamps/size haven't changed since last inode flush or
177 * inode transaction commit. That means either nothing got
178 * written or a transaction committed which caught the updates.
179 * If the latter happened and the transaction hasn't hit the
180 * disk yet, the inode will be still be pinned. If it is,
181 * force the log.
182 */
183 if (xfs_ipincount(ip)) {
184 error = _xfs_log_force_lsn(ip->i_mount,
185 ip->i_itemp->ili_last_lsn,
186 XFS_LOG_SYNC, &log_flushed);
187 }
188 xfs_iunlock(ip, XFS_ILOCK_SHARED);
189 }
190
191 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
192 /*
193 * If the log write didn't issue an ordered tag we need
194 * to flush the disk cache for the data device now.
195 */
196 if (!log_flushed)
197 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
198
199 /*
200 * If this inode is on the RT dev we need to flush that
201 * cache as well.
202 */
203 if (XFS_IS_REALTIME_INODE(ip))
204 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
205 }
206
207 return -error;
64} 208}
65 209
66STATIC ssize_t 210STATIC ssize_t
67xfs_file_aio_write( 211xfs_file_aio_read(
68 struct kiocb *iocb, 212 struct kiocb *iocb,
69 const struct iovec *iov, 213 const struct iovec *iovp,
70 unsigned long nr_segs, 214 unsigned long nr_segs,
71 loff_t pos) 215 loff_t pos)
72{ 216{
73 struct file *file = iocb->ki_filp; 217 struct file *file = iocb->ki_filp;
74 int ioflags = IO_ISAIO; 218 struct inode *inode = file->f_mapping->host;
219 struct xfs_inode *ip = XFS_I(inode);
220 struct xfs_mount *mp = ip->i_mount;
221 size_t size = 0;
222 ssize_t ret = 0;
223 int ioflags = 0;
224 xfs_fsize_t n;
225 unsigned long seg;
226
227 XFS_STATS_INC(xs_read_calls);
75 228
76 BUG_ON(iocb->ki_pos != pos); 229 BUG_ON(iocb->ki_pos != pos);
230
77 if (unlikely(file->f_flags & O_DIRECT)) 231 if (unlikely(file->f_flags & O_DIRECT))
78 ioflags |= IO_ISDIRECT; 232 ioflags |= IO_ISDIRECT;
79 if (file->f_mode & FMODE_NOCMTIME) 233 if (file->f_mode & FMODE_NOCMTIME)
80 ioflags |= IO_INVIS; 234 ioflags |= IO_INVIS;
81 return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, 235
82 &iocb->ki_pos, ioflags); 236 /* START copy & waste from filemap.c */
237 for (seg = 0; seg < nr_segs; seg++) {
238 const struct iovec *iv = &iovp[seg];
239
240 /*
241 * If any segment has a negative length, or the cumulative
242 * length ever wraps negative then return -EINVAL.
243 */
244 size += iv->iov_len;
245 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
246 return XFS_ERROR(-EINVAL);
247 }
248 /* END copy & waste from filemap.c */
249
250 if (unlikely(ioflags & IO_ISDIRECT)) {
251 xfs_buftarg_t *target =
252 XFS_IS_REALTIME_INODE(ip) ?
253 mp->m_rtdev_targp : mp->m_ddev_targp;
254 if ((iocb->ki_pos & target->bt_smask) ||
255 (size & target->bt_smask)) {
256 if (iocb->ki_pos == ip->i_size)
257 return 0;
258 return -XFS_ERROR(EINVAL);
259 }
260 }
261
262 n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
263 if (n <= 0 || size == 0)
264 return 0;
265
266 if (n < size)
267 size = n;
268
269 if (XFS_FORCED_SHUTDOWN(mp))
270 return -EIO;
271
272 if (unlikely(ioflags & IO_ISDIRECT))
273 mutex_lock(&inode->i_mutex);
274 xfs_ilock(ip, XFS_IOLOCK_SHARED);
275
276 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
277 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
278 int iolock = XFS_IOLOCK_SHARED;
279
280 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
281 dmflags, &iolock);
282 if (ret) {
283 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
284 if (unlikely(ioflags & IO_ISDIRECT))
285 mutex_unlock(&inode->i_mutex);
286 return ret;
287 }
288 }
289
290 if (unlikely(ioflags & IO_ISDIRECT)) {
291 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip,
293 (iocb->ki_pos & PAGE_CACHE_MASK),
294 -1, FI_REMAPF_LOCKED);
295 }
296 mutex_unlock(&inode->i_mutex);
297 if (ret) {
298 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
299 return ret;
300 }
301 }
302
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
304
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
306 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret);
308
309 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
310 return ret;
83} 311}
84 312
85STATIC ssize_t 313STATIC ssize_t
@@ -87,16 +315,44 @@ xfs_file_splice_read(
87 struct file *infilp, 315 struct file *infilp,
88 loff_t *ppos, 316 loff_t *ppos,
89 struct pipe_inode_info *pipe, 317 struct pipe_inode_info *pipe,
90 size_t len, 318 size_t count,
91 unsigned int flags) 319 unsigned int flags)
92{ 320{
321 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
322 struct xfs_mount *mp = ip->i_mount;
93 int ioflags = 0; 323 int ioflags = 0;
324 ssize_t ret;
325
326 XFS_STATS_INC(xs_read_calls);
94 327
95 if (infilp->f_mode & FMODE_NOCMTIME) 328 if (infilp->f_mode & FMODE_NOCMTIME)
96 ioflags |= IO_INVIS; 329 ioflags |= IO_INVIS;
97 330
98 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), 331 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
99 infilp, ppos, pipe, len, flags, ioflags); 332 return -EIO;
333
334 xfs_ilock(ip, XFS_IOLOCK_SHARED);
335
336 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
337 int iolock = XFS_IOLOCK_SHARED;
338 int error;
339
340 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
341 FILP_DELAY_FLAG(infilp), &iolock);
342 if (error) {
343 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
344 return -error;
345 }
346 }
347
348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
349
350 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
351 if (ret > 0)
352 XFS_STATS_ADD(xs_read_bytes, ret);
353
354 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
355 return ret;
100} 356}
101 357
102STATIC ssize_t 358STATIC ssize_t
@@ -104,16 +360,538 @@ xfs_file_splice_write(
104 struct pipe_inode_info *pipe, 360 struct pipe_inode_info *pipe,
105 struct file *outfilp, 361 struct file *outfilp,
106 loff_t *ppos, 362 loff_t *ppos,
107 size_t len, 363 size_t count,
108 unsigned int flags) 364 unsigned int flags)
109{ 365{
366 struct inode *inode = outfilp->f_mapping->host;
367 struct xfs_inode *ip = XFS_I(inode);
368 struct xfs_mount *mp = ip->i_mount;
369 xfs_fsize_t isize, new_size;
110 int ioflags = 0; 370 int ioflags = 0;
371 ssize_t ret;
372
373 XFS_STATS_INC(xs_write_calls);
111 374
112 if (outfilp->f_mode & FMODE_NOCMTIME) 375 if (outfilp->f_mode & FMODE_NOCMTIME)
113 ioflags |= IO_INVIS; 376 ioflags |= IO_INVIS;
114 377
115 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), 378 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
116 pipe, outfilp, ppos, len, flags, ioflags); 379 return -EIO;
380
381 xfs_ilock(ip, XFS_IOLOCK_EXCL);
382
383 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
384 int iolock = XFS_IOLOCK_EXCL;
385 int error;
386
387 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
388 FILP_DELAY_FLAG(outfilp), &iolock);
389 if (error) {
390 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
391 return -error;
392 }
393 }
394
395 new_size = *ppos + count;
396
397 xfs_ilock(ip, XFS_ILOCK_EXCL);
398 if (new_size > ip->i_size)
399 ip->i_new_size = new_size;
400 xfs_iunlock(ip, XFS_ILOCK_EXCL);
401
402 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
403
404 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
405 if (ret > 0)
406 XFS_STATS_ADD(xs_write_bytes, ret);
407
408 isize = i_size_read(inode);
409 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
410 *ppos = isize;
411
412 if (*ppos > ip->i_size) {
413 xfs_ilock(ip, XFS_ILOCK_EXCL);
414 if (*ppos > ip->i_size)
415 ip->i_size = *ppos;
416 xfs_iunlock(ip, XFS_ILOCK_EXCL);
417 }
418
419 if (ip->i_new_size) {
420 xfs_ilock(ip, XFS_ILOCK_EXCL);
421 ip->i_new_size = 0;
422 if (ip->i_d.di_size > ip->i_size)
423 ip->i_d.di_size = ip->i_size;
424 xfs_iunlock(ip, XFS_ILOCK_EXCL);
425 }
426 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
427 return ret;
428}
429
430/*
431 * This routine is called to handle zeroing any space in the last
432 * block of the file that is beyond the EOF. We do this since the
433 * size is being increased without writing anything to that block
434 * and we don't want anyone to read the garbage on the disk.
435 */
436STATIC int /* error (positive) */
437xfs_zero_last_block(
438 xfs_inode_t *ip,
439 xfs_fsize_t offset,
440 xfs_fsize_t isize)
441{
442 xfs_fileoff_t last_fsb;
443 xfs_mount_t *mp = ip->i_mount;
444 int nimaps;
445 int zero_offset;
446 int zero_len;
447 int error = 0;
448 xfs_bmbt_irec_t imap;
449
450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
451
452 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
453 if (zero_offset == 0) {
454 /*
455 * There are no extra bytes in the last block on disk to
456 * zero, so return.
457 */
458 return 0;
459 }
460
461 last_fsb = XFS_B_TO_FSBT(mp, isize);
462 nimaps = 1;
463 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
464 &nimaps, NULL, NULL);
465 if (error) {
466 return error;
467 }
468 ASSERT(nimaps > 0);
469 /*
470 * If the block underlying isize is just a hole, then there
471 * is nothing to zero.
472 */
473 if (imap.br_startblock == HOLESTARTBLOCK) {
474 return 0;
475 }
476 /*
477 * Zero the part of the last block beyond the EOF, and write it
478 * out sync. We need to drop the ilock while we do this so we
479 * don't deadlock when the buffer cache calls back to us.
480 */
481 xfs_iunlock(ip, XFS_ILOCK_EXCL);
482
483 zero_len = mp->m_sb.sb_blocksize - zero_offset;
484 if (isize + zero_len > offset)
485 zero_len = offset - isize;
486 error = xfs_iozero(ip, isize, zero_len);
487
488 xfs_ilock(ip, XFS_ILOCK_EXCL);
489 ASSERT(error >= 0);
490 return error;
491}
492
493/*
494 * Zero any on disk space between the current EOF and the new,
495 * larger EOF. This handles the normal case of zeroing the remainder
496 * of the last block in the file and the unusual case of zeroing blocks
497 * out beyond the size of the file. This second case only happens
498 * with fixed size extents and when the system crashes before the inode
499 * size was updated but after blocks were allocated. If fill is set,
500 * then any holes in the range are filled and zeroed. If not, the holes
501 * are left alone as holes.
502 */
503
504int /* error (positive) */
505xfs_zero_eof(
506 xfs_inode_t *ip,
507 xfs_off_t offset, /* starting I/O offset */
508 xfs_fsize_t isize) /* current inode size */
509{
510 xfs_mount_t *mp = ip->i_mount;
511 xfs_fileoff_t start_zero_fsb;
512 xfs_fileoff_t end_zero_fsb;
513 xfs_fileoff_t zero_count_fsb;
514 xfs_fileoff_t last_fsb;
515 xfs_fileoff_t zero_off;
516 xfs_fsize_t zero_len;
517 int nimaps;
518 int error = 0;
519 xfs_bmbt_irec_t imap;
520
521 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
522 ASSERT(offset > isize);
523
524 /*
525 * First handle zeroing the block on which isize resides.
526 * We only zero a part of that block so it is handled specially.
527 */
528 error = xfs_zero_last_block(ip, offset, isize);
529 if (error) {
530 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
531 return error;
532 }
533
534 /*
535 * Calculate the range between the new size and the old
536 * where blocks needing to be zeroed may exist. To get the
537 * block where the last byte in the file currently resides,
538 * we need to subtract one from the size and truncate back
539 * to a block boundary. We subtract 1 in case the size is
540 * exactly on a block boundary.
541 */
542 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
543 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
544 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
545 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
546 if (last_fsb == end_zero_fsb) {
547 /*
548 * The size was only incremented on its last block.
549 * We took care of that above, so just return.
550 */
551 return 0;
552 }
553
554 ASSERT(start_zero_fsb <= end_zero_fsb);
555 while (start_zero_fsb <= end_zero_fsb) {
556 nimaps = 1;
557 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
558 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
559 0, NULL, 0, &imap, &nimaps, NULL, NULL);
560 if (error) {
561 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
562 return error;
563 }
564 ASSERT(nimaps > 0);
565
566 if (imap.br_state == XFS_EXT_UNWRITTEN ||
567 imap.br_startblock == HOLESTARTBLOCK) {
568 /*
569 * This loop handles initializing pages that were
570 * partially initialized by the code below this
571 * loop. It basically zeroes the part of the page
572 * that sits on a hole and sets the page as P_HOLE
573 * and calls remapf if it is a mapped file.
574 */
575 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
576 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
577 continue;
578 }
579
580 /*
581 * There are blocks we need to zero.
582 * Drop the inode lock while we're doing the I/O.
583 * We'll still have the iolock to protect us.
584 */
585 xfs_iunlock(ip, XFS_ILOCK_EXCL);
586
587 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
588 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
589
590 if ((zero_off + zero_len) > offset)
591 zero_len = offset - zero_off;
592
593 error = xfs_iozero(ip, zero_off, zero_len);
594 if (error) {
595 goto out_lock;
596 }
597
598 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
599 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
600
601 xfs_ilock(ip, XFS_ILOCK_EXCL);
602 }
603
604 return 0;
605
606out_lock:
607 xfs_ilock(ip, XFS_ILOCK_EXCL);
608 ASSERT(error >= 0);
609 return error;
610}
611
612STATIC ssize_t
613xfs_file_aio_write(
614 struct kiocb *iocb,
615 const struct iovec *iovp,
616 unsigned long nr_segs,
617 loff_t pos)
618{
619 struct file *file = iocb->ki_filp;
620 struct address_space *mapping = file->f_mapping;
621 struct inode *inode = mapping->host;
622 struct xfs_inode *ip = XFS_I(inode);
623 struct xfs_mount *mp = ip->i_mount;
624 ssize_t ret = 0, error = 0;
625 int ioflags = 0;
626 xfs_fsize_t isize, new_size;
627 int iolock;
628 int eventsent = 0;
629 size_t ocount = 0, count;
630 int need_i_mutex;
631
632 XFS_STATS_INC(xs_write_calls);
633
634 BUG_ON(iocb->ki_pos != pos);
635
636 if (unlikely(file->f_flags & O_DIRECT))
637 ioflags |= IO_ISDIRECT;
638 if (file->f_mode & FMODE_NOCMTIME)
639 ioflags |= IO_INVIS;
640
641 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
642 if (error)
643 return error;
644
645 count = ocount;
646 if (count == 0)
647 return 0;
648
649 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
650
651 if (XFS_FORCED_SHUTDOWN(mp))
652 return -EIO;
653
654relock:
655 if (ioflags & IO_ISDIRECT) {
656 iolock = XFS_IOLOCK_SHARED;
657 need_i_mutex = 0;
658 } else {
659 iolock = XFS_IOLOCK_EXCL;
660 need_i_mutex = 1;
661 mutex_lock(&inode->i_mutex);
662 }
663
664 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
665
666start:
667 error = -generic_write_checks(file, &pos, &count,
668 S_ISBLK(inode->i_mode));
669 if (error) {
670 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
671 goto out_unlock_mutex;
672 }
673
674 if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
675 !(ioflags & IO_INVIS) && !eventsent)) {
676 int dmflags = FILP_DELAY_FLAG(file);
677
678 if (need_i_mutex)
679 dmflags |= DM_FLAGS_IMUX;
680
681 xfs_iunlock(ip, XFS_ILOCK_EXCL);
682 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
683 pos, count, dmflags, &iolock);
684 if (error) {
685 goto out_unlock_internal;
686 }
687 xfs_ilock(ip, XFS_ILOCK_EXCL);
688 eventsent = 1;
689
690 /*
691 * The iolock was dropped and reacquired in XFS_SEND_DATA
692 * so we have to recheck the size when appending.
693 * We will only "goto start;" once, since having sent the
694 * event prevents another call to XFS_SEND_DATA, which is
695 * what allows the size to change in the first place.
696 */
697 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
698 goto start;
699 }
700
701 if (ioflags & IO_ISDIRECT) {
702 xfs_buftarg_t *target =
703 XFS_IS_REALTIME_INODE(ip) ?
704 mp->m_rtdev_targp : mp->m_ddev_targp;
705
706 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
707 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
708 return XFS_ERROR(-EINVAL);
709 }
710
711 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
712 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
713 iolock = XFS_IOLOCK_EXCL;
714 need_i_mutex = 1;
715 mutex_lock(&inode->i_mutex);
716 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
717 goto start;
718 }
719 }
720
721 new_size = pos + count;
722 if (new_size > ip->i_size)
723 ip->i_new_size = new_size;
724
725 if (likely(!(ioflags & IO_INVIS)))
726 file_update_time(file);
727
728 /*
729 * If the offset is beyond the size of the file, we have a couple
730 * of things to do. First, if there is already space allocated
731 * we need to either create holes or zero the disk or ...
732 *
733 * If there is a page where the previous size lands, we need
734 * to zero it out up to the new size.
735 */
736
737 if (pos > ip->i_size) {
738 error = xfs_zero_eof(ip, pos, ip->i_size);
739 if (error) {
740 xfs_iunlock(ip, XFS_ILOCK_EXCL);
741 goto out_unlock_internal;
742 }
743 }
744 xfs_iunlock(ip, XFS_ILOCK_EXCL);
745
746 /*
747 * If we're writing the file then make sure to clear the
748 * setuid and setgid bits if the process is not being run
749 * by root. This keeps people from modifying setuid and
750 * setgid binaries.
751 */
752 error = -file_remove_suid(file);
753 if (unlikely(error))
754 goto out_unlock_internal;
755
756 /* We can write back this queue in page reclaim */
757 current->backing_dev_info = mapping->backing_dev_info;
758
759 if ((ioflags & IO_ISDIRECT)) {
760 if (mapping->nrpages) {
761 WARN_ON(need_i_mutex == 0);
762 error = xfs_flushinval_pages(ip,
763 (pos & PAGE_CACHE_MASK),
764 -1, FI_REMAPF_LOCKED);
765 if (error)
766 goto out_unlock_internal;
767 }
768
769 if (need_i_mutex) {
770 /* demote the lock now the cached pages are gone */
771 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 mutex_unlock(&inode->i_mutex);
773
774 iolock = XFS_IOLOCK_SHARED;
775 need_i_mutex = 0;
776 }
777
778 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
779 ret = generic_file_direct_write(iocb, iovp,
780 &nr_segs, pos, &iocb->ki_pos, count, ocount);
781
782 /*
783 * direct-io write to a hole: fall through to buffered I/O
784 * for completing the rest of the request.
785 */
786 if (ret >= 0 && ret != count) {
787 XFS_STATS_ADD(xs_write_bytes, ret);
788
789 pos += ret;
790 count -= ret;
791
792 ioflags &= ~IO_ISDIRECT;
793 xfs_iunlock(ip, iolock);
794 goto relock;
795 }
796 } else {
797 int enospc = 0;
798 ssize_t ret2 = 0;
799
800write_retry:
801 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
802 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
803 pos, &iocb->ki_pos, count, ret);
804 /*
805 * if we just got an ENOSPC, flush the inode now we
806 * aren't holding any page locks and retry *once*
807 */
808 if (ret2 == -ENOSPC && !enospc) {
809 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
810 if (error)
811 goto out_unlock_internal;
812 enospc = 1;
813 goto write_retry;
814 }
815 ret = ret2;
816 }
817
818 current->backing_dev_info = NULL;
819
820 isize = i_size_read(inode);
821 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
822 iocb->ki_pos = isize;
823
824 if (iocb->ki_pos > ip->i_size) {
825 xfs_ilock(ip, XFS_ILOCK_EXCL);
826 if (iocb->ki_pos > ip->i_size)
827 ip->i_size = iocb->ki_pos;
828 xfs_iunlock(ip, XFS_ILOCK_EXCL);
829 }
830
831 if (ret == -ENOSPC &&
832 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
833 xfs_iunlock(ip, iolock);
834 if (need_i_mutex)
835 mutex_unlock(&inode->i_mutex);
836 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
837 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
838 0, 0, 0); /* Delay flag intentionally unused */
839 if (need_i_mutex)
840 mutex_lock(&inode->i_mutex);
841 xfs_ilock(ip, iolock);
842 if (error)
843 goto out_unlock_internal;
844 goto start;
845 }
846
847 error = -ret;
848 if (ret <= 0)
849 goto out_unlock_internal;
850
851 XFS_STATS_ADD(xs_write_bytes, ret);
852
853 /* Handle various SYNC-type writes */
854 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
855 loff_t end = pos + ret - 1;
856 int error2;
857
858 xfs_iunlock(ip, iolock);
859 if (need_i_mutex)
860 mutex_unlock(&inode->i_mutex);
861
862 error2 = filemap_write_and_wait_range(mapping, pos, end);
863 if (!error)
864 error = error2;
865 if (need_i_mutex)
866 mutex_lock(&inode->i_mutex);
867 xfs_ilock(ip, iolock);
868
869 error2 = -xfs_file_fsync(file, file->f_path.dentry,
870 (file->f_flags & __O_SYNC) ? 0 : 1);
871 if (!error)
872 error = error2;
873 }
874
875 out_unlock_internal:
876 if (ip->i_new_size) {
877 xfs_ilock(ip, XFS_ILOCK_EXCL);
878 ip->i_new_size = 0;
879 /*
880 * If this was a direct or synchronous I/O that failed (such
881 * as ENOSPC) then part of the I/O may have been written to
882 * disk before the error occured. In this case the on-disk
883 * file size may have been adjusted beyond the in-memory file
884 * size and now needs to be truncated back.
885 */
886 if (ip->i_d.di_size > ip->i_size)
887 ip->i_d.di_size = ip->i_size;
888 xfs_iunlock(ip, XFS_ILOCK_EXCL);
889 }
890 xfs_iunlock(ip, iolock);
891 out_unlock_mutex:
892 if (need_i_mutex)
893 mutex_unlock(&inode->i_mutex);
894 return -error;
117} 895}
118 896
119STATIC int 897STATIC int
@@ -160,28 +938,6 @@ xfs_file_release(
160 return -xfs_release(XFS_I(inode)); 938 return -xfs_release(XFS_I(inode));
161} 939}
162 940
163/*
164 * We ignore the datasync flag here because a datasync is effectively
165 * identical to an fsync. That is, datasync implies that we need to write
166 * only the metadata needed to be able to access the data that is written
167 * if we crash after the call completes. Hence if we are writing beyond
168 * EOF we have to log the inode size change as well, which makes it a
169 * full fsync. If we don't write beyond EOF, the inode core will be
170 * clean in memory and so we don't need to log the inode, just like
171 * fsync.
172 */
173STATIC int
174xfs_file_fsync(
175 struct file *file,
176 struct dentry *dentry,
177 int datasync)
178{
179 struct xfs_inode *ip = XFS_I(dentry->d_inode);
180
181 xfs_iflags_clear(ip, XFS_ITRUNCATED);
182 return -xfs_fsync(ip);
183}
184
185STATIC int 941STATIC int
186xfs_file_readdir( 942xfs_file_readdir(
187 struct file *filp, 943 struct file *filp,
@@ -203,9 +959,9 @@ xfs_file_readdir(
203 * 959 *
204 * Try to give it an estimate that's good enough, maybe at some 960 * Try to give it an estimate that's good enough, maybe at some
205 * point we can change the ->readdir prototype to include the 961 * point we can change the ->readdir prototype to include the
206 * buffer size. 962 * buffer size. For now we use the current glibc buffer size.
207 */ 963 */
208 bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); 964 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
209 965
210 error = xfs_readdir(ip, dirent, bufsize, 966 error = xfs_readdir(ip, dirent, bufsize,
211 (xfs_off_t *)&filp->f_pos, filldir); 967 (xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 08be36d7326c..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -19,6 +19,7 @@
19#include "xfs_vnodeops.h" 19#include "xfs_vnodeops.h"
20#include "xfs_bmap_btree.h" 20#include "xfs_bmap_btree.h"
21#include "xfs_inode.h" 21#include "xfs_inode.h"
22#include "xfs_trace.h"
22 23
23int fs_noerr(void) { return 0; } 24int fs_noerr(void) { return 0; }
24int fs_nosys(void) { return ENOSYS; } 25int fs_nosys(void) { return ENOSYS; }
@@ -51,6 +52,8 @@ xfs_flushinval_pages(
51 struct address_space *mapping = VFS_I(ip)->i_mapping; 52 struct address_space *mapping = VFS_I(ip)->i_mapping;
52 int ret = 0; 53 int ret = 0;
53 54
55 trace_xfs_pagecache_inval(ip, first, last);
56
54 if (mapping->nrpages) { 57 if (mapping->nrpages) {
55 xfs_iflags_clear(ip, XFS_ITRUNCATED); 58 xfs_iflags_clear(ip, XFS_ITRUNCATED);
56 ret = filemap_write_and_wait(mapping); 59 ret = filemap_write_and_wait(mapping);
@@ -76,7 +79,7 @@ xfs_flush_pages(
76 xfs_iflags_clear(ip, XFS_ITRUNCATED); 79 xfs_iflags_clear(ip, XFS_ITRUNCATED);
77 ret = -filemap_fdatawrite(mapping); 80 ret = -filemap_fdatawrite(mapping);
78 } 81 }
79 if (flags & XFS_B_ASYNC) 82 if (flags & XBF_ASYNC)
80 return ret; 83 return ret;
81 ret2 = xfs_wait_on_pages(ip, first, last); 84 ret2 = xfs_wait_on_pages(ip, first, last);
82 if (!ret) 85 if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 5bb523d7f37e..7b26cc2fd284 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -51,12 +51,14 @@
51#include "xfs_quota.h" 51#include "xfs_quota.h"
52#include "xfs_inode_item.h" 52#include "xfs_inode_item.h"
53#include "xfs_export.h" 53#include "xfs_export.h"
54#include "xfs_trace.h"
54 55
55#include <linux/capability.h> 56#include <linux/capability.h>
56#include <linux/dcache.h> 57#include <linux/dcache.h>
57#include <linux/mount.h> 58#include <linux/mount.h>
58#include <linux/namei.h> 59#include <linux/namei.h>
59#include <linux/pagemap.h> 60#include <linux/pagemap.h>
61#include <linux/slab.h>
60#include <linux/exportfs.h> 62#include <linux/exportfs.h>
61 63
62/* 64/*
@@ -446,12 +448,12 @@ xfs_attrlist_by_handle(
446int 448int
447xfs_attrmulti_attr_get( 449xfs_attrmulti_attr_get(
448 struct inode *inode, 450 struct inode *inode,
449 char *name, 451 unsigned char *name,
450 char __user *ubuf, 452 unsigned char __user *ubuf,
451 __uint32_t *len, 453 __uint32_t *len,
452 __uint32_t flags) 454 __uint32_t flags)
453{ 455{
454 char *kbuf; 456 unsigned char *kbuf;
455 int error = EFAULT; 457 int error = EFAULT;
456 458
457 if (*len > XATTR_SIZE_MAX) 459 if (*len > XATTR_SIZE_MAX)
@@ -475,12 +477,12 @@ xfs_attrmulti_attr_get(
475int 477int
476xfs_attrmulti_attr_set( 478xfs_attrmulti_attr_set(
477 struct inode *inode, 479 struct inode *inode,
478 char *name, 480 unsigned char *name,
479 const char __user *ubuf, 481 const unsigned char __user *ubuf,
480 __uint32_t len, 482 __uint32_t len,
481 __uint32_t flags) 483 __uint32_t flags)
482{ 484{
483 char *kbuf; 485 unsigned char *kbuf;
484 int error = EFAULT; 486 int error = EFAULT;
485 487
486 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 488 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -500,7 +502,7 @@ xfs_attrmulti_attr_set(
500int 502int
501xfs_attrmulti_attr_remove( 503xfs_attrmulti_attr_remove(
502 struct inode *inode, 504 struct inode *inode,
503 char *name, 505 unsigned char *name,
504 __uint32_t flags) 506 __uint32_t flags)
505{ 507{
506 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 508 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -518,7 +520,7 @@ xfs_attrmulti_by_handle(
518 xfs_fsop_attrmulti_handlereq_t am_hreq; 520 xfs_fsop_attrmulti_handlereq_t am_hreq;
519 struct dentry *dentry; 521 struct dentry *dentry;
520 unsigned int i, size; 522 unsigned int i, size;
521 char *attr_name; 523 unsigned char *attr_name;
522 524
523 if (!capable(CAP_SYS_ADMIN)) 525 if (!capable(CAP_SYS_ADMIN))
524 return -XFS_ERROR(EPERM); 526 return -XFS_ERROR(EPERM);
@@ -546,7 +548,7 @@ xfs_attrmulti_by_handle(
546 548
547 error = 0; 549 error = 0;
548 for (i = 0; i < am_hreq.opcount; i++) { 550 for (i = 0; i < am_hreq.opcount; i++) {
549 ops[i].am_error = strncpy_from_user(attr_name, 551 ops[i].am_error = strncpy_from_user((char *)attr_name,
550 ops[i].am_attrname, MAXNAMELEN); 552 ops[i].am_attrname, MAXNAMELEN);
551 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 553 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
552 error = -ERANGE; 554 error = -ERANGE;
@@ -1430,6 +1432,9 @@ xfs_file_ioctl(
1430 if (!capable(CAP_SYS_ADMIN)) 1432 if (!capable(CAP_SYS_ADMIN))
1431 return -EPERM; 1433 return -EPERM;
1432 1434
1435 if (mp->m_flags & XFS_MOUNT_RDONLY)
1436 return -XFS_ERROR(EROFS);
1437
1433 if (copy_from_user(&inout, arg, sizeof(inout))) 1438 if (copy_from_user(&inout, arg, sizeof(inout)))
1434 return -XFS_ERROR(EFAULT); 1439 return -XFS_ERROR(EFAULT);
1435 1440
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
45extern int 45extern int
46xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
47 struct inode *inode, 47 struct inode *inode,
48 char *name, 48 unsigned char *name,
49 char __user *ubuf, 49 unsigned char __user *ubuf,
50 __uint32_t *len, 50 __uint32_t *len,
51 __uint32_t flags); 51 __uint32_t flags);
52 52
53extern int 53extern int
54 xfs_attrmulti_attr_set( 54xfs_attrmulti_attr_set(
55 struct inode *inode, 55 struct inode *inode,
56 char *name, 56 unsigned char *name,
57 const char __user *ubuf, 57 const unsigned char __user *ubuf,
58 __uint32_t len, 58 __uint32_t len,
59 __uint32_t flags); 59 __uint32_t flags);
60 60
61extern int 61extern int
62xfs_attrmulti_attr_remove( 62xfs_attrmulti_attr_remove(
63 struct inode *inode, 63 struct inode *inode,
64 char *name, 64 unsigned char *name,
65 __uint32_t flags); 65 __uint32_t flags);
66 66
67extern struct dentry * 67extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index eafcc7c18706..593c05b4df8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h> 20#include <linux/mount.h>
21#include <linux/slab.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include "xfs.h" 23#include "xfs.h"
23#include "xfs_fs.h" 24#include "xfs_fs.h"
@@ -46,6 +47,7 @@
46#include "xfs_attr.h" 47#include "xfs_attr.h"
47#include "xfs_ioctl.h" 48#include "xfs_ioctl.h"
48#include "xfs_ioctl32.h" 49#include "xfs_ioctl32.h"
50#include "xfs_trace.h"
49 51
50#define _NATIVE_IOC(cmd, type) \ 52#define _NATIVE_IOC(cmd, type) \
51 _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) 53 _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
@@ -410,7 +412,7 @@ xfs_compat_attrmulti_by_handle(
410 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 412 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
411 struct dentry *dentry; 413 struct dentry *dentry;
412 unsigned int i, size; 414 unsigned int i, size;
413 char *attr_name; 415 unsigned char *attr_name;
414 416
415 if (!capable(CAP_SYS_ADMIN)) 417 if (!capable(CAP_SYS_ADMIN))
416 return -XFS_ERROR(EPERM); 418 return -XFS_ERROR(EPERM);
@@ -439,7 +441,7 @@ xfs_compat_attrmulti_by_handle(
439 441
440 error = 0; 442 error = 0;
441 for (i = 0; i < am_hreq.opcount; i++) { 443 for (i = 0; i < am_hreq.opcount; i++) {
442 ops[i].am_error = strncpy_from_user(attr_name, 444 ops[i].am_error = strncpy_from_user((char *)attr_name,
443 compat_ptr(ops[i].am_attrname), 445 compat_ptr(ops[i].am_attrname),
444 MAXNAMELEN); 446 MAXNAMELEN);
445 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 447 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cd42ef78f6b5..e65a7937f3a4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -47,6 +47,7 @@
47#include "xfs_buf_item.h" 47#include "xfs_buf_item.h"
48#include "xfs_utils.h" 48#include "xfs_utils.h"
49#include "xfs_vnodeops.h" 49#include "xfs_vnodeops.h"
50#include "xfs_trace.h"
50 51
51#include <linux/capability.h> 52#include <linux/capability.h>
52#include <linux/xattr.h> 53#include <linux/xattr.h>
@@ -55,6 +56,7 @@
55#include <linux/security.h> 56#include <linux/security.h>
56#include <linux/falloc.h> 57#include <linux/falloc.h>
57#include <linux/fiemap.h> 58#include <linux/fiemap.h>
59#include <linux/slab.h>
58 60
59/* 61/*
60 * Bring the timestamps in the XFS inode uptodate. 62 * Bring the timestamps in the XFS inode uptodate.
@@ -90,6 +92,16 @@ xfs_mark_inode_dirty_sync(
90 mark_inode_dirty_sync(inode); 92 mark_inode_dirty_sync(inode);
91} 93}
92 94
95void
96xfs_mark_inode_dirty(
97 xfs_inode_t *ip)
98{
99 struct inode *inode = VFS_I(ip);
100
101 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
102 mark_inode_dirty(inode);
103}
104
93/* 105/*
94 * Change the requested timestamp in the given inode. 106 * Change the requested timestamp in the given inode.
95 * We don't lock across timestamp updates, and we don't log them but 107 * We don't lock across timestamp updates, and we don't log them but
@@ -139,10 +151,10 @@ xfs_init_security(
139 struct xfs_inode *ip = XFS_I(inode); 151 struct xfs_inode *ip = XFS_I(inode);
140 size_t length; 152 size_t length;
141 void *value; 153 void *value;
142 char *name; 154 unsigned char *name;
143 int error; 155 int error;
144 156
145 error = security_inode_init_security(inode, dir, &name, 157 error = security_inode_init_security(inode, dir, (char **)&name,
146 &value, &length); 158 &value, &length);
147 if (error) { 159 if (error) {
148 if (error == -EOPNOTSUPP) 160 if (error == -EOPNOTSUPP)
@@ -573,8 +585,8 @@ xfs_vn_fallocate(
573 bf.l_len = len; 585 bf.l_len = len;
574 586
575 xfs_ilock(ip, XFS_IOLOCK_EXCL); 587 xfs_ilock(ip, XFS_IOLOCK_EXCL);
576 error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, 588 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
577 0, XFS_ATTR_NOLOCK); 589 0, XFS_ATTR_NOLOCK);
578 if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && 590 if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
579 offset + len > i_size_read(inode)) 591 offset + len > i_size_read(inode))
580 new_size = offset + len; 592 new_size = offset + len;
@@ -585,7 +597,7 @@ xfs_vn_fallocate(
585 597
586 iattr.ia_valid = ATTR_SIZE; 598 iattr.ia_valid = ATTR_SIZE;
587 iattr.ia_size = new_size; 599 iattr.ia_size = new_size;
588 error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); 600 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
589 } 601 }
590 602
591 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 603 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -793,7 +805,7 @@ xfs_setup_inode(
793 struct inode *inode = &ip->i_vnode; 805 struct inode *inode = &ip->i_vnode;
794 806
795 inode->i_ino = ip->i_ino; 807 inode->i_ino = ip->i_ino;
796 inode->i_state = I_NEW|I_LOCK; 808 inode->i_state = I_NEW;
797 inode_add_to_lists(ip->i_mount->m_super, inode); 809 inode_add_to_lists(ip->i_mount->m_super, inode);
798 810
799 inode->i_mode = ip->i_d.di_mode; 811 inode->i_mode = ip->i_d.di_mode;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 6127e24062d0..facfb323a706 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -40,7 +40,6 @@
40#include <sv.h> 40#include <sv.h>
41#include <time.h> 41#include <time.h>
42 42
43#include <support/ktrace.h>
44#include <support/debug.h> 43#include <support/debug.h>
45#include <support/uuid.h> 44#include <support/uuid.h>
46 45
@@ -89,7 +88,6 @@
89#include <xfs_super.h> 88#include <xfs_super.h>
90#include <xfs_globals.h> 89#include <xfs_globals.h>
91#include <xfs_fs_subr.h> 90#include <xfs_fs_subr.h>
92#include <xfs_lrw.h>
93#include <xfs_buf.h> 91#include <xfs_buf.h>
94 92
95/* 93/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index 072050f8d346..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,922 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h"
30#include "xfs_mount.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_rtalloc.h"
42#include "xfs_error.h"
43#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h"
46#include "xfs_inode_item.h"
47#include "xfs_buf_item.h"
48#include "xfs_utils.h"
49#include "xfs_iomap.h"
50#include "xfs_vnodeops.h"
51
52#include <linux/capability.h>
53#include <linux/writeback.h>
54
55
56#if defined(XFS_RW_TRACE)
57void
58xfs_rw_enter_trace(
59 int tag,
60 xfs_inode_t *ip,
61 void *data,
62 size_t segs,
63 loff_t offset,
64 int ioflags)
65{
66 if (ip->i_rwtrace == NULL)
67 return;
68 ktrace_enter(ip->i_rwtrace,
69 (void *)(unsigned long)tag,
70 (void *)ip,
71 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
72 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
73 (void *)data,
74 (void *)((unsigned long)segs),
75 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
76 (void *)((unsigned long)(offset & 0xffffffff)),
77 (void *)((unsigned long)ioflags),
78 (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
79 (void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
80 (void *)((unsigned long)current_pid()),
81 (void *)NULL,
82 (void *)NULL,
83 (void *)NULL,
84 (void *)NULL);
85}
86
87void
88xfs_inval_cached_trace(
89 xfs_inode_t *ip,
90 xfs_off_t offset,
91 xfs_off_t len,
92 xfs_off_t first,
93 xfs_off_t last)
94{
95
96 if (ip->i_rwtrace == NULL)
97 return;
98 ktrace_enter(ip->i_rwtrace,
99 (void *)(__psint_t)XFS_INVAL_CACHED,
100 (void *)ip,
101 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
102 (void *)((unsigned long)(offset & 0xffffffff)),
103 (void *)((unsigned long)((len >> 32) & 0xffffffff)),
104 (void *)((unsigned long)(len & 0xffffffff)),
105 (void *)((unsigned long)((first >> 32) & 0xffffffff)),
106 (void *)((unsigned long)(first & 0xffffffff)),
107 (void *)((unsigned long)((last >> 32) & 0xffffffff)),
108 (void *)((unsigned long)(last & 0xffffffff)),
109 (void *)((unsigned long)current_pid()),
110 (void *)NULL,
111 (void *)NULL,
112 (void *)NULL,
113 (void *)NULL,
114 (void *)NULL);
115}
116#endif
117
118/*
119 * xfs_iozero
120 *
121 * xfs_iozero clears the specified range of buffer supplied,
122 * and marks all the affected blocks as valid and modified. If
123 * an affected block is not allocated, it will be allocated. If
124 * an affected block is not completely overwritten, and is not
125 * valid before the operation, it will be read from disk before
126 * being partially zeroed.
127 */
128STATIC int
129xfs_iozero(
130 struct xfs_inode *ip, /* inode */
131 loff_t pos, /* offset in file */
132 size_t count) /* size of data to zero */
133{
134 struct page *page;
135 struct address_space *mapping;
136 int status;
137
138 mapping = VFS_I(ip)->i_mapping;
139 do {
140 unsigned offset, bytes;
141 void *fsdata;
142
143 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
144 bytes = PAGE_CACHE_SIZE - offset;
145 if (bytes > count)
146 bytes = count;
147
148 status = pagecache_write_begin(NULL, mapping, pos, bytes,
149 AOP_FLAG_UNINTERRUPTIBLE,
150 &page, &fsdata);
151 if (status)
152 break;
153
154 zero_user(page, offset, bytes);
155
156 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
157 page, fsdata);
158 WARN_ON(status <= 0); /* can't return less than zero! */
159 pos += bytes;
160 count -= bytes;
161 status = 0;
162 } while (count);
163
164 return (-status);
165}
166
167ssize_t /* bytes read, or (-) error */
168xfs_read(
169 xfs_inode_t *ip,
170 struct kiocb *iocb,
171 const struct iovec *iovp,
172 unsigned int segs,
173 loff_t *offset,
174 int ioflags)
175{
176 struct file *file = iocb->ki_filp;
177 struct inode *inode = file->f_mapping->host;
178 xfs_mount_t *mp = ip->i_mount;
179 size_t size = 0;
180 ssize_t ret = 0;
181 xfs_fsize_t n;
182 unsigned long seg;
183
184
185 XFS_STATS_INC(xs_read_calls);
186
187 /* START copy & waste from filemap.c */
188 for (seg = 0; seg < segs; seg++) {
189 const struct iovec *iv = &iovp[seg];
190
191 /*
192 * If any segment has a negative length, or the cumulative
193 * length ever wraps negative then return -EINVAL.
194 */
195 size += iv->iov_len;
196 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
197 return XFS_ERROR(-EINVAL);
198 }
199 /* END copy & waste from filemap.c */
200
201 if (unlikely(ioflags & IO_ISDIRECT)) {
202 xfs_buftarg_t *target =
203 XFS_IS_REALTIME_INODE(ip) ?
204 mp->m_rtdev_targp : mp->m_ddev_targp;
205 if ((*offset & target->bt_smask) ||
206 (size & target->bt_smask)) {
207 if (*offset == ip->i_size) {
208 return (0);
209 }
210 return -XFS_ERROR(EINVAL);
211 }
212 }
213
214 n = XFS_MAXIOFFSET(mp) - *offset;
215 if ((n <= 0) || (size == 0))
216 return 0;
217
218 if (n < size)
219 size = n;
220
221 if (XFS_FORCED_SHUTDOWN(mp))
222 return -EIO;
223
224 if (unlikely(ioflags & IO_ISDIRECT))
225 mutex_lock(&inode->i_mutex);
226 xfs_ilock(ip, XFS_IOLOCK_SHARED);
227
228 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
229 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
230 int iolock = XFS_IOLOCK_SHARED;
231
232 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
233 dmflags, &iolock);
234 if (ret) {
235 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
236 if (unlikely(ioflags & IO_ISDIRECT))
237 mutex_unlock(&inode->i_mutex);
238 return ret;
239 }
240 }
241
242 if (unlikely(ioflags & IO_ISDIRECT)) {
243 if (inode->i_mapping->nrpages)
244 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
245 -1, FI_REMAPF_LOCKED);
246 mutex_unlock(&inode->i_mutex);
247 if (ret) {
248 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
249 return ret;
250 }
251 }
252
253 xfs_rw_enter_trace(XFS_READ_ENTER, ip,
254 (void *)iovp, segs, *offset, ioflags);
255
256 iocb->ki_pos = *offset;
257 ret = generic_file_aio_read(iocb, iovp, segs, *offset);
258 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
259 ret = wait_on_sync_kiocb(iocb);
260 if (ret > 0)
261 XFS_STATS_ADD(xs_read_bytes, ret);
262
263 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
264 return ret;
265}
266
267ssize_t
268xfs_splice_read(
269 xfs_inode_t *ip,
270 struct file *infilp,
271 loff_t *ppos,
272 struct pipe_inode_info *pipe,
273 size_t count,
274 int flags,
275 int ioflags)
276{
277 xfs_mount_t *mp = ip->i_mount;
278 ssize_t ret;
279
280 XFS_STATS_INC(xs_read_calls);
281 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
282 return -EIO;
283
284 xfs_ilock(ip, XFS_IOLOCK_SHARED);
285
286 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
287 int iolock = XFS_IOLOCK_SHARED;
288 int error;
289
290 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
291 FILP_DELAY_FLAG(infilp), &iolock);
292 if (error) {
293 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
294 return -error;
295 }
296 }
297 xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip,
298 pipe, count, *ppos, ioflags);
299 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
300 if (ret > 0)
301 XFS_STATS_ADD(xs_read_bytes, ret);
302
303 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
304 return ret;
305}
306
307ssize_t
308xfs_splice_write(
309 xfs_inode_t *ip,
310 struct pipe_inode_info *pipe,
311 struct file *outfilp,
312 loff_t *ppos,
313 size_t count,
314 int flags,
315 int ioflags)
316{
317 xfs_mount_t *mp = ip->i_mount;
318 ssize_t ret;
319 struct inode *inode = outfilp->f_mapping->host;
320 xfs_fsize_t isize, new_size;
321
322 XFS_STATS_INC(xs_write_calls);
323 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
324 return -EIO;
325
326 xfs_ilock(ip, XFS_IOLOCK_EXCL);
327
328 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
329 int iolock = XFS_IOLOCK_EXCL;
330 int error;
331
332 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
333 FILP_DELAY_FLAG(outfilp), &iolock);
334 if (error) {
335 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
336 return -error;
337 }
338 }
339
340 new_size = *ppos + count;
341
342 xfs_ilock(ip, XFS_ILOCK_EXCL);
343 if (new_size > ip->i_size)
344 ip->i_new_size = new_size;
345 xfs_iunlock(ip, XFS_ILOCK_EXCL);
346
347 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip,
348 pipe, count, *ppos, ioflags);
349 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
350 if (ret > 0)
351 XFS_STATS_ADD(xs_write_bytes, ret);
352
353 isize = i_size_read(inode);
354 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
355 *ppos = isize;
356
357 if (*ppos > ip->i_size) {
358 xfs_ilock(ip, XFS_ILOCK_EXCL);
359 if (*ppos > ip->i_size)
360 ip->i_size = *ppos;
361 xfs_iunlock(ip, XFS_ILOCK_EXCL);
362 }
363
364 if (ip->i_new_size) {
365 xfs_ilock(ip, XFS_ILOCK_EXCL);
366 ip->i_new_size = 0;
367 if (ip->i_d.di_size > ip->i_size)
368 ip->i_d.di_size = ip->i_size;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
372 return ret;
373}
374
375/*
376 * This routine is called to handle zeroing any space in the last
377 * block of the file that is beyond the EOF. We do this since the
378 * size is being increased without writing anything to that block
379 * and we don't want anyone to read the garbage on the disk.
380 */
381STATIC int /* error (positive) */
382xfs_zero_last_block(
383 xfs_inode_t *ip,
384 xfs_fsize_t offset,
385 xfs_fsize_t isize)
386{
387 xfs_fileoff_t last_fsb;
388 xfs_mount_t *mp = ip->i_mount;
389 int nimaps;
390 int zero_offset;
391 int zero_len;
392 int error = 0;
393 xfs_bmbt_irec_t imap;
394
395 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
396
397 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
398 if (zero_offset == 0) {
399 /*
400 * There are no extra bytes in the last block on disk to
401 * zero, so return.
402 */
403 return 0;
404 }
405
406 last_fsb = XFS_B_TO_FSBT(mp, isize);
407 nimaps = 1;
408 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
409 &nimaps, NULL, NULL);
410 if (error) {
411 return error;
412 }
413 ASSERT(nimaps > 0);
414 /*
415 * If the block underlying isize is just a hole, then there
416 * is nothing to zero.
417 */
418 if (imap.br_startblock == HOLESTARTBLOCK) {
419 return 0;
420 }
421 /*
422 * Zero the part of the last block beyond the EOF, and write it
423 * out sync. We need to drop the ilock while we do this so we
424 * don't deadlock when the buffer cache calls back to us.
425 */
426 xfs_iunlock(ip, XFS_ILOCK_EXCL);
427
428 zero_len = mp->m_sb.sb_blocksize - zero_offset;
429 if (isize + zero_len > offset)
430 zero_len = offset - isize;
431 error = xfs_iozero(ip, isize, zero_len);
432
433 xfs_ilock(ip, XFS_ILOCK_EXCL);
434 ASSERT(error >= 0);
435 return error;
436}
437
438/*
439 * Zero any on disk space between the current EOF and the new,
440 * larger EOF. This handles the normal case of zeroing the remainder
441 * of the last block in the file and the unusual case of zeroing blocks
442 * out beyond the size of the file. This second case only happens
443 * with fixed size extents and when the system crashes before the inode
444 * size was updated but after blocks were allocated. If fill is set,
445 * then any holes in the range are filled and zeroed. If not, the holes
446 * are left alone as holes.
447 */
448
449int /* error (positive) */
450xfs_zero_eof(
451 xfs_inode_t *ip,
452 xfs_off_t offset, /* starting I/O offset */
453 xfs_fsize_t isize) /* current inode size */
454{
455 xfs_mount_t *mp = ip->i_mount;
456 xfs_fileoff_t start_zero_fsb;
457 xfs_fileoff_t end_zero_fsb;
458 xfs_fileoff_t zero_count_fsb;
459 xfs_fileoff_t last_fsb;
460 xfs_fileoff_t zero_off;
461 xfs_fsize_t zero_len;
462 int nimaps;
463 int error = 0;
464 xfs_bmbt_irec_t imap;
465
466 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
467 ASSERT(offset > isize);
468
469 /*
470 * First handle zeroing the block on which isize resides.
471 * We only zero a part of that block so it is handled specially.
472 */
473 error = xfs_zero_last_block(ip, offset, isize);
474 if (error) {
475 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
476 return error;
477 }
478
479 /*
480 * Calculate the range between the new size and the old
481 * where blocks needing to be zeroed may exist. To get the
482 * block where the last byte in the file currently resides,
483 * we need to subtract one from the size and truncate back
484 * to a block boundary. We subtract 1 in case the size is
485 * exactly on a block boundary.
486 */
487 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
488 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
489 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
490 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
491 if (last_fsb == end_zero_fsb) {
492 /*
493 * The size was only incremented on its last block.
494 * We took care of that above, so just return.
495 */
496 return 0;
497 }
498
499 ASSERT(start_zero_fsb <= end_zero_fsb);
500 while (start_zero_fsb <= end_zero_fsb) {
501 nimaps = 1;
502 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
503 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
504 0, NULL, 0, &imap, &nimaps, NULL, NULL);
505 if (error) {
506 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
507 return error;
508 }
509 ASSERT(nimaps > 0);
510
511 if (imap.br_state == XFS_EXT_UNWRITTEN ||
512 imap.br_startblock == HOLESTARTBLOCK) {
513 /*
514 * This loop handles initializing pages that were
515 * partially initialized by the code below this
516 * loop. It basically zeroes the part of the page
517 * that sits on a hole and sets the page as P_HOLE
518 * and calls remapf if it is a mapped file.
519 */
520 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
521 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
522 continue;
523 }
524
525 /*
526 * There are blocks we need to zero.
527 * Drop the inode lock while we're doing the I/O.
528 * We'll still have the iolock to protect us.
529 */
530 xfs_iunlock(ip, XFS_ILOCK_EXCL);
531
532 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
533 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
534
535 if ((zero_off + zero_len) > offset)
536 zero_len = offset - zero_off;
537
538 error = xfs_iozero(ip, zero_off, zero_len);
539 if (error) {
540 goto out_lock;
541 }
542
543 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
544 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
545
546 xfs_ilock(ip, XFS_ILOCK_EXCL);
547 }
548
549 return 0;
550
551out_lock:
552 xfs_ilock(ip, XFS_ILOCK_EXCL);
553 ASSERT(error >= 0);
554 return error;
555}
556
557ssize_t /* bytes written, or (-) error */
558xfs_write(
559 struct xfs_inode *xip,
560 struct kiocb *iocb,
561 const struct iovec *iovp,
562 unsigned int nsegs,
563 loff_t *offset,
564 int ioflags)
565{
566 struct file *file = iocb->ki_filp;
567 struct address_space *mapping = file->f_mapping;
568 struct inode *inode = mapping->host;
569 unsigned long segs = nsegs;
570 xfs_mount_t *mp;
571 ssize_t ret = 0, error = 0;
572 xfs_fsize_t isize, new_size;
573 int iolock;
574 int eventsent = 0;
575 size_t ocount = 0, count;
576 loff_t pos;
577 int need_i_mutex;
578
579 XFS_STATS_INC(xs_write_calls);
580
581 error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
582 if (error)
583 return error;
584
585 count = ocount;
586 pos = *offset;
587
588 if (count == 0)
589 return 0;
590
591 mp = xip->i_mount;
592
593 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
594
595 if (XFS_FORCED_SHUTDOWN(mp))
596 return -EIO;
597
598relock:
599 if (ioflags & IO_ISDIRECT) {
600 iolock = XFS_IOLOCK_SHARED;
601 need_i_mutex = 0;
602 } else {
603 iolock = XFS_IOLOCK_EXCL;
604 need_i_mutex = 1;
605 mutex_lock(&inode->i_mutex);
606 }
607
608 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
609
610start:
611 error = -generic_write_checks(file, &pos, &count,
612 S_ISBLK(inode->i_mode));
613 if (error) {
614 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
615 goto out_unlock_mutex;
616 }
617
618 if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
619 !(ioflags & IO_INVIS) && !eventsent)) {
620 int dmflags = FILP_DELAY_FLAG(file);
621
622 if (need_i_mutex)
623 dmflags |= DM_FLAGS_IMUX;
624
625 xfs_iunlock(xip, XFS_ILOCK_EXCL);
626 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
627 pos, count, dmflags, &iolock);
628 if (error) {
629 goto out_unlock_internal;
630 }
631 xfs_ilock(xip, XFS_ILOCK_EXCL);
632 eventsent = 1;
633
634 /*
635 * The iolock was dropped and reacquired in XFS_SEND_DATA
636 * so we have to recheck the size when appending.
637 * We will only "goto start;" once, since having sent the
638 * event prevents another call to XFS_SEND_DATA, which is
639 * what allows the size to change in the first place.
640 */
641 if ((file->f_flags & O_APPEND) && pos != xip->i_size)
642 goto start;
643 }
644
645 if (ioflags & IO_ISDIRECT) {
646 xfs_buftarg_t *target =
647 XFS_IS_REALTIME_INODE(xip) ?
648 mp->m_rtdev_targp : mp->m_ddev_targp;
649
650 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
651 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
652 return XFS_ERROR(-EINVAL);
653 }
654
655 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
656 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
657 iolock = XFS_IOLOCK_EXCL;
658 need_i_mutex = 1;
659 mutex_lock(&inode->i_mutex);
660 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
661 goto start;
662 }
663 }
664
665 new_size = pos + count;
666 if (new_size > xip->i_size)
667 xip->i_new_size = new_size;
668
669 if (likely(!(ioflags & IO_INVIS)))
670 file_update_time(file);
671
672 /*
673 * If the offset is beyond the size of the file, we have a couple
674 * of things to do. First, if there is already space allocated
675 * we need to either create holes or zero the disk or ...
676 *
677 * If there is a page where the previous size lands, we need
678 * to zero it out up to the new size.
679 */
680
681 if (pos > xip->i_size) {
682 error = xfs_zero_eof(xip, pos, xip->i_size);
683 if (error) {
684 xfs_iunlock(xip, XFS_ILOCK_EXCL);
685 goto out_unlock_internal;
686 }
687 }
688 xfs_iunlock(xip, XFS_ILOCK_EXCL);
689
690 /*
691 * If we're writing the file then make sure to clear the
692 * setuid and setgid bits if the process is not being run
693 * by root. This keeps people from modifying setuid and
694 * setgid binaries.
695 */
696
697 if (((xip->i_d.di_mode & S_ISUID) ||
698 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
699 (S_ISGID | S_IXGRP))) &&
700 !capable(CAP_FSETID)) {
701 error = xfs_write_clear_setuid(xip);
702 if (likely(!error))
703 error = -file_remove_suid(file);
704 if (unlikely(error)) {
705 goto out_unlock_internal;
706 }
707 }
708
709 /* We can write back this queue in page reclaim */
710 current->backing_dev_info = mapping->backing_dev_info;
711
712 if ((ioflags & IO_ISDIRECT)) {
713 if (mapping->nrpages) {
714 WARN_ON(need_i_mutex == 0);
715 xfs_inval_cached_trace(xip, pos, -1,
716 (pos & PAGE_CACHE_MASK), -1);
717 error = xfs_flushinval_pages(xip,
718 (pos & PAGE_CACHE_MASK),
719 -1, FI_REMAPF_LOCKED);
720 if (error)
721 goto out_unlock_internal;
722 }
723
724 if (need_i_mutex) {
725 /* demote the lock now the cached pages are gone */
726 xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
727 mutex_unlock(&inode->i_mutex);
728
729 iolock = XFS_IOLOCK_SHARED;
730 need_i_mutex = 0;
731 }
732
733 xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
734 *offset, ioflags);
735 ret = generic_file_direct_write(iocb, iovp,
736 &segs, pos, offset, count, ocount);
737
738 /*
739 * direct-io write to a hole: fall through to buffered I/O
740 * for completing the rest of the request.
741 */
742 if (ret >= 0 && ret != count) {
743 XFS_STATS_ADD(xs_write_bytes, ret);
744
745 pos += ret;
746 count -= ret;
747
748 ioflags &= ~IO_ISDIRECT;
749 xfs_iunlock(xip, iolock);
750 goto relock;
751 }
752 } else {
753 int enospc = 0;
754 ssize_t ret2 = 0;
755
756write_retry:
757 xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
758 *offset, ioflags);
759 ret2 = generic_file_buffered_write(iocb, iovp, segs,
760 pos, offset, count, ret);
761 /*
762 * if we just got an ENOSPC, flush the inode now we
763 * aren't holding any page locks and retry *once*
764 */
765 if (ret2 == -ENOSPC && !enospc) {
766 error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
767 if (error)
768 goto out_unlock_internal;
769 enospc = 1;
770 goto write_retry;
771 }
772 ret = ret2;
773 }
774
775 current->backing_dev_info = NULL;
776
777 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
778 ret = wait_on_sync_kiocb(iocb);
779
780 isize = i_size_read(inode);
781 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
782 *offset = isize;
783
784 if (*offset > xip->i_size) {
785 xfs_ilock(xip, XFS_ILOCK_EXCL);
786 if (*offset > xip->i_size)
787 xip->i_size = *offset;
788 xfs_iunlock(xip, XFS_ILOCK_EXCL);
789 }
790
791 if (ret == -ENOSPC &&
792 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
793 xfs_iunlock(xip, iolock);
794 if (need_i_mutex)
795 mutex_unlock(&inode->i_mutex);
796 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
797 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
798 0, 0, 0); /* Delay flag intentionally unused */
799 if (need_i_mutex)
800 mutex_lock(&inode->i_mutex);
801 xfs_ilock(xip, iolock);
802 if (error)
803 goto out_unlock_internal;
804 goto start;
805 }
806
807 error = -ret;
808 if (ret <= 0)
809 goto out_unlock_internal;
810
811 XFS_STATS_ADD(xs_write_bytes, ret);
812
813 /* Handle various SYNC-type writes */
814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
815 loff_t end = pos + ret - 1;
816 int error2;
817
818 xfs_iunlock(xip, iolock);
819 if (need_i_mutex)
820 mutex_unlock(&inode->i_mutex);
821
822 error2 = filemap_write_and_wait_range(mapping, pos, end);
823 if (!error)
824 error = error2;
825 if (need_i_mutex)
826 mutex_lock(&inode->i_mutex);
827 xfs_ilock(xip, iolock);
828
829 error2 = xfs_fsync(xip);
830 if (!error)
831 error = error2;
832 }
833
834 out_unlock_internal:
835 if (xip->i_new_size) {
836 xfs_ilock(xip, XFS_ILOCK_EXCL);
837 xip->i_new_size = 0;
838 /*
839 * If this was a direct or synchronous I/O that failed (such
840 * as ENOSPC) then part of the I/O may have been written to
841 * disk before the error occured. In this case the on-disk
842 * file size may have been adjusted beyond the in-memory file
843 * size and now needs to be truncated back.
844 */
845 if (xip->i_d.di_size > xip->i_size)
846 xip->i_d.di_size = xip->i_size;
847 xfs_iunlock(xip, XFS_ILOCK_EXCL);
848 }
849 xfs_iunlock(xip, iolock);
850 out_unlock_mutex:
851 if (need_i_mutex)
852 mutex_unlock(&inode->i_mutex);
853 return -error;
854}
855
856/*
857 * All xfs metadata buffers except log state machine buffers
858 * get this attached as their b_bdstrat callback function.
859 * This is so that we can catch a buffer
860 * after prematurely unpinning it to forcibly shutdown the filesystem.
861 */
862int
863xfs_bdstrat_cb(struct xfs_buf *bp)
864{
865 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
866 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
867 /*
868 * Metadata write that didn't get logged but
869 * written delayed anyway. These aren't associated
870 * with a transaction, and can be ignored.
871 */
872 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
873 (XFS_BUF_ISREAD(bp)) == 0)
874 return (xfs_bioerror_relse(bp));
875 else
876 return (xfs_bioerror(bp));
877 }
878
879 xfs_buf_iorequest(bp);
880 return 0;
881}
882
883/*
884 * Wrapper around bdstrat so that we can stop data from going to disk in case
885 * we are shutting down the filesystem. Typically user data goes thru this
886 * path; one of the exceptions is the superblock.
887 */
888void
889xfsbdstrat(
890 struct xfs_mount *mp,
891 struct xfs_buf *bp)
892{
893 ASSERT(mp);
894 if (!XFS_FORCED_SHUTDOWN(mp)) {
895 xfs_buf_iorequest(bp);
896 return;
897 }
898
899 xfs_buftrace("XFSBDSTRAT IOERROR", bp);
900 xfs_bioerror_relse(bp);
901}
902
903/*
904 * If the underlying (data/log/rt) device is readonly, there are some
905 * operations that cannot proceed.
906 */
907int
908xfs_dev_is_read_only(
909 xfs_mount_t *mp,
910 char *message)
911{
912 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
913 xfs_readonly_buftarg(mp->m_logdev_targp) ||
914 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
915 cmn_err(CE_NOTE,
916 "XFS: %s required on read-only device.", message);
917 cmn_err(CE_NOTE,
918 "XFS: write access unavailable, cannot proceed.");
919 return EROFS;
920 }
921 return 0;
922}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index e6be37dbd0e9..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,77 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LRW_H__
19#define __XFS_LRW_H__
20
21struct xfs_mount;
22struct xfs_inode;
23struct xfs_bmbt_irec;
24struct xfs_buf;
25struct xfs_iomap;
26
27#if defined(XFS_RW_TRACE)
28/*
29 * Defines for the trace mechanisms in xfs_lrw.c.
30 */
31#define XFS_RW_KTRACE_SIZE 128
32
33#define XFS_READ_ENTER 1
34#define XFS_WRITE_ENTER 2
35#define XFS_IOMAP_READ_ENTER 3
36#define XFS_IOMAP_WRITE_ENTER 4
37#define XFS_IOMAP_READ_MAP 5
38#define XFS_IOMAP_WRITE_MAP 6
39#define XFS_IOMAP_WRITE_NOSPACE 7
40#define XFS_ITRUNC_START 8
41#define XFS_ITRUNC_FINISH1 9
42#define XFS_ITRUNC_FINISH2 10
43#define XFS_CTRUNC1 11
44#define XFS_CTRUNC2 12
45#define XFS_CTRUNC3 13
46#define XFS_CTRUNC4 14
47#define XFS_CTRUNC5 15
48#define XFS_CTRUNC6 16
49#define XFS_BUNMAP 17
50#define XFS_INVAL_CACHED 18
51#define XFS_DIORD_ENTER 19
52#define XFS_DIOWR_ENTER 20
53#define XFS_WRITEPAGE_ENTER 22
54#define XFS_RELEASEPAGE_ENTER 23
55#define XFS_INVALIDPAGE_ENTER 24
56#define XFS_IOMAP_ALLOC_ENTER 25
57#define XFS_IOMAP_ALLOC_MAP 26
58#define XFS_IOMAP_UNWRITTEN 27
59#define XFS_SPLICE_READ_ENTER 28
60#define XFS_SPLICE_WRITE_ENTER 29
61extern void xfs_rw_enter_trace(int, struct xfs_inode *,
62 void *, size_t, loff_t, int);
63extern void xfs_inval_cached_trace(struct xfs_inode *,
64 xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t);
65#else
66#define xfs_rw_enter_trace(tag, ip, data, size, offset, ioflags)
67#define xfs_inval_cached_trace(ip, offset, len, first, last)
68#endif
69
70/* errors from xfsbdstrat() must be extracted from the buffer */
71extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
72extern int xfs_bdstrat_cb(struct xfs_buf *);
73extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
74
75extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
76
77#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d634..1947514ce1ad 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
44} 44}
45 45
46STATIC int 46STATIC int
47xfs_fs_quota_sync(
48 struct super_block *sb,
49 int type)
50{
51 struct xfs_mount *mp = XFS_M(sb);
52
53 if (sb->s_flags & MS_RDONLY)
54 return -EROFS;
55 if (!XFS_IS_QUOTA_RUNNING(mp))
56 return -ENOSYS;
57 return -xfs_sync_data(mp, 0);
58}
59
60STATIC int
61xfs_fs_get_xstate( 47xfs_fs_get_xstate(
62 struct super_block *sb, 48 struct super_block *sb,
63 struct fs_quota_stat *fqs) 49 struct fs_quota_stat *fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
82 return -EROFS; 68 return -EROFS;
83 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) 69 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
84 return -ENOSYS; 70 return -ENOSYS;
85 if (!capable(CAP_SYS_ADMIN))
86 return -EPERM;
87 71
88 if (uflags & XFS_QUOTA_UDQ_ACCT) 72 if (uflags & XFS_QUOTA_UDQ_ACCT)
89 flags |= XFS_UQUOTA_ACCT; 73 flags |= XFS_UQUOTA_ACCT;
@@ -144,14 +128,11 @@ xfs_fs_set_xquota(
144 return -ENOSYS; 128 return -ENOSYS;
145 if (!XFS_IS_QUOTA_ON(mp)) 129 if (!XFS_IS_QUOTA_ON(mp))
146 return -ESRCH; 130 return -ESRCH;
147 if (!capable(CAP_SYS_ADMIN))
148 return -EPERM;
149 131
150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 132 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
151} 133}
152 134
153const struct quotactl_ops xfs_quotactl_operations = { 135const struct quotactl_ops xfs_quotactl_operations = {
154 .quota_sync = xfs_fs_quota_sync,
155 .get_xstate = xfs_fs_get_xstate, 136 .get_xstate = xfs_fs_get_xstate,
156 .set_xstate = xfs_fs_set_xstate, 137 .set_xstate = xfs_fs_set_xstate,
157 .get_xquota = xfs_fs_get_xquota, 138 .get_xquota = xfs_fs_get_xquota,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 18a4b8e11df2..52e06b487ced 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -15,6 +15,7 @@
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18
18#include "xfs.h" 19#include "xfs.h"
19#include "xfs_bit.h" 20#include "xfs_bit.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
@@ -52,14 +53,15 @@
52#include "xfs_trans_priv.h" 53#include "xfs_trans_priv.h"
53#include "xfs_filestream.h" 54#include "xfs_filestream.h"
54#include "xfs_da_btree.h" 55#include "xfs_da_btree.h"
55#include "xfs_dir2_trace.h"
56#include "xfs_extfree_item.h" 56#include "xfs_extfree_item.h"
57#include "xfs_mru_cache.h" 57#include "xfs_mru_cache.h"
58#include "xfs_inode_item.h" 58#include "xfs_inode_item.h"
59#include "xfs_sync.h" 59#include "xfs_sync.h"
60#include "xfs_trace.h"
60 61
61#include <linux/namei.h> 62#include <linux/namei.h>
62#include <linux/init.h> 63#include <linux/init.h>
64#include <linux/slab.h>
63#include <linux/mount.h> 65#include <linux/mount.h>
64#include <linux/mempool.h> 66#include <linux/mempool.h>
65#include <linux/writeback.h> 67#include <linux/writeback.h>
@@ -876,12 +878,11 @@ xfsaild(
876{ 878{
877 struct xfs_ail *ailp = data; 879 struct xfs_ail *ailp = data;
878 xfs_lsn_t last_pushed_lsn = 0; 880 xfs_lsn_t last_pushed_lsn = 0;
879 long tout = 0; 881 long tout = 0; /* milliseconds */
880 882
881 while (!kthread_should_stop()) { 883 while (!kthread_should_stop()) {
882 if (tout) 884 schedule_timeout_interruptible(tout ?
883 schedule_timeout_interruptible(msecs_to_jiffies(tout)); 885 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
884 tout = 1000;
885 886
886 /* swsusp */ 887 /* swsusp */
887 try_to_freeze(); 888 try_to_freeze();
@@ -930,13 +931,37 @@ xfs_fs_alloc_inode(
930 */ 931 */
931STATIC void 932STATIC void
932xfs_fs_destroy_inode( 933xfs_fs_destroy_inode(
933 struct inode *inode) 934 struct inode *inode)
934{ 935{
935 xfs_inode_t *ip = XFS_I(inode); 936 struct xfs_inode *ip = XFS_I(inode);
937
938 xfs_itrace_entry(ip);
936 939
937 XFS_STATS_INC(vn_reclaim); 940 XFS_STATS_INC(vn_reclaim);
938 if (xfs_reclaim(ip)) 941
939 panic("%s: cannot reclaim 0x%p\n", __func__, inode); 942 /* bad inode, get out here ASAP */
943 if (is_bad_inode(inode))
944 goto out_reclaim;
945
946 xfs_ioend_wait(ip);
947
948 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
949
950 /*
951 * We should never get here with one of the reclaim flags already set.
952 */
953 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
954 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
955
956 /*
957 * We always use background reclaim here because even if the
958 * inode is clean, it still may be under IO and hence we have
959 * to take the flush lock. The background reclaim path handles
960 * this more efficiently than we can here, so simply let background
961 * reclaim tear down all inodes.
962 */
963out_reclaim:
964 xfs_inode_set_reclaim_tag(ip);
940} 965}
941 966
942/* 967/*
@@ -973,7 +998,6 @@ xfs_fs_inode_init_once(
973 998
974 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, 999 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
975 "xfsino", ip->i_ino); 1000 "xfsino", ip->i_ino);
976 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
977} 1001}
978 1002
979/* 1003/*
@@ -998,59 +1022,108 @@ xfs_fs_dirty_inode(
998 XFS_I(inode)->i_update_core = 1; 1022 XFS_I(inode)->i_update_core = 1;
999} 1023}
1000 1024
1001/* 1025STATIC int
1002 * Attempt to flush the inode, this will actually fail 1026xfs_log_inode(
1003 * if the inode is pinned, but we dirty the inode again 1027 struct xfs_inode *ip)
1004 * at the point when it is unpinned after a log write, 1028{
1005 * since this is when the inode itself becomes flushable. 1029 struct xfs_mount *mp = ip->i_mount;
1006 */ 1030 struct xfs_trans *tp;
1031 int error;
1032
1033 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1034 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1035 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
1036
1037 if (error) {
1038 xfs_trans_cancel(tp, 0);
1039 /* we need to return with the lock hold shared */
1040 xfs_ilock(ip, XFS_ILOCK_SHARED);
1041 return error;
1042 }
1043
1044 xfs_ilock(ip, XFS_ILOCK_EXCL);
1045
1046 /*
1047 * Note - it's possible that we might have pushed ourselves out of the
1048 * way during trans_reserve which would flush the inode. But there's
1049 * no guarantee that the inode buffer has actually gone out yet (it's
1050 * delwri). Plus the buffer could be pinned anyway if it's part of
1051 * an inode in another recent transaction. So we play it safe and
1052 * fire off the transaction anyway.
1053 */
1054 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1055 xfs_trans_ihold(tp, ip);
1056 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1057 xfs_trans_set_sync(tp);
1058 error = xfs_trans_commit(tp, 0);
1059 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1060
1061 return error;
1062}
1063
1007STATIC int 1064STATIC int
1008xfs_fs_write_inode( 1065xfs_fs_write_inode(
1009 struct inode *inode, 1066 struct inode *inode,
1010 int sync) 1067 struct writeback_control *wbc)
1011{ 1068{
1012 struct xfs_inode *ip = XFS_I(inode); 1069 struct xfs_inode *ip = XFS_I(inode);
1013 struct xfs_mount *mp = ip->i_mount; 1070 struct xfs_mount *mp = ip->i_mount;
1014 int error = 0; 1071 int error = EAGAIN;
1015 1072
1016 xfs_itrace_entry(ip); 1073 xfs_itrace_entry(ip);
1017 1074
1018 if (XFS_FORCED_SHUTDOWN(mp)) 1075 if (XFS_FORCED_SHUTDOWN(mp))
1019 return XFS_ERROR(EIO); 1076 return XFS_ERROR(EIO);
1020 1077
1021 if (sync) { 1078 if (wbc->sync_mode == WB_SYNC_ALL) {
1022 error = xfs_wait_on_pages(ip, 0, -1); 1079 /*
1023 if (error) 1080 * Make sure the inode has hit stable storage. By using the
1081 * log and the fsync transactions we reduce the IOs we have
1082 * to do here from two (log and inode) to just the log.
1083 *
1084 * Note: We still need to do a delwri write of the inode after
1085 * this to flush it to the backing buffer so that bulkstat
1086 * works properly if this is the first time the inode has been
1087 * written. Because we hold the ilock atomically over the
1088 * transaction commit and the inode flush we are guaranteed
1089 * that the inode is not pinned when it returns. If the flush
1090 * lock is already held, then the inode has already been
1091 * flushed once and we don't need to flush it again. Hence
1092 * the code will only flush the inode if it isn't already
1093 * being flushed.
1094 */
1095 xfs_ilock(ip, XFS_ILOCK_SHARED);
1096 if (ip->i_update_core) {
1097 error = xfs_log_inode(ip);
1098 if (error)
1099 goto out_unlock;
1100 }
1101 } else {
1102 /*
1103 * We make this non-blocking if the inode is contended, return
1104 * EAGAIN to indicate to the caller that they did not succeed.
1105 * This prevents the flush path from blocking on inodes inside
1106 * another operation right now, they get caught later by xfs_sync.
1107 */
1108 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1024 goto out; 1109 goto out;
1025 } 1110 }
1026 1111
1027 /* 1112 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1028 * Bypass inodes which have already been cleaned by 1113 goto out_unlock;
1029 * the inode flush clustering code inside xfs_iflush
1030 */
1031 if (xfs_inode_clean(ip))
1032 goto out;
1033 1114
1034 /* 1115 /*
1035 * We make this non-blocking if the inode is contended, return 1116 * Now we have the flush lock and the inode is not pinned, we can check
1036 * EAGAIN to indicate to the caller that they did not succeed. 1117 * if the inode is really clean as we know that there are no pending
1037 * This prevents the flush path from blocking on inodes inside 1118 * transaction completions, it is not waiting on the delayed write
1038 * another operation right now, they get caught later by xfs_sync. 1119 * queue and there is no IO in progress.
1039 */ 1120 */
1040 if (sync) { 1121 if (xfs_inode_clean(ip)) {
1041 xfs_ilock(ip, XFS_ILOCK_SHARED); 1122 xfs_ifunlock(ip);
1042 xfs_iflock(ip); 1123 error = 0;
1043 1124 goto out_unlock;
1044 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
1045 } else {
1046 error = EAGAIN;
1047 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1048 goto out;
1049 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1050 goto out_unlock;
1051
1052 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
1053 } 1125 }
1126 error = xfs_iflush(ip, 0);
1054 1127
1055 out_unlock: 1128 out_unlock:
1056 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1129 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1075,6 +1148,20 @@ xfs_fs_clear_inode(
1075 XFS_STATS_INC(vn_remove); 1148 XFS_STATS_INC(vn_remove);
1076 XFS_STATS_DEC(vn_active); 1149 XFS_STATS_DEC(vn_active);
1077 1150
1151 /*
1152 * The iolock is used by the file system to coordinate reads,
1153 * writes, and block truncates. Up to this point the lock
1154 * protected concurrent accesses by users of the inode. But
1155 * from here forward we're doing some final processing of the
1156 * inode because we're done with it, and although we reuse the
1157 * iolock for protection it is really a distinct lock class
1158 * (in the lockdep sense) from before. To keep lockdep happy
1159 * (and basically indicate what we are doing), we explicitly
1160 * re-init the iolock here.
1161 */
1162 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1163 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1164
1078 xfs_inactive(ip); 1165 xfs_inactive(ip);
1079} 1166}
1080 1167
@@ -1092,8 +1179,6 @@ xfs_fs_put_super(
1092 struct super_block *sb) 1179 struct super_block *sb)
1093{ 1180{
1094 struct xfs_mount *mp = XFS_M(sb); 1181 struct xfs_mount *mp = XFS_M(sb);
1095 struct xfs_inode *rip = mp->m_rootip;
1096 int unmount_event_flags = 0;
1097 1182
1098 xfs_syncd_stop(mp); 1183 xfs_syncd_stop(mp);
1099 1184
@@ -1109,20 +1194,7 @@ xfs_fs_put_super(
1109 xfs_sync_attr(mp, 0); 1194 xfs_sync_attr(mp, 0);
1110 } 1195 }
1111 1196
1112#ifdef HAVE_DMAPI 1197 XFS_SEND_PREUNMOUNT(mp);
1113 if (mp->m_flags & XFS_MOUNT_DMAPI) {
1114 unmount_event_flags =
1115 (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ?
1116 0 : DM_FLAGS_UNWANTED;
1117 /*
1118 * Ignore error from dmapi here, first unmount is not allowed
1119 * to fail anyway, and second we wouldn't want to fail a
1120 * unmount because of dmapi.
1121 */
1122 XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
1123 NULL, NULL, 0, 0, unmount_event_flags);
1124 }
1125#endif
1126 1198
1127 /* 1199 /*
1128 * Blow away any referenced inode in the filestreams cache. 1200 * Blow away any referenced inode in the filestreams cache.
@@ -1133,10 +1205,7 @@ xfs_fs_put_super(
1133 1205
1134 XFS_bflush(mp->m_ddev_targp); 1206 XFS_bflush(mp->m_ddev_targp);
1135 1207
1136 if (mp->m_flags & XFS_MOUNT_DMAPI) { 1208 XFS_SEND_UNMOUNT(mp);
1137 XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
1138 unmount_event_flags);
1139 }
1140 1209
1141 xfs_unmountfs(mp); 1210 xfs_unmountfs(mp);
1142 xfs_freesb(mp); 1211 xfs_freesb(mp);
@@ -1237,6 +1306,29 @@ xfs_fs_statfs(
1237 return 0; 1306 return 0;
1238} 1307}
1239 1308
1309STATIC void
1310xfs_save_resvblks(struct xfs_mount *mp)
1311{
1312 __uint64_t resblks = 0;
1313
1314 mp->m_resblks_save = mp->m_resblks;
1315 xfs_reserve_blocks(mp, &resblks, NULL);
1316}
1317
1318STATIC void
1319xfs_restore_resvblks(struct xfs_mount *mp)
1320{
1321 __uint64_t resblks;
1322
1323 if (mp->m_resblks_save) {
1324 resblks = mp->m_resblks_save;
1325 mp->m_resblks_save = 0;
1326 } else
1327 resblks = xfs_default_resblks(mp);
1328
1329 xfs_reserve_blocks(mp, &resblks, NULL);
1330}
1331
1240STATIC int 1332STATIC int
1241xfs_fs_remount( 1333xfs_fs_remount(
1242 struct super_block *sb, 1334 struct super_block *sb,
@@ -1316,11 +1408,27 @@ xfs_fs_remount(
1316 } 1408 }
1317 mp->m_update_flags = 0; 1409 mp->m_update_flags = 0;
1318 } 1410 }
1411
1412 /*
1413 * Fill out the reserve pool if it is empty. Use the stashed
1414 * value if it is non-zero, otherwise go with the default.
1415 */
1416 xfs_restore_resvblks(mp);
1319 } 1417 }
1320 1418
1321 /* rw -> ro */ 1419 /* rw -> ro */
1322 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1420 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1421 /*
1422 * After we have synced the data but before we sync the
1423 * metadata, we need to free up the reserve block pool so that
1424 * the used block count in the superblock on disk is correct at
1425 * the end of the remount. Stash the current reserve pool size
1426 * so that if we get remounted rw, we can return it to the same
1427 * size.
1428 */
1429
1323 xfs_quiesce_data(mp); 1430 xfs_quiesce_data(mp);
1431 xfs_save_resvblks(mp);
1324 xfs_quiesce_attr(mp); 1432 xfs_quiesce_attr(mp);
1325 mp->m_flags |= XFS_MOUNT_RDONLY; 1433 mp->m_flags |= XFS_MOUNT_RDONLY;
1326 } 1434 }
@@ -1339,11 +1447,22 @@ xfs_fs_freeze(
1339{ 1447{
1340 struct xfs_mount *mp = XFS_M(sb); 1448 struct xfs_mount *mp = XFS_M(sb);
1341 1449
1450 xfs_save_resvblks(mp);
1342 xfs_quiesce_attr(mp); 1451 xfs_quiesce_attr(mp);
1343 return -xfs_fs_log_dummy(mp); 1452 return -xfs_fs_log_dummy(mp);
1344} 1453}
1345 1454
1346STATIC int 1455STATIC int
1456xfs_fs_unfreeze(
1457 struct super_block *sb)
1458{
1459 struct xfs_mount *mp = XFS_M(sb);
1460
1461 xfs_restore_resvblks(mp);
1462 return 0;
1463}
1464
1465STATIC int
1347xfs_fs_show_options( 1466xfs_fs_show_options(
1348 struct seq_file *m, 1467 struct seq_file *m,
1349 struct vfsmount *mnt) 1468 struct vfsmount *mnt)
@@ -1504,8 +1623,6 @@ xfs_fs_fill_super(
1504 goto fail_vnrele; 1623 goto fail_vnrele;
1505 1624
1506 kfree(mtpt); 1625 kfree(mtpt);
1507
1508 xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
1509 return 0; 1626 return 0;
1510 1627
1511 out_filestream_unmount: 1628 out_filestream_unmount:
@@ -1567,6 +1684,7 @@ static const struct super_operations xfs_super_operations = {
1567 .put_super = xfs_fs_put_super, 1684 .put_super = xfs_fs_put_super,
1568 .sync_fs = xfs_fs_sync_fs, 1685 .sync_fs = xfs_fs_sync_fs,
1569 .freeze_fs = xfs_fs_freeze, 1686 .freeze_fs = xfs_fs_freeze,
1687 .unfreeze_fs = xfs_fs_unfreeze,
1570 .statfs = xfs_fs_statfs, 1688 .statfs = xfs_fs_statfs,
1571 .remount_fs = xfs_fs_remount, 1689 .remount_fs = xfs_fs_remount,
1572 .show_options = xfs_fs_show_options, 1690 .show_options = xfs_fs_show_options,
@@ -1581,94 +1699,6 @@ static struct file_system_type xfs_fs_type = {
1581}; 1699};
1582 1700
1583STATIC int __init 1701STATIC int __init
1584xfs_alloc_trace_bufs(void)
1585{
1586#ifdef XFS_ALLOC_TRACE
1587 xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL);
1588 if (!xfs_alloc_trace_buf)
1589 goto out;
1590#endif
1591#ifdef XFS_BMAP_TRACE
1592 xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL);
1593 if (!xfs_bmap_trace_buf)
1594 goto out_free_alloc_trace;
1595#endif
1596#ifdef XFS_BTREE_TRACE
1597 xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
1598 KM_MAYFAIL);
1599 if (!xfs_allocbt_trace_buf)
1600 goto out_free_bmap_trace;
1601
1602 xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
1603 if (!xfs_inobt_trace_buf)
1604 goto out_free_allocbt_trace;
1605
1606 xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
1607 if (!xfs_bmbt_trace_buf)
1608 goto out_free_inobt_trace;
1609#endif
1610#ifdef XFS_ATTR_TRACE
1611 xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
1612 if (!xfs_attr_trace_buf)
1613 goto out_free_bmbt_trace;
1614#endif
1615#ifdef XFS_DIR2_TRACE
1616 xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL);
1617 if (!xfs_dir2_trace_buf)
1618 goto out_free_attr_trace;
1619#endif
1620
1621 return 0;
1622
1623#ifdef XFS_DIR2_TRACE
1624 out_free_attr_trace:
1625#endif
1626#ifdef XFS_ATTR_TRACE
1627 ktrace_free(xfs_attr_trace_buf);
1628 out_free_bmbt_trace:
1629#endif
1630#ifdef XFS_BTREE_TRACE
1631 ktrace_free(xfs_bmbt_trace_buf);
1632 out_free_inobt_trace:
1633 ktrace_free(xfs_inobt_trace_buf);
1634 out_free_allocbt_trace:
1635 ktrace_free(xfs_allocbt_trace_buf);
1636 out_free_bmap_trace:
1637#endif
1638#ifdef XFS_BMAP_TRACE
1639 ktrace_free(xfs_bmap_trace_buf);
1640 out_free_alloc_trace:
1641#endif
1642#ifdef XFS_ALLOC_TRACE
1643 ktrace_free(xfs_alloc_trace_buf);
1644 out:
1645#endif
1646 return -ENOMEM;
1647}
1648
1649STATIC void
1650xfs_free_trace_bufs(void)
1651{
1652#ifdef XFS_DIR2_TRACE
1653 ktrace_free(xfs_dir2_trace_buf);
1654#endif
1655#ifdef XFS_ATTR_TRACE
1656 ktrace_free(xfs_attr_trace_buf);
1657#endif
1658#ifdef XFS_BTREE_TRACE
1659 ktrace_free(xfs_bmbt_trace_buf);
1660 ktrace_free(xfs_inobt_trace_buf);
1661 ktrace_free(xfs_allocbt_trace_buf);
1662#endif
1663#ifdef XFS_BMAP_TRACE
1664 ktrace_free(xfs_bmap_trace_buf);
1665#endif
1666#ifdef XFS_ALLOC_TRACE
1667 ktrace_free(xfs_alloc_trace_buf);
1668#endif
1669}
1670
1671STATIC int __init
1672xfs_init_zones(void) 1702xfs_init_zones(void)
1673{ 1703{
1674 1704
@@ -1809,7 +1839,6 @@ init_xfs_fs(void)
1809 printk(KERN_INFO XFS_VERSION_STRING " with " 1839 printk(KERN_INFO XFS_VERSION_STRING " with "
1810 XFS_BUILD_OPTIONS " enabled\n"); 1840 XFS_BUILD_OPTIONS " enabled\n");
1811 1841
1812 ktrace_init(64);
1813 xfs_ioend_init(); 1842 xfs_ioend_init();
1814 xfs_dir_startup(); 1843 xfs_dir_startup();
1815 1844
@@ -1817,13 +1846,9 @@ init_xfs_fs(void)
1817 if (error) 1846 if (error)
1818 goto out; 1847 goto out;
1819 1848
1820 error = xfs_alloc_trace_bufs();
1821 if (error)
1822 goto out_destroy_zones;
1823
1824 error = xfs_mru_cache_init(); 1849 error = xfs_mru_cache_init();
1825 if (error) 1850 if (error)
1826 goto out_free_trace_buffers; 1851 goto out_destroy_zones;
1827 1852
1828 error = xfs_filestream_init(); 1853 error = xfs_filestream_init();
1829 if (error) 1854 if (error)
@@ -1858,8 +1883,6 @@ init_xfs_fs(void)
1858 xfs_filestream_uninit(); 1883 xfs_filestream_uninit();
1859 out_mru_cache_uninit: 1884 out_mru_cache_uninit:
1860 xfs_mru_cache_uninit(); 1885 xfs_mru_cache_uninit();
1861 out_free_trace_buffers:
1862 xfs_free_trace_bufs();
1863 out_destroy_zones: 1886 out_destroy_zones:
1864 xfs_destroy_zones(); 1887 xfs_destroy_zones();
1865 out: 1888 out:
@@ -1876,9 +1899,7 @@ exit_xfs_fs(void)
1876 xfs_buf_terminate(); 1899 xfs_buf_terminate();
1877 xfs_filestream_uninit(); 1900 xfs_filestream_uninit();
1878 xfs_mru_cache_uninit(); 1901 xfs_mru_cache_uninit();
1879 xfs_free_trace_bufs();
1880 xfs_destroy_zones(); 1902 xfs_destroy_zones();
1881 ktrace_uninit();
1882} 1903}
1883 1904
1884module_init(init_xfs_fs); 1905module_init(init_xfs_fs);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 18175ebd58ed..233d4b9881b1 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,12 +56,6 @@ extern void xfs_qm_exit(void);
56# define XFS_BIGFS_STRING 56# define XFS_BIGFS_STRING
57#endif 57#endif
58 58
59#ifdef CONFIG_XFS_TRACE
60# define XFS_TRACE_STRING "tracing, "
61#else
62# define XFS_TRACE_STRING
63#endif
64
65#ifdef CONFIG_XFS_DMAPI 59#ifdef CONFIG_XFS_DMAPI
66# define XFS_DMAPI_STRING "dmapi support, " 60# define XFS_DMAPI_STRING "dmapi support, "
67#else 61#else
@@ -78,7 +72,6 @@ extern void xfs_qm_exit(void);
78 XFS_SECURITY_STRING \ 72 XFS_SECURITY_STRING \
79 XFS_REALTIME_STRING \ 73 XFS_REALTIME_STRING \
80 XFS_BIGFS_STRING \ 74 XFS_BIGFS_STRING \
81 XFS_TRACE_STRING \
82 XFS_DMAPI_STRING \ 75 XFS_DMAPI_STRING \
83 XFS_DBG_STRING /* DBG must be last */ 76 XFS_DBG_STRING /* DBG must be last */
84 77
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 961df0a22c78..05cd85317f6f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -44,6 +44,7 @@
44#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_quota.h" 46#include "xfs_quota.h"
47#include "xfs_trace.h"
47 48
48#include <linux/kthread.h> 49#include <linux/kthread.h>
49#include <linux/freezer.h> 50#include <linux/freezer.h>
@@ -64,7 +65,6 @@ xfs_inode_ag_lookup(
64 * as the tree is sparse and a gang lookup walks to find 65 * as the tree is sparse and a gang lookup walks to find
65 * the number of objects requested. 66 * the number of objects requested.
66 */ 67 */
67 read_lock(&pag->pag_ici_lock);
68 if (tag == XFS_ICI_NO_TAG) { 68 if (tag == XFS_ICI_NO_TAG) {
69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
70 (void **)&ip, *first_index, 1); 70 (void **)&ip, *first_index, 1);
@@ -73,7 +73,7 @@ xfs_inode_ag_lookup(
73 (void **)&ip, *first_index, 1, tag); 73 (void **)&ip, *first_index, 1, tag);
74 } 74 }
75 if (!nr_found) 75 if (!nr_found)
76 goto unlock; 76 return NULL;
77 77
78 /* 78 /*
79 * Update the index for the next lookup. Catch overflows 79 * Update the index for the next lookup. Catch overflows
@@ -83,25 +83,20 @@ xfs_inode_ag_lookup(
83 */ 83 */
84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
86 goto unlock; 86 return NULL;
87
88 return ip; 87 return ip;
89
90unlock:
91 read_unlock(&pag->pag_ici_lock);
92 return NULL;
93} 88}
94 89
95STATIC int 90STATIC int
96xfs_inode_ag_walk( 91xfs_inode_ag_walk(
97 struct xfs_mount *mp, 92 struct xfs_mount *mp,
98 xfs_agnumber_t ag, 93 struct xfs_perag *pag,
99 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
100 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
101 int flags, 96 int flags,
102 int tag) 97 int tag,
98 int exclusive)
103{ 99{
104 struct xfs_perag *pag = &mp->m_perag[ag];
105 uint32_t first_index; 100 uint32_t first_index;
106 int last_error = 0; 101 int last_error = 0;
107 int skipped; 102 int skipped;
@@ -113,10 +108,20 @@ restart:
113 int error = 0; 108 int error = 0;
114 xfs_inode_t *ip; 109 xfs_inode_t *ip;
115 110
111 if (exclusive)
112 write_lock(&pag->pag_ici_lock);
113 else
114 read_lock(&pag->pag_ici_lock);
116 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 115 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
117 if (!ip) 116 if (!ip) {
117 if (exclusive)
118 write_unlock(&pag->pag_ici_lock);
119 else
120 read_unlock(&pag->pag_ici_lock);
118 break; 121 break;
122 }
119 123
124 /* execute releases pag->pag_ici_lock */
120 error = execute(ip, pag, flags); 125 error = execute(ip, pag, flags);
121 if (error == EAGAIN) { 126 if (error == EAGAIN) {
122 skipped++; 127 skipped++;
@@ -124,9 +129,8 @@ restart:
124 } 129 }
125 if (error) 130 if (error)
126 last_error = error; 131 last_error = error;
127 /* 132
128 * bail out if the filesystem is corrupted. 133 /* bail out if the filesystem is corrupted. */
129 */
130 if (error == EFSCORRUPTED) 134 if (error == EFSCORRUPTED)
131 break; 135 break;
132 136
@@ -136,8 +140,6 @@ restart:
136 delay(1); 140 delay(1);
137 goto restart; 141 goto restart;
138 } 142 }
139
140 xfs_put_perag(mp, pag);
141 return last_error; 143 return last_error;
142} 144}
143 145
@@ -147,16 +149,24 @@ xfs_inode_ag_iterator(
147 int (*execute)(struct xfs_inode *ip, 149 int (*execute)(struct xfs_inode *ip,
148 struct xfs_perag *pag, int flags), 150 struct xfs_perag *pag, int flags),
149 int flags, 151 int flags,
150 int tag) 152 int tag,
153 int exclusive)
151{ 154{
152 int error = 0; 155 int error = 0;
153 int last_error = 0; 156 int last_error = 0;
154 xfs_agnumber_t ag; 157 xfs_agnumber_t ag;
155 158
156 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
157 if (!mp->m_perag[ag].pag_ici_init) 160 struct xfs_perag *pag;
161
162 pag = xfs_perag_get(mp, ag);
163 if (!pag->pag_ici_init) {
164 xfs_perag_put(pag);
158 continue; 165 continue;
159 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); 166 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
168 exclusive);
169 xfs_perag_put(pag);
160 if (error) { 170 if (error) {
161 last_error = error; 171 last_error = error;
162 if (error == EFSCORRUPTED) 172 if (error == EFSCORRUPTED)
@@ -173,30 +183,31 @@ xfs_sync_inode_valid(
173 struct xfs_perag *pag) 183 struct xfs_perag *pag)
174{ 184{
175 struct inode *inode = VFS_I(ip); 185 struct inode *inode = VFS_I(ip);
186 int error = EFSCORRUPTED;
176 187
177 /* nothing to sync during shutdown */ 188 /* nothing to sync during shutdown */
178 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 189 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
179 read_unlock(&pag->pag_ici_lock); 190 goto out_unlock;
180 return EFSCORRUPTED;
181 }
182 191
183 /* 192 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
184 * If we can't get a reference on the inode, it must be in reclaim. 193 error = ENOENT;
185 * Leave it for the reclaim code to flush. Also avoid inodes that 194 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
186 * haven't been fully initialised. 195 goto out_unlock;
187 */ 196
188 if (!igrab(inode)) { 197 /* If we can't grab the inode, it must on it's way to reclaim. */
189 read_unlock(&pag->pag_ici_lock); 198 if (!igrab(inode))
190 return ENOENT; 199 goto out_unlock;
191 }
192 read_unlock(&pag->pag_ici_lock);
193 200
194 if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { 201 if (is_bad_inode(inode)) {
195 IRELE(ip); 202 IRELE(ip);
196 return ENOENT; 203 goto out_unlock;
197 } 204 }
198 205
199 return 0; 206 /* inode is valid */
207 error = 0;
208out_unlock:
209 read_unlock(&pag->pag_ici_lock);
210 return error;
200} 211}
201 212
202STATIC int 213STATIC int
@@ -223,7 +234,7 @@ xfs_sync_inode_data(
223 } 234 }
224 235
225 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 236 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
226 0 : XFS_B_ASYNC, FI_NONE); 237 0 : XBF_ASYNC, FI_NONE);
227 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 238 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
228 239
229 out_wait: 240 out_wait:
@@ -259,8 +270,7 @@ xfs_sync_inode_attr(
259 goto out_unlock; 270 goto out_unlock;
260 } 271 }
261 272
262 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
263 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
264 274
265 out_unlock: 275 out_unlock:
266 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -281,14 +291,11 @@ xfs_sync_data(
281 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 291 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
282 292
283 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 293 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
284 XFS_ICI_NO_TAG); 294 XFS_ICI_NO_TAG, 0);
285 if (error) 295 if (error)
286 return XFS_ERROR(error); 296 return XFS_ERROR(error);
287 297
288 xfs_log_force(mp, 0, 298 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
289 (flags & SYNC_WAIT) ?
290 XFS_LOG_FORCE | XFS_LOG_SYNC :
291 XFS_LOG_FORCE);
292 return 0; 299 return 0;
293} 300}
294 301
@@ -303,7 +310,7 @@ xfs_sync_attr(
303 ASSERT((flags & ~SYNC_WAIT) == 0); 310 ASSERT((flags & ~SYNC_WAIT) == 0);
304 311
305 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 312 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
306 XFS_ICI_NO_TAG); 313 XFS_ICI_NO_TAG, 0);
307} 314}
308 315
309STATIC int 316STATIC int
@@ -314,10 +321,6 @@ xfs_commit_dummy_trans(
314 struct xfs_inode *ip = mp->m_rootip; 321 struct xfs_inode *ip = mp->m_rootip;
315 struct xfs_trans *tp; 322 struct xfs_trans *tp;
316 int error; 323 int error;
317 int log_flags = XFS_LOG_FORCE;
318
319 if (flags & SYNC_WAIT)
320 log_flags |= XFS_LOG_SYNC;
321 324
322 /* 325 /*
323 * Put a dummy transaction in the log to tell recovery 326 * Put a dummy transaction in the log to tell recovery
@@ -339,11 +342,11 @@ xfs_commit_dummy_trans(
339 xfs_iunlock(ip, XFS_ILOCK_EXCL); 342 xfs_iunlock(ip, XFS_ILOCK_EXCL);
340 343
341 /* the log force ensures this transaction is pushed to disk */ 344 /* the log force ensures this transaction is pushed to disk */
342 xfs_log_force(mp, 0, log_flags); 345 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
343 return error; 346 return error;
344} 347}
345 348
346int 349STATIC int
347xfs_sync_fsdata( 350xfs_sync_fsdata(
348 struct xfs_mount *mp, 351 struct xfs_mount *mp,
349 int flags) 352 int flags)
@@ -359,7 +362,7 @@ xfs_sync_fsdata(
359 if (flags & SYNC_TRYLOCK) { 362 if (flags & SYNC_TRYLOCK) {
360 ASSERT(!(flags & SYNC_WAIT)); 363 ASSERT(!(flags & SYNC_WAIT));
361 364
362 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 365 bp = xfs_getsb(mp, XBF_TRYLOCK);
363 if (!bp) 366 if (!bp)
364 goto out; 367 goto out;
365 368
@@ -379,7 +382,7 @@ xfs_sync_fsdata(
379 * become pinned in between there and here. 382 * become pinned in between there and here.
380 */ 383 */
381 if (XFS_BUF_ISPINNED(bp)) 384 if (XFS_BUF_ISPINNED(bp))
382 xfs_log_force(mp, 0, XFS_LOG_FORCE); 385 xfs_log_force(mp, 0);
383 } 386 }
384 387
385 388
@@ -440,9 +443,6 @@ xfs_quiesce_data(
440 xfs_sync_data(mp, SYNC_WAIT); 443 xfs_sync_data(mp, SYNC_WAIT);
441 xfs_qm_sync(mp, SYNC_WAIT); 444 xfs_qm_sync(mp, SYNC_WAIT);
442 445
443 /* drop inode references pinned by filestreams */
444 xfs_filestream_flush(mp);
445
446 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
447 error = xfs_sync_fsdata(mp, SYNC_WAIT); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
448 448
@@ -459,16 +459,18 @@ xfs_quiesce_fs(
459{ 459{
460 int count = 0, pincount; 460 int count = 0, pincount;
461 461
462 xfs_reclaim_inodes(mp, 0);
462 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
463 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
464 464
465 /* 465 /*
466 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
467 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
468 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
469 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
470 */ 471 */
471 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
472 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
473 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
474 if (!pincount) { 476 if (!pincount) {
@@ -567,7 +569,7 @@ xfs_flush_inodes(
567 igrab(inode); 569 igrab(inode);
568 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 570 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
569 wait_for_completion(&completion); 571 wait_for_completion(&completion);
570 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 572 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
571} 573}
572 574
573/* 575/*
@@ -583,8 +585,8 @@ xfs_sync_worker(
583 int error; 585 int error;
584 586
585 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
586 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 588 xfs_log_force(mp, 0);
587 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
588 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
589 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
590 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -605,7 +607,8 @@ xfssyncd(
605 set_freezable(); 607 set_freezable();
606 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 608 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
607 for (;;) { 609 for (;;) {
608 timeleft = schedule_timeout_interruptible(timeleft); 610 if (list_empty(&mp->m_sync_list))
611 timeleft = schedule_timeout_interruptible(timeleft);
609 /* swsusp */ 612 /* swsusp */
610 try_to_freeze(); 613 try_to_freeze();
611 if (kthread_should_stop() && list_empty(&mp->m_sync_list)) 614 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -625,8 +628,7 @@ xfssyncd(
625 list_add_tail(&mp->m_sync_work.w_list, 628 list_add_tail(&mp->m_sync_work.w_list,
626 &mp->m_sync_list); 629 &mp->m_sync_list);
627 } 630 }
628 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) 631 list_splice_init(&mp->m_sync_list, &tmp);
629 list_move(&work->w_list, &tmp);
630 spin_unlock(&mp->m_sync_lock); 632 spin_unlock(&mp->m_sync_lock);
631 633
632 list_for_each_entry_safe(work, n, &tmp, w_list) { 634 list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -663,67 +665,6 @@ xfs_syncd_stop(
663 kthread_stop(mp->m_sync_task); 665 kthread_stop(mp->m_sync_task);
664} 666}
665 667
666int
667xfs_reclaim_inode(
668 xfs_inode_t *ip,
669 int locked,
670 int sync_mode)
671{
672 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
673
674 /* The hash lock here protects a thread in xfs_iget_core from
675 * racing with us on linking the inode back with a vnode.
676 * Once we have the XFS_IRECLAIM flag set it will not touch
677 * us.
678 */
679 write_lock(&pag->pag_ici_lock);
680 spin_lock(&ip->i_flags_lock);
681 if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
682 !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
683 spin_unlock(&ip->i_flags_lock);
684 write_unlock(&pag->pag_ici_lock);
685 if (locked) {
686 xfs_ifunlock(ip);
687 xfs_iunlock(ip, XFS_ILOCK_EXCL);
688 }
689 return -EAGAIN;
690 }
691 __xfs_iflags_set(ip, XFS_IRECLAIM);
692 spin_unlock(&ip->i_flags_lock);
693 write_unlock(&pag->pag_ici_lock);
694 xfs_put_perag(ip->i_mount, pag);
695
696 /*
697 * If the inode is still dirty, then flush it out. If the inode
698 * is not in the AIL, then it will be OK to flush it delwri as
699 * long as xfs_iflush() does not keep any references to the inode.
700 * We leave that decision up to xfs_iflush() since it has the
701 * knowledge of whether it's OK to simply do a delwri flush of
702 * the inode or whether we need to wait until the inode is
703 * pulled from the AIL.
704 * We get the flush lock regardless, though, just to make sure
705 * we don't free it while it is being flushed.
706 */
707 if (!locked) {
708 xfs_ilock(ip, XFS_ILOCK_EXCL);
709 xfs_iflock(ip);
710 }
711
712 /*
713 * In the case of a forced shutdown we rely on xfs_iflush() to
714 * wait for the inode to be unpinned before returning an error.
715 */
716 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
717 /* synchronize with xfs_iflush_done */
718 xfs_iflock(ip);
719 xfs_ifunlock(ip);
720 }
721
722 xfs_iunlock(ip, XFS_ILOCK_EXCL);
723 xfs_ireclaim(ip);
724 return 0;
725}
726
727void 668void
728__xfs_inode_set_reclaim_tag( 669__xfs_inode_set_reclaim_tag(
729 struct xfs_perag *pag, 670 struct xfs_perag *pag,
@@ -743,16 +684,17 @@ void
743xfs_inode_set_reclaim_tag( 684xfs_inode_set_reclaim_tag(
744 xfs_inode_t *ip) 685 xfs_inode_t *ip)
745{ 686{
746 xfs_mount_t *mp = ip->i_mount; 687 struct xfs_mount *mp = ip->i_mount;
747 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 688 struct xfs_perag *pag;
748 689
749 read_lock(&pag->pag_ici_lock); 690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
691 write_lock(&pag->pag_ici_lock);
750 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
751 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
752 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
753 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
754 read_unlock(&pag->pag_ici_lock); 696 write_unlock(&pag->pag_ici_lock);
755 xfs_put_perag(mp, pag); 697 xfs_perag_put(pag);
756} 698}
757 699
758void 700void
@@ -765,20 +707,145 @@ __xfs_inode_clear_reclaim_tag(
765 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
766} 708}
767 709
710/*
711 * Inodes in different states need to be treated differently, and the return
712 * value of xfs_iflush is not sufficient to get this right. The following table
713 * lists the inode states and the reclaim actions necessary for non-blocking
714 * reclaim:
715 *
716 *
717 * inode state iflush ret required action
718 * --------------- ---------- ---------------
719 * bad - reclaim
720 * shutdown EIO unpin and reclaim
721 * clean, unpinned 0 reclaim
722 * stale, unpinned 0 reclaim
723 * clean, pinned(*) 0 requeue
724 * stale, pinned EAGAIN requeue
725 * dirty, delwri ok 0 requeue
726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
728 *
729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
730 * handled anyway given the order of checks implemented.
731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
750 * Hence the order of actions after gaining the locks should be:
751 * bad => reclaim
752 * shutdown => unpin and reclaim
753 * pinned, delwri => requeue
754 * pinned, sync => unpin
755 * stale => reclaim
756 * clean => reclaim
757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
759 */
768STATIC int 760STATIC int
769xfs_reclaim_inode_now( 761xfs_reclaim_inode(
770 struct xfs_inode *ip, 762 struct xfs_inode *ip,
771 struct xfs_perag *pag, 763 struct xfs_perag *pag,
772 int flags) 764 int sync_mode)
773{ 765{
774 /* ignore if already under reclaim */ 766 int error = 0;
775 if (xfs_iflags_test(ip, XFS_IRECLAIM)) { 767
776 read_unlock(&pag->pag_ici_lock); 768 /*
769 * The radix tree lock here protects a thread in xfs_iget from racing
770 * with us starting reclaim on the inode. Once we have the
771 * XFS_IRECLAIM flag set it will not touch us.
772 */
773 spin_lock(&ip->i_flags_lock);
774 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
775 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
776 /* ignore as it is already under reclaim */
777 spin_unlock(&ip->i_flags_lock);
778 write_unlock(&pag->pag_ici_lock);
777 return 0; 779 return 0;
778 } 780 }
779 read_unlock(&pag->pag_ici_lock); 781 __xfs_iflags_set(ip, XFS_IRECLAIM);
782 spin_unlock(&ip->i_flags_lock);
783 write_unlock(&pag->pag_ici_lock);
784
785 xfs_ilock(ip, XFS_ILOCK_EXCL);
786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
791
792 if (is_bad_inode(VFS_I(ip)))
793 goto reclaim;
794 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
795 xfs_iunpin_wait(ip);
796 goto reclaim;
797 }
798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
803 xfs_iunpin_wait(ip);
804 }
805 if (xfs_iflags_test(ip, XFS_ISTALE))
806 goto reclaim;
807 if (xfs_inode_clean(ip))
808 goto reclaim;
809
810 /* Now we have an inode that needs flushing */
811 error = xfs_iflush(ip, sync_mode);
812 if (sync_mode & SYNC_WAIT) {
813 xfs_iflock(ip);
814 goto reclaim;
815 }
816
817 /*
818 * When we have to flush an inode but don't have SYNC_WAIT set, we
819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and
824 * pass on the error.
825 */
826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
828 "inode 0x%llx background reclaim flush failed with %d",
829 (long long)ip->i_ino, error);
830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
842
843reclaim:
844 xfs_ifunlock(ip);
845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
846 xfs_ireclaim(ip);
847 return error;
780 848
781 return xfs_reclaim_inode(ip, 0, flags);
782} 849}
783 850
784int 851int
@@ -786,6 +853,6 @@ xfs_reclaim_inodes(
786 xfs_mount_t *mp, 853 xfs_mount_t *mp,
787 int mode) 854 int mode)
788{ 855{
789 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, 856 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
790 XFS_ICI_RECLAIM_TAG); 857 XFS_ICI_RECLAIM_TAG, 1);
791} 858}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 27920eb7a820..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,14 +37,12 @@ void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags); 38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags); 39int xfs_sync_data(struct xfs_mount *mp, int flags);
40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
41 40
42int xfs_quiesce_data(struct xfs_mount *mp); 41int xfs_quiesce_data(struct xfs_mount *mp);
43void xfs_quiesce_attr(struct xfs_mount *mp); 42void xfs_quiesce_attr(struct xfs_mount *mp);
44 43
45void xfs_flush_inodes(struct xfs_inode *ip); 44void xfs_flush_inodes(struct xfs_inode *ip);
46 45
47int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
48int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 46int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
49 47
50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 48void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
@@ -55,6 +53,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
55int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 53int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
56int xfs_inode_ag_iterator(struct xfs_mount *mp, 54int xfs_inode_ag_iterator(struct xfs_mount *mp,
57 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
58 int flags, int tag); 56 int flags, int tag, int write_lock);
59 57
60#endif 58#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index c5bc67c4e3bb..7bb5092d6ae4 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -55,170 +55,140 @@ xfs_stats_clear_proc_handler(
55 55
56static ctl_table xfs_table[] = { 56static ctl_table xfs_table[] = {
57 { 57 {
58 .ctl_name = XFS_SGID_INHERIT,
59 .procname = "irix_sgid_inherit", 58 .procname = "irix_sgid_inherit",
60 .data = &xfs_params.sgid_inherit.val, 59 .data = &xfs_params.sgid_inherit.val,
61 .maxlen = sizeof(int), 60 .maxlen = sizeof(int),
62 .mode = 0644, 61 .mode = 0644,
63 .proc_handler = &proc_dointvec_minmax, 62 .proc_handler = proc_dointvec_minmax,
64 .strategy = &sysctl_intvec,
65 .extra1 = &xfs_params.sgid_inherit.min, 63 .extra1 = &xfs_params.sgid_inherit.min,
66 .extra2 = &xfs_params.sgid_inherit.max 64 .extra2 = &xfs_params.sgid_inherit.max
67 }, 65 },
68 { 66 {
69 .ctl_name = XFS_SYMLINK_MODE,
70 .procname = "irix_symlink_mode", 67 .procname = "irix_symlink_mode",
71 .data = &xfs_params.symlink_mode.val, 68 .data = &xfs_params.symlink_mode.val,
72 .maxlen = sizeof(int), 69 .maxlen = sizeof(int),
73 .mode = 0644, 70 .mode = 0644,
74 .proc_handler = &proc_dointvec_minmax, 71 .proc_handler = proc_dointvec_minmax,
75 .strategy = &sysctl_intvec,
76 .extra1 = &xfs_params.symlink_mode.min, 72 .extra1 = &xfs_params.symlink_mode.min,
77 .extra2 = &xfs_params.symlink_mode.max 73 .extra2 = &xfs_params.symlink_mode.max
78 }, 74 },
79 { 75 {
80 .ctl_name = XFS_PANIC_MASK,
81 .procname = "panic_mask", 76 .procname = "panic_mask",
82 .data = &xfs_params.panic_mask.val, 77 .data = &xfs_params.panic_mask.val,
83 .maxlen = sizeof(int), 78 .maxlen = sizeof(int),
84 .mode = 0644, 79 .mode = 0644,
85 .proc_handler = &proc_dointvec_minmax, 80 .proc_handler = proc_dointvec_minmax,
86 .strategy = &sysctl_intvec,
87 .extra1 = &xfs_params.panic_mask.min, 81 .extra1 = &xfs_params.panic_mask.min,
88 .extra2 = &xfs_params.panic_mask.max 82 .extra2 = &xfs_params.panic_mask.max
89 }, 83 },
90 84
91 { 85 {
92 .ctl_name = XFS_ERRLEVEL,
93 .procname = "error_level", 86 .procname = "error_level",
94 .data = &xfs_params.error_level.val, 87 .data = &xfs_params.error_level.val,
95 .maxlen = sizeof(int), 88 .maxlen = sizeof(int),
96 .mode = 0644, 89 .mode = 0644,
97 .proc_handler = &proc_dointvec_minmax, 90 .proc_handler = proc_dointvec_minmax,
98 .strategy = &sysctl_intvec,
99 .extra1 = &xfs_params.error_level.min, 91 .extra1 = &xfs_params.error_level.min,
100 .extra2 = &xfs_params.error_level.max 92 .extra2 = &xfs_params.error_level.max
101 }, 93 },
102 { 94 {
103 .ctl_name = XFS_SYNCD_TIMER,
104 .procname = "xfssyncd_centisecs", 95 .procname = "xfssyncd_centisecs",
105 .data = &xfs_params.syncd_timer.val, 96 .data = &xfs_params.syncd_timer.val,
106 .maxlen = sizeof(int), 97 .maxlen = sizeof(int),
107 .mode = 0644, 98 .mode = 0644,
108 .proc_handler = &proc_dointvec_minmax, 99 .proc_handler = proc_dointvec_minmax,
109 .strategy = &sysctl_intvec,
110 .extra1 = &xfs_params.syncd_timer.min, 100 .extra1 = &xfs_params.syncd_timer.min,
111 .extra2 = &xfs_params.syncd_timer.max 101 .extra2 = &xfs_params.syncd_timer.max
112 }, 102 },
113 { 103 {
114 .ctl_name = XFS_INHERIT_SYNC,
115 .procname = "inherit_sync", 104 .procname = "inherit_sync",
116 .data = &xfs_params.inherit_sync.val, 105 .data = &xfs_params.inherit_sync.val,
117 .maxlen = sizeof(int), 106 .maxlen = sizeof(int),
118 .mode = 0644, 107 .mode = 0644,
119 .proc_handler = &proc_dointvec_minmax, 108 .proc_handler = proc_dointvec_minmax,
120 .strategy = &sysctl_intvec,
121 .extra1 = &xfs_params.inherit_sync.min, 109 .extra1 = &xfs_params.inherit_sync.min,
122 .extra2 = &xfs_params.inherit_sync.max 110 .extra2 = &xfs_params.inherit_sync.max
123 }, 111 },
124 { 112 {
125 .ctl_name = XFS_INHERIT_NODUMP,
126 .procname = "inherit_nodump", 113 .procname = "inherit_nodump",
127 .data = &xfs_params.inherit_nodump.val, 114 .data = &xfs_params.inherit_nodump.val,
128 .maxlen = sizeof(int), 115 .maxlen = sizeof(int),
129 .mode = 0644, 116 .mode = 0644,
130 .proc_handler = &proc_dointvec_minmax, 117 .proc_handler = proc_dointvec_minmax,
131 .strategy = &sysctl_intvec,
132 .extra1 = &xfs_params.inherit_nodump.min, 118 .extra1 = &xfs_params.inherit_nodump.min,
133 .extra2 = &xfs_params.inherit_nodump.max 119 .extra2 = &xfs_params.inherit_nodump.max
134 }, 120 },
135 { 121 {
136 .ctl_name = XFS_INHERIT_NOATIME,
137 .procname = "inherit_noatime", 122 .procname = "inherit_noatime",
138 .data = &xfs_params.inherit_noatim.val, 123 .data = &xfs_params.inherit_noatim.val,
139 .maxlen = sizeof(int), 124 .maxlen = sizeof(int),
140 .mode = 0644, 125 .mode = 0644,
141 .proc_handler = &proc_dointvec_minmax, 126 .proc_handler = proc_dointvec_minmax,
142 .strategy = &sysctl_intvec,
143 .extra1 = &xfs_params.inherit_noatim.min, 127 .extra1 = &xfs_params.inherit_noatim.min,
144 .extra2 = &xfs_params.inherit_noatim.max 128 .extra2 = &xfs_params.inherit_noatim.max
145 }, 129 },
146 { 130 {
147 .ctl_name = XFS_BUF_TIMER,
148 .procname = "xfsbufd_centisecs", 131 .procname = "xfsbufd_centisecs",
149 .data = &xfs_params.xfs_buf_timer.val, 132 .data = &xfs_params.xfs_buf_timer.val,
150 .maxlen = sizeof(int), 133 .maxlen = sizeof(int),
151 .mode = 0644, 134 .mode = 0644,
152 .proc_handler = &proc_dointvec_minmax, 135 .proc_handler = proc_dointvec_minmax,
153 .strategy = &sysctl_intvec,
154 .extra1 = &xfs_params.xfs_buf_timer.min, 136 .extra1 = &xfs_params.xfs_buf_timer.min,
155 .extra2 = &xfs_params.xfs_buf_timer.max 137 .extra2 = &xfs_params.xfs_buf_timer.max
156 }, 138 },
157 { 139 {
158 .ctl_name = XFS_BUF_AGE,
159 .procname = "age_buffer_centisecs", 140 .procname = "age_buffer_centisecs",
160 .data = &xfs_params.xfs_buf_age.val, 141 .data = &xfs_params.xfs_buf_age.val,
161 .maxlen = sizeof(int), 142 .maxlen = sizeof(int),
162 .mode = 0644, 143 .mode = 0644,
163 .proc_handler = &proc_dointvec_minmax, 144 .proc_handler = proc_dointvec_minmax,
164 .strategy = &sysctl_intvec,
165 .extra1 = &xfs_params.xfs_buf_age.min, 145 .extra1 = &xfs_params.xfs_buf_age.min,
166 .extra2 = &xfs_params.xfs_buf_age.max 146 .extra2 = &xfs_params.xfs_buf_age.max
167 }, 147 },
168 { 148 {
169 .ctl_name = XFS_INHERIT_NOSYM,
170 .procname = "inherit_nosymlinks", 149 .procname = "inherit_nosymlinks",
171 .data = &xfs_params.inherit_nosym.val, 150 .data = &xfs_params.inherit_nosym.val,
172 .maxlen = sizeof(int), 151 .maxlen = sizeof(int),
173 .mode = 0644, 152 .mode = 0644,
174 .proc_handler = &proc_dointvec_minmax, 153 .proc_handler = proc_dointvec_minmax,
175 .strategy = &sysctl_intvec,
176 .extra1 = &xfs_params.inherit_nosym.min, 154 .extra1 = &xfs_params.inherit_nosym.min,
177 .extra2 = &xfs_params.inherit_nosym.max 155 .extra2 = &xfs_params.inherit_nosym.max
178 }, 156 },
179 { 157 {
180 .ctl_name = XFS_ROTORSTEP,
181 .procname = "rotorstep", 158 .procname = "rotorstep",
182 .data = &xfs_params.rotorstep.val, 159 .data = &xfs_params.rotorstep.val,
183 .maxlen = sizeof(int), 160 .maxlen = sizeof(int),
184 .mode = 0644, 161 .mode = 0644,
185 .proc_handler = &proc_dointvec_minmax, 162 .proc_handler = proc_dointvec_minmax,
186 .strategy = &sysctl_intvec,
187 .extra1 = &xfs_params.rotorstep.min, 163 .extra1 = &xfs_params.rotorstep.min,
188 .extra2 = &xfs_params.rotorstep.max 164 .extra2 = &xfs_params.rotorstep.max
189 }, 165 },
190 { 166 {
191 .ctl_name = XFS_INHERIT_NODFRG,
192 .procname = "inherit_nodefrag", 167 .procname = "inherit_nodefrag",
193 .data = &xfs_params.inherit_nodfrg.val, 168 .data = &xfs_params.inherit_nodfrg.val,
194 .maxlen = sizeof(int), 169 .maxlen = sizeof(int),
195 .mode = 0644, 170 .mode = 0644,
196 .proc_handler = &proc_dointvec_minmax, 171 .proc_handler = proc_dointvec_minmax,
197 .strategy = &sysctl_intvec,
198 .extra1 = &xfs_params.inherit_nodfrg.min, 172 .extra1 = &xfs_params.inherit_nodfrg.min,
199 .extra2 = &xfs_params.inherit_nodfrg.max 173 .extra2 = &xfs_params.inherit_nodfrg.max
200 }, 174 },
201 { 175 {
202 .ctl_name = XFS_FILESTREAM_TIMER,
203 .procname = "filestream_centisecs", 176 .procname = "filestream_centisecs",
204 .data = &xfs_params.fstrm_timer.val, 177 .data = &xfs_params.fstrm_timer.val,
205 .maxlen = sizeof(int), 178 .maxlen = sizeof(int),
206 .mode = 0644, 179 .mode = 0644,
207 .proc_handler = &proc_dointvec_minmax, 180 .proc_handler = proc_dointvec_minmax,
208 .strategy = &sysctl_intvec,
209 .extra1 = &xfs_params.fstrm_timer.min, 181 .extra1 = &xfs_params.fstrm_timer.min,
210 .extra2 = &xfs_params.fstrm_timer.max, 182 .extra2 = &xfs_params.fstrm_timer.max,
211 }, 183 },
212 /* please keep this the last entry */ 184 /* please keep this the last entry */
213#ifdef CONFIG_PROC_FS 185#ifdef CONFIG_PROC_FS
214 { 186 {
215 .ctl_name = XFS_STATS_CLEAR,
216 .procname = "stats_clear", 187 .procname = "stats_clear",
217 .data = &xfs_params.stats_clear.val, 188 .data = &xfs_params.stats_clear.val,
218 .maxlen = sizeof(int), 189 .maxlen = sizeof(int),
219 .mode = 0644, 190 .mode = 0644,
220 .proc_handler = &xfs_stats_clear_proc_handler, 191 .proc_handler = xfs_stats_clear_proc_handler,
221 .strategy = &sysctl_intvec,
222 .extra1 = &xfs_params.stats_clear.min, 192 .extra1 = &xfs_params.stats_clear.min,
223 .extra2 = &xfs_params.stats_clear.max 193 .extra2 = &xfs_params.stats_clear.max
224 }, 194 },
@@ -229,7 +199,6 @@ static ctl_table xfs_table[] = {
229 199
230static ctl_table xfs_dir_table[] = { 200static ctl_table xfs_dir_table[] = {
231 { 201 {
232 .ctl_name = FS_XFS,
233 .procname = "xfs", 202 .procname = "xfs",
234 .mode = 0555, 203 .mode = 0555,
235 .child = xfs_table 204 .child = xfs_table
@@ -239,7 +208,6 @@ static ctl_table xfs_dir_table[] = {
239 208
240static ctl_table xfs_root_table[] = { 209static ctl_table xfs_root_table[] = {
241 { 210 {
242 .ctl_name = CTL_FS,
243 .procname = "fs", 211 .procname = "fs",
244 .mode = 0555, 212 .mode = 0555,
245 .child = xfs_dir_table 213 .child = xfs_dir_table
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
new file mode 100644
index 000000000000..5a107601e969
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -0,0 +1,59 @@
1/*
2 * Copyright (c) 2009, Christoph Hellwig
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_dir2_sf.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dinode.h"
35#include "xfs_inode.h"
36#include "xfs_btree.h"
37#include "xfs_dmapi.h"
38#include "xfs_mount.h"
39#include "xfs_ialloc.h"
40#include "xfs_itable.h"
41#include "xfs_alloc.h"
42#include "xfs_bmap.h"
43#include "xfs_attr.h"
44#include "xfs_attr_sf.h"
45#include "xfs_attr_leaf.h"
46#include "xfs_log_priv.h"
47#include "xfs_buf_item.h"
48#include "xfs_quota.h"
49#include "xfs_iomap.h"
50#include "xfs_aops.h"
51#include "quota/xfs_dquot_item.h"
52#include "quota/xfs_dquot.h"
53
54/*
55 * We include this last to have the helpers above available for the trace
56 * event implementations.
57 */
58#define CREATE_TRACE_POINTS
59#include "xfs_trace.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
new file mode 100644
index 000000000000..fcaa62f0799e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -0,0 +1,1503 @@
1/*
2 * Copyright (c) 2009, Christoph Hellwig
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM xfs
20
21#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
22#define _TRACE_XFS_H
23
24#include <linux/tracepoint.h>
25
26struct xfs_agf;
27struct xfs_alloc_arg;
28struct xfs_attr_list_context;
29struct xfs_buf_log_item;
30struct xfs_da_args;
31struct xfs_da_node_entry;
32struct xfs_dquot;
33struct xlog_ticket;
34struct log;
35
36DECLARE_EVENT_CLASS(xfs_attr_list_class,
37 TP_PROTO(struct xfs_attr_list_context *ctx),
38 TP_ARGS(ctx),
39 TP_STRUCT__entry(
40 __field(dev_t, dev)
41 __field(xfs_ino_t, ino)
42 __field(u32, hashval)
43 __field(u32, blkno)
44 __field(u32, offset)
45 __field(void *, alist)
46 __field(int, bufsize)
47 __field(int, count)
48 __field(int, firstu)
49 __field(int, dupcnt)
50 __field(int, flags)
51 ),
52 TP_fast_assign(
53 __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
54 __entry->ino = ctx->dp->i_ino;
55 __entry->hashval = ctx->cursor->hashval;
56 __entry->blkno = ctx->cursor->blkno;
57 __entry->offset = ctx->cursor->offset;
58 __entry->alist = ctx->alist;
59 __entry->bufsize = ctx->bufsize;
60 __entry->count = ctx->count;
61 __entry->firstu = ctx->firstu;
62 __entry->flags = ctx->flags;
63 ),
64 TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
65 "alist 0x%p size %u count %u firstu %u flags %d %s",
66 MAJOR(__entry->dev), MINOR(__entry->dev),
67 __entry->ino,
68 __entry->hashval,
69 __entry->blkno,
70 __entry->offset,
71 __entry->dupcnt,
72 __entry->alist,
73 __entry->bufsize,
74 __entry->count,
75 __entry->firstu,
76 __entry->flags,
77 __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
78 )
79)
80
81#define DEFINE_PERAG_REF_EVENT(name) \
82TRACE_EVENT(name, \
83 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
84 unsigned long caller_ip), \
85 TP_ARGS(mp, agno, refcount, caller_ip), \
86 TP_STRUCT__entry( \
87 __field(dev_t, dev) \
88 __field(xfs_agnumber_t, agno) \
89 __field(int, refcount) \
90 __field(unsigned long, caller_ip) \
91 ), \
92 TP_fast_assign( \
93 __entry->dev = mp->m_super->s_dev; \
94 __entry->agno = agno; \
95 __entry->refcount = refcount; \
96 __entry->caller_ip = caller_ip; \
97 ), \
98 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
99 MAJOR(__entry->dev), MINOR(__entry->dev), \
100 __entry->agno, \
101 __entry->refcount, \
102 (char *)__entry->caller_ip) \
103);
104
105DEFINE_PERAG_REF_EVENT(xfs_perag_get)
106DEFINE_PERAG_REF_EVENT(xfs_perag_put)
107
108#define DEFINE_ATTR_LIST_EVENT(name) \
109DEFINE_EVENT(xfs_attr_list_class, name, \
110 TP_PROTO(struct xfs_attr_list_context *ctx), \
111 TP_ARGS(ctx))
112DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
113DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
114DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
115DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
116DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
117DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
118DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
119DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
120
121TRACE_EVENT(xfs_attr_list_node_descend,
122 TP_PROTO(struct xfs_attr_list_context *ctx,
123 struct xfs_da_node_entry *btree),
124 TP_ARGS(ctx, btree),
125 TP_STRUCT__entry(
126 __field(dev_t, dev)
127 __field(xfs_ino_t, ino)
128 __field(u32, hashval)
129 __field(u32, blkno)
130 __field(u32, offset)
131 __field(void *, alist)
132 __field(int, bufsize)
133 __field(int, count)
134 __field(int, firstu)
135 __field(int, dupcnt)
136 __field(int, flags)
137 __field(u32, bt_hashval)
138 __field(u32, bt_before)
139 ),
140 TP_fast_assign(
141 __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
142 __entry->ino = ctx->dp->i_ino;
143 __entry->hashval = ctx->cursor->hashval;
144 __entry->blkno = ctx->cursor->blkno;
145 __entry->offset = ctx->cursor->offset;
146 __entry->alist = ctx->alist;
147 __entry->bufsize = ctx->bufsize;
148 __entry->count = ctx->count;
149 __entry->firstu = ctx->firstu;
150 __entry->flags = ctx->flags;
151 __entry->bt_hashval = be32_to_cpu(btree->hashval);
152 __entry->bt_before = be32_to_cpu(btree->before);
153 ),
154 TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
155 "alist 0x%p size %u count %u firstu %u flags %d %s "
156 "node hashval %u, node before %u",
157 MAJOR(__entry->dev), MINOR(__entry->dev),
158 __entry->ino,
159 __entry->hashval,
160 __entry->blkno,
161 __entry->offset,
162 __entry->dupcnt,
163 __entry->alist,
164 __entry->bufsize,
165 __entry->count,
166 __entry->firstu,
167 __entry->flags,
168 __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
169 __entry->bt_hashval,
170 __entry->bt_before)
171);
172
173TRACE_EVENT(xfs_iext_insert,
174 TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
175 struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
176 TP_ARGS(ip, idx, r, state, caller_ip),
177 TP_STRUCT__entry(
178 __field(dev_t, dev)
179 __field(xfs_ino_t, ino)
180 __field(xfs_extnum_t, idx)
181 __field(xfs_fileoff_t, startoff)
182 __field(xfs_fsblock_t, startblock)
183 __field(xfs_filblks_t, blockcount)
184 __field(xfs_exntst_t, state)
185 __field(int, bmap_state)
186 __field(unsigned long, caller_ip)
187 ),
188 TP_fast_assign(
189 __entry->dev = VFS_I(ip)->i_sb->s_dev;
190 __entry->ino = ip->i_ino;
191 __entry->idx = idx;
192 __entry->startoff = r->br_startoff;
193 __entry->startblock = r->br_startblock;
194 __entry->blockcount = r->br_blockcount;
195 __entry->state = r->br_state;
196 __entry->bmap_state = state;
197 __entry->caller_ip = caller_ip;
198 ),
199 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
200 "offset %lld block %lld count %lld flag %d caller %pf",
201 MAJOR(__entry->dev), MINOR(__entry->dev),
202 __entry->ino,
203 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
204 (long)__entry->idx,
205 __entry->startoff,
206 (__int64_t)__entry->startblock,
207 __entry->blockcount,
208 __entry->state,
209 (char *)__entry->caller_ip)
210);
211
212DECLARE_EVENT_CLASS(xfs_bmap_class,
213 TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
214 unsigned long caller_ip),
215 TP_ARGS(ip, idx, state, caller_ip),
216 TP_STRUCT__entry(
217 __field(dev_t, dev)
218 __field(xfs_ino_t, ino)
219 __field(xfs_extnum_t, idx)
220 __field(xfs_fileoff_t, startoff)
221 __field(xfs_fsblock_t, startblock)
222 __field(xfs_filblks_t, blockcount)
223 __field(xfs_exntst_t, state)
224 __field(int, bmap_state)
225 __field(unsigned long, caller_ip)
226 ),
227 TP_fast_assign(
228 struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ?
229 ip->i_afp : &ip->i_df;
230 struct xfs_bmbt_irec r;
231
232 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
233 __entry->dev = VFS_I(ip)->i_sb->s_dev;
234 __entry->ino = ip->i_ino;
235 __entry->idx = idx;
236 __entry->startoff = r.br_startoff;
237 __entry->startblock = r.br_startblock;
238 __entry->blockcount = r.br_blockcount;
239 __entry->state = r.br_state;
240 __entry->bmap_state = state;
241 __entry->caller_ip = caller_ip;
242 ),
243 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
244 "offset %lld block %lld count %lld flag %d caller %pf",
245 MAJOR(__entry->dev), MINOR(__entry->dev),
246 __entry->ino,
247 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
248 (long)__entry->idx,
249 __entry->startoff,
250 (__int64_t)__entry->startblock,
251 __entry->blockcount,
252 __entry->state,
253 (char *)__entry->caller_ip)
254)
255
256#define DEFINE_BMAP_EVENT(name) \
257DEFINE_EVENT(xfs_bmap_class, name, \
258 TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
259 unsigned long caller_ip), \
260 TP_ARGS(ip, idx, state, caller_ip))
261DEFINE_BMAP_EVENT(xfs_iext_remove);
262DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
263DEFINE_BMAP_EVENT(xfs_bmap_post_update);
264DEFINE_BMAP_EVENT(xfs_extlist);
265
266DECLARE_EVENT_CLASS(xfs_buf_class,
267 TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
268 TP_ARGS(bp, caller_ip),
269 TP_STRUCT__entry(
270 __field(dev_t, dev)
271 __field(xfs_daddr_t, bno)
272 __field(size_t, buffer_length)
273 __field(int, hold)
274 __field(int, pincount)
275 __field(unsigned, lockval)
276 __field(unsigned, flags)
277 __field(unsigned long, caller_ip)
278 ),
279 TP_fast_assign(
280 __entry->dev = bp->b_target->bt_dev;
281 __entry->bno = bp->b_bn;
282 __entry->buffer_length = bp->b_buffer_length;
283 __entry->hold = atomic_read(&bp->b_hold);
284 __entry->pincount = atomic_read(&bp->b_pin_count);
285 __entry->lockval = xfs_buf_lock_value(bp);
286 __entry->flags = bp->b_flags;
287 __entry->caller_ip = caller_ip;
288 ),
289 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
290 "lock %d flags %s caller %pf",
291 MAJOR(__entry->dev), MINOR(__entry->dev),
292 (unsigned long long)__entry->bno,
293 __entry->buffer_length,
294 __entry->hold,
295 __entry->pincount,
296 __entry->lockval,
297 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
298 (void *)__entry->caller_ip)
299)
300
301#define DEFINE_BUF_EVENT(name) \
302DEFINE_EVENT(xfs_buf_class, name, \
303 TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
304 TP_ARGS(bp, caller_ip))
305DEFINE_BUF_EVENT(xfs_buf_init);
306DEFINE_BUF_EVENT(xfs_buf_free);
307DEFINE_BUF_EVENT(xfs_buf_hold);
308DEFINE_BUF_EVENT(xfs_buf_rele);
309DEFINE_BUF_EVENT(xfs_buf_pin);
310DEFINE_BUF_EVENT(xfs_buf_unpin);
311DEFINE_BUF_EVENT(xfs_buf_iodone);
312DEFINE_BUF_EVENT(xfs_buf_iorequest);
313DEFINE_BUF_EVENT(xfs_buf_bawrite);
314DEFINE_BUF_EVENT(xfs_buf_bdwrite);
315DEFINE_BUF_EVENT(xfs_buf_lock);
316DEFINE_BUF_EVENT(xfs_buf_lock_done);
317DEFINE_BUF_EVENT(xfs_buf_cond_lock);
318DEFINE_BUF_EVENT(xfs_buf_unlock);
319DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
320DEFINE_BUF_EVENT(xfs_buf_iowait);
321DEFINE_BUF_EVENT(xfs_buf_iowait_done);
322DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
323DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
324DEFINE_BUF_EVENT(xfs_buf_delwri_split);
325DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
326DEFINE_BUF_EVENT(xfs_bdstrat_shut);
327DEFINE_BUF_EVENT(xfs_buf_item_relse);
328DEFINE_BUF_EVENT(xfs_buf_item_iodone);
329DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
330DEFINE_BUF_EVENT(xfs_buf_error_relse);
331DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
332DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
333
334/* not really buffer traces, but the buf provides useful information */
335DEFINE_BUF_EVENT(xfs_btree_corrupt);
336DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
337DEFINE_BUF_EVENT(xfs_reset_dqcounts);
338DEFINE_BUF_EVENT(xfs_inode_item_push);
339
340/* pass flags explicitly */
341DECLARE_EVENT_CLASS(xfs_buf_flags_class,
342 TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
343 TP_ARGS(bp, flags, caller_ip),
344 TP_STRUCT__entry(
345 __field(dev_t, dev)
346 __field(xfs_daddr_t, bno)
347 __field(size_t, buffer_length)
348 __field(int, hold)
349 __field(int, pincount)
350 __field(unsigned, lockval)
351 __field(unsigned, flags)
352 __field(unsigned long, caller_ip)
353 ),
354 TP_fast_assign(
355 __entry->dev = bp->b_target->bt_dev;
356 __entry->bno = bp->b_bn;
357 __entry->buffer_length = bp->b_buffer_length;
358 __entry->flags = flags;
359 __entry->hold = atomic_read(&bp->b_hold);
360 __entry->pincount = atomic_read(&bp->b_pin_count);
361 __entry->lockval = xfs_buf_lock_value(bp);
362 __entry->caller_ip = caller_ip;
363 ),
364 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
365 "lock %d flags %s caller %pf",
366 MAJOR(__entry->dev), MINOR(__entry->dev),
367 (unsigned long long)__entry->bno,
368 __entry->buffer_length,
369 __entry->hold,
370 __entry->pincount,
371 __entry->lockval,
372 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
373 (void *)__entry->caller_ip)
374)
375
376#define DEFINE_BUF_FLAGS_EVENT(name) \
377DEFINE_EVENT(xfs_buf_flags_class, name, \
378 TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
379 TP_ARGS(bp, flags, caller_ip))
380DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
381DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
382DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
383
384TRACE_EVENT(xfs_buf_ioerror,
385 TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
386 TP_ARGS(bp, error, caller_ip),
387 TP_STRUCT__entry(
388 __field(dev_t, dev)
389 __field(xfs_daddr_t, bno)
390 __field(size_t, buffer_length)
391 __field(unsigned, flags)
392 __field(int, hold)
393 __field(int, pincount)
394 __field(unsigned, lockval)
395 __field(int, error)
396 __field(unsigned long, caller_ip)
397 ),
398 TP_fast_assign(
399 __entry->dev = bp->b_target->bt_dev;
400 __entry->bno = bp->b_bn;
401 __entry->buffer_length = bp->b_buffer_length;
402 __entry->hold = atomic_read(&bp->b_hold);
403 __entry->pincount = atomic_read(&bp->b_pin_count);
404 __entry->lockval = xfs_buf_lock_value(bp);
405 __entry->error = error;
406 __entry->flags = bp->b_flags;
407 __entry->caller_ip = caller_ip;
408 ),
409 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
410 "lock %d error %d flags %s caller %pf",
411 MAJOR(__entry->dev), MINOR(__entry->dev),
412 (unsigned long long)__entry->bno,
413 __entry->buffer_length,
414 __entry->hold,
415 __entry->pincount,
416 __entry->lockval,
417 __entry->error,
418 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
419 (void *)__entry->caller_ip)
420);
421
422DECLARE_EVENT_CLASS(xfs_buf_item_class,
423 TP_PROTO(struct xfs_buf_log_item *bip),
424 TP_ARGS(bip),
425 TP_STRUCT__entry(
426 __field(dev_t, dev)
427 __field(xfs_daddr_t, buf_bno)
428 __field(size_t, buf_len)
429 __field(int, buf_hold)
430 __field(int, buf_pincount)
431 __field(int, buf_lockval)
432 __field(unsigned, buf_flags)
433 __field(unsigned, bli_recur)
434 __field(int, bli_refcount)
435 __field(unsigned, bli_flags)
436 __field(void *, li_desc)
437 __field(unsigned, li_flags)
438 ),
439 TP_fast_assign(
440 __entry->dev = bip->bli_buf->b_target->bt_dev;
441 __entry->bli_flags = bip->bli_flags;
442 __entry->bli_recur = bip->bli_recur;
443 __entry->bli_refcount = atomic_read(&bip->bli_refcount);
444 __entry->buf_bno = bip->bli_buf->b_bn;
445 __entry->buf_len = bip->bli_buf->b_buffer_length;
446 __entry->buf_flags = bip->bli_buf->b_flags;
447 __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
448 __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
449 __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf);
450 __entry->li_desc = bip->bli_item.li_desc;
451 __entry->li_flags = bip->bli_item.li_flags;
452 ),
453 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
454 "lock %d flags %s recur %d refcount %d bliflags %s "
455 "lidesc 0x%p liflags %s",
456 MAJOR(__entry->dev), MINOR(__entry->dev),
457 (unsigned long long)__entry->buf_bno,
458 __entry->buf_len,
459 __entry->buf_hold,
460 __entry->buf_pincount,
461 __entry->buf_lockval,
462 __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
463 __entry->bli_recur,
464 __entry->bli_refcount,
465 __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
466 __entry->li_desc,
467 __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
468)
469
470#define DEFINE_BUF_ITEM_EVENT(name) \
471DEFINE_EVENT(xfs_buf_item_class, name, \
472 TP_PROTO(struct xfs_buf_log_item *bip), \
473 TP_ARGS(bip))
474DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
475DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
476DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
477DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
478DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
479DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
480DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
481DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
482DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
488DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
489DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
490DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
491DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
492DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
493DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
494DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
495DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
496DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
497DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
498DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
499
500DECLARE_EVENT_CLASS(xfs_lock_class,
501 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
502 unsigned long caller_ip),
503 TP_ARGS(ip, lock_flags, caller_ip),
504 TP_STRUCT__entry(
505 __field(dev_t, dev)
506 __field(xfs_ino_t, ino)
507 __field(int, lock_flags)
508 __field(unsigned long, caller_ip)
509 ),
510 TP_fast_assign(
511 __entry->dev = VFS_I(ip)->i_sb->s_dev;
512 __entry->ino = ip->i_ino;
513 __entry->lock_flags = lock_flags;
514 __entry->caller_ip = caller_ip;
515 ),
516 TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
517 MAJOR(__entry->dev), MINOR(__entry->dev),
518 __entry->ino,
519 __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
520 (void *)__entry->caller_ip)
521)
522
523#define DEFINE_LOCK_EVENT(name) \
524DEFINE_EVENT(xfs_lock_class, name, \
525 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
526 unsigned long caller_ip), \
527 TP_ARGS(ip, lock_flags, caller_ip))
528DEFINE_LOCK_EVENT(xfs_ilock);
529DEFINE_LOCK_EVENT(xfs_ilock_nowait);
530DEFINE_LOCK_EVENT(xfs_ilock_demote);
531DEFINE_LOCK_EVENT(xfs_iunlock);
532
533DECLARE_EVENT_CLASS(xfs_iget_class,
534 TP_PROTO(struct xfs_inode *ip),
535 TP_ARGS(ip),
536 TP_STRUCT__entry(
537 __field(dev_t, dev)
538 __field(xfs_ino_t, ino)
539 ),
540 TP_fast_assign(
541 __entry->dev = VFS_I(ip)->i_sb->s_dev;
542 __entry->ino = ip->i_ino;
543 ),
544 TP_printk("dev %d:%d ino 0x%llx",
545 MAJOR(__entry->dev), MINOR(__entry->dev),
546 __entry->ino)
547)
548
549#define DEFINE_IGET_EVENT(name) \
550DEFINE_EVENT(xfs_iget_class, name, \
551 TP_PROTO(struct xfs_inode *ip), \
552 TP_ARGS(ip))
553DEFINE_IGET_EVENT(xfs_iget_skip);
554DEFINE_IGET_EVENT(xfs_iget_reclaim);
555DEFINE_IGET_EVENT(xfs_iget_found);
556DEFINE_IGET_EVENT(xfs_iget_alloc);
557
558DECLARE_EVENT_CLASS(xfs_inode_class,
559 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
560 TP_ARGS(ip, caller_ip),
561 TP_STRUCT__entry(
562 __field(dev_t, dev)
563 __field(xfs_ino_t, ino)
564 __field(int, count)
565 __field(unsigned long, caller_ip)
566 ),
567 TP_fast_assign(
568 __entry->dev = VFS_I(ip)->i_sb->s_dev;
569 __entry->ino = ip->i_ino;
570 __entry->count = atomic_read(&VFS_I(ip)->i_count);
571 __entry->caller_ip = caller_ip;
572 ),
573 TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
574 MAJOR(__entry->dev), MINOR(__entry->dev),
575 __entry->ino,
576 __entry->count,
577 (char *)__entry->caller_ip)
578)
579
580#define DEFINE_INODE_EVENT(name) \
581DEFINE_EVENT(xfs_inode_class, name, \
582 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
583 TP_ARGS(ip, caller_ip))
584DEFINE_INODE_EVENT(xfs_ihold);
585DEFINE_INODE_EVENT(xfs_irele);
586/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
587DEFINE_INODE_EVENT(xfs_inode);
588#define xfs_itrace_entry(ip) \
589 trace_xfs_inode(ip, _THIS_IP_)
590
591DECLARE_EVENT_CLASS(xfs_dquot_class,
592 TP_PROTO(struct xfs_dquot *dqp),
593 TP_ARGS(dqp),
594 TP_STRUCT__entry(
595 __field(dev_t, dev)
596 __field(u32, id)
597 __field(unsigned, flags)
598 __field(unsigned, nrefs)
599 __field(unsigned long long, res_bcount)
600 __field(unsigned long long, bcount)
601 __field(unsigned long long, icount)
602 __field(unsigned long long, blk_hardlimit)
603 __field(unsigned long long, blk_softlimit)
604 __field(unsigned long long, ino_hardlimit)
605 __field(unsigned long long, ino_softlimit)
606 ), \
607 TP_fast_assign(
608 __entry->dev = dqp->q_mount->m_super->s_dev;
609 __entry->id = be32_to_cpu(dqp->q_core.d_id);
610 __entry->flags = dqp->dq_flags;
611 __entry->nrefs = dqp->q_nrefs;
612 __entry->res_bcount = dqp->q_res_bcount;
613 __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
614 __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
615 __entry->blk_hardlimit =
616 be64_to_cpu(dqp->q_core.d_blk_hardlimit);
617 __entry->blk_softlimit =
618 be64_to_cpu(dqp->q_core.d_blk_softlimit);
619 __entry->ino_hardlimit =
620 be64_to_cpu(dqp->q_core.d_ino_hardlimit);
621 __entry->ino_softlimit =
622 be64_to_cpu(dqp->q_core.d_ino_softlimit);
623 ),
624 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
625 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
626 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
627 MAJOR(__entry->dev), MINOR(__entry->dev),
628 __entry->id,
629 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
630 __entry->nrefs,
631 __entry->res_bcount,
632 __entry->bcount,
633 __entry->blk_hardlimit,
634 __entry->blk_softlimit,
635 __entry->icount,
636 __entry->ino_hardlimit,
637 __entry->ino_softlimit)
638)
639
640#define DEFINE_DQUOT_EVENT(name) \
641DEFINE_EVENT(xfs_dquot_class, name, \
642 TP_PROTO(struct xfs_dquot *dqp), \
643 TP_ARGS(dqp))
644DEFINE_DQUOT_EVENT(xfs_dqadjust);
645DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
646DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
647DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
648DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
649DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
650DEFINE_DQUOT_EVENT(xfs_dqattach_found);
651DEFINE_DQUOT_EVENT(xfs_dqattach_get);
652DEFINE_DQUOT_EVENT(xfs_dqinit);
653DEFINE_DQUOT_EVENT(xfs_dqreuse);
654DEFINE_DQUOT_EVENT(xfs_dqalloc);
655DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
656DEFINE_DQUOT_EVENT(xfs_dqread);
657DEFINE_DQUOT_EVENT(xfs_dqread_fail);
658DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
659DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
660DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
661DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
662DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
663DEFINE_DQUOT_EVENT(xfs_dqget_hit);
664DEFINE_DQUOT_EVENT(xfs_dqget_miss);
665DEFINE_DQUOT_EVENT(xfs_dqput);
666DEFINE_DQUOT_EVENT(xfs_dqput_wait);
667DEFINE_DQUOT_EVENT(xfs_dqput_free);
668DEFINE_DQUOT_EVENT(xfs_dqrele);
669DEFINE_DQUOT_EVENT(xfs_dqflush);
670DEFINE_DQUOT_EVENT(xfs_dqflush_force);
671DEFINE_DQUOT_EVENT(xfs_dqflush_done);
672/* not really iget events, but we re-use the format */
673DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
674DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
675
676DECLARE_EVENT_CLASS(xfs_loggrant_class,
677 TP_PROTO(struct log *log, struct xlog_ticket *tic),
678 TP_ARGS(log, tic),
679 TP_STRUCT__entry(
680 __field(dev_t, dev)
681 __field(unsigned, trans_type)
682 __field(char, ocnt)
683 __field(char, cnt)
684 __field(int, curr_res)
685 __field(int, unit_res)
686 __field(unsigned int, flags)
687 __field(void *, reserve_headq)
688 __field(void *, write_headq)
689 __field(int, grant_reserve_cycle)
690 __field(int, grant_reserve_bytes)
691 __field(int, grant_write_cycle)
692 __field(int, grant_write_bytes)
693 __field(int, curr_cycle)
694 __field(int, curr_block)
695 __field(xfs_lsn_t, tail_lsn)
696 ),
697 TP_fast_assign(
698 __entry->dev = log->l_mp->m_super->s_dev;
699 __entry->trans_type = tic->t_trans_type;
700 __entry->ocnt = tic->t_ocnt;
701 __entry->cnt = tic->t_cnt;
702 __entry->curr_res = tic->t_curr_res;
703 __entry->unit_res = tic->t_unit_res;
704 __entry->flags = tic->t_flags;
705 __entry->reserve_headq = log->l_reserve_headq;
706 __entry->write_headq = log->l_write_headq;
707 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
708 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
709 __entry->grant_write_cycle = log->l_grant_write_cycle;
710 __entry->grant_write_bytes = log->l_grant_write_bytes;
711 __entry->curr_cycle = log->l_curr_cycle;
712 __entry->curr_block = log->l_curr_block;
713 __entry->tail_lsn = log->l_tail_lsn;
714 ),
715 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
716 "t_unit_res %u t_flags %s reserve_headq 0x%p "
717 "write_headq 0x%p grant_reserve_cycle %d "
718 "grant_reserve_bytes %d grant_write_cycle %d "
719 "grant_write_bytes %d curr_cycle %d curr_block %d "
720 "tail_cycle %d tail_block %d",
721 MAJOR(__entry->dev), MINOR(__entry->dev),
722 __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
723 __entry->ocnt,
724 __entry->cnt,
725 __entry->curr_res,
726 __entry->unit_res,
727 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
728 __entry->reserve_headq,
729 __entry->write_headq,
730 __entry->grant_reserve_cycle,
731 __entry->grant_reserve_bytes,
732 __entry->grant_write_cycle,
733 __entry->grant_write_bytes,
734 __entry->curr_cycle,
735 __entry->curr_block,
736 CYCLE_LSN(__entry->tail_lsn),
737 BLOCK_LSN(__entry->tail_lsn)
738 )
739)
740
741#define DEFINE_LOGGRANT_EVENT(name) \
742DEFINE_EVENT(xfs_loggrant_class, name, \
743 TP_PROTO(struct log *log, struct xlog_ticket *tic), \
744 TP_ARGS(log, tic))
745DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
746DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
747DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
748DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
749DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
750DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
751DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
752DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
753DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
754DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
755DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
756DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
757DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
758DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
759DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
760DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
761DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
762DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
763DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
764DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
765DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
766DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
767DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
768DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
769
770#define DEFINE_RW_EVENT(name) \
771TRACE_EVENT(name, \
772 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
773 TP_ARGS(ip, count, offset, flags), \
774 TP_STRUCT__entry( \
775 __field(dev_t, dev) \
776 __field(xfs_ino_t, ino) \
777 __field(xfs_fsize_t, size) \
778 __field(xfs_fsize_t, new_size) \
779 __field(loff_t, offset) \
780 __field(size_t, count) \
781 __field(int, flags) \
782 ), \
783 TP_fast_assign( \
784 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
785 __entry->ino = ip->i_ino; \
786 __entry->size = ip->i_d.di_size; \
787 __entry->new_size = ip->i_new_size; \
788 __entry->offset = offset; \
789 __entry->count = count; \
790 __entry->flags = flags; \
791 ), \
792 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
793 "offset 0x%llx count 0x%zx ioflags %s", \
794 MAJOR(__entry->dev), MINOR(__entry->dev), \
795 __entry->ino, \
796 __entry->size, \
797 __entry->new_size, \
798 __entry->offset, \
799 __entry->count, \
800 __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
801)
802DEFINE_RW_EVENT(xfs_file_read);
803DEFINE_RW_EVENT(xfs_file_buffered_write);
804DEFINE_RW_EVENT(xfs_file_direct_write);
805DEFINE_RW_EVENT(xfs_file_splice_read);
806DEFINE_RW_EVENT(xfs_file_splice_write);
807
808
809#define DEFINE_PAGE_EVENT(name) \
810TRACE_EVENT(name, \
811 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
812 TP_ARGS(inode, page, off), \
813 TP_STRUCT__entry( \
814 __field(dev_t, dev) \
815 __field(xfs_ino_t, ino) \
816 __field(pgoff_t, pgoff) \
817 __field(loff_t, size) \
818 __field(unsigned long, offset) \
819 __field(int, delalloc) \
820 __field(int, unmapped) \
821 __field(int, unwritten) \
822 ), \
823 TP_fast_assign( \
824 int delalloc = -1, unmapped = -1, unwritten = -1; \
825 \
826 if (page_has_buffers(page)) \
827 xfs_count_page_state(page, &delalloc, \
828 &unmapped, &unwritten); \
829 __entry->dev = inode->i_sb->s_dev; \
830 __entry->ino = XFS_I(inode)->i_ino; \
831 __entry->pgoff = page_offset(page); \
832 __entry->size = i_size_read(inode); \
833 __entry->offset = off; \
834 __entry->delalloc = delalloc; \
835 __entry->unmapped = unmapped; \
836 __entry->unwritten = unwritten; \
837 ), \
838 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
839 "delalloc %d unmapped %d unwritten %d", \
840 MAJOR(__entry->dev), MINOR(__entry->dev), \
841 __entry->ino, \
842 __entry->pgoff, \
843 __entry->size, \
844 __entry->offset, \
845 __entry->delalloc, \
846 __entry->unmapped, \
847 __entry->unwritten) \
848)
849DEFINE_PAGE_EVENT(xfs_writepage);
850DEFINE_PAGE_EVENT(xfs_releasepage);
851DEFINE_PAGE_EVENT(xfs_invalidatepage);
852
853#define DEFINE_IOMAP_EVENT(name) \
854TRACE_EVENT(name, \
855 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
856 int flags, struct xfs_bmbt_irec *irec), \
857 TP_ARGS(ip, offset, count, flags, irec), \
858 TP_STRUCT__entry( \
859 __field(dev_t, dev) \
860 __field(xfs_ino_t, ino) \
861 __field(loff_t, size) \
862 __field(loff_t, new_size) \
863 __field(loff_t, offset) \
864 __field(size_t, count) \
865 __field(int, flags) \
866 __field(xfs_fileoff_t, startoff) \
867 __field(xfs_fsblock_t, startblock) \
868 __field(xfs_filblks_t, blockcount) \
869 ), \
870 TP_fast_assign( \
871 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
872 __entry->ino = ip->i_ino; \
873 __entry->size = ip->i_d.di_size; \
874 __entry->new_size = ip->i_new_size; \
875 __entry->offset = offset; \
876 __entry->count = count; \
877 __entry->flags = flags; \
878 __entry->startoff = irec ? irec->br_startoff : 0; \
879 __entry->startblock = irec ? irec->br_startblock : 0; \
880 __entry->blockcount = irec ? irec->br_blockcount : 0; \
881 ), \
882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
883 "offset 0x%llx count %zd flags %s " \
884 "startoff 0x%llx startblock %lld blockcount 0x%llx", \
885 MAJOR(__entry->dev), MINOR(__entry->dev), \
886 __entry->ino, \
887 __entry->size, \
888 __entry->new_size, \
889 __entry->offset, \
890 __entry->count, \
891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
892 __entry->startoff, \
893 (__int64_t)__entry->startblock, \
894 __entry->blockcount) \
895)
896DEFINE_IOMAP_EVENT(xfs_iomap_enter);
897DEFINE_IOMAP_EVENT(xfs_iomap_found);
898DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
899
900#define DEFINE_SIMPLE_IO_EVENT(name) \
901TRACE_EVENT(name, \
902 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
903 TP_ARGS(ip, offset, count), \
904 TP_STRUCT__entry( \
905 __field(dev_t, dev) \
906 __field(xfs_ino_t, ino) \
907 __field(loff_t, size) \
908 __field(loff_t, new_size) \
909 __field(loff_t, offset) \
910 __field(size_t, count) \
911 ), \
912 TP_fast_assign( \
913 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
914 __entry->ino = ip->i_ino; \
915 __entry->size = ip->i_d.di_size; \
916 __entry->new_size = ip->i_new_size; \
917 __entry->offset = offset; \
918 __entry->count = count; \
919 ), \
920 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
921 "offset 0x%llx count %zd", \
922 MAJOR(__entry->dev), MINOR(__entry->dev), \
923 __entry->ino, \
924 __entry->size, \
925 __entry->new_size, \
926 __entry->offset, \
927 __entry->count) \
928);
929DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
930DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
931
932
933TRACE_EVENT(xfs_itruncate_start,
934 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag,
935 xfs_off_t toss_start, xfs_off_t toss_finish),
936 TP_ARGS(ip, new_size, flag, toss_start, toss_finish),
937 TP_STRUCT__entry(
938 __field(dev_t, dev)
939 __field(xfs_ino_t, ino)
940 __field(xfs_fsize_t, size)
941 __field(xfs_fsize_t, new_size)
942 __field(xfs_off_t, toss_start)
943 __field(xfs_off_t, toss_finish)
944 __field(int, flag)
945 ),
946 TP_fast_assign(
947 __entry->dev = VFS_I(ip)->i_sb->s_dev;
948 __entry->ino = ip->i_ino;
949 __entry->size = ip->i_d.di_size;
950 __entry->new_size = new_size;
951 __entry->toss_start = toss_start;
952 __entry->toss_finish = toss_finish;
953 __entry->flag = flag;
954 ),
955 TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx "
956 "toss start 0x%llx toss finish 0x%llx",
957 MAJOR(__entry->dev), MINOR(__entry->dev),
958 __entry->ino,
959 __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS),
960 __entry->size,
961 __entry->new_size,
962 __entry->toss_start,
963 __entry->toss_finish)
964);
965
966DECLARE_EVENT_CLASS(xfs_itrunc_class,
967 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
968 TP_ARGS(ip, new_size),
969 TP_STRUCT__entry(
970 __field(dev_t, dev)
971 __field(xfs_ino_t, ino)
972 __field(xfs_fsize_t, size)
973 __field(xfs_fsize_t, new_size)
974 ),
975 TP_fast_assign(
976 __entry->dev = VFS_I(ip)->i_sb->s_dev;
977 __entry->ino = ip->i_ino;
978 __entry->size = ip->i_d.di_size;
979 __entry->new_size = new_size;
980 ),
981 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
982 MAJOR(__entry->dev), MINOR(__entry->dev),
983 __entry->ino,
984 __entry->size,
985 __entry->new_size)
986)
987
988#define DEFINE_ITRUNC_EVENT(name) \
989DEFINE_EVENT(xfs_itrunc_class, name, \
990 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
991 TP_ARGS(ip, new_size))
992DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start);
993DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end);
994
995TRACE_EVENT(xfs_pagecache_inval,
996 TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
997 TP_ARGS(ip, start, finish),
998 TP_STRUCT__entry(
999 __field(dev_t, dev)
1000 __field(xfs_ino_t, ino)
1001 __field(xfs_fsize_t, size)
1002 __field(xfs_off_t, start)
1003 __field(xfs_off_t, finish)
1004 ),
1005 TP_fast_assign(
1006 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1007 __entry->ino = ip->i_ino;
1008 __entry->size = ip->i_d.di_size;
1009 __entry->start = start;
1010 __entry->finish = finish;
1011 ),
1012 TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
1013 MAJOR(__entry->dev), MINOR(__entry->dev),
1014 __entry->ino,
1015 __entry->size,
1016 __entry->start,
1017 __entry->finish)
1018);
1019
1020TRACE_EVENT(xfs_bunmap,
1021 TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
1022 int flags, unsigned long caller_ip),
1023 TP_ARGS(ip, bno, len, flags, caller_ip),
1024 TP_STRUCT__entry(
1025 __field(dev_t, dev)
1026 __field(xfs_ino_t, ino)
1027 __field(xfs_fsize_t, size)
1028 __field(xfs_fileoff_t, bno)
1029 __field(xfs_filblks_t, len)
1030 __field(unsigned long, caller_ip)
1031 __field(int, flags)
1032 ),
1033 TP_fast_assign(
1034 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1035 __entry->ino = ip->i_ino;
1036 __entry->size = ip->i_d.di_size;
1037 __entry->bno = bno;
1038 __entry->len = len;
1039 __entry->caller_ip = caller_ip;
1040 __entry->flags = flags;
1041 ),
1042 TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
1043 "flags %s caller %pf",
1044 MAJOR(__entry->dev), MINOR(__entry->dev),
1045 __entry->ino,
1046 __entry->size,
1047 __entry->bno,
1048 __entry->len,
1049 __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
1050 (void *)__entry->caller_ip)
1051
1052);
1053
1054TRACE_EVENT(xfs_alloc_busy,
1055 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
1056 xfs_extlen_t len, int slot),
1057 TP_ARGS(mp, agno, agbno, len, slot),
1058 TP_STRUCT__entry(
1059 __field(dev_t, dev)
1060 __field(xfs_agnumber_t, agno)
1061 __field(xfs_agblock_t, agbno)
1062 __field(xfs_extlen_t, len)
1063 __field(int, slot)
1064 ),
1065 TP_fast_assign(
1066 __entry->dev = mp->m_super->s_dev;
1067 __entry->agno = agno;
1068 __entry->agbno = agbno;
1069 __entry->len = len;
1070 __entry->slot = slot;
1071 ),
1072 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
1073 MAJOR(__entry->dev), MINOR(__entry->dev),
1074 __entry->agno,
1075 __entry->agbno,
1076 __entry->len,
1077 __entry->slot)
1078
1079);
1080
1081#define XFS_BUSY_STATES \
1082 { 0, "found" }, \
1083 { 1, "missing" }
1084
1085TRACE_EVENT(xfs_alloc_unbusy,
1086 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1087 int slot, int found),
1088 TP_ARGS(mp, agno, slot, found),
1089 TP_STRUCT__entry(
1090 __field(dev_t, dev)
1091 __field(xfs_agnumber_t, agno)
1092 __field(int, slot)
1093 __field(int, found)
1094 ),
1095 TP_fast_assign(
1096 __entry->dev = mp->m_super->s_dev;
1097 __entry->agno = agno;
1098 __entry->slot = slot;
1099 __entry->found = found;
1100 ),
1101 TP_printk("dev %d:%d agno %u slot %d %s",
1102 MAJOR(__entry->dev), MINOR(__entry->dev),
1103 __entry->agno,
1104 __entry->slot,
1105 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1106);
1107
1108TRACE_EVENT(xfs_alloc_busysearch,
1109 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
1110 xfs_extlen_t len, xfs_lsn_t lsn),
1111 TP_ARGS(mp, agno, agbno, len, lsn),
1112 TP_STRUCT__entry(
1113 __field(dev_t, dev)
1114 __field(xfs_agnumber_t, agno)
1115 __field(xfs_agblock_t, agbno)
1116 __field(xfs_extlen_t, len)
1117 __field(xfs_lsn_t, lsn)
1118 ),
1119 TP_fast_assign(
1120 __entry->dev = mp->m_super->s_dev;
1121 __entry->agno = agno;
1122 __entry->agbno = agbno;
1123 __entry->len = len;
1124 __entry->lsn = lsn;
1125 ),
1126 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
1127 MAJOR(__entry->dev), MINOR(__entry->dev),
1128 __entry->agno,
1129 __entry->agbno,
1130 __entry->len,
1131 __entry->lsn)
1132);
1133
1134TRACE_EVENT(xfs_agf,
1135 TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
1136 unsigned long caller_ip),
1137 TP_ARGS(mp, agf, flags, caller_ip),
1138 TP_STRUCT__entry(
1139 __field(dev_t, dev)
1140 __field(xfs_agnumber_t, agno)
1141 __field(int, flags)
1142 __field(__u32, length)
1143 __field(__u32, bno_root)
1144 __field(__u32, cnt_root)
1145 __field(__u32, bno_level)
1146 __field(__u32, cnt_level)
1147 __field(__u32, flfirst)
1148 __field(__u32, fllast)
1149 __field(__u32, flcount)
1150 __field(__u32, freeblks)
1151 __field(__u32, longest)
1152 __field(unsigned long, caller_ip)
1153 ),
1154 TP_fast_assign(
1155 __entry->dev = mp->m_super->s_dev;
1156 __entry->agno = be32_to_cpu(agf->agf_seqno),
1157 __entry->flags = flags;
1158 __entry->length = be32_to_cpu(agf->agf_length),
1159 __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
1160 __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
1161 __entry->bno_level =
1162 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
1163 __entry->cnt_level =
1164 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
1165 __entry->flfirst = be32_to_cpu(agf->agf_flfirst),
1166 __entry->fllast = be32_to_cpu(agf->agf_fllast),
1167 __entry->flcount = be32_to_cpu(agf->agf_flcount),
1168 __entry->freeblks = be32_to_cpu(agf->agf_freeblks),
1169 __entry->longest = be32_to_cpu(agf->agf_longest);
1170 __entry->caller_ip = caller_ip;
1171 ),
1172 TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
1173 "levels b %u c %u flfirst %u fllast %u flcount %u "
1174 "freeblks %u longest %u caller %pf",
1175 MAJOR(__entry->dev), MINOR(__entry->dev),
1176 __entry->agno,
1177 __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
1178 __entry->length,
1179 __entry->bno_root,
1180 __entry->cnt_root,
1181 __entry->bno_level,
1182 __entry->cnt_level,
1183 __entry->flfirst,
1184 __entry->fllast,
1185 __entry->flcount,
1186 __entry->freeblks,
1187 __entry->longest,
1188 (void *)__entry->caller_ip)
1189);
1190
1191TRACE_EVENT(xfs_free_extent,
1192 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
1193 xfs_extlen_t len, bool isfl, int haveleft, int haveright),
1194 TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
1195 TP_STRUCT__entry(
1196 __field(dev_t, dev)
1197 __field(xfs_agnumber_t, agno)
1198 __field(xfs_agblock_t, agbno)
1199 __field(xfs_extlen_t, len)
1200 __field(int, isfl)
1201 __field(int, haveleft)
1202 __field(int, haveright)
1203 ),
1204 TP_fast_assign(
1205 __entry->dev = mp->m_super->s_dev;
1206 __entry->agno = agno;
1207 __entry->agbno = agbno;
1208 __entry->len = len;
1209 __entry->isfl = isfl;
1210 __entry->haveleft = haveleft;
1211 __entry->haveright = haveright;
1212 ),
1213 TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
1214 MAJOR(__entry->dev), MINOR(__entry->dev),
1215 __entry->agno,
1216 __entry->agbno,
1217 __entry->len,
1218 __entry->isfl,
1219 __entry->haveleft ?
1220 (__entry->haveright ? "both" : "left") :
1221 (__entry->haveright ? "right" : "none"))
1222
1223);
1224
1225DECLARE_EVENT_CLASS(xfs_alloc_class,
1226 TP_PROTO(struct xfs_alloc_arg *args),
1227 TP_ARGS(args),
1228 TP_STRUCT__entry(
1229 __field(dev_t, dev)
1230 __field(xfs_agnumber_t, agno)
1231 __field(xfs_agblock_t, agbno)
1232 __field(xfs_extlen_t, minlen)
1233 __field(xfs_extlen_t, maxlen)
1234 __field(xfs_extlen_t, mod)
1235 __field(xfs_extlen_t, prod)
1236 __field(xfs_extlen_t, minleft)
1237 __field(xfs_extlen_t, total)
1238 __field(xfs_extlen_t, alignment)
1239 __field(xfs_extlen_t, minalignslop)
1240 __field(xfs_extlen_t, len)
1241 __field(short, type)
1242 __field(short, otype)
1243 __field(char, wasdel)
1244 __field(char, wasfromfl)
1245 __field(char, isfl)
1246 __field(char, userdata)
1247 __field(xfs_fsblock_t, firstblock)
1248 ),
1249 TP_fast_assign(
1250 __entry->dev = args->mp->m_super->s_dev;
1251 __entry->agno = args->agno;
1252 __entry->agbno = args->agbno;
1253 __entry->minlen = args->minlen;
1254 __entry->maxlen = args->maxlen;
1255 __entry->mod = args->mod;
1256 __entry->prod = args->prod;
1257 __entry->minleft = args->minleft;
1258 __entry->total = args->total;
1259 __entry->alignment = args->alignment;
1260 __entry->minalignslop = args->minalignslop;
1261 __entry->len = args->len;
1262 __entry->type = args->type;
1263 __entry->otype = args->otype;
1264 __entry->wasdel = args->wasdel;
1265 __entry->wasfromfl = args->wasfromfl;
1266 __entry->isfl = args->isfl;
1267 __entry->userdata = args->userdata;
1268 __entry->firstblock = args->firstblock;
1269 ),
1270 TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
1271 "prod %u minleft %u total %u alignment %u minalignslop %u "
1272 "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
1273 "userdata %d firstblock 0x%llx",
1274 MAJOR(__entry->dev), MINOR(__entry->dev),
1275 __entry->agno,
1276 __entry->agbno,
1277 __entry->minlen,
1278 __entry->maxlen,
1279 __entry->mod,
1280 __entry->prod,
1281 __entry->minleft,
1282 __entry->total,
1283 __entry->alignment,
1284 __entry->minalignslop,
1285 __entry->len,
1286 __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
1287 __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
1288 __entry->wasdel,
1289 __entry->wasfromfl,
1290 __entry->isfl,
1291 __entry->userdata,
1292 __entry->firstblock)
1293)
1294
1295#define DEFINE_ALLOC_EVENT(name) \
1296DEFINE_EVENT(xfs_alloc_class, name, \
1297 TP_PROTO(struct xfs_alloc_arg *args), \
1298 TP_ARGS(args))
1299DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1300DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1301DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1302DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
1303DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
1304DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
1305DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
1306DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
1307DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
1308DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
1309DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
1310DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
1311DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
1312DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
1313DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
1314DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
1315DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
1316DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
1317DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
1318DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
1319DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
1320
1321DECLARE_EVENT_CLASS(xfs_dir2_class,
1322 TP_PROTO(struct xfs_da_args *args),
1323 TP_ARGS(args),
1324 TP_STRUCT__entry(
1325 __field(dev_t, dev)
1326 __field(xfs_ino_t, ino)
1327 __dynamic_array(char, name, args->namelen)
1328 __field(int, namelen)
1329 __field(xfs_dahash_t, hashval)
1330 __field(xfs_ino_t, inumber)
1331 __field(int, op_flags)
1332 ),
1333 TP_fast_assign(
1334 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1335 __entry->ino = args->dp->i_ino;
1336 if (args->namelen)
1337 memcpy(__get_str(name), args->name, args->namelen);
1338 __entry->namelen = args->namelen;
1339 __entry->hashval = args->hashval;
1340 __entry->inumber = args->inumber;
1341 __entry->op_flags = args->op_flags;
1342 ),
1343 TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
1344 "inumber 0x%llx op_flags %s",
1345 MAJOR(__entry->dev), MINOR(__entry->dev),
1346 __entry->ino,
1347 __entry->namelen,
1348 __entry->namelen ? __get_str(name) : NULL,
1349 __entry->namelen,
1350 __entry->hashval,
1351 __entry->inumber,
1352 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
1353)
1354
1355#define DEFINE_DIR2_EVENT(name) \
1356DEFINE_EVENT(xfs_dir2_class, name, \
1357 TP_PROTO(struct xfs_da_args *args), \
1358 TP_ARGS(args))
1359DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
1360DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
1361DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
1362DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
1363DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
1364DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
1365DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
1366DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
1367DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
1368DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
1369DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
1370DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
1371DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
1372DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
1373DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
1374DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
1375DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
1376DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
1377DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
1378DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
1379DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
1380DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
1381DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
1382DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
1383DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
1384
1385DECLARE_EVENT_CLASS(xfs_dir2_space_class,
1386 TP_PROTO(struct xfs_da_args *args, int idx),
1387 TP_ARGS(args, idx),
1388 TP_STRUCT__entry(
1389 __field(dev_t, dev)
1390 __field(xfs_ino_t, ino)
1391 __field(int, op_flags)
1392 __field(int, idx)
1393 ),
1394 TP_fast_assign(
1395 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1396 __entry->ino = args->dp->i_ino;
1397 __entry->op_flags = args->op_flags;
1398 __entry->idx = idx;
1399 ),
1400 TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
1401 MAJOR(__entry->dev), MINOR(__entry->dev),
1402 __entry->ino,
1403 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
1404 __entry->idx)
1405)
1406
1407#define DEFINE_DIR2_SPACE_EVENT(name) \
1408DEFINE_EVENT(xfs_dir2_space_class, name, \
1409 TP_PROTO(struct xfs_da_args *args, int idx), \
1410 TP_ARGS(args, idx))
1411DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
1412DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
1413DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
1414DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
1415
1416TRACE_EVENT(xfs_dir2_leafn_moveents,
1417 TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
1418 TP_ARGS(args, src_idx, dst_idx, count),
1419 TP_STRUCT__entry(
1420 __field(dev_t, dev)
1421 __field(xfs_ino_t, ino)
1422 __field(int, op_flags)
1423 __field(int, src_idx)
1424 __field(int, dst_idx)
1425 __field(int, count)
1426 ),
1427 TP_fast_assign(
1428 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1429 __entry->ino = args->dp->i_ino;
1430 __entry->op_flags = args->op_flags;
1431 __entry->src_idx = src_idx;
1432 __entry->dst_idx = dst_idx;
1433 __entry->count = count;
1434 ),
1435 TP_printk("dev %d:%d ino 0x%llx op_flags %s "
1436 "src_idx %d dst_idx %d count %d",
1437 MAJOR(__entry->dev), MINOR(__entry->dev),
1438 __entry->ino,
1439 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
1440 __entry->src_idx,
1441 __entry->dst_idx,
1442 __entry->count)
1443);
1444
1445#define XFS_SWAPEXT_INODES \
1446 { 0, "target" }, \
1447 { 1, "temp" }
1448
1449#define XFS_INODE_FORMAT_STR \
1450 { 0, "invalid" }, \
1451 { 1, "local" }, \
1452 { 2, "extent" }, \
1453 { 3, "btree" }
1454
1455DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1456 TP_PROTO(struct xfs_inode *ip, int which),
1457 TP_ARGS(ip, which),
1458 TP_STRUCT__entry(
1459 __field(dev_t, dev)
1460 __field(int, which)
1461 __field(xfs_ino_t, ino)
1462 __field(int, format)
1463 __field(int, nex)
1464 __field(int, max_nex)
1465 __field(int, broot_size)
1466 __field(int, fork_off)
1467 ),
1468 TP_fast_assign(
1469 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1470 __entry->which = which;
1471 __entry->ino = ip->i_ino;
1472 __entry->format = ip->i_d.di_format;
1473 __entry->nex = ip->i_d.di_nextents;
1474 __entry->max_nex = ip->i_df.if_ext_max;
1475 __entry->broot_size = ip->i_df.if_broot_bytes;
1476 __entry->fork_off = XFS_IFORK_BOFF(ip);
1477 ),
1478 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1479 "Max in-fork extents %d, broot size %d, fork offset %d",
1480 MAJOR(__entry->dev), MINOR(__entry->dev),
1481 __entry->ino,
1482 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1483 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1484 __entry->nex,
1485 __entry->max_nex,
1486 __entry->broot_size,
1487 __entry->fork_off)
1488)
1489
1490#define DEFINE_SWAPEXT_EVENT(name) \
1491DEFINE_EVENT(xfs_swap_extent_class, name, \
1492 TP_PROTO(struct xfs_inode *ip, int which), \
1493 TP_ARGS(ip, which))
1494
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497
1498#endif /* _TRACE_XFS_H */
1499
1500#undef TRACE_INCLUDE_PATH
1501#define TRACE_INCLUDE_PATH .
1502#define TRACE_INCLUDE_FILE xfs_trace
1503#include <trace/define_trace.h>
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index ad7fbead4c97..7c220b4227bc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -36,10 +36,13 @@ struct attrlist_cursor_kern;
36/* 36/*
37 * Flags for read/write calls - same values as IRIX 37 * Flags for read/write calls - same values as IRIX
38 */ 38 */
39#define IO_ISAIO 0x00001 /* don't wait for completion */
40#define IO_ISDIRECT 0x00004 /* bypass page cache */ 39#define IO_ISDIRECT 0x00004 /* bypass page cache */
41#define IO_INVIS 0x00020 /* don't update inode timestamps */ 40#define IO_INVIS 0x00020 /* don't update inode timestamps */
42 41
42#define XFS_IO_FLAGS \
43 { IO_ISDIRECT, "DIRECT" }, \
44 { IO_INVIS, "INVIS"}
45
43/* 46/*
44 * Flush/Invalidate options for vop_toss/flush/flushinval_pages. 47 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
45 */ 48 */
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 497c7fb75cc1..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -30,10 +30,10 @@
30 30
31 31
32static int 32static int
33__xfs_xattr_get(struct inode *inode, const char *name, 33xfs_xattr_get(struct dentry *dentry, const char *name,
34 void *value, size_t size, int xflags) 34 void *value, size_t size, int xflags)
35{ 35{
36 struct xfs_inode *ip = XFS_I(inode); 36 struct xfs_inode *ip = XFS_I(dentry->d_inode);
37 int error, asize = size; 37 int error, asize = size;
38 38
39 if (strcmp(name, "") == 0) 39 if (strcmp(name, "") == 0)
@@ -45,17 +45,17 @@ __xfs_xattr_get(struct inode *inode, const char *name,
45 value = NULL; 45 value = NULL;
46 } 46 }
47 47
48 error = -xfs_attr_get(ip, name, value, &asize, xflags); 48 error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
49 if (error) 49 if (error)
50 return error; 50 return error;
51 return asize; 51 return asize;
52} 52}
53 53
54static int 54static int
55__xfs_xattr_set(struct inode *inode, const char *name, const void *value, 55xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
56 size_t size, int flags, int xflags) 56 size_t size, int flags, int xflags)
57{ 57{
58 struct xfs_inode *ip = XFS_I(inode); 58 struct xfs_inode *ip = XFS_I(dentry->d_inode);
59 59
60 if (strcmp(name, "") == 0) 60 if (strcmp(name, "") == 0)
61 return -EINVAL; 61 return -EINVAL;
@@ -67,79 +67,39 @@ __xfs_xattr_set(struct inode *inode, const char *name, const void *value,
67 xflags |= ATTR_REPLACE; 67 xflags |= ATTR_REPLACE;
68 68
69 if (!value) 69 if (!value)
70 return -xfs_attr_remove(ip, name, xflags); 70 return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
71 return -xfs_attr_set(ip, name, (void *)value, size, xflags); 71 return -xfs_attr_set(ip, (unsigned char *)name,
72} 72 (void *)value, size, xflags);
73
74static int
75xfs_xattr_user_get(struct inode *inode, const char *name,
76 void *value, size_t size)
77{
78 return __xfs_xattr_get(inode, name, value, size, 0);
79}
80
81static int
82xfs_xattr_user_set(struct inode *inode, const char *name,
83 const void *value, size_t size, int flags)
84{
85 return __xfs_xattr_set(inode, name, value, size, flags, 0);
86} 73}
87 74
88static struct xattr_handler xfs_xattr_user_handler = { 75static struct xattr_handler xfs_xattr_user_handler = {
89 .prefix = XATTR_USER_PREFIX, 76 .prefix = XATTR_USER_PREFIX,
90 .get = xfs_xattr_user_get, 77 .flags = 0, /* no flags implies user namespace */
91 .set = xfs_xattr_user_set, 78 .get = xfs_xattr_get,
79 .set = xfs_xattr_set,
92}; 80};
93 81
94
95static int
96xfs_xattr_trusted_get(struct inode *inode, const char *name,
97 void *value, size_t size)
98{
99 return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT);
100}
101
102static int
103xfs_xattr_trusted_set(struct inode *inode, const char *name,
104 const void *value, size_t size, int flags)
105{
106 return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT);
107}
108
109static struct xattr_handler xfs_xattr_trusted_handler = { 82static struct xattr_handler xfs_xattr_trusted_handler = {
110 .prefix = XATTR_TRUSTED_PREFIX, 83 .prefix = XATTR_TRUSTED_PREFIX,
111 .get = xfs_xattr_trusted_get, 84 .flags = ATTR_ROOT,
112 .set = xfs_xattr_trusted_set, 85 .get = xfs_xattr_get,
86 .set = xfs_xattr_set,
113}; 87};
114 88
115
116static int
117xfs_xattr_secure_get(struct inode *inode, const char *name,
118 void *value, size_t size)
119{
120 return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE);
121}
122
123static int
124xfs_xattr_secure_set(struct inode *inode, const char *name,
125 const void *value, size_t size, int flags)
126{
127 return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE);
128}
129
130static struct xattr_handler xfs_xattr_security_handler = { 89static struct xattr_handler xfs_xattr_security_handler = {
131 .prefix = XATTR_SECURITY_PREFIX, 90 .prefix = XATTR_SECURITY_PREFIX,
132 .get = xfs_xattr_secure_get, 91 .flags = ATTR_SECURE,
133 .set = xfs_xattr_secure_set, 92 .get = xfs_xattr_get,
93 .set = xfs_xattr_set,
134}; 94};
135 95
136
137struct xattr_handler *xfs_xattr_handlers[] = { 96struct xattr_handler *xfs_xattr_handlers[] = {
138 &xfs_xattr_user_handler, 97 &xfs_xattr_user_handler,
139 &xfs_xattr_trusted_handler, 98 &xfs_xattr_trusted_handler,
140 &xfs_xattr_security_handler, 99 &xfs_xattr_security_handler,
141#ifdef CONFIG_XFS_POSIX_ACL 100#ifdef CONFIG_XFS_POSIX_ACL
142 &xfs_xattr_system_handler, 101 &xfs_xattr_acl_access_handler,
102 &xfs_xattr_acl_default_handler,
143#endif 103#endif
144 NULL 104 NULL
145}; 105};
@@ -165,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
165} 125}
166 126
167static int 127static int
168xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, 128xfs_xattr_put_listent(
169 char *name, int namelen, int valuelen, char *value) 129 struct xfs_attr_list_context *context,
130 int flags,
131 unsigned char *name,
132 int namelen,
133 int valuelen,
134 unsigned char *value)
170{ 135{
171 unsigned int prefix_len = xfs_xattr_prefix_len(flags); 136 unsigned int prefix_len = xfs_xattr_prefix_len(flags);
172 char *offset; 137 char *offset;
@@ -189,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
189 offset = (char *)context->alist + context->count; 154 offset = (char *)context->alist + context->count;
190 strncpy(offset, xfs_xattr_prefix(flags), prefix_len); 155 strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
191 offset += prefix_len; 156 offset += prefix_len;
192 strncpy(offset, name, namelen); /* real name */ 157 strncpy(offset, (char *)name, namelen); /* real name */
193 offset += namelen; 158 offset += namelen;
194 *offset = '\0'; 159 *offset = '\0';
195 context->count += prefix_len + namelen + 1; 160 context->count += prefix_len + namelen + 1;
@@ -197,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
197} 162}
198 163
199static int 164static int
200xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, 165xfs_xattr_put_listent_sizes(
201 char *name, int namelen, int valuelen, char *value) 166 struct xfs_attr_list_context *context,
167 int flags,
168 unsigned char *name,
169 int namelen,
170 int valuelen,
171 unsigned char *value)
202{ 172{
203 context->count += xfs_xattr_prefix_len(flags) + namelen + 1; 173 context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
204 return 0; 174 return 0;
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 2f3f2229eaaf..5f79dd78626b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -47,6 +47,7 @@
47#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
48#include "xfs_trans_priv.h" 48#include "xfs_trans_priv.h"
49#include "xfs_qm.h" 49#include "xfs_qm.h"
50#include "xfs_trace.h"
50 51
51 52
52/* 53/*
@@ -112,10 +113,7 @@ xfs_qm_dqinit(
112 init_completion(&dqp->q_flush); 113 init_completion(&dqp->q_flush);
113 complete(&dqp->q_flush); 114 complete(&dqp->q_flush);
114 115
115#ifdef XFS_DQUOT_TRACE 116 trace_xfs_dqinit(dqp);
116 dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
117 xfs_dqtrace_entry(dqp, "DQINIT");
118#endif
119 } else { 117 } else {
120 /* 118 /*
121 * Only the q_core portion was zeroed in dqreclaim_one(). 119 * Only the q_core portion was zeroed in dqreclaim_one().
@@ -136,10 +134,7 @@ xfs_qm_dqinit(
136 dqp->q_hash = NULL; 134 dqp->q_hash = NULL;
137 ASSERT(dqp->dq_flnext == dqp->dq_flprev); 135 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
138 136
139#ifdef XFS_DQUOT_TRACE 137 trace_xfs_dqreuse(dqp);
140 ASSERT(dqp->q_trace);
141 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
142#endif
143 } 138 }
144 139
145 /* 140 /*
@@ -167,13 +162,8 @@ xfs_qm_dqdestroy(
167 162
168 mutex_destroy(&dqp->q_qlock); 163 mutex_destroy(&dqp->q_qlock);
169 sv_destroy(&dqp->q_pinwait); 164 sv_destroy(&dqp->q_pinwait);
170
171#ifdef XFS_DQUOT_TRACE
172 if (dqp->q_trace)
173 ktrace_free(dqp->q_trace);
174 dqp->q_trace = NULL;
175#endif
176 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 165 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
166
177 atomic_dec(&xfs_Gqm->qm_totaldquots); 167 atomic_dec(&xfs_Gqm->qm_totaldquots);
178} 168}
179 169
@@ -195,49 +185,6 @@ xfs_qm_dqinit_core(
195 d->dd_diskdq.d_flags = type; 185 d->dd_diskdq.d_flags = type;
196} 186}
197 187
198
199#ifdef XFS_DQUOT_TRACE
200/*
201 * Dquot tracing for debugging.
202 */
203/* ARGSUSED */
204void
205__xfs_dqtrace_entry(
206 xfs_dquot_t *dqp,
207 char *func,
208 void *retaddr,
209 xfs_inode_t *ip)
210{
211 xfs_dquot_t *udqp = NULL;
212 xfs_ino_t ino = 0;
213
214 ASSERT(dqp->q_trace);
215 if (ip) {
216 ino = ip->i_ino;
217 udqp = ip->i_udquot;
218 }
219 ktrace_enter(dqp->q_trace,
220 (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
221 (void *)func,
222 (void *)(__psint_t)dqp->q_nrefs,
223 (void *)(__psint_t)dqp->dq_flags,
224 (void *)(__psint_t)dqp->q_res_bcount,
225 (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_bcount),
226 (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_icount),
227 (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_hardlimit),
228 (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_softlimit),
229 (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_hardlimit),
230 (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_softlimit),
231 (void *)(__psint_t)be32_to_cpu(dqp->q_core.d_id),
232 (void *)(__psint_t)current_pid(),
233 (void *)(__psint_t)ino,
234 (void *)(__psint_t)retaddr,
235 (void *)(__psint_t)udqp);
236 return;
237}
238#endif
239
240
241/* 188/*
242 * If default limits are in force, push them into the dquot now. 189 * If default limits are in force, push them into the dquot now.
243 * We overwrite the dquot limits only if they are zero and this 190 * We overwrite the dquot limits only if they are zero and this
@@ -425,7 +372,8 @@ xfs_qm_dqalloc(
425 xfs_trans_t *tp = *tpp; 372 xfs_trans_t *tp = *tpp;
426 373
427 ASSERT(tp != NULL); 374 ASSERT(tp != NULL);
428 xfs_dqtrace_entry(dqp, "DQALLOC"); 375
376 trace_xfs_dqalloc(dqp);
429 377
430 /* 378 /*
431 * Initialize the bmap freelist prior to calling bmapi code. 379 * Initialize the bmap freelist prior to calling bmapi code.
@@ -612,7 +560,8 @@ xfs_qm_dqtobp(
612 * (in which case we already have the buf). 560 * (in which case we already have the buf).
613 */ 561 */
614 if (! newdquot) { 562 if (! newdquot) {
615 xfs_dqtrace_entry(dqp, "DQTOBP READBUF"); 563 trace_xfs_dqtobp_read(dqp);
564
616 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 565 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
617 dqp->q_blkno, 566 dqp->q_blkno,
618 XFS_QI_DQCHUNKLEN(mp), 567 XFS_QI_DQCHUNKLEN(mp),
@@ -670,11 +619,12 @@ xfs_qm_dqread(
670 619
671 ASSERT(tpp); 620 ASSERT(tpp);
672 621
622 trace_xfs_dqread(dqp);
623
673 /* 624 /*
674 * get a pointer to the on-disk dquot and the buffer containing it 625 * get a pointer to the on-disk dquot and the buffer containing it
675 * dqp already knows its own type (GROUP/USER). 626 * dqp already knows its own type (GROUP/USER).
676 */ 627 */
677 xfs_dqtrace_entry(dqp, "DQREAD");
678 if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { 628 if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
679 return (error); 629 return (error);
680 } 630 }
@@ -763,7 +713,7 @@ xfs_qm_idtodq(
763 * or if the dquot didn't exist on disk and we ask to 713 * or if the dquot didn't exist on disk and we ask to
764 * allocate (ENOENT). 714 * allocate (ENOENT).
765 */ 715 */
766 xfs_dqtrace_entry(dqp, "DQREAD FAIL"); 716 trace_xfs_dqread_fail(dqp);
767 cancelflags |= XFS_TRANS_ABORT; 717 cancelflags |= XFS_TRANS_ABORT;
768 goto error0; 718 goto error0;
769 } 719 }
@@ -817,7 +767,8 @@ xfs_qm_dqlookup(
817 * id can't be modified without the hashlock anyway. 767 * id can't be modified without the hashlock anyway.
818 */ 768 */
819 if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { 769 if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
820 xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP"); 770 trace_xfs_dqlookup_found(dqp);
771
821 /* 772 /*
822 * All in core dquots must be on the dqlist of mp 773 * All in core dquots must be on the dqlist of mp
823 */ 774 */
@@ -827,7 +778,7 @@ xfs_qm_dqlookup(
827 if (dqp->q_nrefs == 0) { 778 if (dqp->q_nrefs == 0) {
828 ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); 779 ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
829 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 780 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
830 xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT"); 781 trace_xfs_dqlookup_want(dqp);
831 782
832 /* 783 /*
833 * We may have raced with dqreclaim_one() 784 * We may have raced with dqreclaim_one()
@@ -857,8 +808,7 @@ xfs_qm_dqlookup(
857 /* 808 /*
858 * take it off the freelist 809 * take it off the freelist
859 */ 810 */
860 xfs_dqtrace_entry(dqp, 811 trace_xfs_dqlookup_freelist(dqp);
861 "DQLOOKUP: TAKEOFF FL");
862 XQM_FREELIST_REMOVE(dqp); 812 XQM_FREELIST_REMOVE(dqp);
863 /* xfs_qm_freelist_print(&(xfs_Gqm-> 813 /* xfs_qm_freelist_print(&(xfs_Gqm->
864 qm_dqfreelist), 814 qm_dqfreelist),
@@ -878,8 +828,7 @@ xfs_qm_dqlookup(
878 */ 828 */
879 ASSERT(mutex_is_locked(&qh->qh_lock)); 829 ASSERT(mutex_is_locked(&qh->qh_lock));
880 if (dqp->HL_PREVP != &qh->qh_next) { 830 if (dqp->HL_PREVP != &qh->qh_next) {
881 xfs_dqtrace_entry(dqp, 831 trace_xfs_dqlookup_move(dqp);
882 "DQLOOKUP: HASH MOVETOFRONT");
883 if ((d = dqp->HL_NEXT)) 832 if ((d = dqp->HL_NEXT))
884 d->HL_PREVP = dqp->HL_PREVP; 833 d->HL_PREVP = dqp->HL_PREVP;
885 *(dqp->HL_PREVP) = d; 834 *(dqp->HL_PREVP) = d;
@@ -889,7 +838,7 @@ xfs_qm_dqlookup(
889 dqp->HL_PREVP = &qh->qh_next; 838 dqp->HL_PREVP = &qh->qh_next;
890 qh->qh_next = dqp; 839 qh->qh_next = dqp;
891 } 840 }
892 xfs_dqtrace_entry(dqp, "LOOKUP END"); 841 trace_xfs_dqlookup_done(dqp);
893 *O_dqpp = dqp; 842 *O_dqpp = dqp;
894 ASSERT(mutex_is_locked(&qh->qh_lock)); 843 ASSERT(mutex_is_locked(&qh->qh_lock));
895 return (0); 844 return (0);
@@ -971,7 +920,7 @@ xfs_qm_dqget(
971 ASSERT(*O_dqpp); 920 ASSERT(*O_dqpp);
972 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); 921 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
973 mutex_unlock(&h->qh_lock); 922 mutex_unlock(&h->qh_lock);
974 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); 923 trace_xfs_dqget_hit(*O_dqpp);
975 return (0); /* success */ 924 return (0); /* success */
976 } 925 }
977 XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); 926 XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
@@ -1104,7 +1053,7 @@ xfs_qm_dqget(
1104 mutex_unlock(&h->qh_lock); 1053 mutex_unlock(&h->qh_lock);
1105 dqret: 1054 dqret:
1106 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1055 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
1107 xfs_dqtrace_entry(dqp, "DQGET DONE"); 1056 trace_xfs_dqget_miss(dqp);
1108 *O_dqpp = dqp; 1057 *O_dqpp = dqp;
1109 return (0); 1058 return (0);
1110} 1059}
@@ -1124,7 +1073,8 @@ xfs_qm_dqput(
1124 1073
1125 ASSERT(dqp->q_nrefs > 0); 1074 ASSERT(dqp->q_nrefs > 0);
1126 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1075 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1127 xfs_dqtrace_entry(dqp, "DQPUT"); 1076
1077 trace_xfs_dqput(dqp);
1128 1078
1129 if (dqp->q_nrefs != 1) { 1079 if (dqp->q_nrefs != 1) {
1130 dqp->q_nrefs--; 1080 dqp->q_nrefs--;
@@ -1137,7 +1087,7 @@ xfs_qm_dqput(
1137 * in the right order; but try to get it out-of-order first 1087 * in the right order; but try to get it out-of-order first
1138 */ 1088 */
1139 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 1089 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
1140 xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT"); 1090 trace_xfs_dqput_wait(dqp);
1141 xfs_dqunlock(dqp); 1091 xfs_dqunlock(dqp);
1142 xfs_qm_freelist_lock(xfs_Gqm); 1092 xfs_qm_freelist_lock(xfs_Gqm);
1143 xfs_dqlock(dqp); 1093 xfs_dqlock(dqp);
@@ -1148,7 +1098,8 @@ xfs_qm_dqput(
1148 1098
1149 /* We can't depend on nrefs being == 1 here */ 1099 /* We can't depend on nrefs being == 1 here */
1150 if (--dqp->q_nrefs == 0) { 1100 if (--dqp->q_nrefs == 0) {
1151 xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST"); 1101 trace_xfs_dqput_free(dqp);
1102
1152 /* 1103 /*
1153 * insert at end of the freelist. 1104 * insert at end of the freelist.
1154 */ 1105 */
@@ -1196,7 +1147,7 @@ xfs_qm_dqrele(
1196 if (!dqp) 1147 if (!dqp)
1197 return; 1148 return;
1198 1149
1199 xfs_dqtrace_entry(dqp, "DQRELE"); 1150 trace_xfs_dqrele(dqp);
1200 1151
1201 xfs_dqlock(dqp); 1152 xfs_dqlock(dqp);
1202 /* 1153 /*
@@ -1229,14 +1180,14 @@ xfs_qm_dqflush(
1229 1180
1230 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1181 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1231 ASSERT(!completion_done(&dqp->q_flush)); 1182 ASSERT(!completion_done(&dqp->q_flush));
1232 xfs_dqtrace_entry(dqp, "DQFLUSH"); 1183 trace_xfs_dqflush(dqp);
1233 1184
1234 /* 1185 /*
1235 * If not dirty, or it's pinned and we are not supposed to 1186 * If not dirty, or it's pinned and we are not supposed to
1236 * block, nada. 1187 * block, nada.
1237 */ 1188 */
1238 if (!XFS_DQ_IS_DIRTY(dqp) || 1189 if (!XFS_DQ_IS_DIRTY(dqp) ||
1239 (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) { 1190 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
1240 xfs_dqfunlock(dqp); 1191 xfs_dqfunlock(dqp);
1241 return 0; 1192 return 0;
1242 } 1193 }
@@ -1259,7 +1210,6 @@ xfs_qm_dqflush(
1259 * the ondisk-dquot has already been allocated for. 1210 * the ondisk-dquot has already been allocated for.
1260 */ 1211 */
1261 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { 1212 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
1262 xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
1263 ASSERT(error != ENOENT); 1213 ASSERT(error != ENOENT);
1264 /* 1214 /*
1265 * Quotas could have gotten turned off (ESRCH) 1215 * Quotas could have gotten turned off (ESRCH)
@@ -1297,22 +1247,21 @@ xfs_qm_dqflush(
1297 * get stuck waiting in the write for too long. 1247 * get stuck waiting in the write for too long.
1298 */ 1248 */
1299 if (XFS_BUF_ISPINNED(bp)) { 1249 if (XFS_BUF_ISPINNED(bp)) {
1300 xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE"); 1250 trace_xfs_dqflush_force(dqp);
1301 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 1251 xfs_log_force(mp, 0);
1302 } 1252 }
1303 1253
1304 if (flags & XFS_QMOPT_DELWRI) { 1254 if (flags & SYNC_WAIT)
1305 xfs_bdwrite(mp, bp);
1306 } else if (flags & XFS_QMOPT_ASYNC) {
1307 error = xfs_bawrite(mp, bp);
1308 } else {
1309 error = xfs_bwrite(mp, bp); 1255 error = xfs_bwrite(mp, bp);
1310 } 1256 else
1311 xfs_dqtrace_entry(dqp, "DQFLUSH END"); 1257 xfs_bdwrite(mp, bp);
1258
1259 trace_xfs_dqflush_done(dqp);
1260
1312 /* 1261 /*
1313 * dqp is still locked, but caller is free to unlock it now. 1262 * dqp is still locked, but caller is free to unlock it now.
1314 */ 1263 */
1315 return (error); 1264 return error;
1316 1265
1317} 1266}
1318 1267
@@ -1483,7 +1432,7 @@ xfs_qm_dqpurge(
1483 */ 1432 */
1484 if (XFS_DQ_IS_DIRTY(dqp)) { 1433 if (XFS_DQ_IS_DIRTY(dqp)) {
1485 int error; 1434 int error;
1486 xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY"); 1435
1487 /* dqflush unlocks dqflock */ 1436 /* dqflush unlocks dqflock */
1488 /* 1437 /*
1489 * Given that dqpurge is a very rare occurrence, it is OK 1438 * Given that dqpurge is a very rare occurrence, it is OK
@@ -1493,7 +1442,7 @@ xfs_qm_dqpurge(
1493 * We don't care about getting disk errors here. We need 1442 * We don't care about getting disk errors here. We need
1494 * to purge this dquot anyway, so we go ahead regardless. 1443 * to purge this dquot anyway, so we go ahead regardless.
1495 */ 1444 */
1496 error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); 1445 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1497 if (error) 1446 if (error)
1498 xfs_fs_cmn_err(CE_WARN, mp, 1447 xfs_fs_cmn_err(CE_WARN, mp,
1499 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1448 "xfs_qm_dqpurge: dquot %p flush failed", dqp);
@@ -1577,25 +1526,17 @@ xfs_qm_dqflock_pushbuf_wait(
1577 * the flush lock when the I/O completes. 1526 * the flush lock when the I/O completes.
1578 */ 1527 */
1579 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
1580 XFS_QI_DQCHUNKLEN(dqp->q_mount), 1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
1581 XFS_INCORE_TRYLOCK); 1530 if (!bp)
1582 if (bp != NULL) { 1531 goto out_lock;
1583 if (XFS_BUF_ISDELAYWRITE(bp)) { 1532
1584 int error; 1533 if (XFS_BUF_ISDELAYWRITE(bp)) {
1585 if (XFS_BUF_ISPINNED(bp)) { 1534 if (XFS_BUF_ISPINNED(bp))
1586 xfs_log_force(dqp->q_mount, 1535 xfs_log_force(dqp->q_mount, 0);
1587 (xfs_lsn_t)0, 1536 xfs_buf_delwri_promote(bp);
1588 XFS_LOG_FORCE); 1537 wake_up_process(bp->b_target->bt_task);
1589 }
1590 error = xfs_bawrite(dqp->q_mount, bp);
1591 if (error)
1592 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
1593 "xfs_qm_dqflock_pushbuf_wait: "
1594 "pushbuf error %d on dqp %p, bp %p",
1595 error, dqp, bp);
1596 } else {
1597 xfs_buf_relse(bp);
1598 }
1599 } 1538 }
1539 xfs_buf_relse(bp);
1540out_lock:
1600 xfs_dqflock(dqp); 1541 xfs_dqflock(dqp);
1601} 1542}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 6533ead9b889..a0f7da586d1b 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -85,9 +85,6 @@ typedef struct xfs_dquot {
85 struct completion q_flush; /* flush completion queue */ 85 struct completion q_flush; /* flush completion queue */
86 atomic_t q_pincount; /* dquot pin count */ 86 atomic_t q_pincount; /* dquot pin count */
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
88#ifdef XFS_DQUOT_TRACE
89 struct ktrace *q_trace; /* trace header structure */
90#endif
91} xfs_dquot_t; 88} xfs_dquot_t;
92 89
93 90
@@ -98,7 +95,7 @@ typedef struct xfs_dquot {
98#define dq_flags q_lists.dqm_flags 95#define dq_flags q_lists.dqm_flags
99 96
100/* 97/*
101 * Lock hierachy for q_qlock: 98 * Lock hierarchy for q_qlock:
102 * XFS_QLOCK_NORMAL is the implicit default, 99 * XFS_QLOCK_NORMAL is the implicit default,
103 * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 100 * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
104 */ 101 */
@@ -144,24 +141,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
144 (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ 141 (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
145 (XFS_IS_OQUOTA_ON((d)->q_mount)))) 142 (XFS_IS_OQUOTA_ON((d)->q_mount))))
146 143
147#ifdef XFS_DQUOT_TRACE
148/*
149 * Dquot Tracing stuff.
150 */
151#define DQUOT_TRACE_SIZE 64
152#define DQUOT_KTRACE_ENTRY 1
153
154extern void __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func,
155 void *, xfs_inode_t *);
156#define xfs_dqtrace_entry_ino(a,b,ip) \
157 __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip))
158#define xfs_dqtrace_entry(a,b) \
159 __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL)
160#else
161#define xfs_dqtrace_entry(a,b)
162#define xfs_dqtrace_entry_ino(a,b,ip)
163#endif
164
165#ifdef QUOTADEBUG 144#ifdef QUOTADEBUG
166extern void xfs_qm_dqprint(xfs_dquot_t *); 145extern void xfs_qm_dqprint(xfs_dquot_t *);
167#else 146#else
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..4e4ee9a57194 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format(
74 74
75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; 75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
76 logvec->i_len = sizeof(xfs_dq_logformat_t); 76 logvec->i_len = sizeof(xfs_dq_logformat_t);
77 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT); 77 logvec->i_type = XLOG_REG_TYPE_QFORMAT;
78 logvec++; 78 logvec++;
79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; 79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
80 logvec->i_len = sizeof(xfs_disk_dquot_t); 80 logvec->i_len = sizeof(xfs_disk_dquot_t);
81 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT); 81 logvec->i_type = XLOG_REG_TYPE_DQUOT;
82 82
83 ASSERT(2 == logitem->qli_item.li_desc->lid_size); 83 ASSERT(2 == logitem->qli_item.li_desc->lid_size);
84 logitem->qli_format.qlf_size = 2; 84 logitem->qli_format.qlf_size = 2;
@@ -153,7 +153,7 @@ xfs_qm_dquot_logitem_push(
153 * lock without sleeping, then there must not have been 153 * lock without sleeping, then there must not have been
154 * anyone in the process of flushing the dquot. 154 * anyone in the process of flushing the dquot.
155 */ 155 */
156 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 156 error = xfs_qm_dqflush(dqp, 0);
157 if (error) 157 if (error)
158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
@@ -190,7 +190,7 @@ xfs_qm_dqunpin_wait(
190 /* 190 /*
191 * Give the log a push so we don't wait here too long. 191 * Give the log a push so we don't wait here too long.
192 */ 192 */
193 xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); 193 xfs_log_force(dqp->q_mount, 0);
194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
195} 195}
196 196
@@ -212,68 +212,31 @@ xfs_qm_dquot_logitem_pushbuf(
212 xfs_dquot_t *dqp; 212 xfs_dquot_t *dqp;
213 xfs_mount_t *mp; 213 xfs_mount_t *mp;
214 xfs_buf_t *bp; 214 xfs_buf_t *bp;
215 uint dopush;
216 215
217 dqp = qip->qli_dquot; 216 dqp = qip->qli_dquot;
218 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 217 ASSERT(XFS_DQ_IS_LOCKED(dqp));
219 218
220 /* 219 /*
221 * The qli_pushbuf_flag keeps others from
222 * trying to duplicate our effort.
223 */
224 ASSERT(qip->qli_pushbuf_flag != 0);
225 ASSERT(qip->qli_push_owner == current_pid());
226
227 /*
228 * If flushlock isn't locked anymore, chances are that the 220 * If flushlock isn't locked anymore, chances are that the
229 * inode flush completed and the inode was taken off the AIL. 221 * inode flush completed and the inode was taken off the AIL.
230 * So, just get out. 222 * So, just get out.
231 */ 223 */
232 if (completion_done(&dqp->q_flush) || 224 if (completion_done(&dqp->q_flush) ||
233 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 225 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
234 qip->qli_pushbuf_flag = 0;
235 xfs_dqunlock(dqp); 226 xfs_dqunlock(dqp);
236 return; 227 return;
237 } 228 }
238 mp = dqp->q_mount; 229 mp = dqp->q_mount;
239 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
240 XFS_QI_DQCHUNKLEN(mp), 231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
241 XFS_INCORE_TRYLOCK); 232 xfs_dqunlock(dqp);
242 if (bp != NULL) { 233 if (!bp)
243 if (XFS_BUF_ISDELAYWRITE(bp)) {
244 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
245 !completion_done(&dqp->q_flush));
246 qip->qli_pushbuf_flag = 0;
247 xfs_dqunlock(dqp);
248
249 if (XFS_BUF_ISPINNED(bp)) {
250 xfs_log_force(mp, (xfs_lsn_t)0,
251 XFS_LOG_FORCE);
252 }
253 if (dopush) {
254 int error;
255#ifdef XFSRACEDEBUG
256 delay_for_intr();
257 delay(300);
258#endif
259 error = xfs_bawrite(mp, bp);
260 if (error)
261 xfs_fs_cmn_err(CE_WARN, mp,
262 "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
263 error, qip, bp);
264 } else {
265 xfs_buf_relse(bp);
266 }
267 } else {
268 qip->qli_pushbuf_flag = 0;
269 xfs_dqunlock(dqp);
270 xfs_buf_relse(bp);
271 }
272 return; 234 return;
273 } 235 if (XFS_BUF_ISDELAYWRITE(bp))
236 xfs_buf_delwri_promote(bp);
237 xfs_buf_relse(bp);
238 return;
274 239
275 qip->qli_pushbuf_flag = 0;
276 xfs_dqunlock(dqp);
277} 240}
278 241
279/* 242/*
@@ -291,50 +254,24 @@ xfs_qm_dquot_logitem_trylock(
291 xfs_dq_logitem_t *qip) 254 xfs_dq_logitem_t *qip)
292{ 255{
293 xfs_dquot_t *dqp; 256 xfs_dquot_t *dqp;
294 uint retval;
295 257
296 dqp = qip->qli_dquot; 258 dqp = qip->qli_dquot;
297 if (atomic_read(&dqp->q_pincount) > 0) 259 if (atomic_read(&dqp->q_pincount) > 0)
298 return (XFS_ITEM_PINNED); 260 return XFS_ITEM_PINNED;
299 261
300 if (! xfs_qm_dqlock_nowait(dqp)) 262 if (! xfs_qm_dqlock_nowait(dqp))
301 return (XFS_ITEM_LOCKED); 263 return XFS_ITEM_LOCKED;
302 264
303 retval = XFS_ITEM_SUCCESS;
304 if (!xfs_dqflock_nowait(dqp)) { 265 if (!xfs_dqflock_nowait(dqp)) {
305 /* 266 /*
306 * The dquot is already being flushed. It may have been 267 * dquot has already been flushed to the backing buffer,
307 * flushed delayed write, however, and we don't want to 268 * leave it locked, pushbuf routine will unlock it.
308 * get stuck waiting for that to complete. So, we want to check
309 * to see if we can lock the dquot's buffer without sleeping.
310 * If we can and it is marked for delayed write, then we
311 * hold it and send it out from the push routine. We don't
312 * want to do that now since we might sleep in the device
313 * strategy routine. We also don't want to grab the buffer lock
314 * here because we'd like not to call into the buffer cache
315 * while holding the AIL lock.
316 * Make sure to only return PUSHBUF if we set pushbuf_flag
317 * ourselves. If someone else is doing it then we don't
318 * want to go to the push routine and duplicate their efforts.
319 */ 269 */
320 if (qip->qli_pushbuf_flag == 0) { 270 return XFS_ITEM_PUSHBUF;
321 qip->qli_pushbuf_flag = 1;
322 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
323#ifdef DEBUG
324 qip->qli_push_owner = current_pid();
325#endif
326 /*
327 * The dquot is left locked.
328 */
329 retval = XFS_ITEM_PUSHBUF;
330 } else {
331 retval = XFS_ITEM_FLUSHING;
332 xfs_dqunlock_nonotify(dqp);
333 }
334 } 271 }
335 272
336 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); 273 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
337 return (retval); 274 return XFS_ITEM_SUCCESS;
338} 275}
339 276
340 277
@@ -467,7 +404,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf,
467 404
468 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); 405 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
469 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 406 log_vector->i_len = sizeof(xfs_qoff_logitem_t);
470 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF); 407 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
471 qf->qql_format.qf_size = 1; 408 qf->qql_format.qf_size = 1;
472} 409}
473 410
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
27 xfs_log_item_t qli_item; /* common portion */ 27 xfs_log_item_t qli_item; /* common portion */
28 struct xfs_dquot *qli_dquot; /* dquot ptr */ 28 struct xfs_dquot *qli_dquot; /* dquot ptr */
29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ 29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
30 unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */
31#ifdef DEBUG
32 uint64_t qli_push_owner;
33#endif
34 xfs_dq_logformat_t qli_format; /* logged structure */ 30 xfs_dq_logformat_t qli_format; /* logged structure */
35} xfs_dq_logitem_t; 31} xfs_dq_logitem_t;
36 32
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 45b1bfef7388..417e61e3d9dd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -47,6 +47,7 @@
47#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
48#include "xfs_utils.h" 48#include "xfs_utils.h"
49#include "xfs_qm.h" 49#include "xfs_qm.h"
50#include "xfs_trace.h"
50 51
51/* 52/*
52 * The global quota manager. There is only one of these for the entire 53 * The global quota manager. There is only one of these for the entire
@@ -117,9 +118,14 @@ xfs_Gqm_init(void)
117 */ 118 */
118 udqhash = kmem_zalloc_greedy(&hsize, 119 udqhash = kmem_zalloc_greedy(&hsize,
119 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), 120 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
120 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t), 121 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
121 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 122 if (!udqhash)
122 gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); 123 goto out;
124
125 gdqhash = kmem_zalloc_large(hsize);
126 if (!gdqhash)
127 goto out_free_udqhash;
128
123 hsize /= sizeof(xfs_dqhash_t); 129 hsize /= sizeof(xfs_dqhash_t);
124 ndquot = hsize << 8; 130 ndquot = hsize << 8;
125 131
@@ -169,6 +175,11 @@ xfs_Gqm_init(void)
169 mutex_init(&qcheck_lock); 175 mutex_init(&qcheck_lock);
170#endif 176#endif
171 return xqm; 177 return xqm;
178
179 out_free_udqhash:
180 kmem_free_large(udqhash);
181 out:
182 return NULL;
172} 183}
173 184
174/* 185/*
@@ -188,8 +199,8 @@ xfs_qm_destroy(
188 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); 199 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
189 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); 200 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
190 } 201 }
191 kmem_free(xqm->qm_usr_dqhtable); 202 kmem_free_large(xqm->qm_usr_dqhtable);
192 kmem_free(xqm->qm_grp_dqhtable); 203 kmem_free_large(xqm->qm_grp_dqhtable);
193 xqm->qm_usr_dqhtable = NULL; 204 xqm->qm_usr_dqhtable = NULL;
194 xqm->qm_grp_dqhtable = NULL; 205 xqm->qm_grp_dqhtable = NULL;
195 xqm->qm_dqhashmask = 0; 206 xqm->qm_dqhashmask = 0;
@@ -218,8 +229,12 @@ xfs_qm_hold_quotafs_ref(
218 */ 229 */
219 mutex_lock(&xfs_Gqm_lock); 230 mutex_lock(&xfs_Gqm_lock);
220 231
221 if (xfs_Gqm == NULL) 232 if (!xfs_Gqm) {
222 xfs_Gqm = xfs_Gqm_init(); 233 xfs_Gqm = xfs_Gqm_init();
234 if (!xfs_Gqm)
235 return ENOMEM;
236 }
237
223 /* 238 /*
224 * We can keep a list of all filesystems with quotas mounted for 239 * We can keep a list of all filesystems with quotas mounted for
225 * debugging and statistical purposes, but ... 240 * debugging and statistical purposes, but ...
@@ -435,7 +450,7 @@ xfs_qm_unmount_quotas(
435STATIC int 450STATIC int
436xfs_qm_dqflush_all( 451xfs_qm_dqflush_all(
437 xfs_mount_t *mp, 452 xfs_mount_t *mp,
438 int flags) 453 int sync_mode)
439{ 454{
440 int recl; 455 int recl;
441 xfs_dquot_t *dqp; 456 xfs_dquot_t *dqp;
@@ -453,7 +468,7 @@ again:
453 xfs_dqunlock(dqp); 468 xfs_dqunlock(dqp);
454 continue; 469 continue;
455 } 470 }
456 xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY"); 471
457 /* XXX a sentinel would be better */ 472 /* XXX a sentinel would be better */
458 recl = XFS_QI_MPLRECLAIMS(mp); 473 recl = XFS_QI_MPLRECLAIMS(mp);
459 if (!xfs_dqflock_nowait(dqp)) { 474 if (!xfs_dqflock_nowait(dqp)) {
@@ -471,7 +486,7 @@ again:
471 * across a disk write. 486 * across a disk write.
472 */ 487 */
473 xfs_qm_mplist_unlock(mp); 488 xfs_qm_mplist_unlock(mp);
474 error = xfs_qm_dqflush(dqp, flags); 489 error = xfs_qm_dqflush(dqp, sync_mode);
475 xfs_dqunlock(dqp); 490 xfs_dqunlock(dqp);
476 if (error) 491 if (error)
477 return error; 492 return error;
@@ -651,7 +666,7 @@ xfs_qm_dqattach_one(
651 */ 666 */
652 dqp = *IO_idqpp; 667 dqp = *IO_idqpp;
653 if (dqp) { 668 if (dqp) {
654 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); 669 trace_xfs_dqattach_found(dqp);
655 return 0; 670 return 0;
656 } 671 }
657 672
@@ -704,7 +719,7 @@ xfs_qm_dqattach_one(
704 if (error) 719 if (error)
705 return error; 720 return error;
706 721
707 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); 722 trace_xfs_dqattach_get(dqp);
708 723
709 /* 724 /*
710 * dqget may have dropped and re-acquired the ilock, but it guarantees 725 * dqget may have dropped and re-acquired the ilock, but it guarantees
@@ -890,15 +905,15 @@ xfs_qm_dqdetach(
890 if (!(ip->i_udquot || ip->i_gdquot)) 905 if (!(ip->i_udquot || ip->i_gdquot))
891 return; 906 return;
892 907
908 trace_xfs_dquot_dqdetach(ip);
909
893 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); 910 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
894 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); 911 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
895 if (ip->i_udquot) { 912 if (ip->i_udquot) {
896 xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
897 xfs_qm_dqrele(ip->i_udquot); 913 xfs_qm_dqrele(ip->i_udquot);
898 ip->i_udquot = NULL; 914 ip->i_udquot = NULL;
899 } 915 }
900 if (ip->i_gdquot) { 916 if (ip->i_gdquot) {
901 xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip);
902 xfs_qm_dqrele(ip->i_gdquot); 917 xfs_qm_dqrele(ip->i_gdquot);
903 ip->i_gdquot = NULL; 918 ip->i_gdquot = NULL;
904 } 919 }
@@ -911,13 +926,11 @@ xfs_qm_sync(
911{ 926{
912 int recl, restarts; 927 int recl, restarts;
913 xfs_dquot_t *dqp; 928 xfs_dquot_t *dqp;
914 uint flush_flags;
915 int error; 929 int error;
916 930
917 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
918 return 0; 932 return 0;
919 933
920 flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
921 restarts = 0; 934 restarts = 0;
922 935
923 again: 936 again:
@@ -977,8 +990,7 @@ xfs_qm_sync(
977 * across a disk write 990 * across a disk write
978 */ 991 */
979 xfs_qm_mplist_unlock(mp); 992 xfs_qm_mplist_unlock(mp);
980 xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH"); 993 error = xfs_qm_dqflush(dqp, flags);
981 error = xfs_qm_dqflush(dqp, flush_flags);
982 xfs_dqunlock(dqp); 994 xfs_dqunlock(dqp);
983 if (error && XFS_FORCED_SHUTDOWN(mp)) 995 if (error && XFS_FORCED_SHUTDOWN(mp))
984 return 0; /* Need to prevent umount failure */ 996 return 0; /* Need to prevent umount failure */
@@ -1350,7 +1362,8 @@ xfs_qm_reset_dqcounts(
1350 xfs_disk_dquot_t *ddq; 1362 xfs_disk_dquot_t *ddq;
1351 int j; 1363 int j;
1352 1364
1353 xfs_buftrace("RESET DQUOTS", bp); 1365 trace_xfs_reset_dqcounts(bp, _RET_IP_);
1366
1354 /* 1367 /*
1355 * Reset all counters and timers. They'll be 1368 * Reset all counters and timers. They'll be
1356 * started afresh by xfs_qm_quotacheck. 1369 * started afresh by xfs_qm_quotacheck.
@@ -1543,7 +1556,9 @@ xfs_qm_quotacheck_dqadjust(
1543 xfs_qcnt_t rtblks) 1556 xfs_qcnt_t rtblks)
1544{ 1557{
1545 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1558 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1546 xfs_dqtrace_entry(dqp, "QCHECK DQADJUST"); 1559
1560 trace_xfs_dqadjust(dqp);
1561
1547 /* 1562 /*
1548 * Adjust the inode count and the block count to reflect this inode's 1563 * Adjust the inode count and the block count to reflect this inode's
1549 * resource usage. 1564 * resource usage.
@@ -1779,7 +1794,7 @@ xfs_qm_quotacheck(
1779 * successfully. 1794 * successfully.
1780 */ 1795 */
1781 if (!error) 1796 if (!error)
1782 error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); 1797 error = xfs_qm_dqflush_all(mp, 0);
1783 1798
1784 /* 1799 /*
1785 * We can get this error if we couldn't do a dquot allocation inside 1800 * We can get this error if we couldn't do a dquot allocation inside
@@ -1994,12 +2009,14 @@ xfs_qm_shake_freelist(
1994 */ 2009 */
1995 if (XFS_DQ_IS_DIRTY(dqp)) { 2010 if (XFS_DQ_IS_DIRTY(dqp)) {
1996 int error; 2011 int error;
1997 xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY"); 2012
2013 trace_xfs_dqshake_dirty(dqp);
2014
1998 /* 2015 /*
1999 * We flush it delayed write, so don't bother 2016 * We flush it delayed write, so don't bother
2000 * releasing the mplock. 2017 * releasing the mplock.
2001 */ 2018 */
2002 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2019 error = xfs_qm_dqflush(dqp, 0);
2003 if (error) { 2020 if (error) {
2004 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2005 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
@@ -2038,7 +2055,9 @@ xfs_qm_shake_freelist(
2038 return nreclaimed; 2055 return nreclaimed;
2039 goto tryagain; 2056 goto tryagain;
2040 } 2057 }
2041 xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING"); 2058
2059 trace_xfs_dqshake_unlink(dqp);
2060
2042#ifdef QUOTADEBUG 2061#ifdef QUOTADEBUG
2043 cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", 2062 cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
2044 dqp, be32_to_cpu(dqp->q_core.d_id)); 2063 dqp, be32_to_cpu(dqp->q_core.d_id));
@@ -2125,7 +2144,9 @@ xfs_qm_dqreclaim_one(void)
2125 */ 2144 */
2126 if (dqp->dq_flags & XFS_DQ_WANT) { 2145 if (dqp->dq_flags & XFS_DQ_WANT) {
2127 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 2146 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
2128 xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT"); 2147
2148 trace_xfs_dqreclaim_want(dqp);
2149
2129 xfs_dqunlock(dqp); 2150 xfs_dqunlock(dqp);
2130 xfs_qm_freelist_unlock(xfs_Gqm); 2151 xfs_qm_freelist_unlock(xfs_Gqm);
2131 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2152 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
@@ -2171,12 +2192,14 @@ xfs_qm_dqreclaim_one(void)
2171 */ 2192 */
2172 if (XFS_DQ_IS_DIRTY(dqp)) { 2193 if (XFS_DQ_IS_DIRTY(dqp)) {
2173 int error; 2194 int error;
2174 xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY"); 2195
2196 trace_xfs_dqreclaim_dirty(dqp);
2197
2175 /* 2198 /*
2176 * We flush it delayed write, so don't bother 2199 * We flush it delayed write, so don't bother
2177 * releasing the freelist lock. 2200 * releasing the freelist lock.
2178 */ 2201 */
2179 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2202 error = xfs_qm_dqflush(dqp, 0);
2180 if (error) { 2203 if (error) {
2181 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2182 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
@@ -2194,8 +2217,9 @@ xfs_qm_dqreclaim_one(void)
2194 if (!mutex_trylock(&dqp->q_hash->qh_lock)) 2217 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2195 goto mplistunlock; 2218 goto mplistunlock;
2196 2219
2220 trace_xfs_dqreclaim_unlink(dqp);
2221
2197 ASSERT(dqp->q_nrefs == 0); 2222 ASSERT(dqp->q_nrefs == 0);
2198 xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
2199 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); 2223 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
2200 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); 2224 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2201 XQM_FREELIST_REMOVE(dqp); 2225 XQM_FREELIST_REMOVE(dqp);
@@ -2430,7 +2454,7 @@ xfs_qm_vop_dqalloc(
2430 } 2454 }
2431 } 2455 }
2432 if (uq) 2456 if (uq)
2433 xfs_dqtrace_entry_ino(uq, "DQALLOC", ip); 2457 trace_xfs_dquot_dqalloc(ip);
2434 2458
2435 xfs_iunlock(ip, lockflags); 2459 xfs_iunlock(ip, lockflags);
2436 if (O_udqpp) 2460 if (O_udqpp)
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
59 be64_to_cpu(dp->d_blk_hardlimit); 59 be64_to_cpu(dp->d_blk_hardlimit);
60 if (limit && statp->f_blocks > limit) { 60 if (limit && statp->f_blocks > limit) {
61 statp->f_blocks = limit; 61 statp->f_blocks = limit;
62 statp->f_bfree = 62 statp->f_bfree = statp->f_bavail =
63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? 63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; 64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
65 } 65 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d1a3b98a6e6..5d0ee8d492db 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -49,6 +49,7 @@
49#include "xfs_buf_item.h" 49#include "xfs_buf_item.h"
50#include "xfs_utils.h" 50#include "xfs_utils.h"
51#include "xfs_qm.h" 51#include "xfs_qm.h"
52#include "xfs_trace.h"
52 53
53#ifdef DEBUG 54#ifdef DEBUG
54# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args) 55# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args)
@@ -496,7 +497,6 @@ xfs_qm_scall_setqlim(
496 ASSERT(error != ENOENT); 497 ASSERT(error != ENOENT);
497 return (error); 498 return (error);
498 } 499 }
499 xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
500 xfs_trans_dqjoin(tp, dqp); 500 xfs_trans_dqjoin(tp, dqp);
501 ddq = &dqp->q_core; 501 ddq = &dqp->q_core;
502 502
@@ -602,7 +602,6 @@ xfs_qm_scall_setqlim(
602 dqp->dq_flags |= XFS_DQ_DIRTY; 602 dqp->dq_flags |= XFS_DQ_DIRTY;
603 xfs_trans_log_dquot(tp, dqp); 603 xfs_trans_log_dquot(tp, dqp);
604 604
605 xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
606 error = xfs_trans_commit(tp, 0); 605 error = xfs_trans_commit(tp, 0);
607 xfs_qm_dqprint(dqp); 606 xfs_qm_dqprint(dqp);
608 xfs_qm_dqrele(dqp); 607 xfs_qm_dqrele(dqp);
@@ -630,7 +629,6 @@ xfs_qm_scall_getquota(
630 return (error); 629 return (error);
631 } 630 }
632 631
633 xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
634 /* 632 /*
635 * If everything's NULL, this dquot doesn't quite exist as far as 633 * If everything's NULL, this dquot doesn't quite exist as far as
636 * our utility programs are concerned. 634 * our utility programs are concerned.
@@ -893,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
893 uint flags) 891 uint flags)
894{ 892{
895 ASSERT(mp->m_quotainfo); 893 ASSERT(mp->m_quotainfo);
896 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); 894 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
897} 895}
898 896
899/*------------------------------------------------------------------------*/ 897/*------------------------------------------------------------------------*/
@@ -1194,9 +1192,9 @@ xfs_qm_internalqcheck(
1194 if (! XFS_IS_QUOTA_ON(mp)) 1192 if (! XFS_IS_QUOTA_ON(mp))
1195 return XFS_ERROR(ESRCH); 1193 return XFS_ERROR(ESRCH);
1196 1194
1197 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1195 xfs_log_force(mp, XFS_LOG_SYNC);
1198 XFS_bflush(mp->m_ddev_targp); 1196 XFS_bflush(mp->m_ddev_targp);
1199 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1197 xfs_log_force(mp, XFS_LOG_SYNC);
1200 XFS_bflush(mp->m_ddev_targp); 1198 XFS_bflush(mp->m_ddev_targp);
1201 1199
1202 mutex_lock(&qcheck_lock); 1200 mutex_lock(&qcheck_lock);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 97ac9640be98..c3ab75cb1d9a 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -589,12 +589,18 @@ xfs_trans_unreserve_and_mod_dquots(
589 } 589 }
590} 590}
591 591
592STATIC int 592STATIC void
593xfs_quota_error(uint flags) 593xfs_quota_warn(
594 struct xfs_mount *mp,
595 struct xfs_dquot *dqp,
596 int type)
594{ 597{
595 if (flags & XFS_QMOPT_ENOSPC) 598 /* no warnings for project quotas - we just return ENOSPC later */
596 return ENOSPC; 599 if (dqp->dq_flags & XFS_DQ_PROJ)
597 return EDQUOT; 600 return;
601 quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
602 be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
603 type);
598} 604}
599 605
600/* 606/*
@@ -612,7 +618,6 @@ xfs_trans_dqresv(
612 long ninos, 618 long ninos,
613 uint flags) 619 uint flags)
614{ 620{
615 int error;
616 xfs_qcnt_t hardlimit; 621 xfs_qcnt_t hardlimit;
617 xfs_qcnt_t softlimit; 622 xfs_qcnt_t softlimit;
618 time_t timer; 623 time_t timer;
@@ -649,7 +654,6 @@ xfs_trans_dqresv(
649 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
650 resbcountp = &dqp->q_res_rtbcount; 655 resbcountp = &dqp->q_res_rtbcount;
651 } 656 }
652 error = 0;
653 657
654 if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 658 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
655 dqp->q_core.d_id && 659 dqp->q_core.d_id &&
@@ -667,18 +671,20 @@ xfs_trans_dqresv(
667 * nblks. 671 * nblks.
668 */ 672 */
669 if (hardlimit > 0ULL && 673 if (hardlimit > 0ULL &&
670 (hardlimit <= nblks + *resbcountp)) { 674 hardlimit <= nblks + *resbcountp) {
671 error = xfs_quota_error(flags); 675 xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
672 goto error_return; 676 goto error_return;
673 } 677 }
674
675 if (softlimit > 0ULL && 678 if (softlimit > 0ULL &&
676 (softlimit <= nblks + *resbcountp)) { 679 softlimit <= nblks + *resbcountp) {
677 if ((timer != 0 && get_seconds() > timer) || 680 if ((timer != 0 && get_seconds() > timer) ||
678 (warns != 0 && warns >= warnlimit)) { 681 (warns != 0 && warns >= warnlimit)) {
679 error = xfs_quota_error(flags); 682 xfs_quota_warn(mp, dqp,
683 QUOTA_NL_BSOFTLONGWARN);
680 goto error_return; 684 goto error_return;
681 } 685 }
686
687 xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
682 } 688 }
683 } 689 }
684 if (ninos > 0) { 690 if (ninos > 0) {
@@ -692,15 +698,19 @@ xfs_trans_dqresv(
692 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 698 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
693 if (!softlimit) 699 if (!softlimit)
694 softlimit = q->qi_isoftlimit; 700 softlimit = q->qi_isoftlimit;
701
695 if (hardlimit > 0ULL && count >= hardlimit) { 702 if (hardlimit > 0ULL && count >= hardlimit) {
696 error = xfs_quota_error(flags); 703 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
697 goto error_return; 704 goto error_return;
698 } else if (softlimit > 0ULL && count >= softlimit) { 705 }
699 if ((timer != 0 && get_seconds() > timer) || 706 if (softlimit > 0ULL && count >= softlimit) {
707 if ((timer != 0 && get_seconds() > timer) ||
700 (warns != 0 && warns >= warnlimit)) { 708 (warns != 0 && warns >= warnlimit)) {
701 error = xfs_quota_error(flags); 709 xfs_quota_warn(mp, dqp,
710 QUOTA_NL_ISOFTLONGWARN);
702 goto error_return; 711 goto error_return;
703 } 712 }
713 xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
704 } 714 }
705 } 715 }
706 } 716 }
@@ -736,9 +746,14 @@ xfs_trans_dqresv(
736 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); 746 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
737 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 747 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
738 748
749 xfs_dqunlock(dqp);
750 return 0;
751
739error_return: 752error_return:
740 xfs_dqunlock(dqp); 753 xfs_dqunlock(dqp);
741 return error; 754 if (flags & XFS_QMOPT_ENOSPC)
755 return ENOSPC;
756 return EDQUOT;
742} 757}
743 758
744 759
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 6f4fd37c67af..d2d20462fd4f 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -41,10 +41,6 @@ extern void assfail(char *expr, char *f, int l);
41# define STATIC static noinline 41# define STATIC static noinline
42#endif 42#endif
43 43
44#ifndef STATIC_INLINE
45# define STATIC_INLINE static inline
46#endif
47
48#else /* DEBUG */ 44#else /* DEBUG */
49 45
50#define ASSERT(expr) \ 46#define ASSERT(expr) \
@@ -54,19 +50,5 @@ extern void assfail(char *expr, char *f, int l);
54# define STATIC noinline 50# define STATIC noinline
55#endif 51#endif
56 52
57/*
58 * We stop inlining of inline functions in debug mode.
59 * Unfortunately, this means static inline in header files
60 * get multiple definitions, so they need to remain static.
61 * This then gives tonnes of warnings about unused but defined
62 * functions, so we need to add the unused attribute to prevent
63 * these spurious warnings.
64 */
65#ifndef STATIC_INLINE
66# define STATIC_INLINE static __attribute__ ((unused)) noinline
67#endif
68
69#endif /* DEBUG */ 53#endif /* DEBUG */
70
71
72#endif /* __XFS_SUPPORT_DEBUG_H__ */ 54#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
deleted file mode 100644
index 2d494c26717f..000000000000
--- a/fs/xfs/support/ktrace.c
+++ /dev/null
@@ -1,323 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <xfs.h>
19
20static kmem_zone_t *ktrace_hdr_zone;
21static kmem_zone_t *ktrace_ent_zone;
22static int ktrace_zentries;
23
24void __init
25ktrace_init(int zentries)
26{
27 ktrace_zentries = roundup_pow_of_two(zentries);
28
29 ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
30 "ktrace_hdr");
31 ASSERT(ktrace_hdr_zone);
32
33 ktrace_ent_zone = kmem_zone_init(ktrace_zentries
34 * sizeof(ktrace_entry_t),
35 "ktrace_ent");
36 ASSERT(ktrace_ent_zone);
37}
38
39void __exit
40ktrace_uninit(void)
41{
42 kmem_zone_destroy(ktrace_hdr_zone);
43 kmem_zone_destroy(ktrace_ent_zone);
44}
45
46/*
47 * ktrace_alloc()
48 *
49 * Allocate a ktrace header and enough buffering for the given
50 * number of entries. Round the number of entries up to a
51 * power of 2 so we can do fast masking to get the index from
52 * the atomic index counter.
53 */
54ktrace_t *
55ktrace_alloc(int nentries, unsigned int __nocast sleep)
56{
57 ktrace_t *ktp;
58 ktrace_entry_t *ktep;
59 int entries;
60
61 ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
62
63 if (ktp == (ktrace_t*)NULL) {
64 /*
65 * KM_SLEEP callers don't expect failure.
66 */
67 if (sleep & KM_SLEEP)
68 panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
69
70 return NULL;
71 }
72
73 /*
74 * Special treatment for buffers with the ktrace_zentries entries
75 */
76 entries = roundup_pow_of_two(nentries);
77 if (entries == ktrace_zentries) {
78 ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
79 sleep);
80 } else {
81 ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
82 sleep | KM_LARGE);
83 }
84
85 if (ktep == NULL) {
86 /*
87 * KM_SLEEP callers don't expect failure.
88 */
89 if (sleep & KM_SLEEP)
90 panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
91
92 kmem_free(ktp);
93
94 return NULL;
95 }
96
97 ktp->kt_entries = ktep;
98 ktp->kt_nentries = entries;
99 ASSERT(is_power_of_2(entries));
100 ktp->kt_index_mask = entries - 1;
101 atomic_set(&ktp->kt_index, 0);
102 ktp->kt_rollover = 0;
103 return ktp;
104}
105
106
107/*
108 * ktrace_free()
109 *
110 * Free up the ktrace header and buffer. It is up to the caller
111 * to ensure that no-one is referencing it.
112 */
113void
114ktrace_free(ktrace_t *ktp)
115{
116 if (ktp == (ktrace_t *)NULL)
117 return;
118
119 /*
120 * Special treatment for the Vnode trace buffer.
121 */
122 if (ktp->kt_nentries == ktrace_zentries)
123 kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
124 else
125 kmem_free(ktp->kt_entries);
126
127 kmem_zone_free(ktrace_hdr_zone, ktp);
128}
129
130
131/*
132 * Enter the given values into the "next" entry in the trace buffer.
133 * kt_index is always the index of the next entry to be filled.
134 */
135void
136ktrace_enter(
137 ktrace_t *ktp,
138 void *val0,
139 void *val1,
140 void *val2,
141 void *val3,
142 void *val4,
143 void *val5,
144 void *val6,
145 void *val7,
146 void *val8,
147 void *val9,
148 void *val10,
149 void *val11,
150 void *val12,
151 void *val13,
152 void *val14,
153 void *val15)
154{
155 int index;
156 ktrace_entry_t *ktep;
157
158 ASSERT(ktp != NULL);
159
160 /*
161 * Grab an entry by pushing the index up to the next one.
162 */
163 index = atomic_add_return(1, &ktp->kt_index);
164 index = (index - 1) & ktp->kt_index_mask;
165 if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
166 ktp->kt_rollover = 1;
167
168 ASSERT((index >= 0) && (index < ktp->kt_nentries));
169
170 ktep = &(ktp->kt_entries[index]);
171
172 ktep->val[0] = val0;
173 ktep->val[1] = val1;
174 ktep->val[2] = val2;
175 ktep->val[3] = val3;
176 ktep->val[4] = val4;
177 ktep->val[5] = val5;
178 ktep->val[6] = val6;
179 ktep->val[7] = val7;
180 ktep->val[8] = val8;
181 ktep->val[9] = val9;
182 ktep->val[10] = val10;
183 ktep->val[11] = val11;
184 ktep->val[12] = val12;
185 ktep->val[13] = val13;
186 ktep->val[14] = val14;
187 ktep->val[15] = val15;
188}
189
190/*
191 * Return the number of entries in the trace buffer.
192 */
193int
194ktrace_nentries(
195 ktrace_t *ktp)
196{
197 int index;
198 if (ktp == NULL)
199 return 0;
200
201 index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
202 return (ktp->kt_rollover ? ktp->kt_nentries : index);
203}
204
205/*
206 * ktrace_first()
207 *
208 * This is used to find the start of the trace buffer.
209 * In conjunction with ktrace_next() it can be used to
210 * iterate through the entire trace buffer. This code does
211 * not do any locking because it is assumed that it is called
212 * from the debugger.
213 *
214 * The caller must pass in a pointer to a ktrace_snap
215 * structure in which we will keep some state used to
216 * iterate through the buffer. This state must not touched
217 * by any code outside of this module.
218 */
219ktrace_entry_t *
220ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp)
221{
222 ktrace_entry_t *ktep;
223 int index;
224 int nentries;
225
226 if (ktp->kt_rollover)
227 index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
228 else
229 index = 0;
230
231 ktsp->ks_start = index;
232 ktep = &(ktp->kt_entries[index]);
233
234 nentries = ktrace_nentries(ktp);
235 index++;
236 if (index < nentries) {
237 ktsp->ks_index = index;
238 } else {
239 ktsp->ks_index = 0;
240 if (index > nentries)
241 ktep = NULL;
242 }
243 return ktep;
244}
245
246/*
247 * ktrace_next()
248 *
249 * This is used to iterate through the entries of the given
250 * trace buffer. The caller must pass in the ktrace_snap_t
251 * structure initialized by ktrace_first(). The return value
252 * will be either a pointer to the next ktrace_entry or NULL
253 * if all of the entries have been traversed.
254 */
255ktrace_entry_t *
256ktrace_next(
257 ktrace_t *ktp,
258 ktrace_snap_t *ktsp)
259{
260 int index;
261 ktrace_entry_t *ktep;
262
263 index = ktsp->ks_index;
264 if (index == ktsp->ks_start) {
265 ktep = NULL;
266 } else {
267 ktep = &ktp->kt_entries[index];
268 }
269
270 index++;
271 if (index == ktrace_nentries(ktp)) {
272 ktsp->ks_index = 0;
273 } else {
274 ktsp->ks_index = index;
275 }
276
277 return ktep;
278}
279
280/*
281 * ktrace_skip()
282 *
283 * Skip the next "count" entries and return the entry after that.
284 * Return NULL if this causes us to iterate past the beginning again.
285 */
286ktrace_entry_t *
287ktrace_skip(
288 ktrace_t *ktp,
289 int count,
290 ktrace_snap_t *ktsp)
291{
292 int index;
293 int new_index;
294 ktrace_entry_t *ktep;
295 int nentries = ktrace_nentries(ktp);
296
297 index = ktsp->ks_index;
298 new_index = index + count;
299 while (new_index >= nentries) {
300 new_index -= nentries;
301 }
302 if (index == ktsp->ks_start) {
303 /*
304 * We've iterated around to the start, so we're done.
305 */
306 ktep = NULL;
307 } else if ((new_index < index) && (index < ktsp->ks_index)) {
308 /*
309 * We've skipped past the start again, so we're done.
310 */
311 ktep = NULL;
312 ktsp->ks_index = ktsp->ks_start;
313 } else {
314 ktep = &(ktp->kt_entries[new_index]);
315 new_index++;
316 if (new_index == nentries) {
317 ktsp->ks_index = 0;
318 } else {
319 ktsp->ks_index = new_index;
320 }
321 }
322 return ktep;
323}
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
deleted file mode 100644
index 741d6947ca60..000000000000
--- a/fs/xfs/support/ktrace.h
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_KTRACE_H__
19#define __XFS_SUPPORT_KTRACE_H__
20
21/*
22 * Trace buffer entry structure.
23 */
24typedef struct ktrace_entry {
25 void *val[16];
26} ktrace_entry_t;
27
28/*
29 * Trace buffer header structure.
30 */
31typedef struct ktrace {
32 int kt_nentries; /* number of entries in trace buf */
33 atomic_t kt_index; /* current index in entries */
34 unsigned int kt_index_mask;
35 int kt_rollover;
36 ktrace_entry_t *kt_entries; /* buffer of entries */
37} ktrace_t;
38
39/*
40 * Trace buffer snapshot structure.
41 */
42typedef struct ktrace_snap {
43 int ks_start; /* kt_index at time of snap */
44 int ks_index; /* current index */
45} ktrace_snap_t;
46
47
48#ifdef CONFIG_XFS_TRACE
49
50extern void ktrace_init(int zentries);
51extern void ktrace_uninit(void);
52
53extern ktrace_t *ktrace_alloc(int, unsigned int __nocast);
54extern void ktrace_free(ktrace_t *);
55
56extern void ktrace_enter(
57 ktrace_t *,
58 void *,
59 void *,
60 void *,
61 void *,
62 void *,
63 void *,
64 void *,
65 void *,
66 void *,
67 void *,
68 void *,
69 void *,
70 void *,
71 void *,
72 void *,
73 void *);
74
75extern ktrace_entry_t *ktrace_first(ktrace_t *, ktrace_snap_t *);
76extern int ktrace_nentries(ktrace_t *);
77extern ktrace_entry_t *ktrace_next(ktrace_t *, ktrace_snap_t *);
78extern ktrace_entry_t *ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
79
80#else
81#define ktrace_init(x) do { } while (0)
82#define ktrace_uninit() do { } while (0)
83#endif /* CONFIG_XFS_TRACE */
84
85#endif /* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 17254b529c54..5ad8ad3a1dcd 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -25,21 +25,5 @@
25/* #define QUOTADEBUG 1 */ 25/* #define QUOTADEBUG 1 */
26#endif 26#endif
27 27
28#ifdef CONFIG_XFS_TRACE
29#define XFS_ALLOC_TRACE 1
30#define XFS_ATTR_TRACE 1
31#define XFS_BLI_TRACE 1
32#define XFS_BMAP_TRACE 1
33#define XFS_BTREE_TRACE 1
34#define XFS_DIR2_TRACE 1
35#define XFS_DQUOT_TRACE 1
36#define XFS_ILOCK_TRACE 1
37#define XFS_LOG_TRACE 1
38#define XFS_RW_TRACE 1
39#define XFS_BUF_TRACE 1
40#define XFS_INODE_TRACE 1
41#define XFS_FILESTREAMS_TRACE 1
42#endif
43
44#include <linux-2.6/xfs_linux.h> 28#include <linux-2.6/xfs_linux.h>
45#endif /* __XFS_H__ */ 29#endif /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 947b150df8ed..d13eeba2c8f8 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,8 +36,8 @@ struct xfs_acl {
36}; 36};
37 37
38/* On-disk XFS extended attribute names */ 38/* On-disk XFS extended attribute names */
39#define SGI_ACL_FILE "SGI_ACL_FILE" 39#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
40#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" 40#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) 41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
@@ -49,7 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
49extern int posix_acl_access_exists(struct inode *inode); 49extern int posix_acl_access_exists(struct inode *inode);
50extern int posix_acl_default_exists(struct inode *inode); 50extern int posix_acl_default_exists(struct inode *inode);
51 51
52extern struct xattr_handler xfs_xattr_system_handler; 52extern struct xattr_handler xfs_xattr_acl_access_handler;
53extern struct xattr_handler xfs_xattr_acl_default_handler;
53#else 54#else
54# define xfs_check_acl NULL 55# define xfs_check_acl NULL
55# define xfs_get_acl(inode, type) NULL 56# define xfs_get_acl(inode, type) NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index a5d54bf4931b..b1a5a1ff88ea 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -86,6 +86,20 @@ typedef struct xfs_agf {
86#define XFS_AGF_NUM_BITS 12 86#define XFS_AGF_NUM_BITS 12
87#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) 87#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
88 88
89#define XFS_AGF_FLAGS \
90 { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
91 { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
92 { XFS_AGF_SEQNO, "SEQNO" }, \
93 { XFS_AGF_LENGTH, "LENGTH" }, \
94 { XFS_AGF_ROOTS, "ROOTS" }, \
95 { XFS_AGF_LEVELS, "LEVELS" }, \
96 { XFS_AGF_FLFIRST, "FLFIRST" }, \
97 { XFS_AGF_FLLAST, "FLLAST" }, \
98 { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
99 { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
100 { XFS_AGF_LONGEST, "LONGEST" }, \
101 { XFS_AGF_BTREEBLKS, "BTREEBLKS" }
102
89/* disk block (xfs_daddr_t) in the AG */ 103/* disk block (xfs_daddr_t) in the AG */
90#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) 104#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
91#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) 105#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
@@ -173,17 +187,13 @@ typedef struct xfs_perag_busy {
173/* 187/*
174 * Per-ag incore structure, copies of information in agf and agi, 188 * Per-ag incore structure, copies of information in agf and agi,
175 * to improve the performance of allocation group selection. 189 * to improve the performance of allocation group selection.
176 *
177 * pick sizes which fit in allocation buckets well
178 */ 190 */
179#if (BITS_PER_LONG == 32)
180#define XFS_PAGB_NUM_SLOTS 84
181#elif (BITS_PER_LONG == 64)
182#define XFS_PAGB_NUM_SLOTS 128 191#define XFS_PAGB_NUM_SLOTS 128
183#endif
184 192
185typedef struct xfs_perag 193typedef struct xfs_perag {
186{ 194 struct xfs_mount *pag_mount; /* owner filesystem */
195 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
196 atomic_t pag_ref; /* perag reference count */
187 char pagf_init; /* this agf's entry is initialized */ 197 char pagf_init; /* this agf's entry is initialized */
188 char pagi_init; /* this agi's entry is initialized */ 198 char pagi_init; /* this agi's entry is initialized */
189 char pagf_metadata; /* the agf is preferred to be metadata */ 199 char pagf_metadata; /* the agf is preferred to be metadata */
@@ -196,8 +206,6 @@ typedef struct xfs_perag
196 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ 206 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
197 xfs_agino_t pagi_freecount; /* number of free inodes */ 207 xfs_agino_t pagi_freecount; /* number of free inodes */
198 xfs_agino_t pagi_count; /* number of allocated inodes */ 208 xfs_agino_t pagi_count; /* number of allocated inodes */
199 int pagb_count; /* pagb slots in use */
200 xfs_perag_busy_t *pagb_list; /* unstable blocks */
201 209
202 /* 210 /*
203 * Inode allocation search lookup optimisation. 211 * Inode allocation search lookup optimisation.
@@ -216,6 +224,8 @@ typedef struct xfs_perag
216 rwlock_t pag_ici_lock; /* incore inode lock */ 224 rwlock_t pag_ici_lock; /* incore inode lock */
217 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 225 struct radix_tree_root pag_ici_root; /* incore inode cache root */
218#endif 226#endif
227 int pagb_count; /* pagb slots in use */
228 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
219} xfs_perag_t; 229} xfs_perag_t;
220 230
221/* 231/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 2cf944eb796d..94cddbfb2560 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -38,6 +38,7 @@
38#include "xfs_ialloc.h" 38#include "xfs_ialloc.h"
39#include "xfs_alloc.h" 39#include "xfs_alloc.h"
40#include "xfs_error.h" 40#include "xfs_error.h"
41#include "xfs_trace.h"
41 42
42 43
43#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 44#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
@@ -51,30 +52,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
51 xfs_agblock_t bno, 52 xfs_agblock_t bno,
52 xfs_extlen_t len); 53 xfs_extlen_t len);
53 54
54#if defined(XFS_ALLOC_TRACE)
55ktrace_t *xfs_alloc_trace_buf;
56
57#define TRACE_ALLOC(s,a) \
58 xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
59#define TRACE_FREE(s,a,b,x,f) \
60 xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
61#define TRACE_MODAGF(s,a,f) \
62 xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
63#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp) \
64 xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
65#define TRACE_UNBUSY(__func__,s,ag,sl,tp) \
66 xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
67#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp) \
68 xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
69#else
70#define TRACE_ALLOC(s,a)
71#define TRACE_FREE(s,a,b,x,f)
72#define TRACE_MODAGF(s,a,f)
73#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
74#define TRACE_UNBUSY(fname,s,ag,sl,tp)
75#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
76#endif /* XFS_ALLOC_TRACE */
77
78/* 55/*
79 * Prototypes for per-ag allocation routines 56 * Prototypes for per-ag allocation routines
80 */ 57 */
@@ -498,124 +475,6 @@ xfs_alloc_read_agfl(
498 return 0; 475 return 0;
499} 476}
500 477
501#if defined(XFS_ALLOC_TRACE)
502/*
503 * Add an allocation trace entry for an alloc call.
504 */
505STATIC void
506xfs_alloc_trace_alloc(
507 const char *name, /* function tag string */
508 char *str, /* additional string */
509 xfs_alloc_arg_t *args, /* allocation argument structure */
510 int line) /* source line number */
511{
512 ktrace_enter(xfs_alloc_trace_buf,
513 (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
514 (void *)name,
515 (void *)str,
516 (void *)args->mp,
517 (void *)(__psunsigned_t)args->agno,
518 (void *)(__psunsigned_t)args->agbno,
519 (void *)(__psunsigned_t)args->minlen,
520 (void *)(__psunsigned_t)args->maxlen,
521 (void *)(__psunsigned_t)args->mod,
522 (void *)(__psunsigned_t)args->prod,
523 (void *)(__psunsigned_t)args->minleft,
524 (void *)(__psunsigned_t)args->total,
525 (void *)(__psunsigned_t)args->alignment,
526 (void *)(__psunsigned_t)args->len,
527 (void *)((((__psint_t)args->type) << 16) |
528 (__psint_t)args->otype),
529 (void *)(__psint_t)((args->wasdel << 3) |
530 (args->wasfromfl << 2) |
531 (args->isfl << 1) |
532 (args->userdata << 0)));
533}
534
535/*
536 * Add an allocation trace entry for a free call.
537 */
538STATIC void
539xfs_alloc_trace_free(
540 const char *name, /* function tag string */
541 char *str, /* additional string */
542 xfs_mount_t *mp, /* file system mount point */
543 xfs_agnumber_t agno, /* allocation group number */
544 xfs_agblock_t agbno, /* a.g. relative block number */
545 xfs_extlen_t len, /* length of extent */
546 int isfl, /* set if is freelist allocation/free */
547 int line) /* source line number */
548{
549 ktrace_enter(xfs_alloc_trace_buf,
550 (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
551 (void *)name,
552 (void *)str,
553 (void *)mp,
554 (void *)(__psunsigned_t)agno,
555 (void *)(__psunsigned_t)agbno,
556 (void *)(__psunsigned_t)len,
557 (void *)(__psint_t)isfl,
558 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
559}
560
561/*
562 * Add an allocation trace entry for modifying an agf.
563 */
564STATIC void
565xfs_alloc_trace_modagf(
566 const char *name, /* function tag string */
567 char *str, /* additional string */
568 xfs_mount_t *mp, /* file system mount point */
569 xfs_agf_t *agf, /* new agf value */
570 int flags, /* logging flags for agf */
571 int line) /* source line number */
572{
573 ktrace_enter(xfs_alloc_trace_buf,
574 (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
575 (void *)name,
576 (void *)str,
577 (void *)mp,
578 (void *)(__psint_t)flags,
579 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_seqno),
580 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_length),
581 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
582 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
583 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
584 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
585 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flfirst),
586 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_fllast),
587 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flcount),
588 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_freeblks),
589 (void *)(__psunsigned_t)be32_to_cpu(agf->agf_longest));
590}
591
592STATIC void
593xfs_alloc_trace_busy(
594 const char *name, /* function tag string */
595 char *str, /* additional string */
596 xfs_mount_t *mp, /* file system mount point */
597 xfs_agnumber_t agno, /* allocation group number */
598 xfs_agblock_t agbno, /* a.g. relative block number */
599 xfs_extlen_t len, /* length of extent */
600 int slot, /* perag Busy slot */
601 xfs_trans_t *tp,
602 int trtype, /* type: add, delete, search */
603 int line) /* source line number */
604{
605 ktrace_enter(xfs_alloc_trace_buf,
606 (void *)(__psint_t)(trtype | (line << 16)),
607 (void *)name,
608 (void *)str,
609 (void *)mp,
610 (void *)(__psunsigned_t)agno,
611 (void *)(__psunsigned_t)agbno,
612 (void *)(__psunsigned_t)len,
613 (void *)(__psint_t)slot,
614 (void *)tp,
615 NULL, NULL, NULL, NULL, NULL, NULL, NULL);
616}
617#endif /* XFS_ALLOC_TRACE */
618
619/* 478/*
620 * Allocation group level functions. 479 * Allocation group level functions.
621 */ 480 */
@@ -665,9 +524,6 @@ xfs_alloc_ag_vextent(
665 */ 524 */
666 if (args->agbno != NULLAGBLOCK) { 525 if (args->agbno != NULLAGBLOCK) {
667 xfs_agf_t *agf; /* allocation group freelist header */ 526 xfs_agf_t *agf; /* allocation group freelist header */
668#ifdef XFS_ALLOC_TRACE
669 xfs_mount_t *mp = args->mp;
670#endif
671 long slen = (long)args->len; 527 long slen = (long)args->len;
672 528
673 ASSERT(args->len >= args->minlen && args->len <= args->maxlen); 529 ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
@@ -682,7 +538,6 @@ xfs_alloc_ag_vextent(
682 args->pag->pagf_freeblks -= args->len; 538 args->pag->pagf_freeblks -= args->len;
683 ASSERT(be32_to_cpu(agf->agf_freeblks) <= 539 ASSERT(be32_to_cpu(agf->agf_freeblks) <=
684 be32_to_cpu(agf->agf_length)); 540 be32_to_cpu(agf->agf_length));
685 TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
686 xfs_alloc_log_agf(args->tp, args->agbp, 541 xfs_alloc_log_agf(args->tp, args->agbp,
687 XFS_AGF_FREEBLKS); 542 XFS_AGF_FREEBLKS);
688 /* search the busylist for these blocks */ 543 /* search the busylist for these blocks */
@@ -792,13 +647,14 @@ xfs_alloc_ag_vextent_exact(
792 } 647 }
793 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 648 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
794 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
795 TRACE_ALLOC("normal", args); 650
651 trace_xfs_alloc_exact_done(args);
796 args->wasfromfl = 0; 652 args->wasfromfl = 0;
797 return 0; 653 return 0;
798 654
799error0: 655error0:
800 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 656 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
801 TRACE_ALLOC("error", args); 657 trace_xfs_alloc_exact_error(args);
802 return error; 658 return error;
803} 659}
804 660
@@ -958,7 +814,7 @@ xfs_alloc_ag_vextent_near(
958 args->len = blen; 814 args->len = blen;
959 if (!xfs_alloc_fix_minleft(args)) { 815 if (!xfs_alloc_fix_minleft(args)) {
960 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 816 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
961 TRACE_ALLOC("nominleft", args); 817 trace_xfs_alloc_near_nominleft(args);
962 return 0; 818 return 0;
963 } 819 }
964 blen = args->len; 820 blen = args->len;
@@ -981,7 +837,8 @@ xfs_alloc_ag_vextent_near(
981 goto error0; 837 goto error0;
982 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 838 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
983 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); 839 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
984 TRACE_ALLOC("first", args); 840
841 trace_xfs_alloc_near_first(args);
985 return 0; 842 return 0;
986 } 843 }
987 /* 844 /*
@@ -1272,7 +1129,7 @@ xfs_alloc_ag_vextent_near(
1272 * If we couldn't get anything, give up. 1129 * If we couldn't get anything, give up.
1273 */ 1130 */
1274 if (bno_cur_lt == NULL && bno_cur_gt == NULL) { 1131 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1275 TRACE_ALLOC("neither", args); 1132 trace_xfs_alloc_size_neither(args);
1276 args->agbno = NULLAGBLOCK; 1133 args->agbno = NULLAGBLOCK;
1277 return 0; 1134 return 0;
1278 } 1135 }
@@ -1299,7 +1156,7 @@ xfs_alloc_ag_vextent_near(
1299 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1156 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1300 xfs_alloc_fix_len(args); 1157 xfs_alloc_fix_len(args);
1301 if (!xfs_alloc_fix_minleft(args)) { 1158 if (!xfs_alloc_fix_minleft(args)) {
1302 TRACE_ALLOC("nominleft", args); 1159 trace_xfs_alloc_near_nominleft(args);
1303 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); 1160 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
1304 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1161 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1305 return 0; 1162 return 0;
@@ -1314,13 +1171,18 @@ xfs_alloc_ag_vextent_near(
1314 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1171 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
1315 ltnew, rlen, XFSA_FIXUP_BNO_OK))) 1172 ltnew, rlen, XFSA_FIXUP_BNO_OK)))
1316 goto error0; 1173 goto error0;
1317 TRACE_ALLOC(j ? "gt" : "lt", args); 1174
1175 if (j)
1176 trace_xfs_alloc_near_greater(args);
1177 else
1178 trace_xfs_alloc_near_lesser(args);
1179
1318 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1180 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1319 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); 1181 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
1320 return 0; 1182 return 0;
1321 1183
1322 error0: 1184 error0:
1323 TRACE_ALLOC("error", args); 1185 trace_xfs_alloc_near_error(args);
1324 if (cnt_cur != NULL) 1186 if (cnt_cur != NULL)
1325 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1187 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1326 if (bno_cur_lt != NULL) 1188 if (bno_cur_lt != NULL)
@@ -1371,7 +1233,7 @@ xfs_alloc_ag_vextent_size(
1371 goto error0; 1233 goto error0;
1372 if (i == 0 || flen == 0) { 1234 if (i == 0 || flen == 0) {
1373 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1235 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1374 TRACE_ALLOC("noentry", args); 1236 trace_xfs_alloc_size_noentry(args);
1375 return 0; 1237 return 0;
1376 } 1238 }
1377 ASSERT(i == 1); 1239 ASSERT(i == 1);
@@ -1448,7 +1310,7 @@ xfs_alloc_ag_vextent_size(
1448 xfs_alloc_fix_len(args); 1310 xfs_alloc_fix_len(args);
1449 if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { 1311 if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
1450 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1312 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1451 TRACE_ALLOC("nominleft", args); 1313 trace_xfs_alloc_size_nominleft(args);
1452 args->agbno = NULLAGBLOCK; 1314 args->agbno = NULLAGBLOCK;
1453 return 0; 1315 return 0;
1454 } 1316 }
@@ -1471,11 +1333,11 @@ xfs_alloc_ag_vextent_size(
1471 args->agbno + args->len <= 1333 args->agbno + args->len <=
1472 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), 1334 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
1473 error0); 1335 error0);
1474 TRACE_ALLOC("normal", args); 1336 trace_xfs_alloc_size_done(args);
1475 return 0; 1337 return 0;
1476 1338
1477error0: 1339error0:
1478 TRACE_ALLOC("error", args); 1340 trace_xfs_alloc_size_error(args);
1479 if (cnt_cur) 1341 if (cnt_cur)
1480 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1342 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1481 if (bno_cur) 1343 if (bno_cur)
@@ -1534,7 +1396,7 @@ xfs_alloc_ag_vextent_small(
1534 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), 1396 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
1535 error0); 1397 error0);
1536 args->wasfromfl = 1; 1398 args->wasfromfl = 1;
1537 TRACE_ALLOC("freelist", args); 1399 trace_xfs_alloc_small_freelist(args);
1538 *stat = 0; 1400 *stat = 0;
1539 return 0; 1401 return 0;
1540 } 1402 }
@@ -1556,17 +1418,17 @@ xfs_alloc_ag_vextent_small(
1556 */ 1418 */
1557 if (flen < args->minlen) { 1419 if (flen < args->minlen) {
1558 args->agbno = NULLAGBLOCK; 1420 args->agbno = NULLAGBLOCK;
1559 TRACE_ALLOC("notenough", args); 1421 trace_xfs_alloc_small_notenough(args);
1560 flen = 0; 1422 flen = 0;
1561 } 1423 }
1562 *fbnop = fbno; 1424 *fbnop = fbno;
1563 *flenp = flen; 1425 *flenp = flen;
1564 *stat = 1; 1426 *stat = 1;
1565 TRACE_ALLOC("normal", args); 1427 trace_xfs_alloc_small_done(args);
1566 return 0; 1428 return 0;
1567 1429
1568error0: 1430error0:
1569 TRACE_ALLOC("error", args); 1431 trace_xfs_alloc_small_error(args);
1570 return error; 1432 return error;
1571} 1433}
1572 1434
@@ -1800,26 +1662,25 @@ xfs_free_ag_extent(
1800 xfs_agf_t *agf; 1662 xfs_agf_t *agf;
1801 xfs_perag_t *pag; /* per allocation group data */ 1663 xfs_perag_t *pag; /* per allocation group data */
1802 1664
1665 pag = xfs_perag_get(mp, agno);
1666 pag->pagf_freeblks += len;
1667 xfs_perag_put(pag);
1668
1803 agf = XFS_BUF_TO_AGF(agbp); 1669 agf = XFS_BUF_TO_AGF(agbp);
1804 pag = &mp->m_perag[agno];
1805 be32_add_cpu(&agf->agf_freeblks, len); 1670 be32_add_cpu(&agf->agf_freeblks, len);
1806 xfs_trans_agblocks_delta(tp, len); 1671 xfs_trans_agblocks_delta(tp, len);
1807 pag->pagf_freeblks += len;
1808 XFS_WANT_CORRUPTED_GOTO( 1672 XFS_WANT_CORRUPTED_GOTO(
1809 be32_to_cpu(agf->agf_freeblks) <= 1673 be32_to_cpu(agf->agf_freeblks) <=
1810 be32_to_cpu(agf->agf_length), 1674 be32_to_cpu(agf->agf_length),
1811 error0); 1675 error0);
1812 TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
1813 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 1676 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
1814 if (!isfl) 1677 if (!isfl)
1815 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); 1678 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1816 XFS_STATS_INC(xs_freex); 1679 XFS_STATS_INC(xs_freex);
1817 XFS_STATS_ADD(xs_freeb, len); 1680 XFS_STATS_ADD(xs_freeb, len);
1818 } 1681 }
1819 TRACE_FREE(haveleft ? 1682
1820 (haveright ? "both" : "left") : 1683 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1821 (haveright ? "right" : "none"),
1822 agno, bno, len, isfl);
1823 1684
1824 /* 1685 /*
1825 * Since blocks move to the free list without the coordination 1686 * Since blocks move to the free list without the coordination
@@ -1836,7 +1697,7 @@ xfs_free_ag_extent(
1836 return 0; 1697 return 0;
1837 1698
1838 error0: 1699 error0:
1839 TRACE_FREE("error", agno, bno, len, isfl); 1700 trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
1840 if (bno_cur) 1701 if (bno_cur)
1841 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1702 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1842 if (cnt_cur) 1703 if (cnt_cur)
@@ -2110,10 +1971,12 @@ xfs_alloc_get_freelist(
2110 xfs_trans_brelse(tp, agflbp); 1971 xfs_trans_brelse(tp, agflbp);
2111 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) 1972 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
2112 agf->agf_flfirst = 0; 1973 agf->agf_flfirst = 0;
2113 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 1974
1975 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2114 be32_add_cpu(&agf->agf_flcount, -1); 1976 be32_add_cpu(&agf->agf_flcount, -1);
2115 xfs_trans_agflist_delta(tp, -1); 1977 xfs_trans_agflist_delta(tp, -1);
2116 pag->pagf_flcount--; 1978 pag->pagf_flcount--;
1979 xfs_perag_put(pag);
2117 1980
2118 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 1981 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2119 if (btreeblk) { 1982 if (btreeblk) {
@@ -2122,7 +1985,6 @@ xfs_alloc_get_freelist(
2122 logflags |= XFS_AGF_BTREEBLKS; 1985 logflags |= XFS_AGF_BTREEBLKS;
2123 } 1986 }
2124 1987
2125 TRACE_MODAGF(NULL, agf, logflags);
2126 xfs_alloc_log_agf(tp, agbp, logflags); 1988 xfs_alloc_log_agf(tp, agbp, logflags);
2127 *bnop = bno; 1989 *bnop = bno;
2128 1990
@@ -2165,6 +2027,8 @@ xfs_alloc_log_agf(
2165 sizeof(xfs_agf_t) 2027 sizeof(xfs_agf_t)
2166 }; 2028 };
2167 2029
2030 trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
2031
2168 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 2032 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2169 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 2033 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2170} 2034}
@@ -2218,7 +2082,8 @@ xfs_alloc_put_freelist(
2218 be32_add_cpu(&agf->agf_fllast, 1); 2082 be32_add_cpu(&agf->agf_fllast, 1);
2219 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) 2083 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
2220 agf->agf_fllast = 0; 2084 agf->agf_fllast = 0;
2221 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 2085
2086 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2222 be32_add_cpu(&agf->agf_flcount, 1); 2087 be32_add_cpu(&agf->agf_flcount, 1);
2223 xfs_trans_agflist_delta(tp, 1); 2088 xfs_trans_agflist_delta(tp, 1);
2224 pag->pagf_flcount++; 2089 pag->pagf_flcount++;
@@ -2229,14 +2094,13 @@ xfs_alloc_put_freelist(
2229 pag->pagf_btreeblks--; 2094 pag->pagf_btreeblks--;
2230 logflags |= XFS_AGF_BTREEBLKS; 2095 logflags |= XFS_AGF_BTREEBLKS;
2231 } 2096 }
2097 xfs_perag_put(pag);
2232 2098
2233 TRACE_MODAGF(NULL, agf, logflags);
2234 xfs_alloc_log_agf(tp, agbp, logflags); 2099 xfs_alloc_log_agf(tp, agbp, logflags);
2235 2100
2236 ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); 2101 ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
2237 blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2102 blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
2238 *blockp = cpu_to_be32(bno); 2103 *blockp = cpu_to_be32(bno);
2239 TRACE_MODAGF(NULL, agf, logflags);
2240 xfs_alloc_log_agf(tp, agbp, logflags); 2104 xfs_alloc_log_agf(tp, agbp, logflags);
2241 xfs_trans_log_buf(tp, agflbp, 2105 xfs_trans_log_buf(tp, agflbp,
2242 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), 2106 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
@@ -2294,7 +2158,6 @@ xfs_read_agf(
2294 xfs_trans_brelse(tp, *bpp); 2158 xfs_trans_brelse(tp, *bpp);
2295 return XFS_ERROR(EFSCORRUPTED); 2159 return XFS_ERROR(EFSCORRUPTED);
2296 } 2160 }
2297
2298 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); 2161 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
2299 return 0; 2162 return 0;
2300} 2163}
@@ -2317,7 +2180,7 @@ xfs_alloc_read_agf(
2317 ASSERT(agno != NULLAGNUMBER); 2180 ASSERT(agno != NULLAGNUMBER);
2318 2181
2319 error = xfs_read_agf(mp, tp, agno, 2182 error = xfs_read_agf(mp, tp, agno,
2320 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, 2183 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2321 bpp); 2184 bpp);
2322 if (error) 2185 if (error)
2323 return error; 2186 return error;
@@ -2326,7 +2189,7 @@ xfs_alloc_read_agf(
2326 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2189 ASSERT(!XFS_BUF_GETERROR(*bpp));
2327 2190
2328 agf = XFS_BUF_TO_AGF(*bpp); 2191 agf = XFS_BUF_TO_AGF(*bpp);
2329 pag = &mp->m_perag[agno]; 2192 pag = xfs_perag_get(mp, agno);
2330 if (!pag->pagf_init) { 2193 if (!pag->pagf_init) {
2331 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 2194 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
2332 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 2195 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2337,8 +2200,8 @@ xfs_alloc_read_agf(
2337 pag->pagf_levels[XFS_BTNUM_CNTi] = 2200 pag->pagf_levels[XFS_BTNUM_CNTi] =
2338 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2339 spin_lock_init(&pag->pagb_lock); 2202 spin_lock_init(&pag->pagb_lock);
2340 pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * 2203 pag->pagb_count = 0;
2341 sizeof(xfs_perag_busy_t), KM_SLEEP); 2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
2342 pag->pagf_init = 1; 2205 pag->pagf_init = 1;
2343 } 2206 }
2344#ifdef DEBUG 2207#ifdef DEBUG
@@ -2353,6 +2216,7 @@ xfs_alloc_read_agf(
2353 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 2216 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2354 } 2217 }
2355#endif 2218#endif
2219 xfs_perag_put(pag);
2356 return 0; 2220 return 0;
2357} 2221}
2358 2222
@@ -2399,7 +2263,7 @@ xfs_alloc_vextent(
2399 args->minlen > args->maxlen || args->minlen > agsize || 2263 args->minlen > args->maxlen || args->minlen > agsize ||
2400 args->mod >= args->prod) { 2264 args->mod >= args->prod) {
2401 args->fsbno = NULLFSBLOCK; 2265 args->fsbno = NULLFSBLOCK;
2402 TRACE_ALLOC("badargs", args); 2266 trace_xfs_alloc_vextent_badargs(args);
2403 return 0; 2267 return 0;
2404 } 2268 }
2405 minleft = args->minleft; 2269 minleft = args->minleft;
@@ -2412,24 +2276,21 @@ xfs_alloc_vextent(
2412 * These three force us into a single a.g. 2276 * These three force us into a single a.g.
2413 */ 2277 */
2414 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 2278 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2415 down_read(&mp->m_peraglock); 2279 args->pag = xfs_perag_get(mp, args->agno);
2416 args->pag = &mp->m_perag[args->agno];
2417 args->minleft = 0; 2280 args->minleft = 0;
2418 error = xfs_alloc_fix_freelist(args, 0); 2281 error = xfs_alloc_fix_freelist(args, 0);
2419 args->minleft = minleft; 2282 args->minleft = minleft;
2420 if (error) { 2283 if (error) {
2421 TRACE_ALLOC("nofix", args); 2284 trace_xfs_alloc_vextent_nofix(args);
2422 goto error0; 2285 goto error0;
2423 } 2286 }
2424 if (!args->agbp) { 2287 if (!args->agbp) {
2425 up_read(&mp->m_peraglock); 2288 trace_xfs_alloc_vextent_noagbp(args);
2426 TRACE_ALLOC("noagbp", args);
2427 break; 2289 break;
2428 } 2290 }
2429 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 2291 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2430 if ((error = xfs_alloc_ag_vextent(args))) 2292 if ((error = xfs_alloc_ag_vextent(args)))
2431 goto error0; 2293 goto error0;
2432 up_read(&mp->m_peraglock);
2433 break; 2294 break;
2434 case XFS_ALLOCTYPE_START_BNO: 2295 case XFS_ALLOCTYPE_START_BNO:
2435 /* 2296 /*
@@ -2481,14 +2342,13 @@ xfs_alloc_vextent(
2481 * Loop over allocation groups twice; first time with 2342 * Loop over allocation groups twice; first time with
2482 * trylock set, second time without. 2343 * trylock set, second time without.
2483 */ 2344 */
2484 down_read(&mp->m_peraglock);
2485 for (;;) { 2345 for (;;) {
2486 args->pag = &mp->m_perag[args->agno]; 2346 args->pag = xfs_perag_get(mp, args->agno);
2487 if (no_min) args->minleft = 0; 2347 if (no_min) args->minleft = 0;
2488 error = xfs_alloc_fix_freelist(args, flags); 2348 error = xfs_alloc_fix_freelist(args, flags);
2489 args->minleft = minleft; 2349 args->minleft = minleft;
2490 if (error) { 2350 if (error) {
2491 TRACE_ALLOC("nofix", args); 2351 trace_xfs_alloc_vextent_nofix(args);
2492 goto error0; 2352 goto error0;
2493 } 2353 }
2494 /* 2354 /*
@@ -2499,7 +2359,9 @@ xfs_alloc_vextent(
2499 goto error0; 2359 goto error0;
2500 break; 2360 break;
2501 } 2361 }
2502 TRACE_ALLOC("loopfailed", args); 2362
2363 trace_xfs_alloc_vextent_loopfailed(args);
2364
2503 /* 2365 /*
2504 * Didn't work, figure out the next iteration. 2366 * Didn't work, figure out the next iteration.
2505 */ 2367 */
@@ -2526,7 +2388,7 @@ xfs_alloc_vextent(
2526 if (args->agno == sagno) { 2388 if (args->agno == sagno) {
2527 if (no_min == 1) { 2389 if (no_min == 1) {
2528 args->agbno = NULLAGBLOCK; 2390 args->agbno = NULLAGBLOCK;
2529 TRACE_ALLOC("allfailed", args); 2391 trace_xfs_alloc_vextent_allfailed(args);
2530 break; 2392 break;
2531 } 2393 }
2532 if (flags == 0) { 2394 if (flags == 0) {
@@ -2540,8 +2402,8 @@ xfs_alloc_vextent(
2540 } 2402 }
2541 } 2403 }
2542 } 2404 }
2405 xfs_perag_put(args->pag);
2543 } 2406 }
2544 up_read(&mp->m_peraglock);
2545 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { 2407 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
2546 if (args->agno == sagno) 2408 if (args->agno == sagno)
2547 mp->m_agfrotor = (mp->m_agfrotor + 1) % 2409 mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2567,9 +2429,10 @@ xfs_alloc_vextent(
2567 args->len); 2429 args->len);
2568#endif 2430#endif
2569 } 2431 }
2432 xfs_perag_put(args->pag);
2570 return 0; 2433 return 0;
2571error0: 2434error0:
2572 up_read(&mp->m_peraglock); 2435 xfs_perag_put(args->pag);
2573 return error; 2436 return error;
2574} 2437}
2575 2438
@@ -2594,8 +2457,7 @@ xfs_free_extent(
2594 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2457 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2595 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2458 ASSERT(args.agno < args.mp->m_sb.sb_agcount);
2596 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2459 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2597 down_read(&args.mp->m_peraglock); 2460 args.pag = xfs_perag_get(args.mp, args.agno);
2598 args.pag = &args.mp->m_perag[args.agno];
2599 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2461 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
2600 goto error0; 2462 goto error0;
2601#ifdef DEBUG 2463#ifdef DEBUG
@@ -2605,7 +2467,7 @@ xfs_free_extent(
2605#endif 2467#endif
2606 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2468 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2607error0: 2469error0:
2608 up_read(&args.mp->m_peraglock); 2470 xfs_perag_put(args.pag);
2609 return error; 2471 return error;
2610} 2472}
2611 2473
@@ -2626,15 +2488,15 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2626 xfs_agblock_t bno, 2488 xfs_agblock_t bno,
2627 xfs_extlen_t len) 2489 xfs_extlen_t len)
2628{ 2490{
2629 xfs_mount_t *mp;
2630 xfs_perag_busy_t *bsy; 2491 xfs_perag_busy_t *bsy;
2492 struct xfs_perag *pag;
2631 int n; 2493 int n;
2632 2494
2633 mp = tp->t_mountp; 2495 pag = xfs_perag_get(tp->t_mountp, agno);
2634 spin_lock(&mp->m_perag[agno].pagb_lock); 2496 spin_lock(&pag->pagb_lock);
2635 2497
2636 /* search pagb_list for an open slot */ 2498 /* search pagb_list for an open slot */
2637 for (bsy = mp->m_perag[agno].pagb_list, n = 0; 2499 for (bsy = pag->pagb_list, n = 0;
2638 n < XFS_PAGB_NUM_SLOTS; 2500 n < XFS_PAGB_NUM_SLOTS;
2639 bsy++, n++) { 2501 bsy++, n++) {
2640 if (bsy->busy_tp == NULL) { 2502 if (bsy->busy_tp == NULL) {
@@ -2642,16 +2504,16 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2642 } 2504 }
2643 } 2505 }
2644 2506
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
2508
2645 if (n < XFS_PAGB_NUM_SLOTS) { 2509 if (n < XFS_PAGB_NUM_SLOTS) {
2646 bsy = &mp->m_perag[agno].pagb_list[n]; 2510 bsy = &pag->pagb_list[n];
2647 mp->m_perag[agno].pagb_count++; 2511 pag->pagb_count++;
2648 TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
2649 bsy->busy_start = bno; 2512 bsy->busy_start = bno;
2650 bsy->busy_length = len; 2513 bsy->busy_length = len;
2651 bsy->busy_tp = tp; 2514 bsy->busy_tp = tp;
2652 xfs_trans_add_busy(tp, agno, n); 2515 xfs_trans_add_busy(tp, agno, n);
2653 } else { 2516 } else {
2654 TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
2655 /* 2517 /*
2656 * The busy list is full! Since it is now not possible to 2518 * The busy list is full! Since it is now not possible to
2657 * track the free block, make this a synchronous transaction 2519 * track the free block, make this a synchronous transaction
@@ -2661,7 +2523,8 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2661 xfs_trans_set_sync(tp); 2523 xfs_trans_set_sync(tp);
2662 } 2524 }
2663 2525
2664 spin_unlock(&mp->m_perag[agno].pagb_lock); 2526 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag);
2665} 2528}
2666 2529
2667void 2530void
@@ -2669,24 +2532,23 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
2669 xfs_agnumber_t agno, 2532 xfs_agnumber_t agno,
2670 int idx) 2533 int idx)
2671{ 2534{
2672 xfs_mount_t *mp; 2535 struct xfs_perag *pag;
2673 xfs_perag_busy_t *list; 2536 xfs_perag_busy_t *list;
2674 2537
2675 mp = tp->t_mountp; 2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2676 2542
2677 spin_lock(&mp->m_perag[agno].pagb_lock); 2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
2678 list = mp->m_perag[agno].pagb_list;
2679 2544
2680 ASSERT(idx < XFS_PAGB_NUM_SLOTS);
2681 if (list[idx].busy_tp == tp) { 2545 if (list[idx].busy_tp == tp) {
2682 TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
2683 list[idx].busy_tp = NULL; 2546 list[idx].busy_tp = NULL;
2684 mp->m_perag[agno].pagb_count--; 2547 pag->pagb_count--;
2685 } else {
2686 TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
2687 } 2548 }
2688 2549
2689 spin_unlock(&mp->m_perag[agno].pagb_lock); 2550 spin_unlock(&pag->pagb_lock);
2551 xfs_perag_put(pag);
2690} 2552}
2691 2553
2692 2554
@@ -2700,48 +2562,44 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2700 xfs_agblock_t bno, 2562 xfs_agblock_t bno,
2701 xfs_extlen_t len) 2563 xfs_extlen_t len)
2702{ 2564{
2703 xfs_mount_t *mp; 2565 struct xfs_perag *pag;
2704 xfs_perag_busy_t *bsy; 2566 xfs_perag_busy_t *bsy;
2705 xfs_agblock_t uend, bend; 2567 xfs_agblock_t uend, bend;
2706 xfs_lsn_t lsn; 2568 xfs_lsn_t lsn = 0;
2707 int cnt; 2569 int cnt;
2708 2570
2709 mp = tp->t_mountp; 2571 pag = xfs_perag_get(tp->t_mountp, agno);
2710 2572 spin_lock(&pag->pagb_lock);
2711 spin_lock(&mp->m_perag[agno].pagb_lock); 2573 cnt = pag->pagb_count;
2712 cnt = mp->m_perag[agno].pagb_count;
2713 2574
2575 /*
2576 * search pagb_list for this slot, skipping open slots. We have to
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2714 uend = bno + len - 1; 2581 uend = bno + len - 1;
2715 2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2716 /* search pagb_list for this slot, skipping open slots */ 2583 bsy = &pag->pagb_list[cnt];
2717 for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { 2584 if (!bsy->busy_tp)
2718 2585 continue;
2719 /* 2586
2720 * (start1,length1) within (start2, length2) 2587 bend = bsy->busy_start + bsy->busy_length - 1;
2721 */ 2588 if (bno > bend || uend < bsy->busy_start)
2722 if (bsy->busy_tp != NULL) { 2589 continue;
2723 bend = bsy->busy_start + bsy->busy_length - 1; 2590
2724 if ((bno > bend) || (uend < bsy->busy_start)) { 2591 /* (start1,length1) within (start2, length2) */
2725 cnt--; 2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
2726 } else { 2593 lsn = bsy->busy_tp->t_commit_lsn;
2727 TRACE_BUSYSEARCH("xfs_alloc_search_busy",
2728 "found1", agno, bno, len, tp);
2729 break;
2730 }
2731 }
2732 } 2594 }
2595 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2733 2598
2734 /* 2599 /*
2735 * If a block was found, force the log through the LSN of the 2600 * If a block was found, force the log through the LSN of the
2736 * transaction that freed the block 2601 * transaction that freed the block
2737 */ 2602 */
2738 if (cnt) { 2603 if (lsn)
2739 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp); 2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2740 lsn = bsy->busy_tp->t_commit_lsn;
2741 spin_unlock(&mp->m_perag[agno].pagb_lock);
2742 xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
2743 } else {
2744 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
2745 spin_unlock(&mp->m_perag[agno].pagb_lock);
2746 }
2747} 2605}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index e704caee10df..599bffa39784 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -37,6 +37,15 @@ typedef enum xfs_alloctype
37 XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ 37 XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */
38} xfs_alloctype_t; 38} xfs_alloctype_t;
39 39
40#define XFS_ALLOC_TYPES \
41 { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
42 { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \
43 { XFS_ALLOCTYPE_START_AG, "START_AG" }, \
44 { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \
45 { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \
46 { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \
47 { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" }
48
40/* 49/*
41 * Flags for xfs_alloc_fix_freelist. 50 * Flags for xfs_alloc_fix_freelist.
42 */ 51 */
@@ -109,24 +118,6 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
109 118
110#ifdef __KERNEL__ 119#ifdef __KERNEL__
111 120
112#if defined(XFS_ALLOC_TRACE)
113/*
114 * Allocation tracing buffer size.
115 */
116#define XFS_ALLOC_TRACE_SIZE 4096
117extern ktrace_t *xfs_alloc_trace_buf;
118
119/*
120 * Types for alloc tracing.
121 */
122#define XFS_ALLOC_KTRACE_ALLOC 1
123#define XFS_ALLOC_KTRACE_FREE 2
124#define XFS_ALLOC_KTRACE_MODAGF 3
125#define XFS_ALLOC_KTRACE_BUSY 4
126#define XFS_ALLOC_KTRACE_UNBUSY 5
127#define XFS_ALLOC_KTRACE_BUSYSEARCH 6
128#endif
129
130void 121void
131xfs_alloc_mark_busy(xfs_trans_t *tp, 122xfs_alloc_mark_busy(xfs_trans_t *tp,
132 xfs_agnumber_t agno, 123 xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index c10c3a292d30..b726e10d2c1c 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -39,6 +39,7 @@
39#include "xfs_ialloc.h" 39#include "xfs_ialloc.h"
40#include "xfs_alloc.h" 40#include "xfs_alloc.h"
41#include "xfs_error.h" 41#include "xfs_error.h"
42#include "xfs_trace.h"
42 43
43 44
44STATIC struct xfs_btree_cur * 45STATIC struct xfs_btree_cur *
@@ -60,12 +61,14 @@ xfs_allocbt_set_root(
60 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 61 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
61 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 62 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
62 int btnum = cur->bc_btnum; 63 int btnum = cur->bc_btnum;
64 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
63 65
64 ASSERT(ptr->s != 0); 66 ASSERT(ptr->s != 0);
65 67
66 agf->agf_roots[btnum] = ptr->s; 68 agf->agf_roots[btnum] = ptr->s;
67 be32_add_cpu(&agf->agf_levels[btnum], inc); 69 be32_add_cpu(&agf->agf_levels[btnum], inc);
68 cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc; 70 pag->pagf_levels[btnum] += inc;
71 xfs_perag_put(pag);
69 72
70 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 73 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
71} 74}
@@ -149,6 +152,7 @@ xfs_allocbt_update_lastrec(
149{ 152{
150 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 153 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
151 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 154 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
155 struct xfs_perag *pag;
152 __be32 len; 156 __be32 len;
153 int numrecs; 157 int numrecs;
154 158
@@ -192,7 +196,9 @@ xfs_allocbt_update_lastrec(
192 } 196 }
193 197
194 agf->agf_longest = len; 198 agf->agf_longest = len;
195 cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len); 199 pag = xfs_perag_get(cur->bc_mp, seqno);
200 pag->pagf_longest = be32_to_cpu(len);
201 xfs_perag_put(pag);
196 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); 202 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
197} 203}
198 204
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 4ece1906bd41..b9c196a53c42 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -47,6 +47,7 @@
47#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
48#include "xfs_rw.h" 48#include "xfs_rw.h"
49#include "xfs_vnodeops.h" 49#include "xfs_vnodeops.h"
50#include "xfs_trace.h"
50 51
51/* 52/*
52 * xfs_attr.c 53 * xfs_attr.c
@@ -89,19 +90,15 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
89 90
90#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ 91#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
91 92
92#if defined(XFS_ATTR_TRACE)
93ktrace_t *xfs_attr_trace_buf;
94#endif
95
96STATIC int 93STATIC int
97xfs_attr_name_to_xname( 94xfs_attr_name_to_xname(
98 struct xfs_name *xname, 95 struct xfs_name *xname,
99 const char *aname) 96 const unsigned char *aname)
100{ 97{
101 if (!aname) 98 if (!aname)
102 return EINVAL; 99 return EINVAL;
103 xname->name = aname; 100 xname->name = aname;
104 xname->len = strlen(aname); 101 xname->len = strlen((char *)aname);
105 if (xname->len >= MAXNAMELEN) 102 if (xname->len >= MAXNAMELEN)
106 return EFAULT; /* match IRIX behaviour */ 103 return EFAULT; /* match IRIX behaviour */
107 104
@@ -123,9 +120,13 @@ xfs_inode_hasattr(
123 * Overall external interface routines. 120 * Overall external interface routines.
124 *========================================================================*/ 121 *========================================================================*/
125 122
126int 123STATIC int
127xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name, 124xfs_attr_get_int(
128 char *value, int *valuelenp, int flags) 125 struct xfs_inode *ip,
126 struct xfs_name *name,
127 unsigned char *value,
128 int *valuelenp,
129 int flags)
129{ 130{
130 xfs_da_args_t args; 131 xfs_da_args_t args;
131 int error; 132 int error;
@@ -170,8 +171,8 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
170int 171int
171xfs_attr_get( 172xfs_attr_get(
172 xfs_inode_t *ip, 173 xfs_inode_t *ip,
173 const char *name, 174 const unsigned char *name,
174 char *value, 175 unsigned char *value,
175 int *valuelenp, 176 int *valuelenp,
176 int flags) 177 int flags)
177{ 178{
@@ -188,7 +189,7 @@ xfs_attr_get(
188 return error; 189 return error;
189 190
190 xfs_ilock(ip, XFS_ILOCK_SHARED); 191 xfs_ilock(ip, XFS_ILOCK_SHARED);
191 error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags); 192 error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
192 xfs_iunlock(ip, XFS_ILOCK_SHARED); 193 xfs_iunlock(ip, XFS_ILOCK_SHARED);
193 return(error); 194 return(error);
194} 195}
@@ -196,7 +197,7 @@ xfs_attr_get(
196/* 197/*
197 * Calculate how many blocks we need for the new attribute, 198 * Calculate how many blocks we need for the new attribute,
198 */ 199 */
199int 200STATIC int
200xfs_attr_calc_size( 201xfs_attr_calc_size(
201 struct xfs_inode *ip, 202 struct xfs_inode *ip,
202 int namelen, 203 int namelen,
@@ -234,8 +235,12 @@ xfs_attr_calc_size(
234} 235}
235 236
236STATIC int 237STATIC int
237xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, 238xfs_attr_set_int(
238 char *value, int valuelen, int flags) 239 struct xfs_inode *dp,
240 struct xfs_name *name,
241 unsigned char *value,
242 int valuelen,
243 int flags)
239{ 244{
240 xfs_da_args_t args; 245 xfs_da_args_t args;
241 xfs_fsblock_t firstblock; 246 xfs_fsblock_t firstblock;
@@ -451,8 +456,8 @@ out:
451int 456int
452xfs_attr_set( 457xfs_attr_set(
453 xfs_inode_t *dp, 458 xfs_inode_t *dp,
454 const char *name, 459 const unsigned char *name,
455 char *value, 460 unsigned char *value,
456 int valuelen, 461 int valuelen,
457 int flags) 462 int flags)
458{ 463{
@@ -599,7 +604,7 @@ out:
599int 604int
600xfs_attr_remove( 605xfs_attr_remove(
601 xfs_inode_t *dp, 606 xfs_inode_t *dp,
602 const char *name, 607 const unsigned char *name,
603 int flags) 608 int flags)
604{ 609{
605 int error; 610 int error;
@@ -636,7 +641,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
636 return EIO; 641 return EIO;
637 642
638 xfs_ilock(dp, XFS_ILOCK_SHARED); 643 xfs_ilock(dp, XFS_ILOCK_SHARED);
639 xfs_attr_trace_l_c("syscall start", context);
640 644
641 /* 645 /*
642 * Decide on what work routines to call based on the inode size. 646 * Decide on what work routines to call based on the inode size.
@@ -652,7 +656,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
652 } 656 }
653 657
654 xfs_iunlock(dp, XFS_ILOCK_SHARED); 658 xfs_iunlock(dp, XFS_ILOCK_SHARED);
655 xfs_attr_trace_l_c("syscall end", context);
656 659
657 return error; 660 return error;
658} 661}
@@ -670,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
670 */ 673 */
671/*ARGSUSED*/ 674/*ARGSUSED*/
672STATIC int 675STATIC int
673xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, 676xfs_attr_put_listent(
674 char *name, int namelen, 677 xfs_attr_list_context_t *context,
675 int valuelen, char *value) 678 int flags,
679 unsigned char *name,
680 int namelen,
681 int valuelen,
682 unsigned char *value)
676{ 683{
677 struct attrlist *alist = (struct attrlist *)context->alist; 684 struct attrlist *alist = (struct attrlist *)context->alist;
678 attrlist_ent_t *aep; 685 attrlist_ent_t *aep;
@@ -698,7 +705,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
698 context->count * sizeof(alist->al_offset[0]); 705 context->count * sizeof(alist->al_offset[0]);
699 context->firstu -= ATTR_ENTSIZE(namelen); 706 context->firstu -= ATTR_ENTSIZE(namelen);
700 if (context->firstu < arraytop) { 707 if (context->firstu < arraytop) {
701 xfs_attr_trace_l_c("buffer full", context); 708 trace_xfs_attr_list_full(context);
702 alist->al_more = 1; 709 alist->al_more = 1;
703 context->seen_enough = 1; 710 context->seen_enough = 1;
704 return 1; 711 return 1;
@@ -710,7 +717,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
710 aep->a_name[namelen] = 0; 717 aep->a_name[namelen] = 0;
711 alist->al_offset[context->count++] = context->firstu; 718 alist->al_offset[context->count++] = context->firstu;
712 alist->al_count = context->count; 719 alist->al_count = context->count;
713 xfs_attr_trace_l_c("add", context); 720 trace_xfs_attr_list_add(context);
714 return 0; 721 return 0;
715} 722}
716 723
@@ -1849,7 +1856,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1849 node = bp->data; 1856 node = bp->data;
1850 switch (be16_to_cpu(node->hdr.info.magic)) { 1857 switch (be16_to_cpu(node->hdr.info.magic)) {
1851 case XFS_DA_NODE_MAGIC: 1858 case XFS_DA_NODE_MAGIC:
1852 xfs_attr_trace_l_cn("wrong blk", context, node); 1859 trace_xfs_attr_list_wrong_blk(context);
1853 xfs_da_brelse(NULL, bp); 1860 xfs_da_brelse(NULL, bp);
1854 bp = NULL; 1861 bp = NULL;
1855 break; 1862 break;
@@ -1857,20 +1864,18 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1857 leaf = bp->data; 1864 leaf = bp->data;
1858 if (cursor->hashval > be32_to_cpu(leaf->entries[ 1865 if (cursor->hashval > be32_to_cpu(leaf->entries[
1859 be16_to_cpu(leaf->hdr.count)-1].hashval)) { 1866 be16_to_cpu(leaf->hdr.count)-1].hashval)) {
1860 xfs_attr_trace_l_cl("wrong blk", 1867 trace_xfs_attr_list_wrong_blk(context);
1861 context, leaf);
1862 xfs_da_brelse(NULL, bp); 1868 xfs_da_brelse(NULL, bp);
1863 bp = NULL; 1869 bp = NULL;
1864 } else if (cursor->hashval <= 1870 } else if (cursor->hashval <=
1865 be32_to_cpu(leaf->entries[0].hashval)) { 1871 be32_to_cpu(leaf->entries[0].hashval)) {
1866 xfs_attr_trace_l_cl("maybe wrong blk", 1872 trace_xfs_attr_list_wrong_blk(context);
1867 context, leaf);
1868 xfs_da_brelse(NULL, bp); 1873 xfs_da_brelse(NULL, bp);
1869 bp = NULL; 1874 bp = NULL;
1870 } 1875 }
1871 break; 1876 break;
1872 default: 1877 default:
1873 xfs_attr_trace_l_c("wrong blk - ??", context); 1878 trace_xfs_attr_list_wrong_blk(context);
1874 xfs_da_brelse(NULL, bp); 1879 xfs_da_brelse(NULL, bp);
1875 bp = NULL; 1880 bp = NULL;
1876 } 1881 }
@@ -1915,8 +1920,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1915 if (cursor->hashval 1920 if (cursor->hashval
1916 <= be32_to_cpu(btree->hashval)) { 1921 <= be32_to_cpu(btree->hashval)) {
1917 cursor->blkno = be32_to_cpu(btree->before); 1922 cursor->blkno = be32_to_cpu(btree->before);
1918 xfs_attr_trace_l_cb("descending", 1923 trace_xfs_attr_list_node_descend(context,
1919 context, btree); 1924 btree);
1920 break; 1925 break;
1921 } 1926 }
1922 } 1927 }
@@ -1983,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1983 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; 1988 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
1984 xfs_mount_t *mp; 1989 xfs_mount_t *mp;
1985 xfs_daddr_t dblkno; 1990 xfs_daddr_t dblkno;
1986 xfs_caddr_t dst; 1991 void *dst;
1987 xfs_buf_t *bp; 1992 xfs_buf_t *bp;
1988 int nmap, error, tmp, valuelen, blkcnt, i; 1993 int nmap, error, tmp, valuelen, blkcnt, i;
1989 xfs_dablk_t lblkno; 1994 xfs_dablk_t lblkno;
@@ -2010,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2010 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 2015 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
2011 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 2016 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
2012 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, 2017 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
2013 blkcnt, 2018 blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
2014 XFS_BUF_LOCK | XBF_DONT_BLOCK,
2015 &bp); 2019 &bp);
2016 if (error) 2020 if (error)
2017 return(error); 2021 return(error);
2018 2022
2019 tmp = (valuelen < XFS_BUF_SIZE(bp)) 2023 tmp = (valuelen < XFS_BUF_SIZE(bp))
2020 ? valuelen : XFS_BUF_SIZE(bp); 2024 ? valuelen : XFS_BUF_SIZE(bp);
2021 xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); 2025 xfs_biomove(bp, 0, tmp, dst, XBF_READ);
2022 xfs_buf_relse(bp); 2026 xfs_buf_relse(bp);
2023 dst += tmp; 2027 dst += tmp;
2024 valuelen -= tmp; 2028 valuelen -= tmp;
@@ -2042,7 +2046,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2042 xfs_inode_t *dp; 2046 xfs_inode_t *dp;
2043 xfs_bmbt_irec_t map; 2047 xfs_bmbt_irec_t map;
2044 xfs_daddr_t dblkno; 2048 xfs_daddr_t dblkno;
2045 xfs_caddr_t src; 2049 void *src;
2046 xfs_buf_t *bp; 2050 xfs_buf_t *bp;
2047 xfs_dablk_t lblkno; 2051 xfs_dablk_t lblkno;
2048 int blkcnt, valuelen, nmap, error, tmp, committed; 2052 int blkcnt, valuelen, nmap, error, tmp, committed;
@@ -2143,14 +2147,14 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2143 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), 2147 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
2144 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 2148 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2145 2149
2146 bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, 2150 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
2147 XFS_BUF_LOCK | XBF_DONT_BLOCK); 2151 XBF_LOCK | XBF_DONT_BLOCK);
2148 ASSERT(bp); 2152 ASSERT(bp);
2149 ASSERT(!XFS_BUF_GETERROR(bp)); 2153 ASSERT(!XFS_BUF_GETERROR(bp));
2150 2154
2151 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2155 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2152 XFS_BUF_SIZE(bp); 2156 XFS_BUF_SIZE(bp);
2153 xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); 2157 xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
2154 if (tmp < XFS_BUF_SIZE(bp)) 2158 if (tmp < XFS_BUF_SIZE(bp))
2155 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2159 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2156 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2160 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
@@ -2211,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2211 /* 2215 /*
2212 * If the "remote" value is in the cache, remove it. 2216 * If the "remote" value is in the cache, remove it.
2213 */ 2217 */
2214 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, 2218 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
2215 XFS_INCORE_TRYLOCK);
2216 if (bp) { 2219 if (bp) {
2217 XFS_BUF_STALE(bp); 2220 XFS_BUF_STALE(bp);
2218 XFS_BUF_UNDELAYWRITE(bp); 2221 XFS_BUF_UNDELAYWRITE(bp);
@@ -2266,85 +2269,3 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2266 } 2269 }
2267 return(0); 2270 return(0);
2268} 2271}
2269
2270#if defined(XFS_ATTR_TRACE)
2271/*
2272 * Add a trace buffer entry for an attr_list context structure.
2273 */
2274void
2275xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
2276{
2277 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context,
2278 (__psunsigned_t)NULL,
2279 (__psunsigned_t)NULL,
2280 (__psunsigned_t)NULL);
2281}
2282
2283/*
2284 * Add a trace buffer entry for a context structure and a Btree node.
2285 */
2286void
2287xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
2288 struct xfs_da_intnode *node)
2289{
2290 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context,
2291 (__psunsigned_t)be16_to_cpu(node->hdr.count),
2292 (__psunsigned_t)be32_to_cpu(node->btree[0].hashval),
2293 (__psunsigned_t)be32_to_cpu(node->btree[
2294 be16_to_cpu(node->hdr.count)-1].hashval));
2295}
2296
2297/*
2298 * Add a trace buffer entry for a context structure and a Btree element.
2299 */
2300void
2301xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
2302 struct xfs_da_node_entry *btree)
2303{
2304 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context,
2305 (__psunsigned_t)be32_to_cpu(btree->hashval),
2306 (__psunsigned_t)be32_to_cpu(btree->before),
2307 (__psunsigned_t)NULL);
2308}
2309
2310/*
2311 * Add a trace buffer entry for a context structure and a leaf block.
2312 */
2313void
2314xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
2315 struct xfs_attr_leafblock *leaf)
2316{
2317 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context,
2318 (__psunsigned_t)be16_to_cpu(leaf->hdr.count),
2319 (__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval),
2320 (__psunsigned_t)be32_to_cpu(leaf->entries[
2321 be16_to_cpu(leaf->hdr.count)-1].hashval));
2322}
2323
2324/*
2325 * Add a trace buffer entry for the arguments given to the routine,
2326 * generic form.
2327 */
2328void
2329xfs_attr_trace_enter(int type, char *where,
2330 struct xfs_attr_list_context *context,
2331 __psunsigned_t a13, __psunsigned_t a14,
2332 __psunsigned_t a15)
2333{
2334 ASSERT(xfs_attr_trace_buf);
2335 ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
2336 (void *)((__psunsigned_t)where),
2337 (void *)((__psunsigned_t)context->dp),
2338 (void *)((__psunsigned_t)context->cursor->hashval),
2339 (void *)((__psunsigned_t)context->cursor->blkno),
2340 (void *)((__psunsigned_t)context->cursor->offset),
2341 (void *)((__psunsigned_t)context->alist),
2342 (void *)((__psunsigned_t)context->bufsize),
2343 (void *)((__psunsigned_t)context->count),
2344 (void *)((__psunsigned_t)context->firstu),
2345 NULL,
2346 (void *)((__psunsigned_t)context->dupcnt),
2347 (void *)((__psunsigned_t)context->flags),
2348 (void *)a13, (void *)a14, (void *)a15);
2349}
2350#endif /* XFS_ATTR_TRACE */
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index fb3b2a68b9b9..e920d68ef509 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -48,6 +48,16 @@ struct xfs_attr_list_context;
48#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ 48#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
49#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ 49#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
50 50
51#define XFS_ATTR_FLAGS \
52 { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
53 { ATTR_ROOT, "ROOT" }, \
54 { ATTR_TRUST, "TRUST" }, \
55 { ATTR_SECURE, "SECURE" }, \
56 { ATTR_CREATE, "CREATE" }, \
57 { ATTR_REPLACE, "REPLACE" }, \
58 { ATTR_KERNOTIME, "KERNOTIME" }, \
59 { ATTR_KERNOVAL, "KERNOVAL" }
60
51/* 61/*
52 * The maximum size (into the kernel or returned from the kernel) of an 62 * The maximum size (into the kernel or returned from the kernel) of an
53 * attribute value or the buffer used for an attr_list() call. Larger 63 * attribute value or the buffer used for an attr_list() call. Larger
@@ -103,7 +113,7 @@ typedef struct attrlist_cursor_kern {
103 113
104 114
105typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, 115typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
106 char *, int, int, char *); 116 unsigned char *, int, int, unsigned char *);
107 117
108typedef struct xfs_attr_list_context { 118typedef struct xfs_attr_list_context {
109 struct xfs_inode *dp; /* inode */ 119 struct xfs_inode *dp; /* inode */
@@ -129,9 +139,7 @@ typedef struct xfs_attr_list_context {
129/* 139/*
130 * Overall external interface routines. 140 * Overall external interface routines.
131 */ 141 */
132int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
133int xfs_attr_inactive(struct xfs_inode *dp); 142int xfs_attr_inactive(struct xfs_inode *dp);
134int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
135int xfs_attr_rmtval_get(struct xfs_da_args *args); 143int xfs_attr_rmtval_get(struct xfs_da_args *args);
136int xfs_attr_list_int(struct xfs_attr_list_context *); 144int xfs_attr_list_int(struct xfs_attr_list_context *);
137 145
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index afdc8911637d..a90ce74fc256 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -42,6 +42,7 @@
42#include "xfs_attr.h" 42#include "xfs_attr.h"
43#include "xfs_attr_leaf.h" 43#include "xfs_attr_leaf.h"
44#include "xfs_error.h" 44#include "xfs_error.h"
45#include "xfs_trace.h"
45 46
46/* 47/*
47 * xfs_attr_leaf.c 48 * xfs_attr_leaf.c
@@ -98,7 +99,7 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
98 * If namespace bits don't match return 0. 99 * If namespace bits don't match return 0.
99 * If all match then return 1. 100 * If all match then return 1.
100 */ 101 */
101STATIC_INLINE int 102STATIC int
102xfs_attr_namesp_match(int arg_flags, int ondisk_flags) 103xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
103{ 104{
104 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); 105 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
@@ -520,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
520 521
521 sfe = &sf->list[0]; 522 sfe = &sf->list[0];
522 for (i = 0; i < sf->hdr.count; i++) { 523 for (i = 0; i < sf->hdr.count; i++) {
523 nargs.name = (char *)sfe->nameval; 524 nargs.name = sfe->nameval;
524 nargs.namelen = sfe->namelen; 525 nargs.namelen = sfe->namelen;
525 nargs.value = (char *)&sfe->nameval[nargs.namelen]; 526 nargs.value = &sfe->nameval[nargs.namelen];
526 nargs.valuelen = sfe->valuelen; 527 nargs.valuelen = sfe->valuelen;
527 nargs.hashval = xfs_da_hashname((char *)sfe->nameval, 528 nargs.hashval = xfs_da_hashname(sfe->nameval,
528 sfe->namelen); 529 sfe->namelen);
529 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); 530 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
530 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ 531 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
@@ -594,7 +595,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
594 cursor = context->cursor; 595 cursor = context->cursor;
595 ASSERT(cursor != NULL); 596 ASSERT(cursor != NULL);
596 597
597 xfs_attr_trace_l_c("sf start", context); 598 trace_xfs_attr_list_sf(context);
598 599
599 /* 600 /*
600 * If the buffer is large enough and the cursor is at the start, 601 * If the buffer is large enough and the cursor is at the start,
@@ -611,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
611 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { 612 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
612 error = context->put_listent(context, 613 error = context->put_listent(context,
613 sfe->flags, 614 sfe->flags,
614 (char *)sfe->nameval, 615 sfe->nameval,
615 (int)sfe->namelen, 616 (int)sfe->namelen,
616 (int)sfe->valuelen, 617 (int)sfe->valuelen,
617 (char*)&sfe->nameval[sfe->namelen]); 618 &sfe->nameval[sfe->namelen]);
618 619
619 /* 620 /*
620 * Either search callback finished early or 621 * Either search callback finished early or
@@ -627,7 +628,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
627 return error; 628 return error;
628 sfe = XFS_ATTR_SF_NEXTENTRY(sfe); 629 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
629 } 630 }
630 xfs_attr_trace_l_c("sf big-gulp", context); 631 trace_xfs_attr_list_sf_all(context);
631 return(0); 632 return(0);
632 } 633 }
633 634
@@ -653,14 +654,13 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
653 XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", 654 XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
654 XFS_ERRLEVEL_LOW, 655 XFS_ERRLEVEL_LOW,
655 context->dp->i_mount, sfe); 656 context->dp->i_mount, sfe);
656 xfs_attr_trace_l_c("sf corrupted", context);
657 kmem_free(sbuf); 657 kmem_free(sbuf);
658 return XFS_ERROR(EFSCORRUPTED); 658 return XFS_ERROR(EFSCORRUPTED);
659 } 659 }
660 660
661 sbp->entno = i; 661 sbp->entno = i;
662 sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); 662 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
663 sbp->name = (char *)sfe->nameval; 663 sbp->name = sfe->nameval;
664 sbp->namelen = sfe->namelen; 664 sbp->namelen = sfe->namelen;
665 /* These are bytes, and both on-disk, don't endian-flip */ 665 /* These are bytes, and both on-disk, don't endian-flip */
666 sbp->valuelen = sfe->valuelen; 666 sbp->valuelen = sfe->valuelen;
@@ -693,7 +693,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
693 } 693 }
694 if (i == nsbuf) { 694 if (i == nsbuf) {
695 kmem_free(sbuf); 695 kmem_free(sbuf);
696 xfs_attr_trace_l_c("blk end", context);
697 return(0); 696 return(0);
698 } 697 }
699 698
@@ -719,7 +718,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
719 } 718 }
720 719
721 kmem_free(sbuf); 720 kmem_free(sbuf);
722 xfs_attr_trace_l_c("sf E-O-F", context);
723 return(0); 721 return(0);
724} 722}
725 723
@@ -820,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
820 continue; 818 continue;
821 ASSERT(entry->flags & XFS_ATTR_LOCAL); 819 ASSERT(entry->flags & XFS_ATTR_LOCAL);
822 name_loc = xfs_attr_leaf_name_local(leaf, i); 820 name_loc = xfs_attr_leaf_name_local(leaf, i);
823 nargs.name = (char *)name_loc->nameval; 821 nargs.name = name_loc->nameval;
824 nargs.namelen = name_loc->namelen; 822 nargs.namelen = name_loc->namelen;
825 nargs.value = (char *)&name_loc->nameval[nargs.namelen]; 823 nargs.value = &name_loc->nameval[nargs.namelen];
826 nargs.valuelen = be16_to_cpu(name_loc->valuelen); 824 nargs.valuelen = be16_to_cpu(name_loc->valuelen);
827 nargs.hashval = be32_to_cpu(entry->hashval); 825 nargs.hashval = be32_to_cpu(entry->hashval);
828 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); 826 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
@@ -2323,7 +2321,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2323 cursor = context->cursor; 2321 cursor = context->cursor;
2324 cursor->initted = 1; 2322 cursor->initted = 1;
2325 2323
2326 xfs_attr_trace_l_cl("blk start", context, leaf); 2324 trace_xfs_attr_list_leaf(context);
2327 2325
2328 /* 2326 /*
2329 * Re-find our place in the leaf block if this is a new syscall. 2327 * Re-find our place in the leaf block if this is a new syscall.
@@ -2344,7 +2342,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2344 } 2342 }
2345 } 2343 }
2346 if (i == be16_to_cpu(leaf->hdr.count)) { 2344 if (i == be16_to_cpu(leaf->hdr.count)) {
2347 xfs_attr_trace_l_c("not found", context); 2345 trace_xfs_attr_list_notfound(context);
2348 return(0); 2346 return(0);
2349 } 2347 }
2350 } else { 2348 } else {
@@ -2372,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2372 2370
2373 retval = context->put_listent(context, 2371 retval = context->put_listent(context,
2374 entry->flags, 2372 entry->flags,
2375 (char *)name_loc->nameval, 2373 name_loc->nameval,
2376 (int)name_loc->namelen, 2374 (int)name_loc->namelen,
2377 be16_to_cpu(name_loc->valuelen), 2375 be16_to_cpu(name_loc->valuelen),
2378 (char *)&name_loc->nameval[name_loc->namelen]); 2376 &name_loc->nameval[name_loc->namelen]);
2379 if (retval) 2377 if (retval)
2380 return retval; 2378 return retval;
2381 } else { 2379 } else {
@@ -2399,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2399 return retval; 2397 return retval;
2400 retval = context->put_listent(context, 2398 retval = context->put_listent(context,
2401 entry->flags, 2399 entry->flags,
2402 (char *)name_rmt->name, 2400 name_rmt->name,
2403 (int)name_rmt->namelen, 2401 (int)name_rmt->namelen,
2404 valuelen, 2402 valuelen,
2405 (char*)args.value); 2403 args.value);
2406 kmem_free(args.value); 2404 kmem_free(args.value);
2407 } else { 2405 } else {
2408 retval = context->put_listent(context, 2406 retval = context->put_listent(context,
2409 entry->flags, 2407 entry->flags,
2410 (char *)name_rmt->name, 2408 name_rmt->name,
2411 (int)name_rmt->namelen, 2409 (int)name_rmt->namelen,
2412 valuelen, 2410 valuelen,
2413 NULL); 2411 NULL);
@@ -2419,7 +2417,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2419 break; 2417 break;
2420 cursor->offset++; 2418 cursor->offset++;
2421 } 2419 }
2422 xfs_attr_trace_l_cl("blk end", context, leaf); 2420 trace_xfs_attr_list_leaf_end(context);
2423 return(retval); 2421 return(retval);
2424} 2422}
2425 2423
@@ -2952,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2952 map.br_blockcount); 2950 map.br_blockcount);
2953 bp = xfs_trans_get_buf(*trans, 2951 bp = xfs_trans_get_buf(*trans,
2954 dp->i_mount->m_ddev_targp, 2952 dp->i_mount->m_ddev_targp,
2955 dblkno, dblkcnt, XFS_BUF_LOCK); 2953 dblkno, dblkcnt, XBF_LOCK);
2956 xfs_trans_binval(*trans, bp); 2954 xfs_trans_binval(*trans, bp);
2957 /* 2955 /*
2958 * Roll to next transaction. 2956 * Roll to next transaction.
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index ea22839caed2..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -25,8 +25,6 @@
25 * to fit into the literal area of the inode. 25 * to fit into the literal area of the inode.
26 */ 26 */
27 27
28struct xfs_inode;
29
30/* 28/*
31 * Entries are packed toward the top as tight as possible. 29 * Entries are packed toward the top as tight as possible.
32 */ 30 */
@@ -54,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
54 __uint8_t valuelen; /* length of value */ 52 __uint8_t valuelen; /* length of value */
55 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ 53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
56 xfs_dahash_t hash; /* this entry's hash value */ 54 xfs_dahash_t hash; /* this entry's hash value */
57 char *name; /* name value, pointer into buffer */ 55 unsigned char *name; /* name value, pointer into buffer */
58} xfs_attr_sf_sort_t; 56} xfs_attr_sf_sort_t;
59 57
60#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ 58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
@@ -69,42 +67,4 @@ typedef struct xfs_attr_sf_sort {
69 (be16_to_cpu(((xfs_attr_shortform_t *) \ 67 (be16_to_cpu(((xfs_attr_shortform_t *) \
70 ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) 68 ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
71 69
72#if defined(XFS_ATTR_TRACE)
73/*
74 * Kernel tracing support for attribute lists
75 */
76struct xfs_attr_list_context;
77struct xfs_da_intnode;
78struct xfs_da_node_entry;
79struct xfs_attr_leafblock;
80
81#define XFS_ATTR_TRACE_SIZE 4096 /* size of global trace buffer */
82extern ktrace_t *xfs_attr_trace_buf;
83
84/*
85 * Trace record types.
86 */
87#define XFS_ATTR_KTRACE_L_C 1 /* context */
88#define XFS_ATTR_KTRACE_L_CN 2 /* context, node */
89#define XFS_ATTR_KTRACE_L_CB 3 /* context, btree */
90#define XFS_ATTR_KTRACE_L_CL 4 /* context, leaf */
91
92void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
93void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
94 struct xfs_da_intnode *node);
95void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
96 struct xfs_da_node_entry *btree);
97void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
98 struct xfs_attr_leafblock *leaf);
99void xfs_attr_trace_enter(int type, char *where,
100 struct xfs_attr_list_context *context,
101 __psunsigned_t a13, __psunsigned_t a14,
102 __psunsigned_t a15);
103#else
104#define xfs_attr_trace_l_c(w,c)
105#define xfs_attr_trace_l_cn(w,c,n)
106#define xfs_attr_trace_l_cb(w,c,b)
107#define xfs_attr_trace_l_cl(w,c,l)
108#endif /* XFS_ATTR_TRACE */
109
110#endif /* __XFS_ATTR_SF_H__ */ 70#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8971fb09d387..5c11e4d17010 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -54,6 +54,7 @@
54#include "xfs_buf_item.h" 54#include "xfs_buf_item.h"
55#include "xfs_filestream.h" 55#include "xfs_filestream.h"
56#include "xfs_vnodeops.h" 56#include "xfs_vnodeops.h"
57#include "xfs_trace.h"
57 58
58 59
59#ifdef DEBUG 60#ifdef DEBUG
@@ -272,71 +273,6 @@ xfs_bmap_isaeof(
272 int whichfork, /* data or attribute fork */ 273 int whichfork, /* data or attribute fork */
273 char *aeof); /* return value */ 274 char *aeof); /* return value */
274 275
275#ifdef XFS_BMAP_TRACE
276/*
277 * Add bmap trace entry prior to a call to xfs_iext_remove.
278 */
279STATIC void
280xfs_bmap_trace_delete(
281 const char *fname, /* function name */
282 char *desc, /* operation description */
283 xfs_inode_t *ip, /* incore inode pointer */
284 xfs_extnum_t idx, /* index of entry(entries) deleted */
285 xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */
286 int whichfork); /* data or attr fork */
287
288/*
289 * Add bmap trace entry prior to a call to xfs_iext_insert, or
290 * reading in the extents list from the disk (in the btree).
291 */
292STATIC void
293xfs_bmap_trace_insert(
294 const char *fname, /* function name */
295 char *desc, /* operation description */
296 xfs_inode_t *ip, /* incore inode pointer */
297 xfs_extnum_t idx, /* index of entry(entries) inserted */
298 xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */
299 xfs_bmbt_irec_t *r1, /* inserted record 1 */
300 xfs_bmbt_irec_t *r2, /* inserted record 2 or null */
301 int whichfork); /* data or attr fork */
302
303/*
304 * Add bmap trace entry after updating an extent record in place.
305 */
306STATIC void
307xfs_bmap_trace_post_update(
308 const char *fname, /* function name */
309 char *desc, /* operation description */
310 xfs_inode_t *ip, /* incore inode pointer */
311 xfs_extnum_t idx, /* index of entry updated */
312 int whichfork); /* data or attr fork */
313
314/*
315 * Add bmap trace entry prior to updating an extent record in place.
316 */
317STATIC void
318xfs_bmap_trace_pre_update(
319 const char *fname, /* function name */
320 char *desc, /* operation description */
321 xfs_inode_t *ip, /* incore inode pointer */
322 xfs_extnum_t idx, /* index of entry to be updated */
323 int whichfork); /* data or attr fork */
324
325#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \
326 xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
327#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \
328 xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
329#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \
330 xfs_bmap_trace_post_update(__func__,d,ip,i,w)
331#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \
332 xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
333#else
334#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
335#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
336#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)
337#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)
338#endif /* XFS_BMAP_TRACE */
339
340/* 276/*
341 * Compute the worst-case number of indirect blocks that will be used 277 * Compute the worst-case number of indirect blocks that will be used
342 * for ip's delayed extent of length "len". 278 * for ip's delayed extent of length "len".
@@ -363,18 +299,6 @@ xfs_bmap_validate_ret(
363#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) 299#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
364#endif /* DEBUG */ 300#endif /* DEBUG */
365 301
366#if defined(XFS_RW_TRACE)
367STATIC void
368xfs_bunmap_trace(
369 xfs_inode_t *ip,
370 xfs_fileoff_t bno,
371 xfs_filblks_t len,
372 int flags,
373 inst_t *ra);
374#else
375#define xfs_bunmap_trace(ip, bno, len, flags, ra)
376#endif /* XFS_RW_TRACE */
377
378STATIC int 302STATIC int
379xfs_bmap_count_tree( 303xfs_bmap_count_tree(
380 xfs_mount_t *mp, 304 xfs_mount_t *mp,
@@ -590,9 +514,9 @@ xfs_bmap_add_extent(
590 * already extents in the list. 514 * already extents in the list.
591 */ 515 */
592 if (nextents == 0) { 516 if (nextents == 0) {
593 XFS_BMAP_TRACE_INSERT("insert empty", ip, 0, 1, new, NULL, 517 xfs_iext_insert(ip, 0, 1, new,
594 whichfork); 518 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
595 xfs_iext_insert(ifp, 0, 1, new); 519
596 ASSERT(cur == NULL); 520 ASSERT(cur == NULL);
597 ifp->if_lastex = 0; 521 ifp->if_lastex = 0;
598 if (!isnullstartblock(new->br_startblock)) { 522 if (!isnullstartblock(new->br_startblock)) {
@@ -759,26 +683,10 @@ xfs_bmap_add_extent_delay_real(
759 xfs_filblks_t temp=0; /* value for dnew calculations */ 683 xfs_filblks_t temp=0; /* value for dnew calculations */
760 xfs_filblks_t temp2=0;/* value for dnew calculations */ 684 xfs_filblks_t temp2=0;/* value for dnew calculations */
761 int tmp_rval; /* partial logging flags */ 685 int tmp_rval; /* partial logging flags */
762 enum { /* bit number definitions for state */
763 LEFT_CONTIG, RIGHT_CONTIG,
764 LEFT_FILLING, RIGHT_FILLING,
765 LEFT_DELAY, RIGHT_DELAY,
766 LEFT_VALID, RIGHT_VALID
767 };
768 686
769#define LEFT r[0] 687#define LEFT r[0]
770#define RIGHT r[1] 688#define RIGHT r[1]
771#define PREV r[2] 689#define PREV r[2]
772#define MASK(b) (1 << (b))
773#define MASK2(a,b) (MASK(a) | MASK(b))
774#define MASK3(a,b,c) (MASK2(a,b) | MASK(c))
775#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d))
776#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
777#define STATE_TEST(b) (state & MASK(b))
778#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
779 ((state &= ~MASK(b)), 0))
780#define SWITCH_STATE \
781 (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
782 690
783 /* 691 /*
784 * Set up a bunch of variables to make the tests simpler. 692 * Set up a bunch of variables to make the tests simpler.
@@ -790,69 +698,80 @@ xfs_bmap_add_extent_delay_real(
790 new_endoff = new->br_startoff + new->br_blockcount; 698 new_endoff = new->br_startoff + new->br_blockcount;
791 ASSERT(PREV.br_startoff <= new->br_startoff); 699 ASSERT(PREV.br_startoff <= new->br_startoff);
792 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); 700 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
701
793 /* 702 /*
794 * Set flags determining what part of the previous delayed allocation 703 * Set flags determining what part of the previous delayed allocation
795 * extent is being replaced by a real allocation. 704 * extent is being replaced by a real allocation.
796 */ 705 */
797 STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); 706 if (PREV.br_startoff == new->br_startoff)
798 STATE_SET(RIGHT_FILLING, 707 state |= BMAP_LEFT_FILLING;
799 PREV.br_startoff + PREV.br_blockcount == new_endoff); 708 if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
709 state |= BMAP_RIGHT_FILLING;
710
800 /* 711 /*
801 * Check and set flags if this segment has a left neighbor. 712 * Check and set flags if this segment has a left neighbor.
802 * Don't set contiguous if the combined extent would be too large. 713 * Don't set contiguous if the combined extent would be too large.
803 */ 714 */
804 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 715 if (idx > 0) {
716 state |= BMAP_LEFT_VALID;
805 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 717 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
806 STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); 718
719 if (isnullstartblock(LEFT.br_startblock))
720 state |= BMAP_LEFT_DELAY;
807 } 721 }
808 STATE_SET(LEFT_CONTIG, 722
809 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && 723 if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
810 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && 724 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
811 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && 725 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
812 LEFT.br_state == new->br_state && 726 LEFT.br_state == new->br_state &&
813 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); 727 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
728 state |= BMAP_LEFT_CONTIG;
729
814 /* 730 /*
815 * Check and set flags if this segment has a right neighbor. 731 * Check and set flags if this segment has a right neighbor.
816 * Don't set contiguous if the combined extent would be too large. 732 * Don't set contiguous if the combined extent would be too large.
817 * Also check for all-three-contiguous being too large. 733 * Also check for all-three-contiguous being too large.
818 */ 734 */
819 if (STATE_SET_TEST(RIGHT_VALID, 735 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
820 idx < 736 state |= BMAP_RIGHT_VALID;
821 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
822 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 737 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
823 STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); 738
739 if (isnullstartblock(RIGHT.br_startblock))
740 state |= BMAP_RIGHT_DELAY;
824 } 741 }
825 STATE_SET(RIGHT_CONTIG, 742
826 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && 743 if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
827 new_endoff == RIGHT.br_startoff && 744 new_endoff == RIGHT.br_startoff &&
828 new->br_startblock + new->br_blockcount == 745 new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
829 RIGHT.br_startblock && 746 new->br_state == RIGHT.br_state &&
830 new->br_state == RIGHT.br_state && 747 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
831 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && 748 ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
832 ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != 749 BMAP_RIGHT_FILLING)) !=
833 MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || 750 (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
834 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount 751 BMAP_RIGHT_FILLING) ||
835 <= MAXEXTLEN)); 752 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
753 <= MAXEXTLEN))
754 state |= BMAP_RIGHT_CONTIG;
755
836 error = 0; 756 error = 0;
837 /* 757 /*
838 * Switch out based on the FILLING and CONTIG state bits. 758 * Switch out based on the FILLING and CONTIG state bits.
839 */ 759 */
840 switch (SWITCH_STATE) { 760 switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
841 761 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
842 case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): 762 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
763 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
843 /* 764 /*
844 * Filling in all of a previously delayed allocation extent. 765 * Filling in all of a previously delayed allocation extent.
845 * The left and right neighbors are both contiguous with new. 766 * The left and right neighbors are both contiguous with new.
846 */ 767 */
847 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, 768 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
848 XFS_DATA_FORK);
849 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 769 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
850 LEFT.br_blockcount + PREV.br_blockcount + 770 LEFT.br_blockcount + PREV.br_blockcount +
851 RIGHT.br_blockcount); 771 RIGHT.br_blockcount);
852 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, 772 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
853 XFS_DATA_FORK); 773
854 XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); 774 xfs_iext_remove(ip, idx, 2, state);
855 xfs_iext_remove(ifp, idx, 2);
856 ip->i_df.if_lastex = idx - 1; 775 ip->i_df.if_lastex = idx - 1;
857 ip->i_d.di_nextents--; 776 ip->i_d.di_nextents--;
858 if (cur == NULL) 777 if (cur == NULL)
@@ -885,20 +804,18 @@ xfs_bmap_add_extent_delay_real(
885 RIGHT.br_blockcount; 804 RIGHT.br_blockcount;
886 break; 805 break;
887 806
888 case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): 807 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
889 /* 808 /*
890 * Filling in all of a previously delayed allocation extent. 809 * Filling in all of a previously delayed allocation extent.
891 * The left neighbor is contiguous, the right is not. 810 * The left neighbor is contiguous, the right is not.
892 */ 811 */
893 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, 812 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
894 XFS_DATA_FORK);
895 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 813 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
896 LEFT.br_blockcount + PREV.br_blockcount); 814 LEFT.br_blockcount + PREV.br_blockcount);
897 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, 815 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
898 XFS_DATA_FORK); 816
899 ip->i_df.if_lastex = idx - 1; 817 ip->i_df.if_lastex = idx - 1;
900 XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); 818 xfs_iext_remove(ip, idx, 1, state);
901 xfs_iext_remove(ifp, idx, 1);
902 if (cur == NULL) 819 if (cur == NULL)
903 rval = XFS_ILOG_DEXT; 820 rval = XFS_ILOG_DEXT;
904 else { 821 else {
@@ -921,19 +838,19 @@ xfs_bmap_add_extent_delay_real(
921 PREV.br_blockcount; 838 PREV.br_blockcount;
922 break; 839 break;
923 840
924 case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): 841 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
925 /* 842 /*
926 * Filling in all of a previously delayed allocation extent. 843 * Filling in all of a previously delayed allocation extent.
927 * The right neighbor is contiguous, the left is not. 844 * The right neighbor is contiguous, the left is not.
928 */ 845 */
929 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); 846 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
930 xfs_bmbt_set_startblock(ep, new->br_startblock); 847 xfs_bmbt_set_startblock(ep, new->br_startblock);
931 xfs_bmbt_set_blockcount(ep, 848 xfs_bmbt_set_blockcount(ep,
932 PREV.br_blockcount + RIGHT.br_blockcount); 849 PREV.br_blockcount + RIGHT.br_blockcount);
933 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); 850 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
851
934 ip->i_df.if_lastex = idx; 852 ip->i_df.if_lastex = idx;
935 XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); 853 xfs_iext_remove(ip, idx + 1, 1, state);
936 xfs_iext_remove(ifp, idx + 1, 1);
937 if (cur == NULL) 854 if (cur == NULL)
938 rval = XFS_ILOG_DEXT; 855 rval = XFS_ILOG_DEXT;
939 else { 856 else {
@@ -956,15 +873,16 @@ xfs_bmap_add_extent_delay_real(
956 RIGHT.br_blockcount; 873 RIGHT.br_blockcount;
957 break; 874 break;
958 875
959 case MASK2(LEFT_FILLING, RIGHT_FILLING): 876 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
960 /* 877 /*
961 * Filling in all of a previously delayed allocation extent. 878 * Filling in all of a previously delayed allocation extent.
962 * Neither the left nor right neighbors are contiguous with 879 * Neither the left nor right neighbors are contiguous with
963 * the new one. 880 * the new one.
964 */ 881 */
965 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); 882 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
966 xfs_bmbt_set_startblock(ep, new->br_startblock); 883 xfs_bmbt_set_startblock(ep, new->br_startblock);
967 XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); 884 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
885
968 ip->i_df.if_lastex = idx; 886 ip->i_df.if_lastex = idx;
969 ip->i_d.di_nextents++; 887 ip->i_d.di_nextents++;
970 if (cur == NULL) 888 if (cur == NULL)
@@ -987,19 +905,20 @@ xfs_bmap_add_extent_delay_real(
987 temp2 = new->br_blockcount; 905 temp2 = new->br_blockcount;
988 break; 906 break;
989 907
990 case MASK2(LEFT_FILLING, LEFT_CONTIG): 908 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
991 /* 909 /*
992 * Filling in the first part of a previous delayed allocation. 910 * Filling in the first part of a previous delayed allocation.
993 * The left neighbor is contiguous. 911 * The left neighbor is contiguous.
994 */ 912 */
995 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); 913 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
996 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 914 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
997 LEFT.br_blockcount + new->br_blockcount); 915 LEFT.br_blockcount + new->br_blockcount);
998 xfs_bmbt_set_startoff(ep, 916 xfs_bmbt_set_startoff(ep,
999 PREV.br_startoff + new->br_blockcount); 917 PREV.br_startoff + new->br_blockcount);
1000 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); 918 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
919
1001 temp = PREV.br_blockcount - new->br_blockcount; 920 temp = PREV.br_blockcount - new->br_blockcount;
1002 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); 921 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1003 xfs_bmbt_set_blockcount(ep, temp); 922 xfs_bmbt_set_blockcount(ep, temp);
1004 ip->i_df.if_lastex = idx - 1; 923 ip->i_df.if_lastex = idx - 1;
1005 if (cur == NULL) 924 if (cur == NULL)
@@ -1021,7 +940,7 @@ xfs_bmap_add_extent_delay_real(
1021 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 940 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1022 startblockval(PREV.br_startblock)); 941 startblockval(PREV.br_startblock));
1023 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 942 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1024 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); 943 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1025 *dnew = temp; 944 *dnew = temp;
1026 /* DELTA: The boundary between two in-core extents moved. */ 945 /* DELTA: The boundary between two in-core extents moved. */
1027 temp = LEFT.br_startoff; 946 temp = LEFT.br_startoff;
@@ -1029,18 +948,16 @@ xfs_bmap_add_extent_delay_real(
1029 PREV.br_blockcount; 948 PREV.br_blockcount;
1030 break; 949 break;
1031 950
1032 case MASK(LEFT_FILLING): 951 case BMAP_LEFT_FILLING:
1033 /* 952 /*
1034 * Filling in the first part of a previous delayed allocation. 953 * Filling in the first part of a previous delayed allocation.
1035 * The left neighbor is not contiguous. 954 * The left neighbor is not contiguous.
1036 */ 955 */
1037 XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); 956 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1038 xfs_bmbt_set_startoff(ep, new_endoff); 957 xfs_bmbt_set_startoff(ep, new_endoff);
1039 temp = PREV.br_blockcount - new->br_blockcount; 958 temp = PREV.br_blockcount - new->br_blockcount;
1040 xfs_bmbt_set_blockcount(ep, temp); 959 xfs_bmbt_set_blockcount(ep, temp);
1041 XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, 960 xfs_iext_insert(ip, idx, 1, new, state);
1042 XFS_DATA_FORK);
1043 xfs_iext_insert(ifp, idx, 1, new);
1044 ip->i_df.if_lastex = idx; 961 ip->i_df.if_lastex = idx;
1045 ip->i_d.di_nextents++; 962 ip->i_d.di_nextents++;
1046 if (cur == NULL) 963 if (cur == NULL)
@@ -1071,27 +988,27 @@ xfs_bmap_add_extent_delay_real(
1071 (cur ? cur->bc_private.b.allocated : 0)); 988 (cur ? cur->bc_private.b.allocated : 0));
1072 ep = xfs_iext_get_ext(ifp, idx + 1); 989 ep = xfs_iext_get_ext(ifp, idx + 1);
1073 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 990 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1074 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); 991 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
1075 *dnew = temp; 992 *dnew = temp;
1076 /* DELTA: One in-core extent is split in two. */ 993 /* DELTA: One in-core extent is split in two. */
1077 temp = PREV.br_startoff; 994 temp = PREV.br_startoff;
1078 temp2 = PREV.br_blockcount; 995 temp2 = PREV.br_blockcount;
1079 break; 996 break;
1080 997
1081 case MASK2(RIGHT_FILLING, RIGHT_CONTIG): 998 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
1082 /* 999 /*
1083 * Filling in the last part of a previous delayed allocation. 1000 * Filling in the last part of a previous delayed allocation.
1084 * The right neighbor is contiguous with the new allocation. 1001 * The right neighbor is contiguous with the new allocation.
1085 */ 1002 */
1086 temp = PREV.br_blockcount - new->br_blockcount; 1003 temp = PREV.br_blockcount - new->br_blockcount;
1087 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); 1004 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1088 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); 1005 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
1089 xfs_bmbt_set_blockcount(ep, temp); 1006 xfs_bmbt_set_blockcount(ep, temp);
1090 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 1007 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
1091 new->br_startoff, new->br_startblock, 1008 new->br_startoff, new->br_startblock,
1092 new->br_blockcount + RIGHT.br_blockcount, 1009 new->br_blockcount + RIGHT.br_blockcount,
1093 RIGHT.br_state); 1010 RIGHT.br_state);
1094 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); 1011 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
1095 ip->i_df.if_lastex = idx + 1; 1012 ip->i_df.if_lastex = idx + 1;
1096 if (cur == NULL) 1013 if (cur == NULL)
1097 rval = XFS_ILOG_DEXT; 1014 rval = XFS_ILOG_DEXT;
@@ -1112,7 +1029,7 @@ xfs_bmap_add_extent_delay_real(
1112 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1029 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1113 startblockval(PREV.br_startblock)); 1030 startblockval(PREV.br_startblock));
1114 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1115 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); 1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1116 *dnew = temp; 1033 *dnew = temp;
1117 /* DELTA: The boundary between two in-core extents moved. */ 1034 /* DELTA: The boundary between two in-core extents moved. */
1118 temp = PREV.br_startoff; 1035 temp = PREV.br_startoff;
@@ -1120,17 +1037,15 @@ xfs_bmap_add_extent_delay_real(
1120 RIGHT.br_blockcount; 1037 RIGHT.br_blockcount;
1121 break; 1038 break;
1122 1039
1123 case MASK(RIGHT_FILLING): 1040 case BMAP_RIGHT_FILLING:
1124 /* 1041 /*
1125 * Filling in the last part of a previous delayed allocation. 1042 * Filling in the last part of a previous delayed allocation.
1126 * The right neighbor is not contiguous. 1043 * The right neighbor is not contiguous.
1127 */ 1044 */
1128 temp = PREV.br_blockcount - new->br_blockcount; 1045 temp = PREV.br_blockcount - new->br_blockcount;
1129 XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); 1046 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1130 xfs_bmbt_set_blockcount(ep, temp); 1047 xfs_bmbt_set_blockcount(ep, temp);
1131 XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, 1048 xfs_iext_insert(ip, idx + 1, 1, new, state);
1132 XFS_DATA_FORK);
1133 xfs_iext_insert(ifp, idx + 1, 1, new);
1134 ip->i_df.if_lastex = idx + 1; 1049 ip->i_df.if_lastex = idx + 1;
1135 ip->i_d.di_nextents++; 1050 ip->i_d.di_nextents++;
1136 if (cur == NULL) 1051 if (cur == NULL)
@@ -1161,7 +1076,7 @@ xfs_bmap_add_extent_delay_real(
1161 (cur ? cur->bc_private.b.allocated : 0)); 1076 (cur ? cur->bc_private.b.allocated : 0));
1162 ep = xfs_iext_get_ext(ifp, idx); 1077 ep = xfs_iext_get_ext(ifp, idx);
1163 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1078 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1164 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); 1079 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1165 *dnew = temp; 1080 *dnew = temp;
1166 /* DELTA: One in-core extent is split in two. */ 1081 /* DELTA: One in-core extent is split in two. */
1167 temp = PREV.br_startoff; 1082 temp = PREV.br_startoff;
@@ -1175,7 +1090,7 @@ xfs_bmap_add_extent_delay_real(
1175 * This case is avoided almost all the time. 1090 * This case is avoided almost all the time.
1176 */ 1091 */
1177 temp = new->br_startoff - PREV.br_startoff; 1092 temp = new->br_startoff - PREV.br_startoff;
1178 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); 1093 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1179 xfs_bmbt_set_blockcount(ep, temp); 1094 xfs_bmbt_set_blockcount(ep, temp);
1180 r[0] = *new; 1095 r[0] = *new;
1181 r[1].br_state = PREV.br_state; 1096 r[1].br_state = PREV.br_state;
@@ -1183,9 +1098,7 @@ xfs_bmap_add_extent_delay_real(
1183 r[1].br_startoff = new_endoff; 1098 r[1].br_startoff = new_endoff;
1184 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1099 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1185 r[1].br_blockcount = temp2; 1100 r[1].br_blockcount = temp2;
1186 XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], 1101 xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
1187 XFS_DATA_FORK);
1188 xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
1189 ip->i_df.if_lastex = idx + 1; 1102 ip->i_df.if_lastex = idx + 1;
1190 ip->i_d.di_nextents++; 1103 ip->i_d.di_nextents++;
1191 if (cur == NULL) 1104 if (cur == NULL)
@@ -1242,24 +1155,24 @@ xfs_bmap_add_extent_delay_real(
1242 } 1155 }
1243 ep = xfs_iext_get_ext(ifp, idx); 1156 ep = xfs_iext_get_ext(ifp, idx);
1244 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1157 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1245 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); 1158 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1246 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); 1159 trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
1247 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), 1160 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
1248 nullstartblock((int)temp2)); 1161 nullstartblock((int)temp2));
1249 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); 1162 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
1250 *dnew = temp + temp2; 1163 *dnew = temp + temp2;
1251 /* DELTA: One in-core extent is split in three. */ 1164 /* DELTA: One in-core extent is split in three. */
1252 temp = PREV.br_startoff; 1165 temp = PREV.br_startoff;
1253 temp2 = PREV.br_blockcount; 1166 temp2 = PREV.br_blockcount;
1254 break; 1167 break;
1255 1168
1256 case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): 1169 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
1257 case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): 1170 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
1258 case MASK2(LEFT_FILLING, RIGHT_CONTIG): 1171 case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
1259 case MASK2(RIGHT_FILLING, LEFT_CONTIG): 1172 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
1260 case MASK2(LEFT_CONTIG, RIGHT_CONTIG): 1173 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
1261 case MASK(LEFT_CONTIG): 1174 case BMAP_LEFT_CONTIG:
1262 case MASK(RIGHT_CONTIG): 1175 case BMAP_RIGHT_CONTIG:
1263 /* 1176 /*
1264 * These cases are all impossible. 1177 * These cases are all impossible.
1265 */ 1178 */
@@ -1279,14 +1192,6 @@ done:
1279#undef LEFT 1192#undef LEFT
1280#undef RIGHT 1193#undef RIGHT
1281#undef PREV 1194#undef PREV
1282#undef MASK
1283#undef MASK2
1284#undef MASK3
1285#undef MASK4
1286#undef STATE_SET
1287#undef STATE_TEST
1288#undef STATE_SET_TEST
1289#undef SWITCH_STATE
1290} 1195}
1291 1196
1292/* 1197/*
@@ -1316,27 +1221,10 @@ xfs_bmap_add_extent_unwritten_real(
1316 int state = 0;/* state bits, accessed thru macros */ 1221 int state = 0;/* state bits, accessed thru macros */
1317 xfs_filblks_t temp=0; 1222 xfs_filblks_t temp=0;
1318 xfs_filblks_t temp2=0; 1223 xfs_filblks_t temp2=0;
1319 enum { /* bit number definitions for state */
1320 LEFT_CONTIG, RIGHT_CONTIG,
1321 LEFT_FILLING, RIGHT_FILLING,
1322 LEFT_DELAY, RIGHT_DELAY,
1323 LEFT_VALID, RIGHT_VALID
1324 };
1325 1224
1326#define LEFT r[0] 1225#define LEFT r[0]
1327#define RIGHT r[1] 1226#define RIGHT r[1]
1328#define PREV r[2] 1227#define PREV r[2]
1329#define MASK(b) (1 << (b))
1330#define MASK2(a,b) (MASK(a) | MASK(b))
1331#define MASK3(a,b,c) (MASK2(a,b) | MASK(c))
1332#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d))
1333#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
1334#define STATE_TEST(b) (state & MASK(b))
1335#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
1336 ((state &= ~MASK(b)), 0))
1337#define SWITCH_STATE \
1338 (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
1339
1340 /* 1228 /*
1341 * Set up a bunch of variables to make the tests simpler. 1229 * Set up a bunch of variables to make the tests simpler.
1342 */ 1230 */
@@ -1352,68 +1240,78 @@ xfs_bmap_add_extent_unwritten_real(
1352 new_endoff = new->br_startoff + new->br_blockcount; 1240 new_endoff = new->br_startoff + new->br_blockcount;
1353 ASSERT(PREV.br_startoff <= new->br_startoff); 1241 ASSERT(PREV.br_startoff <= new->br_startoff);
1354 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); 1242 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
1243
1355 /* 1244 /*
1356 * Set flags determining what part of the previous oldext allocation 1245 * Set flags determining what part of the previous oldext allocation
1357 * extent is being replaced by a newext allocation. 1246 * extent is being replaced by a newext allocation.
1358 */ 1247 */
1359 STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); 1248 if (PREV.br_startoff == new->br_startoff)
1360 STATE_SET(RIGHT_FILLING, 1249 state |= BMAP_LEFT_FILLING;
1361 PREV.br_startoff + PREV.br_blockcount == new_endoff); 1250 if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
1251 state |= BMAP_RIGHT_FILLING;
1252
1362 /* 1253 /*
1363 * Check and set flags if this segment has a left neighbor. 1254 * Check and set flags if this segment has a left neighbor.
1364 * Don't set contiguous if the combined extent would be too large. 1255 * Don't set contiguous if the combined extent would be too large.
1365 */ 1256 */
1366 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 1257 if (idx > 0) {
1258 state |= BMAP_LEFT_VALID;
1367 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 1259 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
1368 STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); 1260
1261 if (isnullstartblock(LEFT.br_startblock))
1262 state |= BMAP_LEFT_DELAY;
1369 } 1263 }
1370 STATE_SET(LEFT_CONTIG, 1264
1371 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && 1265 if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
1372 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && 1266 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
1373 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && 1267 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
1374 LEFT.br_state == newext && 1268 LEFT.br_state == newext &&
1375 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); 1269 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
1270 state |= BMAP_LEFT_CONTIG;
1271
1376 /* 1272 /*
1377 * Check and set flags if this segment has a right neighbor. 1273 * Check and set flags if this segment has a right neighbor.
1378 * Don't set contiguous if the combined extent would be too large. 1274 * Don't set contiguous if the combined extent would be too large.
1379 * Also check for all-three-contiguous being too large. 1275 * Also check for all-three-contiguous being too large.
1380 */ 1276 */
1381 if (STATE_SET_TEST(RIGHT_VALID, 1277 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
1382 idx < 1278 state |= BMAP_RIGHT_VALID;
1383 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
1384 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 1279 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
1385 STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); 1280 if (isnullstartblock(RIGHT.br_startblock))
1281 state |= BMAP_RIGHT_DELAY;
1386 } 1282 }
1387 STATE_SET(RIGHT_CONTIG, 1283
1388 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && 1284 if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
1389 new_endoff == RIGHT.br_startoff && 1285 new_endoff == RIGHT.br_startoff &&
1390 new->br_startblock + new->br_blockcount == 1286 new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
1391 RIGHT.br_startblock && 1287 newext == RIGHT.br_state &&
1392 newext == RIGHT.br_state && 1288 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
1393 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && 1289 ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
1394 ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != 1290 BMAP_RIGHT_FILLING)) !=
1395 MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || 1291 (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
1396 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount 1292 BMAP_RIGHT_FILLING) ||
1397 <= MAXEXTLEN)); 1293 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
1294 <= MAXEXTLEN))
1295 state |= BMAP_RIGHT_CONTIG;
1296
1398 /* 1297 /*
1399 * Switch out based on the FILLING and CONTIG state bits. 1298 * Switch out based on the FILLING and CONTIG state bits.
1400 */ 1299 */
1401 switch (SWITCH_STATE) { 1300 switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
1402 1301 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
1403 case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): 1302 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
1303 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
1404 /* 1304 /*
1405 * Setting all of a previous oldext extent to newext. 1305 * Setting all of a previous oldext extent to newext.
1406 * The left and right neighbors are both contiguous with new. 1306 * The left and right neighbors are both contiguous with new.
1407 */ 1307 */
1408 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, 1308 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
1409 XFS_DATA_FORK);
1410 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1309 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
1411 LEFT.br_blockcount + PREV.br_blockcount + 1310 LEFT.br_blockcount + PREV.br_blockcount +
1412 RIGHT.br_blockcount); 1311 RIGHT.br_blockcount);
1413 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, 1312 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
1414 XFS_DATA_FORK); 1313
1415 XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); 1314 xfs_iext_remove(ip, idx, 2, state);
1416 xfs_iext_remove(ifp, idx, 2);
1417 ip->i_df.if_lastex = idx - 1; 1315 ip->i_df.if_lastex = idx - 1;
1418 ip->i_d.di_nextents -= 2; 1316 ip->i_d.di_nextents -= 2;
1419 if (cur == NULL) 1317 if (cur == NULL)
@@ -1450,20 +1348,18 @@ xfs_bmap_add_extent_unwritten_real(
1450 RIGHT.br_blockcount; 1348 RIGHT.br_blockcount;
1451 break; 1349 break;
1452 1350
1453 case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): 1351 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
1454 /* 1352 /*
1455 * Setting all of a previous oldext extent to newext. 1353 * Setting all of a previous oldext extent to newext.
1456 * The left neighbor is contiguous, the right is not. 1354 * The left neighbor is contiguous, the right is not.
1457 */ 1355 */
1458 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, 1356 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
1459 XFS_DATA_FORK);
1460 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1357 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
1461 LEFT.br_blockcount + PREV.br_blockcount); 1358 LEFT.br_blockcount + PREV.br_blockcount);
1462 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, 1359 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
1463 XFS_DATA_FORK); 1360
1464 ip->i_df.if_lastex = idx - 1; 1361 ip->i_df.if_lastex = idx - 1;
1465 XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); 1362 xfs_iext_remove(ip, idx, 1, state);
1466 xfs_iext_remove(ifp, idx, 1);
1467 ip->i_d.di_nextents--; 1363 ip->i_d.di_nextents--;
1468 if (cur == NULL) 1364 if (cur == NULL)
1469 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1365 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1492,21 +1388,18 @@ xfs_bmap_add_extent_unwritten_real(
1492 PREV.br_blockcount; 1388 PREV.br_blockcount;
1493 break; 1389 break;
1494 1390
1495 case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): 1391 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
1496 /* 1392 /*
1497 * Setting all of a previous oldext extent to newext. 1393 * Setting all of a previous oldext extent to newext.
1498 * The right neighbor is contiguous, the left is not. 1394 * The right neighbor is contiguous, the left is not.
1499 */ 1395 */
1500 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, 1396 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1501 XFS_DATA_FORK);
1502 xfs_bmbt_set_blockcount(ep, 1397 xfs_bmbt_set_blockcount(ep,
1503 PREV.br_blockcount + RIGHT.br_blockcount); 1398 PREV.br_blockcount + RIGHT.br_blockcount);
1504 xfs_bmbt_set_state(ep, newext); 1399 xfs_bmbt_set_state(ep, newext);
1505 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, 1400 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1506 XFS_DATA_FORK);
1507 ip->i_df.if_lastex = idx; 1401 ip->i_df.if_lastex = idx;
1508 XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); 1402 xfs_iext_remove(ip, idx + 1, 1, state);
1509 xfs_iext_remove(ifp, idx + 1, 1);
1510 ip->i_d.di_nextents--; 1403 ip->i_d.di_nextents--;
1511 if (cur == NULL) 1404 if (cur == NULL)
1512 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1405 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1535,17 +1428,16 @@ xfs_bmap_add_extent_unwritten_real(
1535 RIGHT.br_blockcount; 1428 RIGHT.br_blockcount;
1536 break; 1429 break;
1537 1430
1538 case MASK2(LEFT_FILLING, RIGHT_FILLING): 1431 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
1539 /* 1432 /*
1540 * Setting all of a previous oldext extent to newext. 1433 * Setting all of a previous oldext extent to newext.
1541 * Neither the left nor right neighbors are contiguous with 1434 * Neither the left nor right neighbors are contiguous with
1542 * the new one. 1435 * the new one.
1543 */ 1436 */
1544 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, 1437 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1545 XFS_DATA_FORK);
1546 xfs_bmbt_set_state(ep, newext); 1438 xfs_bmbt_set_state(ep, newext);
1547 XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, 1439 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1548 XFS_DATA_FORK); 1440
1549 ip->i_df.if_lastex = idx; 1441 ip->i_df.if_lastex = idx;
1550 if (cur == NULL) 1442 if (cur == NULL)
1551 rval = XFS_ILOG_DEXT; 1443 rval = XFS_ILOG_DEXT;
@@ -1566,27 +1458,25 @@ xfs_bmap_add_extent_unwritten_real(
1566 temp2 = new->br_blockcount; 1458 temp2 = new->br_blockcount;
1567 break; 1459 break;
1568 1460
1569 case MASK2(LEFT_FILLING, LEFT_CONTIG): 1461 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
1570 /* 1462 /*
1571 * Setting the first part of a previous oldext extent to newext. 1463 * Setting the first part of a previous oldext extent to newext.
1572 * The left neighbor is contiguous. 1464 * The left neighbor is contiguous.
1573 */ 1465 */
1574 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, 1466 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
1575 XFS_DATA_FORK);
1576 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1467 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
1577 LEFT.br_blockcount + new->br_blockcount); 1468 LEFT.br_blockcount + new->br_blockcount);
1578 xfs_bmbt_set_startoff(ep, 1469 xfs_bmbt_set_startoff(ep,
1579 PREV.br_startoff + new->br_blockcount); 1470 PREV.br_startoff + new->br_blockcount);
1580 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, 1471 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
1581 XFS_DATA_FORK); 1472
1582 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, 1473 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1583 XFS_DATA_FORK);
1584 xfs_bmbt_set_startblock(ep, 1474 xfs_bmbt_set_startblock(ep,
1585 new->br_startblock + new->br_blockcount); 1475 new->br_startblock + new->br_blockcount);
1586 xfs_bmbt_set_blockcount(ep, 1476 xfs_bmbt_set_blockcount(ep,
1587 PREV.br_blockcount - new->br_blockcount); 1477 PREV.br_blockcount - new->br_blockcount);
1588 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, 1478 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1589 XFS_DATA_FORK); 1479
1590 ip->i_df.if_lastex = idx - 1; 1480 ip->i_df.if_lastex = idx - 1;
1591 if (cur == NULL) 1481 if (cur == NULL)
1592 rval = XFS_ILOG_DEXT; 1482 rval = XFS_ILOG_DEXT;
@@ -1617,22 +1507,21 @@ xfs_bmap_add_extent_unwritten_real(
1617 PREV.br_blockcount; 1507 PREV.br_blockcount;
1618 break; 1508 break;
1619 1509
1620 case MASK(LEFT_FILLING): 1510 case BMAP_LEFT_FILLING:
1621 /* 1511 /*
1622 * Setting the first part of a previous oldext extent to newext. 1512 * Setting the first part of a previous oldext extent to newext.
1623 * The left neighbor is not contiguous. 1513 * The left neighbor is not contiguous.
1624 */ 1514 */
1625 XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); 1515 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1626 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); 1516 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
1627 xfs_bmbt_set_startoff(ep, new_endoff); 1517 xfs_bmbt_set_startoff(ep, new_endoff);
1628 xfs_bmbt_set_blockcount(ep, 1518 xfs_bmbt_set_blockcount(ep,
1629 PREV.br_blockcount - new->br_blockcount); 1519 PREV.br_blockcount - new->br_blockcount);
1630 xfs_bmbt_set_startblock(ep, 1520 xfs_bmbt_set_startblock(ep,
1631 new->br_startblock + new->br_blockcount); 1521 new->br_startblock + new->br_blockcount);
1632 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx, XFS_DATA_FORK); 1522 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1633 XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, 1523
1634 XFS_DATA_FORK); 1524 xfs_iext_insert(ip, idx, 1, new, state);
1635 xfs_iext_insert(ifp, idx, 1, new);
1636 ip->i_df.if_lastex = idx; 1525 ip->i_df.if_lastex = idx;
1637 ip->i_d.di_nextents++; 1526 ip->i_d.di_nextents++;
1638 if (cur == NULL) 1527 if (cur == NULL)
@@ -1660,24 +1549,21 @@ xfs_bmap_add_extent_unwritten_real(
1660 temp2 = PREV.br_blockcount; 1549 temp2 = PREV.br_blockcount;
1661 break; 1550 break;
1662 1551
1663 case MASK2(RIGHT_FILLING, RIGHT_CONTIG): 1552 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
1664 /* 1553 /*
1665 * Setting the last part of a previous oldext extent to newext. 1554 * Setting the last part of a previous oldext extent to newext.
1666 * The right neighbor is contiguous with the new allocation. 1555 * The right neighbor is contiguous with the new allocation.
1667 */ 1556 */
1668 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, 1557 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1669 XFS_DATA_FORK); 1558 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
1670 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1,
1671 XFS_DATA_FORK);
1672 xfs_bmbt_set_blockcount(ep, 1559 xfs_bmbt_set_blockcount(ep,
1673 PREV.br_blockcount - new->br_blockcount); 1560 PREV.br_blockcount - new->br_blockcount);
1674 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, 1561 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1675 XFS_DATA_FORK);
1676 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 1562 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
1677 new->br_startoff, new->br_startblock, 1563 new->br_startoff, new->br_startblock,
1678 new->br_blockcount + RIGHT.br_blockcount, newext); 1564 new->br_blockcount + RIGHT.br_blockcount, newext);
1679 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, 1565 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
1680 XFS_DATA_FORK); 1566
1681 ip->i_df.if_lastex = idx + 1; 1567 ip->i_df.if_lastex = idx + 1;
1682 if (cur == NULL) 1568 if (cur == NULL)
1683 rval = XFS_ILOG_DEXT; 1569 rval = XFS_ILOG_DEXT;
@@ -1707,18 +1593,17 @@ xfs_bmap_add_extent_unwritten_real(
1707 RIGHT.br_blockcount; 1593 RIGHT.br_blockcount;
1708 break; 1594 break;
1709 1595
1710 case MASK(RIGHT_FILLING): 1596 case BMAP_RIGHT_FILLING:
1711 /* 1597 /*
1712 * Setting the last part of a previous oldext extent to newext. 1598 * Setting the last part of a previous oldext extent to newext.
1713 * The right neighbor is not contiguous. 1599 * The right neighbor is not contiguous.
1714 */ 1600 */
1715 XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); 1601 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1716 xfs_bmbt_set_blockcount(ep, 1602 xfs_bmbt_set_blockcount(ep,
1717 PREV.br_blockcount - new->br_blockcount); 1603 PREV.br_blockcount - new->br_blockcount);
1718 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); 1604 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1719 XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, 1605
1720 XFS_DATA_FORK); 1606 xfs_iext_insert(ip, idx + 1, 1, new, state);
1721 xfs_iext_insert(ifp, idx + 1, 1, new);
1722 ip->i_df.if_lastex = idx + 1; 1607 ip->i_df.if_lastex = idx + 1;
1723 ip->i_d.di_nextents++; 1608 ip->i_d.di_nextents++;
1724 if (cur == NULL) 1609 if (cur == NULL)
@@ -1756,19 +1641,18 @@ xfs_bmap_add_extent_unwritten_real(
1756 * newext. Contiguity is impossible here. 1641 * newext. Contiguity is impossible here.
1757 * One extent becomes three extents. 1642 * One extent becomes three extents.
1758 */ 1643 */
1759 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); 1644 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1760 xfs_bmbt_set_blockcount(ep, 1645 xfs_bmbt_set_blockcount(ep,
1761 new->br_startoff - PREV.br_startoff); 1646 new->br_startoff - PREV.br_startoff);
1762 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); 1647 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1648
1763 r[0] = *new; 1649 r[0] = *new;
1764 r[1].br_startoff = new_endoff; 1650 r[1].br_startoff = new_endoff;
1765 r[1].br_blockcount = 1651 r[1].br_blockcount =
1766 PREV.br_startoff + PREV.br_blockcount - new_endoff; 1652 PREV.br_startoff + PREV.br_blockcount - new_endoff;
1767 r[1].br_startblock = new->br_startblock + new->br_blockcount; 1653 r[1].br_startblock = new->br_startblock + new->br_blockcount;
1768 r[1].br_state = oldext; 1654 r[1].br_state = oldext;
1769 XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], 1655 xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
1770 XFS_DATA_FORK);
1771 xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
1772 ip->i_df.if_lastex = idx + 1; 1656 ip->i_df.if_lastex = idx + 1;
1773 ip->i_d.di_nextents += 2; 1657 ip->i_d.di_nextents += 2;
1774 if (cur == NULL) 1658 if (cur == NULL)
@@ -1813,13 +1697,13 @@ xfs_bmap_add_extent_unwritten_real(
1813 temp2 = PREV.br_blockcount; 1697 temp2 = PREV.br_blockcount;
1814 break; 1698 break;
1815 1699
1816 case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): 1700 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
1817 case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): 1701 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
1818 case MASK2(LEFT_FILLING, RIGHT_CONTIG): 1702 case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
1819 case MASK2(RIGHT_FILLING, LEFT_CONTIG): 1703 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
1820 case MASK2(LEFT_CONTIG, RIGHT_CONTIG): 1704 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
1821 case MASK(LEFT_CONTIG): 1705 case BMAP_LEFT_CONTIG:
1822 case MASK(RIGHT_CONTIG): 1706 case BMAP_RIGHT_CONTIG:
1823 /* 1707 /*
1824 * These cases are all impossible. 1708 * These cases are all impossible.
1825 */ 1709 */
@@ -1839,14 +1723,6 @@ done:
1839#undef LEFT 1723#undef LEFT
1840#undef RIGHT 1724#undef RIGHT
1841#undef PREV 1725#undef PREV
1842#undef MASK
1843#undef MASK2
1844#undef MASK3
1845#undef MASK4
1846#undef STATE_SET
1847#undef STATE_TEST
1848#undef STATE_SET_TEST
1849#undef SWITCH_STATE
1850} 1726}
1851 1727
1852/* 1728/*
@@ -1872,62 +1748,57 @@ xfs_bmap_add_extent_hole_delay(
1872 int state; /* state bits, accessed thru macros */ 1748 int state; /* state bits, accessed thru macros */
1873 xfs_filblks_t temp=0; /* temp for indirect calculations */ 1749 xfs_filblks_t temp=0; /* temp for indirect calculations */
1874 xfs_filblks_t temp2=0; 1750 xfs_filblks_t temp2=0;
1875 enum { /* bit number definitions for state */
1876 LEFT_CONTIG, RIGHT_CONTIG,
1877 LEFT_DELAY, RIGHT_DELAY,
1878 LEFT_VALID, RIGHT_VALID
1879 };
1880
1881#define MASK(b) (1 << (b))
1882#define MASK2(a,b) (MASK(a) | MASK(b))
1883#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
1884#define STATE_TEST(b) (state & MASK(b))
1885#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
1886 ((state &= ~MASK(b)), 0))
1887#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
1888 1751
1889 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1752 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1890 ep = xfs_iext_get_ext(ifp, idx); 1753 ep = xfs_iext_get_ext(ifp, idx);
1891 state = 0; 1754 state = 0;
1892 ASSERT(isnullstartblock(new->br_startblock)); 1755 ASSERT(isnullstartblock(new->br_startblock));
1756
1893 /* 1757 /*
1894 * Check and set flags if this segment has a left neighbor 1758 * Check and set flags if this segment has a left neighbor
1895 */ 1759 */
1896 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 1760 if (idx > 0) {
1761 state |= BMAP_LEFT_VALID;
1897 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1762 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
1898 STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); 1763
1764 if (isnullstartblock(left.br_startblock))
1765 state |= BMAP_LEFT_DELAY;
1899 } 1766 }
1767
1900 /* 1768 /*
1901 * Check and set flags if the current (right) segment exists. 1769 * Check and set flags if the current (right) segment exists.
1902 * If it doesn't exist, we're converting the hole at end-of-file. 1770 * If it doesn't exist, we're converting the hole at end-of-file.
1903 */ 1771 */
1904 if (STATE_SET_TEST(RIGHT_VALID, 1772 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1905 idx < 1773 state |= BMAP_RIGHT_VALID;
1906 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
1907 xfs_bmbt_get_all(ep, &right); 1774 xfs_bmbt_get_all(ep, &right);
1908 STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); 1775
1776 if (isnullstartblock(right.br_startblock))
1777 state |= BMAP_RIGHT_DELAY;
1909 } 1778 }
1779
1910 /* 1780 /*
1911 * Set contiguity flags on the left and right neighbors. 1781 * Set contiguity flags on the left and right neighbors.
1912 * Don't let extents get too large, even if the pieces are contiguous. 1782 * Don't let extents get too large, even if the pieces are contiguous.
1913 */ 1783 */
1914 STATE_SET(LEFT_CONTIG, 1784 if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
1915 STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) && 1785 left.br_startoff + left.br_blockcount == new->br_startoff &&
1916 left.br_startoff + left.br_blockcount == new->br_startoff && 1786 left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
1917 left.br_blockcount + new->br_blockcount <= MAXEXTLEN); 1787 state |= BMAP_LEFT_CONTIG;
1918 STATE_SET(RIGHT_CONTIG, 1788
1919 STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) && 1789 if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
1920 new->br_startoff + new->br_blockcount == right.br_startoff && 1790 new->br_startoff + new->br_blockcount == right.br_startoff &&
1921 new->br_blockcount + right.br_blockcount <= MAXEXTLEN && 1791 new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
1922 (!STATE_TEST(LEFT_CONTIG) || 1792 (!(state & BMAP_LEFT_CONTIG) ||
1923 (left.br_blockcount + new->br_blockcount + 1793 (left.br_blockcount + new->br_blockcount +
1924 right.br_blockcount <= MAXEXTLEN))); 1794 right.br_blockcount <= MAXEXTLEN)))
1795 state |= BMAP_RIGHT_CONTIG;
1796
1925 /* 1797 /*
1926 * Switch out based on the contiguity flags. 1798 * Switch out based on the contiguity flags.
1927 */ 1799 */
1928 switch (SWITCH_STATE) { 1800 switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
1929 1801 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
1930 case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
1931 /* 1802 /*
1932 * New allocation is contiguous with delayed allocations 1803 * New allocation is contiguous with delayed allocations
1933 * on the left and on the right. 1804 * on the left and on the right.
@@ -1935,8 +1806,8 @@ xfs_bmap_add_extent_hole_delay(
1935 */ 1806 */
1936 temp = left.br_blockcount + new->br_blockcount + 1807 temp = left.br_blockcount + new->br_blockcount +
1937 right.br_blockcount; 1808 right.br_blockcount;
1938 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, 1809
1939 XFS_DATA_FORK); 1810 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
1940 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1811 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1941 oldlen = startblockval(left.br_startblock) + 1812 oldlen = startblockval(left.br_startblock) +
1942 startblockval(new->br_startblock) + 1813 startblockval(new->br_startblock) +
@@ -1944,53 +1815,52 @@ xfs_bmap_add_extent_hole_delay(
1944 newlen = xfs_bmap_worst_indlen(ip, temp); 1815 newlen = xfs_bmap_worst_indlen(ip, temp);
1945 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1816 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1946 nullstartblock((int)newlen)); 1817 nullstartblock((int)newlen));
1947 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, 1818 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
1948 XFS_DATA_FORK); 1819
1949 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); 1820 xfs_iext_remove(ip, idx, 1, state);
1950 xfs_iext_remove(ifp, idx, 1);
1951 ip->i_df.if_lastex = idx - 1; 1821 ip->i_df.if_lastex = idx - 1;
1952 /* DELTA: Two in-core extents were replaced by one. */ 1822 /* DELTA: Two in-core extents were replaced by one. */
1953 temp2 = temp; 1823 temp2 = temp;
1954 temp = left.br_startoff; 1824 temp = left.br_startoff;
1955 break; 1825 break;
1956 1826
1957 case MASK(LEFT_CONTIG): 1827 case BMAP_LEFT_CONTIG:
1958 /* 1828 /*
1959 * New allocation is contiguous with a delayed allocation 1829 * New allocation is contiguous with a delayed allocation
1960 * on the left. 1830 * on the left.
1961 * Merge the new allocation with the left neighbor. 1831 * Merge the new allocation with the left neighbor.
1962 */ 1832 */
1963 temp = left.br_blockcount + new->br_blockcount; 1833 temp = left.br_blockcount + new->br_blockcount;
1964 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, 1834 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
1965 XFS_DATA_FORK);
1966 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1835 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1967 oldlen = startblockval(left.br_startblock) + 1836 oldlen = startblockval(left.br_startblock) +
1968 startblockval(new->br_startblock); 1837 startblockval(new->br_startblock);
1969 newlen = xfs_bmap_worst_indlen(ip, temp); 1838 newlen = xfs_bmap_worst_indlen(ip, temp);
1970 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1839 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1971 nullstartblock((int)newlen)); 1840 nullstartblock((int)newlen));
1972 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, 1841 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
1973 XFS_DATA_FORK); 1842
1974 ip->i_df.if_lastex = idx - 1; 1843 ip->i_df.if_lastex = idx - 1;
1975 /* DELTA: One in-core extent grew into a hole. */ 1844 /* DELTA: One in-core extent grew into a hole. */
1976 temp2 = temp; 1845 temp2 = temp;
1977 temp = left.br_startoff; 1846 temp = left.br_startoff;
1978 break; 1847 break;
1979 1848
1980 case MASK(RIGHT_CONTIG): 1849 case BMAP_RIGHT_CONTIG:
1981 /* 1850 /*
1982 * New allocation is contiguous with a delayed allocation 1851 * New allocation is contiguous with a delayed allocation
1983 * on the right. 1852 * on the right.
1984 * Merge the new allocation with the right neighbor. 1853 * Merge the new allocation with the right neighbor.
1985 */ 1854 */
1986 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); 1855 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
1987 temp = new->br_blockcount + right.br_blockcount; 1856 temp = new->br_blockcount + right.br_blockcount;
1988 oldlen = startblockval(new->br_startblock) + 1857 oldlen = startblockval(new->br_startblock) +
1989 startblockval(right.br_startblock); 1858 startblockval(right.br_startblock);
1990 newlen = xfs_bmap_worst_indlen(ip, temp); 1859 newlen = xfs_bmap_worst_indlen(ip, temp);
1991 xfs_bmbt_set_allf(ep, new->br_startoff, 1860 xfs_bmbt_set_allf(ep, new->br_startoff,
1992 nullstartblock((int)newlen), temp, right.br_state); 1861 nullstartblock((int)newlen), temp, right.br_state);
1993 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); 1862 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1863
1994 ip->i_df.if_lastex = idx; 1864 ip->i_df.if_lastex = idx;
1995 /* DELTA: One in-core extent grew into a hole. */ 1865 /* DELTA: One in-core extent grew into a hole. */
1996 temp2 = temp; 1866 temp2 = temp;
@@ -2004,9 +1874,7 @@ xfs_bmap_add_extent_hole_delay(
2004 * Insert a new entry. 1874 * Insert a new entry.
2005 */ 1875 */
2006 oldlen = newlen = 0; 1876 oldlen = newlen = 0;
2007 XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, 1877 xfs_iext_insert(ip, idx, 1, new, state);
2008 XFS_DATA_FORK);
2009 xfs_iext_insert(ifp, idx, 1, new);
2010 ip->i_df.if_lastex = idx; 1878 ip->i_df.if_lastex = idx;
2011 /* DELTA: A new in-core extent was added in a hole. */ 1879 /* DELTA: A new in-core extent was added in a hole. */
2012 temp2 = new->br_blockcount; 1880 temp2 = new->br_blockcount;
@@ -2030,12 +1898,6 @@ xfs_bmap_add_extent_hole_delay(
2030 } 1898 }
2031 *logflagsp = 0; 1899 *logflagsp = 0;
2032 return 0; 1900 return 0;
2033#undef MASK
2034#undef MASK2
2035#undef STATE_SET
2036#undef STATE_TEST
2037#undef STATE_SET_TEST
2038#undef SWITCH_STATE
2039} 1901}
2040 1902
2041/* 1903/*
@@ -2062,83 +1924,75 @@ xfs_bmap_add_extent_hole_real(
2062 int state; /* state bits, accessed thru macros */ 1924 int state; /* state bits, accessed thru macros */
2063 xfs_filblks_t temp=0; 1925 xfs_filblks_t temp=0;
2064 xfs_filblks_t temp2=0; 1926 xfs_filblks_t temp2=0;
2065 enum { /* bit number definitions for state */
2066 LEFT_CONTIG, RIGHT_CONTIG,
2067 LEFT_DELAY, RIGHT_DELAY,
2068 LEFT_VALID, RIGHT_VALID
2069 };
2070
2071#define MASK(b) (1 << (b))
2072#define MASK2(a,b) (MASK(a) | MASK(b))
2073#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
2074#define STATE_TEST(b) (state & MASK(b))
2075#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
2076 ((state &= ~MASK(b)), 0))
2077#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
2078 1927
2079 ifp = XFS_IFORK_PTR(ip, whichfork); 1928 ifp = XFS_IFORK_PTR(ip, whichfork);
2080 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1929 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
2081 ep = xfs_iext_get_ext(ifp, idx); 1930 ep = xfs_iext_get_ext(ifp, idx);
2082 state = 0; 1931 state = 0;
1932
1933 if (whichfork == XFS_ATTR_FORK)
1934 state |= BMAP_ATTRFORK;
1935
2083 /* 1936 /*
2084 * Check and set flags if this segment has a left neighbor. 1937 * Check and set flags if this segment has a left neighbor.
2085 */ 1938 */
2086 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 1939 if (idx > 0) {
1940 state |= BMAP_LEFT_VALID;
2087 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1941 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
2088 STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); 1942 if (isnullstartblock(left.br_startblock))
1943 state |= BMAP_LEFT_DELAY;
2089 } 1944 }
1945
2090 /* 1946 /*
2091 * Check and set flags if this segment has a current value. 1947 * Check and set flags if this segment has a current value.
2092 * Not true if we're inserting into the "hole" at eof. 1948 * Not true if we're inserting into the "hole" at eof.
2093 */ 1949 */
2094 if (STATE_SET_TEST(RIGHT_VALID, 1950 if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
2095 idx < 1951 state |= BMAP_RIGHT_VALID;
2096 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
2097 xfs_bmbt_get_all(ep, &right); 1952 xfs_bmbt_get_all(ep, &right);
2098 STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); 1953 if (isnullstartblock(right.br_startblock))
1954 state |= BMAP_RIGHT_DELAY;
2099 } 1955 }
1956
2100 /* 1957 /*
2101 * We're inserting a real allocation between "left" and "right". 1958 * We're inserting a real allocation between "left" and "right".
2102 * Set the contiguity flags. Don't let extents get too large. 1959 * Set the contiguity flags. Don't let extents get too large.
2103 */ 1960 */
2104 STATE_SET(LEFT_CONTIG, 1961 if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
2105 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && 1962 left.br_startoff + left.br_blockcount == new->br_startoff &&
2106 left.br_startoff + left.br_blockcount == new->br_startoff && 1963 left.br_startblock + left.br_blockcount == new->br_startblock &&
2107 left.br_startblock + left.br_blockcount == new->br_startblock && 1964 left.br_state == new->br_state &&
2108 left.br_state == new->br_state && 1965 left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
2109 left.br_blockcount + new->br_blockcount <= MAXEXTLEN); 1966 state |= BMAP_LEFT_CONTIG;
2110 STATE_SET(RIGHT_CONTIG, 1967
2111 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && 1968 if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
2112 new->br_startoff + new->br_blockcount == right.br_startoff && 1969 new->br_startoff + new->br_blockcount == right.br_startoff &&
2113 new->br_startblock + new->br_blockcount == 1970 new->br_startblock + new->br_blockcount == right.br_startblock &&
2114 right.br_startblock && 1971 new->br_state == right.br_state &&
2115 new->br_state == right.br_state && 1972 new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
2116 new->br_blockcount + right.br_blockcount <= MAXEXTLEN && 1973 (!(state & BMAP_LEFT_CONTIG) ||
2117 (!STATE_TEST(LEFT_CONTIG) || 1974 left.br_blockcount + new->br_blockcount +
2118 left.br_blockcount + new->br_blockcount + 1975 right.br_blockcount <= MAXEXTLEN))
2119 right.br_blockcount <= MAXEXTLEN)); 1976 state |= BMAP_RIGHT_CONTIG;
2120 1977
2121 error = 0; 1978 error = 0;
2122 /* 1979 /*
2123 * Select which case we're in here, and implement it. 1980 * Select which case we're in here, and implement it.
2124 */ 1981 */
2125 switch (SWITCH_STATE) { 1982 switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
2126 1983 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2127 case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
2128 /* 1984 /*
2129 * New allocation is contiguous with real allocations on the 1985 * New allocation is contiguous with real allocations on the
2130 * left and on the right. 1986 * left and on the right.
2131 * Merge all three into a single extent record. 1987 * Merge all three into a single extent record.
2132 */ 1988 */
2133 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, 1989 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
2134 whichfork);
2135 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1990 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
2136 left.br_blockcount + new->br_blockcount + 1991 left.br_blockcount + new->br_blockcount +
2137 right.br_blockcount); 1992 right.br_blockcount);
2138 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, 1993 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
2139 whichfork); 1994
2140 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, whichfork); 1995 xfs_iext_remove(ip, idx, 1, state);
2141 xfs_iext_remove(ifp, idx, 1);
2142 ifp->if_lastex = idx - 1; 1996 ifp->if_lastex = idx - 1;
2143 XFS_IFORK_NEXT_SET(ip, whichfork, 1997 XFS_IFORK_NEXT_SET(ip, whichfork,
2144 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 1998 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -2173,16 +2027,17 @@ xfs_bmap_add_extent_hole_real(
2173 right.br_blockcount; 2027 right.br_blockcount;
2174 break; 2028 break;
2175 2029
2176 case MASK(LEFT_CONTIG): 2030 case BMAP_LEFT_CONTIG:
2177 /* 2031 /*
2178 * New allocation is contiguous with a real allocation 2032 * New allocation is contiguous with a real allocation
2179 * on the left. 2033 * on the left.
2180 * Merge the new allocation with the left neighbor. 2034 * Merge the new allocation with the left neighbor.
2181 */ 2035 */
2182 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, whichfork); 2036 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
2183 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 2037 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
2184 left.br_blockcount + new->br_blockcount); 2038 left.br_blockcount + new->br_blockcount);
2185 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); 2039 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
2040
2186 ifp->if_lastex = idx - 1; 2041 ifp->if_lastex = idx - 1;
2187 if (cur == NULL) { 2042 if (cur == NULL) {
2188 rval = xfs_ilog_fext(whichfork); 2043 rval = xfs_ilog_fext(whichfork);
@@ -2207,17 +2062,18 @@ xfs_bmap_add_extent_hole_real(
2207 new->br_blockcount; 2062 new->br_blockcount;
2208 break; 2063 break;
2209 2064
2210 case MASK(RIGHT_CONTIG): 2065 case BMAP_RIGHT_CONTIG:
2211 /* 2066 /*
2212 * New allocation is contiguous with a real allocation 2067 * New allocation is contiguous with a real allocation
2213 * on the right. 2068 * on the right.
2214 * Merge the new allocation with the right neighbor. 2069 * Merge the new allocation with the right neighbor.
2215 */ 2070 */
2216 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, whichfork); 2071 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
2217 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, 2072 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
2218 new->br_blockcount + right.br_blockcount, 2073 new->br_blockcount + right.br_blockcount,
2219 right.br_state); 2074 right.br_state);
2220 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); 2075 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
2076
2221 ifp->if_lastex = idx; 2077 ifp->if_lastex = idx;
2222 if (cur == NULL) { 2078 if (cur == NULL) {
2223 rval = xfs_ilog_fext(whichfork); 2079 rval = xfs_ilog_fext(whichfork);
@@ -2248,8 +2104,7 @@ xfs_bmap_add_extent_hole_real(
2248 * real allocation. 2104 * real allocation.
2249 * Insert a new entry. 2105 * Insert a new entry.
2250 */ 2106 */
2251 XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, whichfork); 2107 xfs_iext_insert(ip, idx, 1, new, state);
2252 xfs_iext_insert(ifp, idx, 1, new);
2253 ifp->if_lastex = idx; 2108 ifp->if_lastex = idx;
2254 XFS_IFORK_NEXT_SET(ip, whichfork, 2109 XFS_IFORK_NEXT_SET(ip, whichfork,
2255 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 2110 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
@@ -2283,12 +2138,6 @@ xfs_bmap_add_extent_hole_real(
2283done: 2138done:
2284 *logflagsp = rval; 2139 *logflagsp = rval;
2285 return error; 2140 return error;
2286#undef MASK
2287#undef MASK2
2288#undef STATE_SET
2289#undef STATE_TEST
2290#undef STATE_SET_TEST
2291#undef SWITCH_STATE
2292} 2141}
2293 2142
2294/* 2143/*
@@ -2701,22 +2550,134 @@ xfs_bmap_rtalloc(
2701} 2550}
2702 2551
2703STATIC int 2552STATIC int
2553xfs_bmap_btalloc_nullfb(
2554 struct xfs_bmalloca *ap,
2555 struct xfs_alloc_arg *args,
2556 xfs_extlen_t *blen)
2557{
2558 struct xfs_mount *mp = ap->ip->i_mount;
2559 struct xfs_perag *pag;
2560 xfs_agnumber_t ag, startag;
2561 int notinit = 0;
2562 int error;
2563
2564 if (ap->userdata && xfs_inode_is_filestream(ap->ip))
2565 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2566 else
2567 args->type = XFS_ALLOCTYPE_START_BNO;
2568 args->total = ap->total;
2569
2570 /*
2571 * Search for an allocation group with a single extent large enough
2572 * for the request. If one isn't found, then adjust the minimum
2573 * allocation size to the largest space found.
2574 */
2575 startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
2576 if (startag == NULLAGNUMBER)
2577 startag = ag = 0;
2578
2579 pag = xfs_perag_get(mp, ag);
2580 while (*blen < ap->alen) {
2581 if (!pag->pagf_init) {
2582 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2583 XFS_ALLOC_FLAG_TRYLOCK);
2584 if (error) {
2585 xfs_perag_put(pag);
2586 return error;
2587 }
2588 }
2589
2590 /*
2591 * See xfs_alloc_fix_freelist...
2592 */
2593 if (pag->pagf_init) {
2594 xfs_extlen_t longest;
2595 longest = xfs_alloc_longest_free_extent(mp, pag);
2596 if (*blen < longest)
2597 *blen = longest;
2598 } else
2599 notinit = 1;
2600
2601 if (xfs_inode_is_filestream(ap->ip)) {
2602 if (*blen >= ap->alen)
2603 break;
2604
2605 if (ap->userdata) {
2606 /*
2607 * If startag is an invalid AG, we've
2608 * come here once before and
2609 * xfs_filestream_new_ag picked the
2610 * best currently available.
2611 *
2612 * Don't continue looping, since we
2613 * could loop forever.
2614 */
2615 if (startag == NULLAGNUMBER)
2616 break;
2617
2618 error = xfs_filestream_new_ag(ap, &ag);
2619 xfs_perag_put(pag);
2620 if (error)
2621 return error;
2622
2623 /* loop again to set 'blen'*/
2624 startag = NULLAGNUMBER;
2625 pag = xfs_perag_get(mp, ag);
2626 continue;
2627 }
2628 }
2629 if (++ag == mp->m_sb.sb_agcount)
2630 ag = 0;
2631 if (ag == startag)
2632 break;
2633 xfs_perag_put(pag);
2634 pag = xfs_perag_get(mp, ag);
2635 }
2636 xfs_perag_put(pag);
2637
2638 /*
2639 * Since the above loop did a BUF_TRYLOCK, it is
2640 * possible that there is space for this request.
2641 */
2642 if (notinit || *blen < ap->minlen)
2643 args->minlen = ap->minlen;
2644 /*
2645 * If the best seen length is less than the request
2646 * length, use the best as the minimum.
2647 */
2648 else if (*blen < ap->alen)
2649 args->minlen = *blen;
2650 /*
2651 * Otherwise we've seen an extent as big as alen,
2652 * use that as the minimum.
2653 */
2654 else
2655 args->minlen = ap->alen;
2656
2657 /*
2658 * set the failure fallback case to look in the selected
2659 * AG as the stream may have moved.
2660 */
2661 if (xfs_inode_is_filestream(ap->ip))
2662 ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2663
2664 return 0;
2665}
2666
2667STATIC int
2704xfs_bmap_btalloc( 2668xfs_bmap_btalloc(
2705 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 2669 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
2706{ 2670{
2707 xfs_mount_t *mp; /* mount point structure */ 2671 xfs_mount_t *mp; /* mount point structure */
2708 xfs_alloctype_t atype = 0; /* type for allocation routines */ 2672 xfs_alloctype_t atype = 0; /* type for allocation routines */
2709 xfs_extlen_t align; /* minimum allocation alignment */ 2673 xfs_extlen_t align; /* minimum allocation alignment */
2710 xfs_agnumber_t ag;
2711 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 2674 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
2712 xfs_agnumber_t startag; 2675 xfs_agnumber_t ag;
2713 xfs_alloc_arg_t args; 2676 xfs_alloc_arg_t args;
2714 xfs_extlen_t blen; 2677 xfs_extlen_t blen;
2715 xfs_extlen_t nextminlen = 0; 2678 xfs_extlen_t nextminlen = 0;
2716 xfs_perag_t *pag;
2717 int nullfb; /* true if ap->firstblock isn't set */ 2679 int nullfb; /* true if ap->firstblock isn't set */
2718 int isaligned; 2680 int isaligned;
2719 int notinit;
2720 int tryagain; 2681 int tryagain;
2721 int error; 2682 int error;
2722 2683
@@ -2763,102 +2724,9 @@ xfs_bmap_btalloc(
2763 args.firstblock = ap->firstblock; 2724 args.firstblock = ap->firstblock;
2764 blen = 0; 2725 blen = 0;
2765 if (nullfb) { 2726 if (nullfb) {
2766 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) 2727 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
2767 args.type = XFS_ALLOCTYPE_NEAR_BNO; 2728 if (error)
2768 else 2729 return error;
2769 args.type = XFS_ALLOCTYPE_START_BNO;
2770 args.total = ap->total;
2771
2772 /*
2773 * Search for an allocation group with a single extent
2774 * large enough for the request.
2775 *
2776 * If one isn't found, then adjust the minimum allocation
2777 * size to the largest space found.
2778 */
2779 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
2780 if (startag == NULLAGNUMBER)
2781 startag = ag = 0;
2782 notinit = 0;
2783 down_read(&mp->m_peraglock);
2784 while (blen < ap->alen) {
2785 pag = &mp->m_perag[ag];
2786 if (!pag->pagf_init &&
2787 (error = xfs_alloc_pagf_init(mp, args.tp,
2788 ag, XFS_ALLOC_FLAG_TRYLOCK))) {
2789 up_read(&mp->m_peraglock);
2790 return error;
2791 }
2792 /*
2793 * See xfs_alloc_fix_freelist...
2794 */
2795 if (pag->pagf_init) {
2796 xfs_extlen_t longest;
2797 longest = xfs_alloc_longest_free_extent(mp, pag);
2798 if (blen < longest)
2799 blen = longest;
2800 } else
2801 notinit = 1;
2802
2803 if (xfs_inode_is_filestream(ap->ip)) {
2804 if (blen >= ap->alen)
2805 break;
2806
2807 if (ap->userdata) {
2808 /*
2809 * If startag is an invalid AG, we've
2810 * come here once before and
2811 * xfs_filestream_new_ag picked the
2812 * best currently available.
2813 *
2814 * Don't continue looping, since we
2815 * could loop forever.
2816 */
2817 if (startag == NULLAGNUMBER)
2818 break;
2819
2820 error = xfs_filestream_new_ag(ap, &ag);
2821 if (error) {
2822 up_read(&mp->m_peraglock);
2823 return error;
2824 }
2825
2826 /* loop again to set 'blen'*/
2827 startag = NULLAGNUMBER;
2828 continue;
2829 }
2830 }
2831 if (++ag == mp->m_sb.sb_agcount)
2832 ag = 0;
2833 if (ag == startag)
2834 break;
2835 }
2836 up_read(&mp->m_peraglock);
2837 /*
2838 * Since the above loop did a BUF_TRYLOCK, it is
2839 * possible that there is space for this request.
2840 */
2841 if (notinit || blen < ap->minlen)
2842 args.minlen = ap->minlen;
2843 /*
2844 * If the best seen length is less than the request
2845 * length, use the best as the minimum.
2846 */
2847 else if (blen < ap->alen)
2848 args.minlen = blen;
2849 /*
2850 * Otherwise we've seen an extent as big as alen,
2851 * use that as the minimum.
2852 */
2853 else
2854 args.minlen = ap->alen;
2855
2856 /*
2857 * set the failure fallback case to look in the selected
2858 * AG as the stream may have moved.
2859 */
2860 if (xfs_inode_is_filestream(ap->ip))
2861 ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2862 } else if (ap->low) { 2730 } else if (ap->low) {
2863 if (xfs_inode_is_filestream(ap->ip)) 2731 if (xfs_inode_is_filestream(ap->ip))
2864 args.type = XFS_ALLOCTYPE_FIRST_AG; 2732 args.type = XFS_ALLOCTYPE_FIRST_AG;
@@ -3115,8 +2983,13 @@ xfs_bmap_del_extent(
3115 uint qfield; /* quota field to update */ 2983 uint qfield; /* quota field to update */
3116 xfs_filblks_t temp; /* for indirect length calculations */ 2984 xfs_filblks_t temp; /* for indirect length calculations */
3117 xfs_filblks_t temp2; /* for indirect length calculations */ 2985 xfs_filblks_t temp2; /* for indirect length calculations */
2986 int state = 0;
3118 2987
3119 XFS_STATS_INC(xs_del_exlist); 2988 XFS_STATS_INC(xs_del_exlist);
2989
2990 if (whichfork == XFS_ATTR_FORK)
2991 state |= BMAP_ATTRFORK;
2992
3120 mp = ip->i_mount; 2993 mp = ip->i_mount;
3121 ifp = XFS_IFORK_PTR(ip, whichfork); 2994 ifp = XFS_IFORK_PTR(ip, whichfork);
3122 ASSERT((idx >= 0) && (idx < ifp->if_bytes / 2995 ASSERT((idx >= 0) && (idx < ifp->if_bytes /
@@ -3196,8 +3069,8 @@ xfs_bmap_del_extent(
3196 /* 3069 /*
3197 * Matches the whole extent. Delete the entry. 3070 * Matches the whole extent. Delete the entry.
3198 */ 3071 */
3199 XFS_BMAP_TRACE_DELETE("3", ip, idx, 1, whichfork); 3072 xfs_iext_remove(ip, idx, 1,
3200 xfs_iext_remove(ifp, idx, 1); 3073 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
3201 ifp->if_lastex = idx; 3074 ifp->if_lastex = idx;
3202 if (delay) 3075 if (delay)
3203 break; 3076 break;
@@ -3217,7 +3090,7 @@ xfs_bmap_del_extent(
3217 /* 3090 /*
3218 * Deleting the first part of the extent. 3091 * Deleting the first part of the extent.
3219 */ 3092 */
3220 XFS_BMAP_TRACE_PRE_UPDATE("2", ip, idx, whichfork); 3093 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
3221 xfs_bmbt_set_startoff(ep, del_endoff); 3094 xfs_bmbt_set_startoff(ep, del_endoff);
3222 temp = got.br_blockcount - del->br_blockcount; 3095 temp = got.br_blockcount - del->br_blockcount;
3223 xfs_bmbt_set_blockcount(ep, temp); 3096 xfs_bmbt_set_blockcount(ep, temp);
@@ -3226,13 +3099,12 @@ xfs_bmap_del_extent(
3226 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3099 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3227 da_old); 3100 da_old);
3228 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 3101 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3229 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, 3102 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
3230 whichfork);
3231 da_new = temp; 3103 da_new = temp;
3232 break; 3104 break;
3233 } 3105 }
3234 xfs_bmbt_set_startblock(ep, del_endblock); 3106 xfs_bmbt_set_startblock(ep, del_endblock);
3235 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); 3107 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
3236 if (!cur) { 3108 if (!cur) {
3237 flags |= xfs_ilog_fext(whichfork); 3109 flags |= xfs_ilog_fext(whichfork);
3238 break; 3110 break;
@@ -3248,19 +3120,18 @@ xfs_bmap_del_extent(
3248 * Deleting the last part of the extent. 3120 * Deleting the last part of the extent.
3249 */ 3121 */
3250 temp = got.br_blockcount - del->br_blockcount; 3122 temp = got.br_blockcount - del->br_blockcount;
3251 XFS_BMAP_TRACE_PRE_UPDATE("1", ip, idx, whichfork); 3123 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
3252 xfs_bmbt_set_blockcount(ep, temp); 3124 xfs_bmbt_set_blockcount(ep, temp);
3253 ifp->if_lastex = idx; 3125 ifp->if_lastex = idx;
3254 if (delay) { 3126 if (delay) {
3255 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3127 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3256 da_old); 3128 da_old);
3257 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 3129 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3258 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, 3130 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
3259 whichfork);
3260 da_new = temp; 3131 da_new = temp;
3261 break; 3132 break;
3262 } 3133 }
3263 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); 3134 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
3264 if (!cur) { 3135 if (!cur) {
3265 flags |= xfs_ilog_fext(whichfork); 3136 flags |= xfs_ilog_fext(whichfork);
3266 break; 3137 break;
@@ -3277,7 +3148,7 @@ xfs_bmap_del_extent(
3277 * Deleting the middle of the extent. 3148 * Deleting the middle of the extent.
3278 */ 3149 */
3279 temp = del->br_startoff - got.br_startoff; 3150 temp = del->br_startoff - got.br_startoff;
3280 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, whichfork); 3151 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
3281 xfs_bmbt_set_blockcount(ep, temp); 3152 xfs_bmbt_set_blockcount(ep, temp);
3282 new.br_startoff = del_endoff; 3153 new.br_startoff = del_endoff;
3283 temp2 = got_endoff - del_endoff; 3154 temp2 = got_endoff - del_endoff;
@@ -3364,10 +3235,8 @@ xfs_bmap_del_extent(
3364 } 3235 }
3365 } 3236 }
3366 } 3237 }
3367 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, whichfork); 3238 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
3368 XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 1, &new, NULL, 3239 xfs_iext_insert(ip, idx + 1, 1, &new, state);
3369 whichfork);
3370 xfs_iext_insert(ifp, idx + 1, 1, &new);
3371 ifp->if_lastex = idx + 1; 3240 ifp->if_lastex = idx + 1;
3372 break; 3241 break;
3373 } 3242 }
@@ -3687,7 +3556,9 @@ xfs_bmap_local_to_extents(
3687 xfs_iext_add(ifp, 0, 1); 3556 xfs_iext_add(ifp, 0, 1);
3688 ep = xfs_iext_get_ext(ifp, 0); 3557 ep = xfs_iext_get_ext(ifp, 0);
3689 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); 3558 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
3690 XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork); 3559 trace_xfs_bmap_post_update(ip, 0,
3560 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
3561 _THIS_IP_);
3691 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 3562 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
3692 ip->i_d.di_nblocks = 1; 3563 ip->i_d.di_nblocks = 1;
3693 xfs_trans_mod_dquot_byino(tp, ip, 3564 xfs_trans_mod_dquot_byino(tp, ip,
@@ -3800,158 +3671,6 @@ xfs_bmap_search_extents(
3800 return ep; 3671 return ep;
3801} 3672}
3802 3673
3803
3804#ifdef XFS_BMAP_TRACE
3805ktrace_t *xfs_bmap_trace_buf;
3806
3807/*
3808 * Add a bmap trace buffer entry. Base routine for the others.
3809 */
3810STATIC void
3811xfs_bmap_trace_addentry(
3812 int opcode, /* operation */
3813 const char *fname, /* function name */
3814 char *desc, /* operation description */
3815 xfs_inode_t *ip, /* incore inode pointer */
3816 xfs_extnum_t idx, /* index of entry(ies) */
3817 xfs_extnum_t cnt, /* count of entries, 1 or 2 */
3818 xfs_bmbt_rec_host_t *r1, /* first record */
3819 xfs_bmbt_rec_host_t *r2, /* second record or null */
3820 int whichfork) /* data or attr fork */
3821{
3822 xfs_bmbt_rec_host_t tr2;
3823
3824 ASSERT(cnt == 1 || cnt == 2);
3825 ASSERT(r1 != NULL);
3826 if (cnt == 1) {
3827 ASSERT(r2 == NULL);
3828 r2 = &tr2;
3829 memset(&tr2, 0, sizeof(tr2));
3830 } else
3831 ASSERT(r2 != NULL);
3832 ktrace_enter(xfs_bmap_trace_buf,
3833 (void *)(__psint_t)(opcode | (whichfork << 16)),
3834 (void *)fname, (void *)desc, (void *)ip,
3835 (void *)(__psint_t)idx,
3836 (void *)(__psint_t)cnt,
3837 (void *)(__psunsigned_t)(ip->i_ino >> 32),
3838 (void *)(__psunsigned_t)(unsigned)ip->i_ino,
3839 (void *)(__psunsigned_t)(r1->l0 >> 32),
3840 (void *)(__psunsigned_t)(unsigned)(r1->l0),
3841 (void *)(__psunsigned_t)(r1->l1 >> 32),
3842 (void *)(__psunsigned_t)(unsigned)(r1->l1),
3843 (void *)(__psunsigned_t)(r2->l0 >> 32),
3844 (void *)(__psunsigned_t)(unsigned)(r2->l0),
3845 (void *)(__psunsigned_t)(r2->l1 >> 32),
3846 (void *)(__psunsigned_t)(unsigned)(r2->l1)
3847 );
3848 ASSERT(ip->i_xtrace);
3849 ktrace_enter(ip->i_xtrace,
3850 (void *)(__psint_t)(opcode | (whichfork << 16)),
3851 (void *)fname, (void *)desc, (void *)ip,
3852 (void *)(__psint_t)idx,
3853 (void *)(__psint_t)cnt,
3854 (void *)(__psunsigned_t)(ip->i_ino >> 32),
3855 (void *)(__psunsigned_t)(unsigned)ip->i_ino,
3856 (void *)(__psunsigned_t)(r1->l0 >> 32),
3857 (void *)(__psunsigned_t)(unsigned)(r1->l0),
3858 (void *)(__psunsigned_t)(r1->l1 >> 32),
3859 (void *)(__psunsigned_t)(unsigned)(r1->l1),
3860 (void *)(__psunsigned_t)(r2->l0 >> 32),
3861 (void *)(__psunsigned_t)(unsigned)(r2->l0),
3862 (void *)(__psunsigned_t)(r2->l1 >> 32),
3863 (void *)(__psunsigned_t)(unsigned)(r2->l1)
3864 );
3865}
3866
3867/*
3868 * Add bmap trace entry prior to a call to xfs_iext_remove.
3869 */
3870STATIC void
3871xfs_bmap_trace_delete(
3872 const char *fname, /* function name */
3873 char *desc, /* operation description */
3874 xfs_inode_t *ip, /* incore inode pointer */
3875 xfs_extnum_t idx, /* index of entry(entries) deleted */
3876 xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */
3877 int whichfork) /* data or attr fork */
3878{
3879 xfs_ifork_t *ifp; /* inode fork pointer */
3880
3881 ifp = XFS_IFORK_PTR(ip, whichfork);
3882 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
3883 cnt, xfs_iext_get_ext(ifp, idx),
3884 cnt == 2 ? xfs_iext_get_ext(ifp, idx + 1) : NULL,
3885 whichfork);
3886}
3887
3888/*
3889 * Add bmap trace entry prior to a call to xfs_iext_insert, or
3890 * reading in the extents list from the disk (in the btree).
3891 */
3892STATIC void
3893xfs_bmap_trace_insert(
3894 const char *fname, /* function name */
3895 char *desc, /* operation description */
3896 xfs_inode_t *ip, /* incore inode pointer */
3897 xfs_extnum_t idx, /* index of entry(entries) inserted */
3898 xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */
3899 xfs_bmbt_irec_t *r1, /* inserted record 1 */
3900 xfs_bmbt_irec_t *r2, /* inserted record 2 or null */
3901 int whichfork) /* data or attr fork */
3902{
3903 xfs_bmbt_rec_host_t tr1; /* compressed record 1 */
3904 xfs_bmbt_rec_host_t tr2; /* compressed record 2 if needed */
3905
3906 xfs_bmbt_set_all(&tr1, r1);
3907 if (cnt == 2) {
3908 ASSERT(r2 != NULL);
3909 xfs_bmbt_set_all(&tr2, r2);
3910 } else {
3911 ASSERT(cnt == 1);
3912 ASSERT(r2 == NULL);
3913 }
3914 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
3915 cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
3916}
3917
3918/*
3919 * Add bmap trace entry after updating an extent record in place.
3920 */
3921STATIC void
3922xfs_bmap_trace_post_update(
3923 const char *fname, /* function name */
3924 char *desc, /* operation description */
3925 xfs_inode_t *ip, /* incore inode pointer */
3926 xfs_extnum_t idx, /* index of entry updated */
3927 int whichfork) /* data or attr fork */
3928{
3929 xfs_ifork_t *ifp; /* inode fork pointer */
3930
3931 ifp = XFS_IFORK_PTR(ip, whichfork);
3932 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
3933 1, xfs_iext_get_ext(ifp, idx), NULL, whichfork);
3934}
3935
3936/*
3937 * Add bmap trace entry prior to updating an extent record in place.
3938 */
3939STATIC void
3940xfs_bmap_trace_pre_update(
3941 const char *fname, /* function name */
3942 char *desc, /* operation description */
3943 xfs_inode_t *ip, /* incore inode pointer */
3944 xfs_extnum_t idx, /* index of entry to be updated */
3945 int whichfork) /* data or attr fork */
3946{
3947 xfs_ifork_t *ifp; /* inode fork pointer */
3948
3949 ifp = XFS_IFORK_PTR(ip, whichfork);
3950 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
3951 xfs_iext_get_ext(ifp, idx), NULL, whichfork);
3952}
3953#endif /* XFS_BMAP_TRACE */
3954
3955/* 3674/*
3956 * Compute the worst-case number of indirect blocks that will be used 3675 * Compute the worst-case number of indirect blocks that will be used
3957 * for ip's delayed extent of length "len". 3676 * for ip's delayed extent of length "len".
@@ -3983,37 +3702,6 @@ xfs_bmap_worst_indlen(
3983 return rval; 3702 return rval;
3984} 3703}
3985 3704
3986#if defined(XFS_RW_TRACE)
3987STATIC void
3988xfs_bunmap_trace(
3989 xfs_inode_t *ip,
3990 xfs_fileoff_t bno,
3991 xfs_filblks_t len,
3992 int flags,
3993 inst_t *ra)
3994{
3995 if (ip->i_rwtrace == NULL)
3996 return;
3997 ktrace_enter(ip->i_rwtrace,
3998 (void *)(__psint_t)XFS_BUNMAP,
3999 (void *)ip,
4000 (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
4001 (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
4002 (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
4003 (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
4004 (void *)(__psint_t)len,
4005 (void *)(__psint_t)flags,
4006 (void *)(unsigned long)current_cpu(),
4007 (void *)ra,
4008 (void *)0,
4009 (void *)0,
4010 (void *)0,
4011 (void *)0,
4012 (void *)0,
4013 (void *)0);
4014}
4015#endif
4016
4017/* 3705/*
4018 * Convert inode from non-attributed to attributed. 3706 * Convert inode from non-attributed to attributed.
4019 * Must not be in a transaction, ip must not be locked. 3707 * Must not be in a transaction, ip must not be locked.
@@ -4702,34 +4390,30 @@ error0:
4702 return XFS_ERROR(EFSCORRUPTED); 4390 return XFS_ERROR(EFSCORRUPTED);
4703} 4391}
4704 4392
4705#ifdef XFS_BMAP_TRACE 4393#ifdef DEBUG
4706/* 4394/*
4707 * Add bmap trace insert entries for all the contents of the extent records. 4395 * Add bmap trace insert entries for all the contents of the extent records.
4708 */ 4396 */
4709void 4397void
4710xfs_bmap_trace_exlist( 4398xfs_bmap_trace_exlist(
4711 const char *fname, /* function name */
4712 xfs_inode_t *ip, /* incore inode pointer */ 4399 xfs_inode_t *ip, /* incore inode pointer */
4713 xfs_extnum_t cnt, /* count of entries in the list */ 4400 xfs_extnum_t cnt, /* count of entries in the list */
4714 int whichfork) /* data or attr fork */ 4401 int whichfork, /* data or attr fork */
4402 unsigned long caller_ip)
4715{ 4403{
4716 xfs_bmbt_rec_host_t *ep; /* current extent record */
4717 xfs_extnum_t idx; /* extent record index */ 4404 xfs_extnum_t idx; /* extent record index */
4718 xfs_ifork_t *ifp; /* inode fork pointer */ 4405 xfs_ifork_t *ifp; /* inode fork pointer */
4719 xfs_bmbt_irec_t s; /* file extent record */ 4406 int state = 0;
4407
4408 if (whichfork == XFS_ATTR_FORK)
4409 state |= BMAP_ATTRFORK;
4720 4410
4721 ifp = XFS_IFORK_PTR(ip, whichfork); 4411 ifp = XFS_IFORK_PTR(ip, whichfork);
4722 ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); 4412 ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
4723 for (idx = 0; idx < cnt; idx++) { 4413 for (idx = 0; idx < cnt; idx++)
4724 ep = xfs_iext_get_ext(ifp, idx); 4414 trace_xfs_extlist(ip, idx, whichfork, caller_ip);
4725 xfs_bmbt_get_all(ep, &s);
4726 XFS_BMAP_TRACE_INSERT("exlist", ip, idx, 1, &s, NULL,
4727 whichfork);
4728 }
4729} 4415}
4730#endif
4731 4416
4732#ifdef DEBUG
4733/* 4417/*
4734 * Validate that the bmbt_irecs being returned from bmapi are valid 4418 * Validate that the bmbt_irecs being returned from bmapi are valid
4735 * given the callers original parameters. Specifically check the 4419 * given the callers original parameters. Specifically check the
@@ -4805,7 +4489,7 @@ xfs_bmapi(
4805 xfs_fsblock_t abno; /* allocated block number */ 4489 xfs_fsblock_t abno; /* allocated block number */
4806 xfs_extlen_t alen; /* allocated extent length */ 4490 xfs_extlen_t alen; /* allocated extent length */
4807 xfs_fileoff_t aoff; /* allocated file offset */ 4491 xfs_fileoff_t aoff; /* allocated file offset */
4808 xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ 4492 xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */
4809 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4493 xfs_btree_cur_t *cur; /* bmap btree cursor */
4810 xfs_fileoff_t end; /* end of mapped file region */ 4494 xfs_fileoff_t end; /* end of mapped file region */
4811 int eof; /* we've hit the end of extents */ 4495 int eof; /* we've hit the end of extents */
@@ -5478,7 +5162,8 @@ xfs_bunmapi(
5478 int rsvd; /* OK to allocate reserved blocks */ 5162 int rsvd; /* OK to allocate reserved blocks */
5479 xfs_fsblock_t sum; 5163 xfs_fsblock_t sum;
5480 5164
5481 xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address); 5165 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
5166
5482 whichfork = (flags & XFS_BMAPI_ATTRFORK) ? 5167 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
5483 XFS_ATTR_FORK : XFS_DATA_FORK; 5168 XFS_ATTR_FORK : XFS_DATA_FORK;
5484 ifp = XFS_IFORK_PTR(ip, whichfork); 5169 ifp = XFS_IFORK_PTR(ip, whichfork);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 56f62d2edc35..419dafb9d87d 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,6 +95,21 @@ typedef struct xfs_bmap_free
95 /* need write cache flushing and no */ 95 /* need write cache flushing and no */
96 /* additional allocation alignments */ 96 /* additional allocation alignments */
97 97
98#define XFS_BMAPI_FLAGS \
99 { XFS_BMAPI_WRITE, "WRITE" }, \
100 { XFS_BMAPI_DELAY, "DELAY" }, \
101 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
102 { XFS_BMAPI_METADATA, "METADATA" }, \
103 { XFS_BMAPI_EXACT, "EXACT" }, \
104 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
105 { XFS_BMAPI_ASYNC, "ASYNC" }, \
106 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
107 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
108 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
109 { XFS_BMAPI_CONTIG, "CONTIG" }, \
110 { XFS_BMAPI_CONVERT, "CONVERT" }
111
112
98static inline int xfs_bmapi_aflag(int w) 113static inline int xfs_bmapi_aflag(int w)
99{ 114{
100 return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); 115 return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -135,36 +150,43 @@ typedef struct xfs_bmalloca {
135 char conv; /* overwriting unwritten extents */ 150 char conv; /* overwriting unwritten extents */
136} xfs_bmalloca_t; 151} xfs_bmalloca_t;
137 152
138#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
139/* 153/*
140 * Trace operations for bmap extent tracing 154 * Flags for xfs_bmap_add_extent*.
141 */ 155 */
142#define XFS_BMAP_KTRACE_DELETE 1 156#define BMAP_LEFT_CONTIG (1 << 0)
143#define XFS_BMAP_KTRACE_INSERT 2 157#define BMAP_RIGHT_CONTIG (1 << 1)
144#define XFS_BMAP_KTRACE_PRE_UP 3 158#define BMAP_LEFT_FILLING (1 << 2)
145#define XFS_BMAP_KTRACE_POST_UP 4 159#define BMAP_RIGHT_FILLING (1 << 3)
146 160#define BMAP_LEFT_DELAY (1 << 4)
147#define XFS_BMAP_TRACE_SIZE 4096 /* size of global trace buffer */ 161#define BMAP_RIGHT_DELAY (1 << 5)
148#define XFS_BMAP_KTRACE_SIZE 32 /* size of per-inode trace buffer */ 162#define BMAP_LEFT_VALID (1 << 6)
149extern ktrace_t *xfs_bmap_trace_buf; 163#define BMAP_RIGHT_VALID (1 << 7)
164#define BMAP_ATTRFORK (1 << 8)
165
166#define XFS_BMAP_EXT_FLAGS \
167 { BMAP_LEFT_CONTIG, "LC" }, \
168 { BMAP_RIGHT_CONTIG, "RC" }, \
169 { BMAP_LEFT_FILLING, "LF" }, \
170 { BMAP_RIGHT_FILLING, "RF" }, \
171 { BMAP_ATTRFORK, "ATTR" }
150 172
151/* 173/*
152 * Add bmap trace insert entries for all the contents of the extent list. 174 * Add bmap trace insert entries for all the contents of the extent list.
175 *
176 * Quite excessive tracing. Only do this for debug builds.
153 */ 177 */
178#if defined(__KERNEL) && defined(DEBUG)
154void 179void
155xfs_bmap_trace_exlist( 180xfs_bmap_trace_exlist(
156 const char *fname, /* function name */
157 struct xfs_inode *ip, /* incore inode pointer */ 181 struct xfs_inode *ip, /* incore inode pointer */
158 xfs_extnum_t cnt, /* count of entries in list */ 182 xfs_extnum_t cnt, /* count of entries in list */
159 int whichfork); /* data or attr fork */ 183 int whichfork,
184 unsigned long caller_ip); /* data or attr fork */
160#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ 185#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
161 xfs_bmap_trace_exlist(__func__,ip,c,w) 186 xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
162 187#else
163#else /* __KERNEL__ && XFS_BMAP_TRACE */
164
165#define XFS_BMAP_TRACE_EXLIST(ip,c,w) 188#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
166 189#endif
167#endif /* __KERNEL__ && XFS_BMAP_TRACE */
168 190
169/* 191/*
170 * Convert inode from non-attributed to attributed. 192 * Convert inode from non-attributed to attributed.
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index eb7b702d0690..416e47e54b83 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -98,8 +98,7 @@ xfs_bmdr_to_bmbt(
98 * This code must be in sync with the routines xfs_bmbt_get_startoff, 98 * This code must be in sync with the routines xfs_bmbt_get_startoff,
99 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. 99 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
100 */ 100 */
101 101STATIC void
102STATIC_INLINE void
103__xfs_bmbt_get_all( 102__xfs_bmbt_get_all(
104 __uint64_t l0, 103 __uint64_t l0,
105 __uint64_t l1, 104 __uint64_t l1,
@@ -335,7 +334,7 @@ xfs_bmbt_disk_set_allf(
335/* 334/*
336 * Set all the fields in a bmap extent record from the uncompressed form. 335 * Set all the fields in a bmap extent record from the uncompressed form.
337 */ 336 */
338void 337STATIC void
339xfs_bmbt_disk_set_all( 338xfs_bmbt_disk_set_all(
340 xfs_bmbt_rec_t *r, 339 xfs_bmbt_rec_t *r,
341 xfs_bmbt_irec_t *s) 340 xfs_bmbt_irec_t *s)
@@ -769,12 +768,6 @@ xfs_bmbt_trace_enter(
769 (void *)a0, (void *)a1, (void *)a2, (void *)a3, 768 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
770 (void *)a4, (void *)a5, (void *)a6, (void *)a7, 769 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
771 (void *)a8, (void *)a9, (void *)a10); 770 (void *)a8, (void *)a9, (void *)a10);
772 ktrace_enter(ip->i_btrace,
773 (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
774 (void *)func, (void *)s, (void *)ip, (void *)cur,
775 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
776 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
777 (void *)a8, (void *)a9, (void *)a10);
778} 771}
779 772
780STATIC void 773STATIC void
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 5549d495947f..0e66c4ea0f85 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -46,20 +46,12 @@ typedef struct xfs_bmdr_block {
46#define BMBT_STARTBLOCK_BITLEN 52 46#define BMBT_STARTBLOCK_BITLEN 52
47#define BMBT_BLOCKCOUNT_BITLEN 21 47#define BMBT_BLOCKCOUNT_BITLEN 21
48 48
49 49typedef struct xfs_bmbt_rec {
50#define BMBT_USE_64 1
51
52typedef struct xfs_bmbt_rec_32
53{
54 __uint32_t l0, l1, l2, l3;
55} xfs_bmbt_rec_32_t;
56typedef struct xfs_bmbt_rec_64
57{
58 __be64 l0, l1; 50 __be64 l0, l1;
59} xfs_bmbt_rec_64_t; 51} xfs_bmbt_rec_t;
60 52
61typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ 53typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
62typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t; 54typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
63 55
64typedef struct xfs_bmbt_rec_host { 56typedef struct xfs_bmbt_rec_host {
65 __uint64_t l0, l1; 57 __uint64_t l0, l1;
@@ -231,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
231extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); 223extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
232extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); 224extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
233 225
234extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
235extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, 226extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
236 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); 227 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
237 228
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 52b5f14d0c32..96be4b0f2496 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -39,6 +39,7 @@
39#include "xfs_btree_trace.h" 39#include "xfs_btree_trace.h"
40#include "xfs_ialloc.h" 40#include "xfs_ialloc.h"
41#include "xfs_error.h" 41#include "xfs_error.h"
42#include "xfs_trace.h"
42 43
43/* 44/*
44 * Cursor allocation zone. 45 * Cursor allocation zone.
@@ -81,7 +82,7 @@ xfs_btree_check_lblock(
81 XFS_ERRTAG_BTREE_CHECK_LBLOCK, 82 XFS_ERRTAG_BTREE_CHECK_LBLOCK,
82 XFS_RANDOM_BTREE_CHECK_LBLOCK))) { 83 XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
83 if (bp) 84 if (bp)
84 xfs_buftrace("LBTREE ERROR", bp); 85 trace_xfs_btree_corrupt(bp, _RET_IP_);
85 XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, 86 XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
86 mp); 87 mp);
87 return XFS_ERROR(EFSCORRUPTED); 88 return XFS_ERROR(EFSCORRUPTED);
@@ -119,7 +120,7 @@ xfs_btree_check_sblock(
119 XFS_ERRTAG_BTREE_CHECK_SBLOCK, 120 XFS_ERRTAG_BTREE_CHECK_SBLOCK,
120 XFS_RANDOM_BTREE_CHECK_SBLOCK))) { 121 XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
121 if (bp) 122 if (bp)
122 xfs_buftrace("SBTREE ERROR", bp); 123 trace_xfs_btree_corrupt(bp, _RET_IP_);
123 XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", 124 XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
124 XFS_ERRLEVEL_LOW, cur->bc_mp, block); 125 XFS_ERRLEVEL_LOW, cur->bc_mp, block);
125 return XFS_ERROR(EFSCORRUPTED); 126 return XFS_ERROR(EFSCORRUPTED);
@@ -976,7 +977,7 @@ xfs_btree_get_buf_block(
976 xfs_daddr_t d; 977 xfs_daddr_t d;
977 978
978 /* need to sort out how callers deal with failures first */ 979 /* need to sort out how callers deal with failures first */
979 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 980 ASSERT(!(flags & XBF_TRYLOCK));
980 981
981 d = xfs_btree_ptr_to_daddr(cur, ptr); 982 d = xfs_btree_ptr_to_daddr(cur, ptr);
982 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 983 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
@@ -1007,7 +1008,7 @@ xfs_btree_read_buf_block(
1007 int error; 1008 int error;
1008 1009
1009 /* need to sort out how callers deal with failures first */ 1010 /* need to sort out how callers deal with failures first */
1010 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 1011 ASSERT(!(flags & XBF_TRYLOCK));
1011 1012
1012 d = xfs_btree_ptr_to_daddr(cur, ptr); 1013 d = xfs_btree_ptr_to_daddr(cur, ptr);
1013 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1014 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
index b3f5eb3c3c6c..2d8a309873ea 100644
--- a/fs/xfs/xfs_btree_trace.h
+++ b/fs/xfs/xfs_btree_trace.h
@@ -58,8 +58,6 @@ void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
58 struct xfs_buf *, int, int); 58 struct xfs_buf *, int, int);
59void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *, 59void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
60 struct xfs_buf *, int, int, int); 60 struct xfs_buf *, int, int, int);
61void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
62 xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
63void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int); 61void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
64void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int, 62void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
65 union xfs_btree_ptr, union xfs_btree_key *, int); 63 union xfs_btree_ptr, union xfs_btree_key *, int);
@@ -71,24 +69,10 @@ void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
71 union xfs_btree_rec *, int); 69 union xfs_btree_rec *, int);
72void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int); 70void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
73 71
74
75#define XFS_ALLOCBT_TRACE_SIZE 4096 /* size of global trace buffer */
76extern ktrace_t *xfs_allocbt_trace_buf;
77
78#define XFS_INOBT_TRACE_SIZE 4096 /* size of global trace buffer */
79extern ktrace_t *xfs_inobt_trace_buf;
80
81#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */
82#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */
83extern ktrace_t *xfs_bmbt_trace_buf;
84
85
86#define XFS_BTREE_TRACE_ARGBI(c, b, i) \ 72#define XFS_BTREE_TRACE_ARGBI(c, b, i) \
87 xfs_btree_trace_argbi(__func__, c, b, i, __LINE__) 73 xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
88#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \ 74#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \
89 xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__) 75 xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
90#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j) \
91 xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
92#define XFS_BTREE_TRACE_ARGI(c, i) \ 76#define XFS_BTREE_TRACE_ARGI(c, i) \
93 xfs_btree_trace_argi(__func__, c, i, __LINE__) 77 xfs_btree_trace_argi(__func__, c, i, __LINE__)
94#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \ 78#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \
@@ -104,7 +88,6 @@ extern ktrace_t *xfs_bmbt_trace_buf;
104#else 88#else
105#define XFS_BTREE_TRACE_ARGBI(c, b, i) 89#define XFS_BTREE_TRACE_ARGBI(c, b, i)
106#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) 90#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
107#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
108#define XFS_BTREE_TRACE_ARGI(c, i) 91#define XFS_BTREE_TRACE_ARGI(c, i)
109#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) 92#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
110#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) 93#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 92af4098c7e8..f3c49e69eab9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -29,6 +29,7 @@
29#include "xfs_buf_item.h" 29#include "xfs_buf_item.h"
30#include "xfs_trans_priv.h" 30#include "xfs_trans_priv.h"
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_trace.h"
32 33
33 34
34kmem_zone_t *xfs_buf_item_zone; 35kmem_zone_t *xfs_buf_item_zone;
@@ -164,7 +165,7 @@ xfs_buf_item_size(
164 * is the buf log format structure with the 165 * is the buf log format structure with the
165 * cancel flag in it. 166 * cancel flag in it.
166 */ 167 */
167 xfs_buf_item_trace("SIZE STALE", bip); 168 trace_xfs_buf_item_size_stale(bip);
168 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
169 return 1; 170 return 1;
170 } 171 }
@@ -206,7 +207,7 @@ xfs_buf_item_size(
206 } 207 }
207 } 208 }
208 209
209 xfs_buf_item_trace("SIZE NORM", bip); 210 trace_xfs_buf_item_size(bip);
210 return nvecs; 211 return nvecs;
211} 212}
212 213
@@ -249,7 +250,7 @@ xfs_buf_item_format(
249 ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); 250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
250 vecp->i_addr = (xfs_caddr_t)&bip->bli_format; 251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
251 vecp->i_len = base_size; 252 vecp->i_len = base_size;
252 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); 253 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
253 vecp++; 254 vecp++;
254 nvecs = 1; 255 nvecs = 1;
255 256
@@ -259,7 +260,7 @@ xfs_buf_item_format(
259 * is the buf log format structure with the 260 * is the buf log format structure with the
260 * cancel flag in it. 261 * cancel flag in it.
261 */ 262 */
262 xfs_buf_item_trace("FORMAT STALE", bip); 263 trace_xfs_buf_item_format_stale(bip);
263 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
264 bip->bli_format.blf_size = nvecs; 265 bip->bli_format.blf_size = nvecs;
265 return; 266 return;
@@ -296,14 +297,14 @@ xfs_buf_item_format(
296 buffer_offset = first_bit * XFS_BLI_CHUNK; 297 buffer_offset = first_bit * XFS_BLI_CHUNK;
297 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
298 vecp->i_len = nbits * XFS_BLI_CHUNK; 299 vecp->i_len = nbits * XFS_BLI_CHUNK;
299 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 300 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
300 nvecs++; 301 nvecs++;
301 break; 302 break;
302 } else if (next_bit != last_bit + 1) { 303 } else if (next_bit != last_bit + 1) {
303 buffer_offset = first_bit * XFS_BLI_CHUNK; 304 buffer_offset = first_bit * XFS_BLI_CHUNK;
304 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
305 vecp->i_len = nbits * XFS_BLI_CHUNK; 306 vecp->i_len = nbits * XFS_BLI_CHUNK;
306 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 307 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
307 nvecs++; 308 nvecs++;
308 vecp++; 309 vecp++;
309 first_bit = next_bit; 310 first_bit = next_bit;
@@ -315,7 +316,7 @@ xfs_buf_item_format(
315 buffer_offset = first_bit * XFS_BLI_CHUNK; 316 buffer_offset = first_bit * XFS_BLI_CHUNK;
316 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
317 vecp->i_len = nbits * XFS_BLI_CHUNK; 318 vecp->i_len = nbits * XFS_BLI_CHUNK;
318 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 319 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
319/* You would think we need to bump the nvecs here too, but we do not 320/* You would think we need to bump the nvecs here too, but we do not
320 * this number is used by recovery, and it gets confused by the boundary 321 * this number is used by recovery, and it gets confused by the boundary
321 * split here 322 * split here
@@ -335,7 +336,7 @@ xfs_buf_item_format(
335 /* 336 /*
336 * Check to make sure everything is consistent. 337 * Check to make sure everything is consistent.
337 */ 338 */
338 xfs_buf_item_trace("FORMAT NORM", bip); 339 trace_xfs_buf_item_format(bip);
339 xfs_buf_item_log_check(bip); 340 xfs_buf_item_log_check(bip);
340} 341}
341 342
@@ -355,8 +356,7 @@ xfs_buf_item_pin(
355 ASSERT(atomic_read(&bip->bli_refcount) > 0); 356 ASSERT(atomic_read(&bip->bli_refcount) > 0);
356 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
357 (bip->bli_flags & XFS_BLI_STALE)); 358 (bip->bli_flags & XFS_BLI_STALE));
358 xfs_buf_item_trace("PIN", bip); 359 trace_xfs_buf_item_pin(bip);
359 xfs_buftrace("XFS_PIN", bp);
360 xfs_bpin(bp); 360 xfs_bpin(bp);
361} 361}
362 362
@@ -383,8 +383,7 @@ xfs_buf_item_unpin(
383 ASSERT(bp != NULL); 383 ASSERT(bp != NULL);
384 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); 384 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
385 ASSERT(atomic_read(&bip->bli_refcount) > 0); 385 ASSERT(atomic_read(&bip->bli_refcount) > 0);
386 xfs_buf_item_trace("UNPIN", bip); 386 trace_xfs_buf_item_unpin(bip);
387 xfs_buftrace("XFS_UNPIN", bp);
388 387
389 freed = atomic_dec_and_test(&bip->bli_refcount); 388 freed = atomic_dec_and_test(&bip->bli_refcount);
390 ailp = bip->bli_item.li_ailp; 389 ailp = bip->bli_item.li_ailp;
@@ -395,8 +394,8 @@ xfs_buf_item_unpin(
395 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
396 ASSERT(XFS_BUF_ISSTALE(bp)); 395 ASSERT(XFS_BUF_ISSTALE(bp));
397 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
398 xfs_buf_item_trace("UNPIN STALE", bip); 397 trace_xfs_buf_item_unpin_stale(bip);
399 xfs_buftrace("XFS_UNPIN STALE", bp); 398
400 /* 399 /*
401 * If we get called here because of an IO error, we may 400 * If we get called here because of an IO error, we may
402 * or may not have the item on the AIL. xfs_trans_ail_delete() 401 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -440,8 +439,8 @@ xfs_buf_item_unpin_remove(
440 if ((atomic_read(&bip->bli_refcount) == 1) && 439 if ((atomic_read(&bip->bli_refcount) == 1) &&
441 (bip->bli_flags & XFS_BLI_STALE)) { 440 (bip->bli_flags & XFS_BLI_STALE)) {
442 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); 441 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
443 xfs_buf_item_trace("UNPIN REMOVE", bip); 442 trace_xfs_buf_item_unpin_stale(bip);
444 xfs_buftrace("XFS_UNPIN_REMOVE", bp); 443
445 /* 444 /*
446 * yes -- clear the xaction descriptor in-use flag 445 * yes -- clear the xaction descriptor in-use flag
447 * and free the chunk if required. We can safely 446 * and free the chunk if required. We can safely
@@ -468,8 +467,10 @@ xfs_buf_item_unpin_remove(
468/* 467/*
469 * This is called to attempt to lock the buffer associated with this 468 * This is called to attempt to lock the buffer associated with this
470 * buf log item. Don't sleep on the buffer lock. If we can't get 469 * buf log item. Don't sleep on the buffer lock. If we can't get
471 * the lock right away, return 0. If we can get the lock, pull the 470 * the lock right away, return 0. If we can get the lock, take a
472 * buffer from the free list, mark it busy, and return 1. 471 * reference to the buffer. If this is a delayed write buffer that
472 * needs AIL help to be written back, invoke the pushbuf routine
473 * rather than the normal success path.
473 */ 474 */
474STATIC uint 475STATIC uint
475xfs_buf_item_trylock( 476xfs_buf_item_trylock(
@@ -478,24 +479,18 @@ xfs_buf_item_trylock(
478 xfs_buf_t *bp; 479 xfs_buf_t *bp;
479 480
480 bp = bip->bli_buf; 481 bp = bip->bli_buf;
481 482 if (XFS_BUF_ISPINNED(bp))
482 if (XFS_BUF_ISPINNED(bp)) {
483 return XFS_ITEM_PINNED; 483 return XFS_ITEM_PINNED;
484 } 484 if (!XFS_BUF_CPSEMA(bp))
485
486 if (!XFS_BUF_CPSEMA(bp)) {
487 return XFS_ITEM_LOCKED; 485 return XFS_ITEM_LOCKED;
488 }
489 486
490 /* 487 /* take a reference to the buffer. */
491 * Remove the buffer from the free list. Only do this
492 * if it's on the free list. Private buffers like the
493 * superblock buffer are not.
494 */
495 XFS_BUF_HOLD(bp); 488 XFS_BUF_HOLD(bp);
496 489
497 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 490 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
498 xfs_buf_item_trace("TRYLOCK SUCCESS", bip); 491 trace_xfs_buf_item_trylock(bip);
492 if (XFS_BUF_ISDELAYWRITE(bp))
493 return XFS_ITEM_PUSHBUF;
499 return XFS_ITEM_SUCCESS; 494 return XFS_ITEM_SUCCESS;
500} 495}
501 496
@@ -524,7 +519,6 @@ xfs_buf_item_unlock(
524 uint hold; 519 uint hold;
525 520
526 bp = bip->bli_buf; 521 bp = bip->bli_buf;
527 xfs_buftrace("XFS_UNLOCK", bp);
528 522
529 /* 523 /*
530 * Clear the buffer's association with this transaction. 524 * Clear the buffer's association with this transaction.
@@ -547,7 +541,7 @@ xfs_buf_item_unlock(
547 */ 541 */
548 if (bip->bli_flags & XFS_BLI_STALE) { 542 if (bip->bli_flags & XFS_BLI_STALE) {
549 bip->bli_flags &= ~XFS_BLI_LOGGED; 543 bip->bli_flags &= ~XFS_BLI_LOGGED;
550 xfs_buf_item_trace("UNLOCK STALE", bip); 544 trace_xfs_buf_item_unlock_stale(bip);
551 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 545 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
552 if (!aborted) 546 if (!aborted)
553 return; 547 return;
@@ -574,7 +568,7 @@ xfs_buf_item_unlock(
574 * release the buffer at the end of this routine. 568 * release the buffer at the end of this routine.
575 */ 569 */
576 hold = bip->bli_flags & XFS_BLI_HOLD; 570 hold = bip->bli_flags & XFS_BLI_HOLD;
577 xfs_buf_item_trace("UNLOCK", bip); 571 trace_xfs_buf_item_unlock(bip);
578 572
579 /* 573 /*
580 * If the buf item isn't tracking any data, free it. 574 * If the buf item isn't tracking any data, free it.
@@ -618,7 +612,8 @@ xfs_buf_item_committed(
618 xfs_buf_log_item_t *bip, 612 xfs_buf_log_item_t *bip,
619 xfs_lsn_t lsn) 613 xfs_lsn_t lsn)
620{ 614{
621 xfs_buf_item_trace("COMMITTED", bip); 615 trace_xfs_buf_item_committed(bip);
616
622 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 617 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
623 (bip->bli_item.li_lsn != 0)) { 618 (bip->bli_item.li_lsn != 0)) {
624 return bip->bli_item.li_lsn; 619 return bip->bli_item.li_lsn;
@@ -627,11 +622,9 @@ xfs_buf_item_committed(
627} 622}
628 623
629/* 624/*
630 * This is called to asynchronously write the buffer associated with this 625 * The buffer is locked, but is not a delayed write buffer. This happens
631 * buf log item out to disk. The buffer will already have been locked by 626 * if we race with IO completion and hence we don't want to try to write it
632 * a successful call to xfs_buf_item_trylock(). If the buffer still has 627 * again. Just release the buffer.
633 * B_DELWRI set, then get it going out to disk with a call to bawrite().
634 * If not, then just release the buffer.
635 */ 628 */
636STATIC void 629STATIC void
637xfs_buf_item_push( 630xfs_buf_item_push(
@@ -640,20 +633,32 @@ xfs_buf_item_push(
640 xfs_buf_t *bp; 633 xfs_buf_t *bp;
641 634
642 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 635 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
643 xfs_buf_item_trace("PUSH", bip); 636 trace_xfs_buf_item_push(bip);
644 637
645 bp = bip->bli_buf; 638 bp = bip->bli_buf;
639 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
640 xfs_buf_relse(bp);
641}
646 642
647 if (XFS_BUF_ISDELAYWRITE(bp)) { 643/*
648 int error; 644 * The buffer is locked and is a delayed write buffer. Promote the buffer
649 error = xfs_bawrite(bip->bli_item.li_mountp, bp); 645 * in the delayed write queue as the caller knows that they must invoke
650 if (error) 646 * the xfsbufd to get this buffer written. We have to unlock the buffer
651 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, 647 * to allow the xfsbufd to write it, too.
652 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", 648 */
653 error, bip, bp); 649STATIC void
654 } else { 650xfs_buf_item_pushbuf(
655 xfs_buf_relse(bp); 651 xfs_buf_log_item_t *bip)
656 } 652{
653 xfs_buf_t *bp;
654
655 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
656 trace_xfs_buf_item_pushbuf(bip);
657
658 bp = bip->bli_buf;
659 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
660 xfs_buf_delwri_promote(bp);
661 xfs_buf_relse(bp);
657} 662}
658 663
659/* ARGSUSED */ 664/* ARGSUSED */
@@ -678,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
678 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 683 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
679 xfs_buf_item_committed, 684 xfs_buf_item_committed,
680 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, 685 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
681 .iop_pushbuf = NULL, 686 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
682 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 687 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
683 xfs_buf_item_committing 688 xfs_buf_item_committing
684}; 689};
@@ -738,9 +743,6 @@ xfs_buf_item_init(
738 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 743 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
739 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); 744 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
740 bip->bli_format.blf_map_size = map_size; 745 bip->bli_format.blf_map_size = map_size;
741#ifdef XFS_BLI_TRACE
742 bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
743#endif
744 746
745#ifdef XFS_TRANS_DEBUG 747#ifdef XFS_TRANS_DEBUG
746 /* 748 /*
@@ -878,9 +880,6 @@ xfs_buf_item_free(
878 kmem_free(bip->bli_logged); 880 kmem_free(bip->bli_logged);
879#endif /* XFS_TRANS_DEBUG */ 881#endif /* XFS_TRANS_DEBUG */
880 882
881#ifdef XFS_BLI_TRACE
882 ktrace_free(bip->bli_trace);
883#endif
884 kmem_zone_free(xfs_buf_item_zone, bip); 883 kmem_zone_free(xfs_buf_item_zone, bip);
885} 884}
886 885
@@ -897,7 +896,8 @@ xfs_buf_item_relse(
897{ 896{
898 xfs_buf_log_item_t *bip; 897 xfs_buf_log_item_t *bip;
899 898
900 xfs_buftrace("XFS_RELSE", bp); 899 trace_xfs_buf_item_relse(bp, _RET_IP_);
900
901 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); 901 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
902 XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); 902 XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
903 if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && 903 if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
@@ -994,7 +994,7 @@ xfs_buf_iodone_callbacks(
994 if (XFS_FORCED_SHUTDOWN(mp)) { 994 if (XFS_FORCED_SHUTDOWN(mp)) {
995 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 995 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
996 XFS_BUF_SUPER_STALE(bp); 996 XFS_BUF_SUPER_STALE(bp);
997 xfs_buftrace("BUF_IODONE_CB", bp); 997 trace_xfs_buf_item_iodone(bp, _RET_IP_);
998 xfs_buf_do_callbacks(bp, lip); 998 xfs_buf_do_callbacks(bp, lip);
999 XFS_BUF_SET_FSPRIVATE(bp, NULL); 999 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1000 XFS_BUF_CLR_IODONE_FUNC(bp); 1000 XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -1030,7 +1030,7 @@ xfs_buf_iodone_callbacks(
1030 XFS_BUF_SET_START(bp); 1030 XFS_BUF_SET_START(bp);
1031 } 1031 }
1032 ASSERT(XFS_BUF_IODONE_FUNC(bp)); 1032 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1033 xfs_buftrace("BUF_IODONE ASYNC", bp); 1033 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1034 xfs_buf_relse(bp); 1034 xfs_buf_relse(bp);
1035 } else { 1035 } else {
1036 /* 1036 /*
@@ -1053,9 +1053,7 @@ xfs_buf_iodone_callbacks(
1053 } 1053 }
1054 return; 1054 return;
1055 } 1055 }
1056#ifdef XFSERRORDEBUG 1056
1057 xfs_buftrace("XFS BUFCB NOERR", bp);
1058#endif
1059 xfs_buf_do_callbacks(bp, lip); 1057 xfs_buf_do_callbacks(bp, lip);
1060 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1058 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1061 XFS_BUF_CLR_IODONE_FUNC(bp); 1059 XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -1081,7 +1079,9 @@ xfs_buf_error_relse(
1081 XFS_BUF_DONE(bp); 1079 XFS_BUF_DONE(bp);
1082 XFS_BUF_UNDELAYWRITE(bp); 1080 XFS_BUF_UNDELAYWRITE(bp);
1083 XFS_BUF_ERROR(bp,0); 1081 XFS_BUF_ERROR(bp,0);
1084 xfs_buftrace("BUF_ERROR_RELSE", bp); 1082
1083 trace_xfs_buf_error_relse(bp, _RET_IP_);
1084
1085 if (! XFS_FORCED_SHUTDOWN(mp)) 1085 if (! XFS_FORCED_SHUTDOWN(mp))
1086 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1086 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1087 /* 1087 /*
@@ -1128,34 +1128,3 @@ xfs_buf_iodone(
1128 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); 1128 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
1129 xfs_buf_item_free(bip); 1129 xfs_buf_item_free(bip);
1130} 1130}
1131
1132#if defined(XFS_BLI_TRACE)
1133void
1134xfs_buf_item_trace(
1135 char *id,
1136 xfs_buf_log_item_t *bip)
1137{
1138 xfs_buf_t *bp;
1139 ASSERT(bip->bli_trace != NULL);
1140
1141 bp = bip->bli_buf;
1142 ktrace_enter(bip->bli_trace,
1143 (void *)id,
1144 (void *)bip->bli_buf,
1145 (void *)((unsigned long)bip->bli_flags),
1146 (void *)((unsigned long)bip->bli_recur),
1147 (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
1148 (void *)((unsigned long)
1149 (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
1150 (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
1151 (void *)((unsigned long)XFS_BUF_COUNT(bp)),
1152 (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
1153 XFS_BUF_FSPRIVATE(bp, void *),
1154 XFS_BUF_FSPRIVATE2(bp, void *),
1155 (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
1156 (void *)XFS_BUF_IODONE_FUNC(bp),
1157 (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
1158 (void *)bip->bli_item.li_desc,
1159 (void *)((unsigned long)bip->bli_item.li_flags));
1160}
1161#endif /* XFS_BLI_TRACE */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 5a41c348bb1c..217f34af00cb 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -70,22 +70,21 @@ typedef struct xfs_buf_log_format_t {
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72 72
73#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \
75 { XFS_BLI_DIRTY, "DIRTY" }, \
76 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" }
80
73 81
74#ifdef __KERNEL__ 82#ifdef __KERNEL__
75 83
76struct xfs_buf; 84struct xfs_buf;
77struct ktrace;
78struct xfs_mount; 85struct xfs_mount;
79struct xfs_buf_log_item; 86struct xfs_buf_log_item;
80 87
81#if defined(XFS_BLI_TRACE)
82#define XFS_BLI_TRACE_SIZE 32
83
84void xfs_buf_item_trace(char *, struct xfs_buf_log_item *);
85#else
86#define xfs_buf_item_trace(id, bip)
87#endif
88
89/* 88/*
90 * This is the in core log item structure used to track information 89 * This is the in core log item structure used to track information
91 * needed to log buffers. It tracks how many times the lock has been 90 * needed to log buffers. It tracks how many times the lock has been
@@ -97,9 +96,6 @@ typedef struct xfs_buf_log_item {
97 unsigned int bli_flags; /* misc flags */ 96 unsigned int bli_flags; /* misc flags */
98 unsigned int bli_recur; /* lock recursion count */ 97 unsigned int bli_recur; /* lock recursion count */
99 atomic_t bli_refcount; /* cnt of tp refs */ 98 atomic_t bli_refcount; /* cnt of tp refs */
100#ifdef XFS_BLI_TRACE
101 struct ktrace *bli_trace; /* event trace buf */
102#endif
103#ifdef XFS_TRANS_DEBUG 99#ifdef XFS_TRANS_DEBUG
104 char *bli_orig; /* original buffer copy */ 100 char *bli_orig; /* original buffer copy */
105 char *bli_logged; /* bytes logged (bitmap) */ 101 char *bli_logged; /* bytes logged (bitmap) */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 2847bbc1c534..0ca556b4bf31 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -46,6 +46,7 @@
46#include "xfs_dir2_block.h" 46#include "xfs_dir2_block.h"
47#include "xfs_dir2_node.h" 47#include "xfs_dir2_node.h"
48#include "xfs_error.h" 48#include "xfs_error.h"
49#include "xfs_trace.h"
49 50
50/* 51/*
51 * xfs_da_btree.c 52 * xfs_da_btree.c
@@ -1533,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen)
1533enum xfs_dacmp 1534enum xfs_dacmp
1534xfs_da_compname( 1535xfs_da_compname(
1535 struct xfs_da_args *args, 1536 struct xfs_da_args *args,
1536 const char *name, 1537 const unsigned char *name,
1537 int len) 1538 int len)
1538{ 1539{
1539 return (args->namelen == len && memcmp(args->name, name, len) == 0) ? 1540 return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
1540 XFS_CMP_EXACT : XFS_CMP_DIFFERENT; 1541 XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
@@ -2107,7 +2108,7 @@ xfs_da_do_buf(
2107 (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC), 2108 (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC),
2108 mp, XFS_ERRTAG_DA_READ_BUF, 2109 mp, XFS_ERRTAG_DA_READ_BUF,
2109 XFS_RANDOM_DA_READ_BUF))) { 2110 XFS_RANDOM_DA_READ_BUF))) {
2110 xfs_buftrace("DA READ ERROR", rbp->bps[0]); 2111 trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_);
2111 XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", 2112 XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
2112 XFS_ERRLEVEL_LOW, mp, info); 2113 XFS_ERRLEVEL_LOW, mp, info);
2113 error = XFS_ERROR(EFSCORRUPTED); 2114 error = XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8c536167bf75..fe9f5a8c1d2a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -125,6 +125,13 @@ typedef struct xfs_da_args {
125#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ 125#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
126#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ 126#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
127 127
128#define XFS_DA_OP_FLAGS \
129 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
130 { XFS_DA_OP_RENAME, "RENAME" }, \
131 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
132 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
133 { XFS_DA_OP_CILOOKUP, "CILOOKUP" }
134
128/* 135/*
129 * Structure to describe buffer(s) for a block. 136 * Structure to describe buffer(s) for a block.
130 * This is needed in the directory version 2 format case, when 137 * This is needed in the directory version 2 format case, when
@@ -202,7 +209,8 @@ typedef struct xfs_da_state {
202 */ 209 */
203struct xfs_nameops { 210struct xfs_nameops {
204 xfs_dahash_t (*hashname)(struct xfs_name *); 211 xfs_dahash_t (*hashname)(struct xfs_name *);
205 enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); 212 enum xfs_dacmp (*compname)(struct xfs_da_args *,
213 const unsigned char *, int);
206}; 214};
207 215
208 216
@@ -253,7 +261,7 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
253 261
254uint xfs_da_hashname(const __uint8_t *name_string, int name_length); 262uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
255enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, 263enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
256 const char *name, int len); 264 const unsigned char *name, int len);
257 265
258 266
259xfs_da_state_t *xfs_da_state_alloc(void); 267xfs_da_state_t *xfs_da_state_alloc(void);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index ab89a7e94a0f..cd27c9d6c71f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -43,16 +43,23 @@
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_vnodeops.h" 45#include "xfs_vnodeops.h"
46#include "xfs_trace.h"
47
48
49static int xfs_swap_extents(
50 xfs_inode_t *ip, /* target inode */
51 xfs_inode_t *tip, /* tmp inode */
52 xfs_swapext_t *sxp);
46 53
47/* 54/*
48 * Syssgi interface for swapext 55 * ioctl interface for swapext
49 */ 56 */
50int 57int
51xfs_swapext( 58xfs_swapext(
52 xfs_swapext_t *sxp) 59 xfs_swapext_t *sxp)
53{ 60{
54 xfs_inode_t *ip, *tip; 61 xfs_inode_t *ip, *tip;
55 struct file *file, *target_file; 62 struct file *file, *tmp_file;
56 int error = 0; 63 int error = 0;
57 64
58 /* Pull information for the target fd */ 65 /* Pull information for the target fd */
@@ -67,56 +74,128 @@ xfs_swapext(
67 goto out_put_file; 74 goto out_put_file;
68 } 75 }
69 76
70 target_file = fget((int)sxp->sx_fdtmp); 77 tmp_file = fget((int)sxp->sx_fdtmp);
71 if (!target_file) { 78 if (!tmp_file) {
72 error = XFS_ERROR(EINVAL); 79 error = XFS_ERROR(EINVAL);
73 goto out_put_file; 80 goto out_put_file;
74 } 81 }
75 82
76 if (!(target_file->f_mode & FMODE_WRITE) || 83 if (!(tmp_file->f_mode & FMODE_WRITE) ||
77 (target_file->f_flags & O_APPEND)) { 84 (tmp_file->f_flags & O_APPEND)) {
78 error = XFS_ERROR(EBADF); 85 error = XFS_ERROR(EBADF);
79 goto out_put_target_file; 86 goto out_put_tmp_file;
80 } 87 }
81 88
82 if (IS_SWAPFILE(file->f_path.dentry->d_inode) || 89 if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
83 IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { 90 IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
84 error = XFS_ERROR(EINVAL); 91 error = XFS_ERROR(EINVAL);
85 goto out_put_target_file; 92 goto out_put_tmp_file;
86 } 93 }
87 94
88 ip = XFS_I(file->f_path.dentry->d_inode); 95 ip = XFS_I(file->f_path.dentry->d_inode);
89 tip = XFS_I(target_file->f_path.dentry->d_inode); 96 tip = XFS_I(tmp_file->f_path.dentry->d_inode);
90 97
91 if (ip->i_mount != tip->i_mount) { 98 if (ip->i_mount != tip->i_mount) {
92 error = XFS_ERROR(EINVAL); 99 error = XFS_ERROR(EINVAL);
93 goto out_put_target_file; 100 goto out_put_tmp_file;
94 } 101 }
95 102
96 if (ip->i_ino == tip->i_ino) { 103 if (ip->i_ino == tip->i_ino) {
97 error = XFS_ERROR(EINVAL); 104 error = XFS_ERROR(EINVAL);
98 goto out_put_target_file; 105 goto out_put_tmp_file;
99 } 106 }
100 107
101 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 108 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
102 error = XFS_ERROR(EIO); 109 error = XFS_ERROR(EIO);
103 goto out_put_target_file; 110 goto out_put_tmp_file;
104 } 111 }
105 112
106 error = xfs_swap_extents(ip, tip, sxp); 113 error = xfs_swap_extents(ip, tip, sxp);
107 114
108 out_put_target_file: 115 out_put_tmp_file:
109 fput(target_file); 116 fput(tmp_file);
110 out_put_file: 117 out_put_file:
111 fput(file); 118 fput(file);
112 out: 119 out:
113 return error; 120 return error;
114} 121}
115 122
116int 123/*
124 * We need to check that the format of the data fork in the temporary inode is
125 * valid for the target inode before doing the swap. This is not a problem with
126 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
127 * data fork depending on the space the attribute fork is taking so we can get
128 * invalid formats on the target inode.
129 *
130 * E.g. target has space for 7 extents in extent format, temp inode only has
131 * space for 6. If we defragment down to 7 extents, then the tmp format is a
132 * btree, but when swapped it needs to be in extent format. Hence we can't just
133 * blindly swap data forks on attr2 filesystems.
134 *
135 * Note that we check the swap in both directions so that we don't end up with
136 * a corrupt temporary inode, either.
137 *
138 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
139 * inode will prevent this situation from occurring, so all we do here is
140 * reject and log the attempt. basically we are putting the responsibility on
141 * userspace to get this right.
142 */
143static int
144xfs_swap_extents_check_format(
145 xfs_inode_t *ip, /* target inode */
146 xfs_inode_t *tip) /* tmp inode */
147{
148
149 /* Should never get a local format */
150 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
151 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
152 return EINVAL;
153
154 /*
155 * if the target inode has less extents that then temporary inode then
156 * why did userspace call us?
157 */
158 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
159 return EINVAL;
160
161 /*
162 * if the target inode is in extent form and the temp inode is in btree
163 * form then we will end up with the target inode in the wrong format
164 * as we already know there are less extents in the temp inode.
165 */
166 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
167 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
168 return EINVAL;
169
170 /* Check temp in extent form to max in target */
171 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
172 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
173 return EINVAL;
174
175 /* Check target in extent form to max in temp */
176 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
177 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
178 return EINVAL;
179
180 /* Check root block of temp in btree form to max in target */
181 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
182 XFS_IFORK_BOFF(ip) &&
183 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
184 return EINVAL;
185
186 /* Check root block of target in btree form to max in temp */
187 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
188 XFS_IFORK_BOFF(tip) &&
189 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
190 return EINVAL;
191
192 return 0;
193}
194
195static int
117xfs_swap_extents( 196xfs_swap_extents(
118 xfs_inode_t *ip, 197 xfs_inode_t *ip, /* target inode */
119 xfs_inode_t *tip, 198 xfs_inode_t *tip, /* tmp inode */
120 xfs_swapext_t *sxp) 199 xfs_swapext_t *sxp)
121{ 200{
122 xfs_mount_t *mp; 201 xfs_mount_t *mp;
@@ -160,15 +239,7 @@ xfs_swap_extents(
160 goto out_unlock; 239 goto out_unlock;
161 } 240 }
162 241
163 /* Should never get a local format */
164 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
165 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
166 error = XFS_ERROR(EINVAL);
167 goto out_unlock;
168 }
169
170 if (VN_CACHED(VFS_I(tip)) != 0) { 242 if (VN_CACHED(VFS_I(tip)) != 0) {
171 xfs_inval_cached_trace(tip, 0, -1, 0, -1);
172 error = xfs_flushinval_pages(tip, 0, -1, 243 error = xfs_flushinval_pages(tip, 0, -1,
173 FI_REMAPF_LOCKED); 244 FI_REMAPF_LOCKED);
174 if (error) 245 if (error)
@@ -189,13 +260,15 @@ xfs_swap_extents(
189 goto out_unlock; 260 goto out_unlock;
190 } 261 }
191 262
192 /* 263 trace_xfs_swap_extent_before(ip, 0);
193 * If the target has extended attributes, the tmp file 264 trace_xfs_swap_extent_before(tip, 1);
194 * must also in order to ensure the correct data fork 265
195 * format. 266 /* check inode formats now that data is flushed */
196 */ 267 error = xfs_swap_extents_check_format(ip, tip);
197 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { 268 if (error) {
198 error = XFS_ERROR(EINVAL); 269 xfs_fs_cmn_err(CE_NOTE, mp,
270 "%s: inode 0x%llx format is incompatible for exchanging.",
271 __FILE__, ip->i_ino);
199 goto out_unlock; 272 goto out_unlock;
200 } 273 }
201 274
@@ -276,6 +349,16 @@ xfs_swap_extents(
276 *tifp = *tempifp; /* struct copy */ 349 *tifp = *tempifp; /* struct copy */
277 350
278 /* 351 /*
352 * Fix the in-memory data fork values that are dependent on the fork
353 * offset in the inode. We can't assume they remain the same as attr2
354 * has dynamic fork offsets.
355 */
356 ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
357 (uint)sizeof(xfs_bmbt_rec_t);
358 tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
359 (uint)sizeof(xfs_bmbt_rec_t);
360
361 /*
279 * Fix the on-disk inode values 362 * Fix the on-disk inode values
280 */ 363 */
281 tmp = (__uint64_t)ip->i_d.di_nblocks; 364 tmp = (__uint64_t)ip->i_d.di_nblocks;
@@ -347,6 +430,8 @@ xfs_swap_extents(
347 430
348 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 431 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
349 432
433 trace_xfs_swap_extent_after(ip, 0);
434 trace_xfs_swap_extent_after(tip, 1);
350out: 435out:
351 kmem_free(tempifp); 436 kmem_free(tempifp);
352 return error; 437 return error;
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index 4f55a6306558..20bdd935c121 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,9 +48,6 @@ typedef struct xfs_swapext
48 */ 48 */
49int xfs_swapext(struct xfs_swapext *sx); 49int xfs_swapext(struct xfs_swapext *sx);
50 50
51int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
52 struct xfs_swapext *sxp);
53
54#endif /* __KERNEL__ */ 51#endif /* __KERNEL__ */
55 52
56#endif /* __XFS_DFRAG_H__ */ 53#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index bb1d58eb3982..42520f041265 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -40,11 +40,11 @@
40#include "xfs_dir2_leaf.h" 40#include "xfs_dir2_leaf.h"
41#include "xfs_dir2_block.h" 41#include "xfs_dir2_block.h"
42#include "xfs_dir2_node.h" 42#include "xfs_dir2_node.h"
43#include "xfs_dir2_trace.h"
44#include "xfs_error.h" 43#include "xfs_error.h"
45#include "xfs_vnodeops.h" 44#include "xfs_vnodeops.h"
45#include "xfs_trace.h"
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2}; 47struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
48 48
49/* 49/*
50 * ASCII case-insensitive (ie. A-Z) support for directories that was 50 * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -66,8 +66,8 @@ xfs_ascii_ci_hashname(
66STATIC enum xfs_dacmp 66STATIC enum xfs_dacmp
67xfs_ascii_ci_compname( 67xfs_ascii_ci_compname(
68 struct xfs_da_args *args, 68 struct xfs_da_args *args,
69 const char *name, 69 const unsigned char *name,
70 int len) 70 int len)
71{ 71{
72 enum xfs_dacmp result; 72 enum xfs_dacmp result;
73 int i; 73 int i;
@@ -247,7 +247,7 @@ xfs_dir_createname(
247int 247int
248xfs_dir_cilookup_result( 248xfs_dir_cilookup_result(
249 struct xfs_da_args *args, 249 struct xfs_da_args *args,
250 const char *name, 250 const unsigned char *name,
251 int len) 251 int len)
252{ 252{
253 if (args->cmpresult == XFS_CMP_DIFFERENT) 253 if (args->cmpresult == XFS_CMP_DIFFERENT)
@@ -525,7 +525,8 @@ xfs_dir2_grow_inode(
525 xfs_trans_t *tp; 525 xfs_trans_t *tp;
526 xfs_drfsbno_t nblks; 526 xfs_drfsbno_t nblks;
527 527
528 xfs_dir2_trace_args_s("grow_inode", args, space); 528 trace_xfs_dir2_grow_inode(args, space);
529
529 dp = args->dp; 530 dp = args->dp;
530 tp = args->trans; 531 tp = args->trans;
531 mp = dp->i_mount; 532 mp = dp->i_mount;
@@ -703,7 +704,8 @@ xfs_dir2_shrink_inode(
703 xfs_mount_t *mp; 704 xfs_mount_t *mp;
704 xfs_trans_t *tp; 705 xfs_trans_t *tp;
705 706
706 xfs_dir2_trace_args_db("shrink_inode", args, db, bp); 707 trace_xfs_dir2_shrink_inode(args, db);
708
707 dp = args->dp; 709 dp = args->dp;
708 mp = dp->i_mount; 710 mp = dp->i_mount;
709 tp = args->trans; 711 tp = args->trans;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1d9ef96f33aa..74a3b1057685 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, 100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
101 struct xfs_dabuf *bp); 101 struct xfs_dabuf *bp);
102 102
103extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, 103extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
104 int len); 104 const unsigned char *name, int len);
105 105
106#endif /* __XFS_DIR2_H__ */ 106#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ab52e9e1c1ee..779a267b0a84 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -36,8 +36,8 @@
36#include "xfs_dir2_data.h" 36#include "xfs_dir2_data.h"
37#include "xfs_dir2_leaf.h" 37#include "xfs_dir2_leaf.h"
38#include "xfs_dir2_block.h" 38#include "xfs_dir2_block.h"
39#include "xfs_dir2_trace.h"
40#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_trace.h"
41 41
42/* 42/*
43 * Local function prototypes. 43 * Local function prototypes.
@@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
57void 57void
58xfs_dir_startup(void) 58xfs_dir_startup(void)
59{ 59{
60 xfs_dir_hash_dot = xfs_da_hashname(".", 1); 60 xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
61 xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); 61 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
62} 62}
63 63
64/* 64/*
@@ -94,7 +94,8 @@ xfs_dir2_block_addname(
94 __be16 *tagp; /* pointer to tag value */ 94 __be16 *tagp; /* pointer to tag value */
95 xfs_trans_t *tp; /* transaction structure */ 95 xfs_trans_t *tp; /* transaction structure */
96 96
97 xfs_dir2_trace_args("block_addname", args); 97 trace_xfs_dir2_block_addname(args);
98
98 dp = args->dp; 99 dp = args->dp;
99 tp = args->trans; 100 tp = args->trans;
100 mp = dp->i_mount; 101 mp = dp->i_mount;
@@ -512,8 +513,9 @@ xfs_dir2_block_getdents(
512 /* 513 /*
513 * If it didn't fit, set the final offset to here & return. 514 * If it didn't fit, set the final offset to here & return.
514 */ 515 */
515 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, 516 if (filldir(dirent, (char *)dep->name, dep->namelen,
516 be64_to_cpu(dep->inumber), DT_UNKNOWN)) { 517 cook & 0x7fffffff, be64_to_cpu(dep->inumber),
518 DT_UNKNOWN)) {
517 *offset = cook & 0x7fffffff; 519 *offset = cook & 0x7fffffff;
518 xfs_da_brelse(NULL, bp); 520 xfs_da_brelse(NULL, bp);
519 return 0; 521 return 0;
@@ -590,7 +592,8 @@ xfs_dir2_block_lookup(
590 int error; /* error return value */ 592 int error; /* error return value */
591 xfs_mount_t *mp; /* filesystem mount point */ 593 xfs_mount_t *mp; /* filesystem mount point */
592 594
593 xfs_dir2_trace_args("block_lookup", args); 595 trace_xfs_dir2_block_lookup(args);
596
594 /* 597 /*
595 * Get the buffer, look up the entry. 598 * Get the buffer, look up the entry.
596 * If not found (ENOENT) then return, have no buffer. 599 * If not found (ENOENT) then return, have no buffer.
@@ -747,7 +750,8 @@ xfs_dir2_block_removename(
747 int size; /* shortform size */ 750 int size; /* shortform size */
748 xfs_trans_t *tp; /* transaction pointer */ 751 xfs_trans_t *tp; /* transaction pointer */
749 752
750 xfs_dir2_trace_args("block_removename", args); 753 trace_xfs_dir2_block_removename(args);
754
751 /* 755 /*
752 * Look up the entry in the block. Gets the buffer and entry index. 756 * Look up the entry in the block. Gets the buffer and entry index.
753 * It will always be there, the vnodeops level does a lookup first. 757 * It will always be there, the vnodeops level does a lookup first.
@@ -823,7 +827,8 @@ xfs_dir2_block_replace(
823 int error; /* error return value */ 827 int error; /* error return value */
824 xfs_mount_t *mp; /* filesystem mount point */ 828 xfs_mount_t *mp; /* filesystem mount point */
825 829
826 xfs_dir2_trace_args("block_replace", args); 830 trace_xfs_dir2_block_replace(args);
831
827 /* 832 /*
828 * Lookup the entry in the directory. Get buffer and entry index. 833 * Lookup the entry in the directory. Get buffer and entry index.
829 * This will always succeed since the caller has already done a lookup. 834 * This will always succeed since the caller has already done a lookup.
@@ -897,7 +902,8 @@ xfs_dir2_leaf_to_block(
897 int to; /* block/leaf to index */ 902 int to; /* block/leaf to index */
898 xfs_trans_t *tp; /* transaction pointer */ 903 xfs_trans_t *tp; /* transaction pointer */
899 904
900 xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp); 905 trace_xfs_dir2_leaf_to_block(args);
906
901 dp = args->dp; 907 dp = args->dp;
902 tp = args->trans; 908 tp = args->trans;
903 mp = dp->i_mount; 909 mp = dp->i_mount;
@@ -1044,7 +1050,8 @@ xfs_dir2_sf_to_block(
1044 xfs_trans_t *tp; /* transaction pointer */ 1050 xfs_trans_t *tp; /* transaction pointer */
1045 struct xfs_name name; 1051 struct xfs_name name;
1046 1052
1047 xfs_dir2_trace_args("sf_to_block", args); 1053 trace_xfs_dir2_sf_to_block(args);
1054
1048 dp = args->dp; 1055 dp = args->dp;
1049 tp = args->trans; 1056 tp = args->trans;
1050 mp = dp->i_mount; 1057 mp = dp->i_mount;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 41ad537c49e9..e2d89854ec9e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -38,8 +38,8 @@
38#include "xfs_dir2_leaf.h" 38#include "xfs_dir2_leaf.h"
39#include "xfs_dir2_block.h" 39#include "xfs_dir2_block.h"
40#include "xfs_dir2_node.h" 40#include "xfs_dir2_node.h"
41#include "xfs_dir2_trace.h"
42#include "xfs_error.h" 41#include "xfs_error.h"
42#include "xfs_trace.h"
43 43
44/* 44/*
45 * Local function declarations. 45 * Local function declarations.
@@ -80,7 +80,8 @@ xfs_dir2_block_to_leaf(
80 int needscan; /* need to rescan bestfree */ 80 int needscan; /* need to rescan bestfree */
81 xfs_trans_t *tp; /* transaction pointer */ 81 xfs_trans_t *tp; /* transaction pointer */
82 82
83 xfs_dir2_trace_args_b("block_to_leaf", args, dbp); 83 trace_xfs_dir2_block_to_leaf(args);
84
84 dp = args->dp; 85 dp = args->dp;
85 mp = dp->i_mount; 86 mp = dp->i_mount;
86 tp = args->trans; 87 tp = args->trans;
@@ -188,7 +189,8 @@ xfs_dir2_leaf_addname(
188 xfs_trans_t *tp; /* transaction pointer */ 189 xfs_trans_t *tp; /* transaction pointer */
189 xfs_dir2_db_t use_block; /* data block number */ 190 xfs_dir2_db_t use_block; /* data block number */
190 191
191 xfs_dir2_trace_args("leaf_addname", args); 192 trace_xfs_dir2_leaf_addname(args);
193
192 dp = args->dp; 194 dp = args->dp;
193 tp = args->trans; 195 tp = args->trans;
194 mp = dp->i_mount; 196 mp = dp->i_mount;
@@ -1079,7 +1081,7 @@ xfs_dir2_leaf_getdents(
1079 dep = (xfs_dir2_data_entry_t *)ptr; 1081 dep = (xfs_dir2_data_entry_t *)ptr;
1080 length = xfs_dir2_data_entsize(dep->namelen); 1082 length = xfs_dir2_data_entsize(dep->namelen);
1081 1083
1082 if (filldir(dirent, dep->name, dep->namelen, 1084 if (filldir(dirent, (char *)dep->name, dep->namelen,
1083 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1085 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1084 be64_to_cpu(dep->inumber), DT_UNKNOWN)) 1086 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1085 break; 1087 break;
@@ -1266,7 +1268,8 @@ xfs_dir2_leaf_lookup(
1266 xfs_dir2_leaf_entry_t *lep; /* leaf entry */ 1268 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1267 xfs_trans_t *tp; /* transaction pointer */ 1269 xfs_trans_t *tp; /* transaction pointer */
1268 1270
1269 xfs_dir2_trace_args("leaf_lookup", args); 1271 trace_xfs_dir2_leaf_lookup(args);
1272
1270 /* 1273 /*
1271 * Look up name in the leaf block, returning both buffers and index. 1274 * Look up name in the leaf block, returning both buffers and index.
1272 */ 1275 */
@@ -1454,7 +1457,8 @@ xfs_dir2_leaf_removename(
1454 xfs_dir2_data_off_t oldbest; /* old value of best free */ 1457 xfs_dir2_data_off_t oldbest; /* old value of best free */
1455 xfs_trans_t *tp; /* transaction pointer */ 1458 xfs_trans_t *tp; /* transaction pointer */
1456 1459
1457 xfs_dir2_trace_args("leaf_removename", args); 1460 trace_xfs_dir2_leaf_removename(args);
1461
1458 /* 1462 /*
1459 * Lookup the leaf entry, get the leaf and data blocks read in. 1463 * Lookup the leaf entry, get the leaf and data blocks read in.
1460 */ 1464 */
@@ -1586,7 +1590,8 @@ xfs_dir2_leaf_replace(
1586 xfs_dir2_leaf_entry_t *lep; /* leaf entry */ 1590 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1587 xfs_trans_t *tp; /* transaction pointer */ 1591 xfs_trans_t *tp; /* transaction pointer */
1588 1592
1589 xfs_dir2_trace_args("leaf_replace", args); 1593 trace_xfs_dir2_leaf_replace(args);
1594
1590 /* 1595 /*
1591 * Look up the entry. 1596 * Look up the entry.
1592 */ 1597 */
@@ -1766,7 +1771,9 @@ xfs_dir2_node_to_leaf(
1766 if (state->path.active > 1) 1771 if (state->path.active > 1)
1767 return 0; 1772 return 0;
1768 args = state->args; 1773 args = state->args;
1769 xfs_dir2_trace_args("node_to_leaf", args); 1774
1775 trace_xfs_dir2_node_to_leaf(args);
1776
1770 mp = state->mp; 1777 mp = state->mp;
1771 dp = args->dp; 1778 dp = args->dp;
1772 tp = args->trans; 1779 tp = args->trans;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5a81ccd1045b..78fc4d9ae756 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -37,8 +37,8 @@
37#include "xfs_dir2_leaf.h" 37#include "xfs_dir2_leaf.h"
38#include "xfs_dir2_block.h" 38#include "xfs_dir2_block.h"
39#include "xfs_dir2_node.h" 39#include "xfs_dir2_node.h"
40#include "xfs_dir2_trace.h"
41#include "xfs_error.h" 40#include "xfs_error.h"
41#include "xfs_trace.h"
42 42
43/* 43/*
44 * Function declarations. 44 * Function declarations.
@@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
65/* 65/*
66 * Log entries from a freespace block. 66 * Log entries from a freespace block.
67 */ 67 */
68void 68STATIC void
69xfs_dir2_free_log_bests( 69xfs_dir2_free_log_bests(
70 xfs_trans_t *tp, /* transaction pointer */ 70 xfs_trans_t *tp, /* transaction pointer */
71 xfs_dabuf_t *bp, /* freespace buffer */ 71 xfs_dabuf_t *bp, /* freespace buffer */
@@ -123,7 +123,8 @@ xfs_dir2_leaf_to_node(
123 __be16 *to; /* pointer to freespace entry */ 123 __be16 *to; /* pointer to freespace entry */
124 xfs_trans_t *tp; /* transaction pointer */ 124 xfs_trans_t *tp; /* transaction pointer */
125 125
126 xfs_dir2_trace_args_b("leaf_to_node", args, lbp); 126 trace_xfs_dir2_leaf_to_node(args);
127
127 dp = args->dp; 128 dp = args->dp;
128 mp = dp->i_mount; 129 mp = dp->i_mount;
129 tp = args->trans; 130 tp = args->trans;
@@ -196,7 +197,8 @@ xfs_dir2_leafn_add(
196 xfs_mount_t *mp; /* filesystem mount point */ 197 xfs_mount_t *mp; /* filesystem mount point */
197 xfs_trans_t *tp; /* transaction pointer */ 198 xfs_trans_t *tp; /* transaction pointer */
198 199
199 xfs_dir2_trace_args_sb("leafn_add", args, index, bp); 200 trace_xfs_dir2_leafn_add(args, index);
201
200 dp = args->dp; 202 dp = args->dp;
201 mp = dp->i_mount; 203 mp = dp->i_mount;
202 tp = args->trans; 204 tp = args->trans;
@@ -711,8 +713,8 @@ xfs_dir2_leafn_moveents(
711 int stale; /* count stale leaves copied */ 713 int stale; /* count stale leaves copied */
712 xfs_trans_t *tp; /* transaction pointer */ 714 xfs_trans_t *tp; /* transaction pointer */
713 715
714 xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d, 716 trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
715 start_d, count); 717
716 /* 718 /*
717 * Silently return if nothing to do. 719 * Silently return if nothing to do.
718 */ 720 */
@@ -933,7 +935,8 @@ xfs_dir2_leafn_remove(
933 int needscan; /* need to rescan data frees */ 935 int needscan; /* need to rescan data frees */
934 xfs_trans_t *tp; /* transaction pointer */ 936 xfs_trans_t *tp; /* transaction pointer */
935 937
936 xfs_dir2_trace_args_sb("leafn_remove", args, index, bp); 938 trace_xfs_dir2_leafn_remove(args, index);
939
937 dp = args->dp; 940 dp = args->dp;
938 tp = args->trans; 941 tp = args->trans;
939 mp = dp->i_mount; 942 mp = dp->i_mount;
@@ -1363,7 +1366,8 @@ xfs_dir2_node_addname(
1363 int rval; /* sub-return value */ 1366 int rval; /* sub-return value */
1364 xfs_da_state_t *state; /* btree cursor */ 1367 xfs_da_state_t *state; /* btree cursor */
1365 1368
1366 xfs_dir2_trace_args("node_addname", args); 1369 trace_xfs_dir2_node_addname(args);
1370
1367 /* 1371 /*
1368 * Allocate and initialize the state (btree cursor). 1372 * Allocate and initialize the state (btree cursor).
1369 */ 1373 */
@@ -1822,7 +1826,8 @@ xfs_dir2_node_lookup(
1822 int rval; /* operation return value */ 1826 int rval; /* operation return value */
1823 xfs_da_state_t *state; /* btree cursor */ 1827 xfs_da_state_t *state; /* btree cursor */
1824 1828
1825 xfs_dir2_trace_args("node_lookup", args); 1829 trace_xfs_dir2_node_lookup(args);
1830
1826 /* 1831 /*
1827 * Allocate and initialize the btree cursor. 1832 * Allocate and initialize the btree cursor.
1828 */ 1833 */
@@ -1875,7 +1880,8 @@ xfs_dir2_node_removename(
1875 int rval; /* operation return value */ 1880 int rval; /* operation return value */
1876 xfs_da_state_t *state; /* btree cursor */ 1881 xfs_da_state_t *state; /* btree cursor */
1877 1882
1878 xfs_dir2_trace_args("node_removename", args); 1883 trace_xfs_dir2_node_removename(args);
1884
1879 /* 1885 /*
1880 * Allocate and initialize the btree cursor. 1886 * Allocate and initialize the btree cursor.
1881 */ 1887 */
@@ -1944,7 +1950,8 @@ xfs_dir2_node_replace(
1944 int rval; /* internal return value */ 1950 int rval; /* internal return value */
1945 xfs_da_state_t *state; /* btree cursor */ 1951 xfs_da_state_t *state; /* btree cursor */
1946 1952
1947 xfs_dir2_trace_args("node_replace", args); 1953 trace_xfs_dir2_node_replace(args);
1954
1948 /* 1955 /*
1949 * Allocate and initialize the btree cursor. 1956 * Allocate and initialize the btree cursor.
1950 */ 1957 */
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index dde72db3d695..82dfe7147195 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); 75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
76} 76}
77 77
78extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
79 int first, int last);
80extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, 78extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
81 struct xfs_dabuf *lbp); 79 struct xfs_dabuf *lbp);
82extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); 80extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index e89734e84646..c1a5945d463a 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -37,7 +37,7 @@
37#include "xfs_dir2_data.h" 37#include "xfs_dir2_data.h"
38#include "xfs_dir2_leaf.h" 38#include "xfs_dir2_leaf.h"
39#include "xfs_dir2_block.h" 39#include "xfs_dir2_block.h"
40#include "xfs_dir2_trace.h" 40#include "xfs_trace.h"
41 41
42/* 42/*
43 * Prototypes for internal functions. 43 * Prototypes for internal functions.
@@ -169,7 +169,8 @@ xfs_dir2_block_to_sf(
169 xfs_dir2_sf_t *sfp; /* shortform structure */ 169 xfs_dir2_sf_t *sfp; /* shortform structure */
170 xfs_ino_t temp; 170 xfs_ino_t temp;
171 171
172 xfs_dir2_trace_args_sb("block_to_sf", args, size, bp); 172 trace_xfs_dir2_block_to_sf(args);
173
173 dp = args->dp; 174 dp = args->dp;
174 mp = dp->i_mount; 175 mp = dp->i_mount;
175 176
@@ -281,7 +282,8 @@ xfs_dir2_sf_addname(
281 xfs_dir2_sf_t *sfp; /* shortform structure */ 282 xfs_dir2_sf_t *sfp; /* shortform structure */
282 xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ 283 xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
283 284
284 xfs_dir2_trace_args("sf_addname", args); 285 trace_xfs_dir2_sf_addname(args);
286
285 ASSERT(xfs_dir2_sf_lookup(args) == ENOENT); 287 ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
286 dp = args->dp; 288 dp = args->dp;
287 ASSERT(dp->i_df.if_flags & XFS_IFINLINE); 289 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -654,7 +656,8 @@ xfs_dir2_sf_create(
654 xfs_dir2_sf_t *sfp; /* shortform structure */ 656 xfs_dir2_sf_t *sfp; /* shortform structure */
655 int size; /* directory size */ 657 int size; /* directory size */
656 658
657 xfs_dir2_trace_args_i("sf_create", args, pino); 659 trace_xfs_dir2_sf_create(args);
660
658 dp = args->dp; 661 dp = args->dp;
659 662
660 ASSERT(dp != NULL); 663 ASSERT(dp != NULL);
@@ -779,7 +782,7 @@ xfs_dir2_sf_getdents(
779 } 782 }
780 783
781 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); 784 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
782 if (filldir(dirent, sfep->name, sfep->namelen, 785 if (filldir(dirent, (char *)sfep->name, sfep->namelen,
783 off & 0x7fffffff, ino, DT_UNKNOWN)) { 786 off & 0x7fffffff, ino, DT_UNKNOWN)) {
784 *offset = off & 0x7fffffff; 787 *offset = off & 0x7fffffff;
785 return 0; 788 return 0;
@@ -808,7 +811,8 @@ xfs_dir2_sf_lookup(
808 enum xfs_dacmp cmp; /* comparison result */ 811 enum xfs_dacmp cmp; /* comparison result */
809 xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ 812 xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
810 813
811 xfs_dir2_trace_args("sf_lookup", args); 814 trace_xfs_dir2_sf_lookup(args);
815
812 xfs_dir2_sf_check(args); 816 xfs_dir2_sf_check(args);
813 dp = args->dp; 817 dp = args->dp;
814 818
@@ -891,7 +895,8 @@ xfs_dir2_sf_removename(
891 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ 895 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
892 xfs_dir2_sf_t *sfp; /* shortform structure */ 896 xfs_dir2_sf_t *sfp; /* shortform structure */
893 897
894 xfs_dir2_trace_args("sf_removename", args); 898 trace_xfs_dir2_sf_removename(args);
899
895 dp = args->dp; 900 dp = args->dp;
896 901
897 ASSERT(dp->i_df.if_flags & XFS_IFINLINE); 902 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -982,7 +987,8 @@ xfs_dir2_sf_replace(
982 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ 987 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
983 xfs_dir2_sf_t *sfp; /* shortform structure */ 988 xfs_dir2_sf_t *sfp; /* shortform structure */
984 989
985 xfs_dir2_trace_args("sf_replace", args); 990 trace_xfs_dir2_sf_replace(args);
991
986 dp = args->dp; 992 dp = args->dp;
987 993
988 ASSERT(dp->i_df.if_flags & XFS_IFINLINE); 994 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -1125,7 +1131,8 @@ xfs_dir2_sf_toino4(
1125 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1131 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1126 xfs_dir2_sf_t *sfp; /* new sf directory */ 1132 xfs_dir2_sf_t *sfp; /* new sf directory */
1127 1133
1128 xfs_dir2_trace_args("sf_toino4", args); 1134 trace_xfs_dir2_sf_toino4(args);
1135
1129 dp = args->dp; 1136 dp = args->dp;
1130 1137
1131 /* 1138 /*
@@ -1202,7 +1209,8 @@ xfs_dir2_sf_toino8(
1202 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1209 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1203 xfs_dir2_sf_t *sfp; /* new sf directory */ 1210 xfs_dir2_sf_t *sfp; /* new sf directory */
1204 1211
1205 xfs_dir2_trace_args("sf_toino8", args); 1212 trace_xfs_dir2_sf_toino8(args);
1213
1206 dp = args->dp; 1214 dp = args->dp;
1207 1215
1208 /* 1216 /*
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
deleted file mode 100644
index 6cc7c0c681ac..000000000000
--- a/fs/xfs/xfs_dir2_trace.c
+++ /dev/null
@@ -1,216 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_inum.h"
22#include "xfs_dir2.h"
23#include "xfs_da_btree.h"
24#include "xfs_bmap_btree.h"
25#include "xfs_dir2_sf.h"
26#include "xfs_attr_sf.h"
27#include "xfs_dinode.h"
28#include "xfs_inode.h"
29#include "xfs_dir2_trace.h"
30
31#ifdef XFS_DIR2_TRACE
32ktrace_t *xfs_dir2_trace_buf;
33
34/*
35 * Enter something in the trace buffers.
36 */
37static void
38xfs_dir2_trace_enter(
39 xfs_inode_t *dp,
40 int type,
41 char *where,
42 char *name,
43 int namelen,
44 void *a0,
45 void *a1,
46 void *a2,
47 void *a3,
48 void *a4,
49 void *a5,
50 void *a6,
51 void *a7)
52{
53 void *n[5];
54
55 ASSERT(xfs_dir2_trace_buf);
56 ASSERT(dp->i_dir_trace);
57 if (name)
58 memcpy(n, name, min((int)sizeof(n), namelen));
59 else
60 memset((char *)n, 0, sizeof(n));
61 ktrace_enter(xfs_dir2_trace_buf,
62 (void *)(long)type, (void *)where,
63 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
64 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
65 (void *)(long)namelen,
66 (void *)n[0], (void *)n[1], (void *)n[2],
67 (void *)n[3], (void *)n[4]);
68 ktrace_enter(dp->i_dir_trace,
69 (void *)(long)type, (void *)where,
70 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
71 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
72 (void *)(long)namelen,
73 (void *)n[0], (void *)n[1], (void *)n[2],
74 (void *)n[3], (void *)n[4]);
75}
76
77void
78xfs_dir2_trace_args(
79 char *where,
80 xfs_da_args_t *args)
81{
82 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
83 (char *)args->name, (int)args->namelen,
84 (void *)(unsigned long)args->hashval,
85 (void *)((unsigned long)(args->inumber >> 32)),
86 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
87 (void *)args->dp, (void *)args->trans,
88 (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
89 NULL, NULL);
90}
91
92void
93xfs_dir2_trace_args_b(
94 char *where,
95 xfs_da_args_t *args,
96 xfs_dabuf_t *bp)
97{
98 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
99 (char *)args->name, (int)args->namelen,
100 (void *)(unsigned long)args->hashval,
101 (void *)((unsigned long)(args->inumber >> 32)),
102 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
103 (void *)args->dp, (void *)args->trans,
104 (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
105 (void *)(bp ? bp->bps[0] : NULL), NULL);
106}
107
108void
109xfs_dir2_trace_args_bb(
110 char *where,
111 xfs_da_args_t *args,
112 xfs_dabuf_t *lbp,
113 xfs_dabuf_t *dbp)
114{
115 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
116 (char *)args->name, (int)args->namelen,
117 (void *)(unsigned long)args->hashval,
118 (void *)((unsigned long)(args->inumber >> 32)),
119 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
120 (void *)args->dp, (void *)args->trans,
121 (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
122 (void *)(lbp ? lbp->bps[0] : NULL),
123 (void *)(dbp ? dbp->bps[0] : NULL));
124}
125
126void
127xfs_dir2_trace_args_bibii(
128 char *where,
129 xfs_da_args_t *args,
130 xfs_dabuf_t *bs,
131 int ss,
132 xfs_dabuf_t *bd,
133 int sd,
134 int c)
135{
136 xfs_buf_t *bpbs = bs ? bs->bps[0] : NULL;
137 xfs_buf_t *bpbd = bd ? bd->bps[0] : NULL;
138
139 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
140 (char *)args->name, (int)args->namelen,
141 (void *)args->dp, (void *)args->trans,
142 (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd,
143 (void *)(long)c, NULL);
144}
145
146void
147xfs_dir2_trace_args_db(
148 char *where,
149 xfs_da_args_t *args,
150 xfs_dir2_db_t db,
151 xfs_dabuf_t *bp)
152{
153 xfs_buf_t *dbp = bp ? bp->bps[0] : NULL;
154
155 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
156 (char *)args->name, (int)args->namelen,
157 (void *)(unsigned long)args->hashval,
158 (void *)((unsigned long)(args->inumber >> 32)),
159 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
160 (void *)args->dp, (void *)args->trans,
161 (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
162 (void *)(long)db, (void *)dbp);
163}
164
165void
166xfs_dir2_trace_args_i(
167 char *where,
168 xfs_da_args_t *args,
169 xfs_ino_t i)
170{
171 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
172 (char *)args->name, (int)args->namelen,
173 (void *)(unsigned long)args->hashval,
174 (void *)((unsigned long)(args->inumber >> 32)),
175 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
176 (void *)args->dp, (void *)args->trans,
177 (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
178 (void *)((unsigned long)(i >> 32)),
179 (void *)((unsigned long)(i & 0xFFFFFFFF)));
180}
181
182void
183xfs_dir2_trace_args_s(
184 char *where,
185 xfs_da_args_t *args,
186 int s)
187{
188 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
189 (char *)args->name, (int)args->namelen,
190 (void *)(unsigned long)args->hashval,
191 (void *)((unsigned long)(args->inumber >> 32)),
192 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
193 (void *)args->dp, (void *)args->trans,
194 (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
195 (void *)(long)s, NULL);
196}
197
198void
199xfs_dir2_trace_args_sb(
200 char *where,
201 xfs_da_args_t *args,
202 int s,
203 xfs_dabuf_t *bp)
204{
205 xfs_buf_t *dbp = bp ? bp->bps[0] : NULL;
206
207 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
208 (char *)args->name, (int)args->namelen,
209 (void *)(unsigned long)args->hashval,
210 (void *)((unsigned long)(args->inumber >> 32)),
211 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
212 (void *)args->dp, (void *)args->trans,
213 (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
214 (void *)(long)s, (void *)dbp);
215}
216#endif /* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h
deleted file mode 100644
index ca3c754f4822..000000000000
--- a/fs/xfs/xfs_dir2_trace.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DIR2_TRACE_H__
19#define __XFS_DIR2_TRACE_H__
20
21/*
22 * Tracing for xfs v2 directories.
23 */
24
25#if defined(XFS_DIR2_TRACE)
26
27struct ktrace;
28struct xfs_dabuf;
29struct xfs_da_args;
30
31#define XFS_DIR2_GTRACE_SIZE 4096 /* global buffer */
32#define XFS_DIR2_KTRACE_SIZE 32 /* per-inode buffer */
33extern struct ktrace *xfs_dir2_trace_buf;
34
35#define XFS_DIR2_KTRACE_ARGS 1 /* args only */
36#define XFS_DIR2_KTRACE_ARGS_B 2 /* args + buffer */
37#define XFS_DIR2_KTRACE_ARGS_BB 3 /* args + 2 buffers */
38#define XFS_DIR2_KTRACE_ARGS_DB 4 /* args, db, buffer */
39#define XFS_DIR2_KTRACE_ARGS_I 5 /* args, inum */
40#define XFS_DIR2_KTRACE_ARGS_S 6 /* args, int */
41#define XFS_DIR2_KTRACE_ARGS_SB 7 /* args, int, buffer */
42#define XFS_DIR2_KTRACE_ARGS_BIBII 8 /* args, buf/int/buf/int/int */
43
44void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
45void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
46 struct xfs_dabuf *bp);
47void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
48 struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
49void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
50 struct xfs_dabuf *bs, int ss,
51 struct xfs_dabuf *bd, int sd, int c);
52void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
53 xfs_dir2_db_t db, struct xfs_dabuf *bp);
54void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
55void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
56void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
57 struct xfs_dabuf *bp);
58
59#else /* XFS_DIR2_TRACE */
60
61#define xfs_dir2_trace_args(where, args)
62#define xfs_dir2_trace_args_b(where, args, bp)
63#define xfs_dir2_trace_args_bb(where, args, lbp, dbp)
64#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
65#define xfs_dir2_trace_args_db(where, args, db, bp)
66#define xfs_dir2_trace_args_i(where, args, i)
67#define xfs_dir2_trace_args_s(where, args, s)
68#define xfs_dir2_trace_args_sb(where, args, s, bp)
69
70#endif /* XFS_DIR2_TRACE */
71
72#endif /* __XFS_DIR2_TRACE_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 05a4bdd4be39..6f35ed1b39b9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
82 82
83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); 83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
84 log_vector->i_len = size; 84 log_vector->i_len = size;
85 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT); 85 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
86 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 86 ASSERT(size >= sizeof(xfs_efi_log_format_t));
87} 87}
88 88
@@ -406,7 +406,7 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp,
406 406
407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); 407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
408 log_vector->i_len = size; 408 log_vector->i_len = size;
409 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT); 409 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
410 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 410 ASSERT(size >= sizeof(xfs_efd_log_format_t));
411} 411}
412 412
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index edf8bdf4141f..390850ee6603 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -34,6 +34,7 @@
34#include "xfs_utils.h" 34#include "xfs_utils.h"
35#include "xfs_mru_cache.h" 35#include "xfs_mru_cache.h"
36#include "xfs_filestream.h" 36#include "xfs_filestream.h"
37#include "xfs_trace.h"
37 38
38#ifdef XFS_FILESTREAMS_TRACE 39#ifdef XFS_FILESTREAMS_TRACE
39 40
@@ -139,6 +140,7 @@ _xfs_filestream_pick_ag(
139 int flags, 140 int flags,
140 xfs_extlen_t minlen) 141 xfs_extlen_t minlen)
141{ 142{
143 int streams, max_streams;
142 int err, trylock, nscan; 144 int err, trylock, nscan;
143 xfs_extlen_t longest, free, minfree, maxfree = 0; 145 xfs_extlen_t longest, free, minfree, maxfree = 0;
144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 146 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
@@ -154,15 +156,15 @@ _xfs_filestream_pick_ag(
154 trylock = XFS_ALLOC_FLAG_TRYLOCK; 156 trylock = XFS_ALLOC_FLAG_TRYLOCK;
155 157
156 for (nscan = 0; 1; nscan++) { 158 for (nscan = 0; 1; nscan++) {
157 159 pag = xfs_perag_get(mp, ag);
158 TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); 160 TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
159
160 pag = mp->m_perag + ag;
161 161
162 if (!pag->pagf_init) { 162 if (!pag->pagf_init) {
163 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); 163 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
164 if (err && !trylock) 164 if (err && !trylock) {
165 xfs_perag_put(pag);
165 return err; 166 return err;
167 }
166 } 168 }
167 169
168 /* Might fail sometimes during the 1st pass with trylock set. */ 170 /* Might fail sometimes during the 1st pass with trylock set. */
@@ -172,6 +174,7 @@ _xfs_filestream_pick_ag(
172 /* Keep track of the AG with the most free blocks. */ 174 /* Keep track of the AG with the most free blocks. */
173 if (pag->pagf_freeblks > maxfree) { 175 if (pag->pagf_freeblks > maxfree) {
174 maxfree = pag->pagf_freeblks; 176 maxfree = pag->pagf_freeblks;
177 max_streams = atomic_read(&pag->pagf_fstrms);
175 max_ag = ag; 178 max_ag = ag;
176 } 179 }
177 180
@@ -194,6 +197,8 @@ _xfs_filestream_pick_ag(
194 197
195 /* Break out, retaining the reference on the AG. */ 198 /* Break out, retaining the reference on the AG. */
196 free = pag->pagf_freeblks; 199 free = pag->pagf_freeblks;
200 streams = atomic_read(&pag->pagf_fstrms);
201 xfs_perag_put(pag);
197 *agp = ag; 202 *agp = ag;
198 break; 203 break;
199 } 204 }
@@ -201,6 +206,7 @@ _xfs_filestream_pick_ag(
201 /* Drop the reference on this AG, it's not usable. */ 206 /* Drop the reference on this AG, it's not usable. */
202 xfs_filestream_put_ag(mp, ag); 207 xfs_filestream_put_ag(mp, ag);
203next_ag: 208next_ag:
209 xfs_perag_put(pag);
204 /* Move to the next AG, wrapping to AG 0 if necessary. */ 210 /* Move to the next AG, wrapping to AG 0 if necessary. */
205 if (++ag >= mp->m_sb.sb_agcount) 211 if (++ag >= mp->m_sb.sb_agcount)
206 ag = 0; 212 ag = 0;
@@ -228,6 +234,7 @@ next_ag:
228 if (max_ag != NULLAGNUMBER) { 234 if (max_ag != NULLAGNUMBER) {
229 xfs_filestream_get_ag(mp, max_ag); 235 xfs_filestream_get_ag(mp, max_ag);
230 TRACE_AG_PICK1(mp, max_ag, maxfree); 236 TRACE_AG_PICK1(mp, max_ag, maxfree);
237 streams = max_streams;
231 free = maxfree; 238 free = maxfree;
232 *agp = max_ag; 239 *agp = max_ag;
233 break; 240 break;
@@ -239,16 +246,14 @@ next_ag:
239 return 0; 246 return 0;
240 } 247 }
241 248
242 TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), 249 TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
243 free, nscan, flags);
244 250
245 return 0; 251 return 0;
246} 252}
247 253
248/* 254/*
249 * Set the allocation group number for a file or a directory, updating inode 255 * Set the allocation group number for a file or a directory, updating inode
250 * references and per-AG references as appropriate. Must be called with the 256 * references and per-AG references as appropriate.
251 * m_peraglock held in read mode.
252 */ 257 */
253static int 258static int
254_xfs_filestream_update_ag( 259_xfs_filestream_update_ag(
@@ -394,9 +399,7 @@ xfs_filestream_init(void)
394 item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); 399 item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
395 if (!item_zone) 400 if (!item_zone)
396 return -ENOMEM; 401 return -ENOMEM;
397#ifdef XFS_FILESTREAMS_TRACE 402
398 xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
399#endif
400 return 0; 403 return 0;
401} 404}
402 405
@@ -407,9 +410,6 @@ xfs_filestream_init(void)
407void 410void
408xfs_filestream_uninit(void) 411xfs_filestream_uninit(void)
409{ 412{
410#ifdef XFS_FILESTREAMS_TRACE
411 ktrace_free(xfs_filestreams_trace_buf);
412#endif
413 kmem_zone_destroy(item_zone); 413 kmem_zone_destroy(item_zone);
414} 414}
415 415
@@ -455,20 +455,6 @@ xfs_filestream_unmount(
455} 455}
456 456
457/* 457/*
458 * If the mount point's m_perag array is going to be reallocated, all
459 * outstanding cache entries must be flushed to avoid accessing reference count
460 * addresses that have been freed. The call to xfs_filestream_flush() must be
461 * made inside the block that holds the m_peraglock in write mode to do the
462 * reallocation.
463 */
464void
465xfs_filestream_flush(
466 xfs_mount_t *mp)
467{
468 xfs_mru_cache_flush(mp->m_filestream);
469}
470
471/*
472 * Return the AG of the filestream the file or directory belongs to, or 458 * Return the AG of the filestream the file or directory belongs to, or
473 * NULLAGNUMBER otherwise. 459 * NULLAGNUMBER otherwise.
474 */ 460 */
@@ -530,7 +516,6 @@ xfs_filestream_associate(
530 516
531 mp = pip->i_mount; 517 mp = pip->i_mount;
532 cache = mp->m_filestream; 518 cache = mp->m_filestream;
533 down_read(&mp->m_peraglock);
534 519
535 /* 520 /*
536 * We have a problem, Houston. 521 * We have a problem, Houston.
@@ -547,10 +532,8 @@ xfs_filestream_associate(
547 * 532 *
548 * So, if we can't get the iolock without sleeping then just give up 533 * So, if we can't get the iolock without sleeping then just give up
549 */ 534 */
550 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { 535 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
551 up_read(&mp->m_peraglock);
552 return 1; 536 return 1;
553 }
554 537
555 /* If the parent directory is already in the cache, use its AG. */ 538 /* If the parent directory is already in the cache, use its AG. */
556 item = xfs_mru_cache_lookup(cache, pip->i_ino); 539 item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -605,7 +588,6 @@ exit_did_pick:
605 588
606exit: 589exit:
607 xfs_iunlock(pip, XFS_IOLOCK_EXCL); 590 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
608 up_read(&mp->m_peraglock);
609 return -err; 591 return -err;
610} 592}
611 593
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index f655f7dc334c..260f757bbc5d 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,28 +79,49 @@ extern ktrace_t *xfs_filestreams_trace_buf;
79 * the cache that reference per-ag array elements that have since been 79 * the cache that reference per-ag array elements that have since been
80 * reallocated. 80 * reallocated.
81 */ 81 */
82STATIC_INLINE int 82/*
83 * xfs_filestream_peek_ag is only used in tracing code
84 */
85static inline int
83xfs_filestream_peek_ag( 86xfs_filestream_peek_ag(
84 xfs_mount_t *mp, 87 xfs_mount_t *mp,
85 xfs_agnumber_t agno) 88 xfs_agnumber_t agno)
86{ 89{
87 return atomic_read(&mp->m_perag[agno].pagf_fstrms); 90 struct xfs_perag *pag;
91 int ret;
92
93 pag = xfs_perag_get(mp, agno);
94 ret = atomic_read(&pag->pagf_fstrms);
95 xfs_perag_put(pag);
96 return ret;
88} 97}
89 98
90STATIC_INLINE int 99static inline int
91xfs_filestream_get_ag( 100xfs_filestream_get_ag(
92 xfs_mount_t *mp, 101 xfs_mount_t *mp,
93 xfs_agnumber_t agno) 102 xfs_agnumber_t agno)
94{ 103{
95 return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); 104 struct xfs_perag *pag;
105 int ret;
106
107 pag = xfs_perag_get(mp, agno);
108 ret = atomic_inc_return(&pag->pagf_fstrms);
109 xfs_perag_put(pag);
110 return ret;
96} 111}
97 112
98STATIC_INLINE int 113static inline int
99xfs_filestream_put_ag( 114xfs_filestream_put_ag(
100 xfs_mount_t *mp, 115 xfs_mount_t *mp,
101 xfs_agnumber_t agno) 116 xfs_agnumber_t agno)
102{ 117{
103 return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); 118 struct xfs_perag *pag;
119 int ret;
120
121 pag = xfs_perag_get(mp, agno);
122 ret = atomic_dec_return(&pag->pagf_fstrms);
123 xfs_perag_put(pag);
124 return ret;
104} 125}
105 126
106/* allocation selection flags */ 127/* allocation selection flags */
@@ -114,7 +135,6 @@ int xfs_filestream_init(void);
114void xfs_filestream_uninit(void); 135void xfs_filestream_uninit(void);
115int xfs_filestream_mount(struct xfs_mount *mp); 136int xfs_filestream_mount(struct xfs_mount *mp);
116void xfs_filestream_unmount(struct xfs_mount *mp); 137void xfs_filestream_unmount(struct xfs_mount *mp);
117void xfs_filestream_flush(struct xfs_mount *mp);
118xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); 138xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
119int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); 139int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
120void xfs_filestream_deassociate(struct xfs_inode *ip); 140void xfs_filestream_deassociate(struct xfs_inode *ip);
@@ -122,7 +142,7 @@ int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
122 142
123 143
124/* filestreams for the inode? */ 144/* filestreams for the inode? */
125STATIC_INLINE int 145static inline int
126xfs_inode_is_filestream( 146xfs_inode_is_filestream(
127 struct xfs_inode *ip) 147 struct xfs_inode *ip)
128{ 148{
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f52ac276277e..7cf7220e7d5f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -292,7 +292,8 @@ typedef struct xfs_bstat {
292 __s32 bs_extents; /* number of extents */ 292 __s32 bs_extents; /* number of extents */
293 __u32 bs_gen; /* generation count */ 293 __u32 bs_gen; /* generation count */
294 __u16 bs_projid; /* project id */ 294 __u16 bs_projid; /* project id */
295 unsigned char bs_pad[14]; /* pad space, unused */ 295 __u16 bs_forkoff; /* inode fork offset in bytes */
296 unsigned char bs_pad[12]; /* pad space, unused */
296 __u32 bs_dmevmask; /* DMIG event mask */ 297 __u32 bs_dmevmask; /* DMIG event mask */
297 __u16 bs_dmstate; /* DMIG state info */ 298 __u16 bs_dmstate; /* DMIG state info */
298 __u16 bs_aextents; /* attribute number of extents */ 299 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2d0b3e1da9e6..37a6f62c57b6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -45,6 +45,7 @@
45#include "xfs_rtalloc.h" 45#include "xfs_rtalloc.h"
46#include "xfs_rw.h" 46#include "xfs_rw.h"
47#include "xfs_filestream.h" 47#include "xfs_filestream.h"
48#include "xfs_trace.h"
48 49
49/* 50/*
50 * File system operations 51 * File system operations
@@ -166,27 +167,14 @@ xfs_growfs_data_private(
166 } 167 }
167 new = nb - mp->m_sb.sb_dblocks; 168 new = nb - mp->m_sb.sb_dblocks;
168 oagcount = mp->m_sb.sb_agcount; 169 oagcount = mp->m_sb.sb_agcount;
169 if (nagcount > oagcount) {
170 void *new_perag, *old_perag;
171
172 xfs_filestream_flush(mp);
173
174 new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
175 KM_MAYFAIL);
176 if (!new_perag)
177 return XFS_ERROR(ENOMEM);
178
179 down_write(&mp->m_peraglock);
180 memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
181 old_perag = mp->m_perag;
182 mp->m_perag = new_perag;
183
184 mp->m_flags |= XFS_MOUNT_32BITINODES;
185 nagimax = xfs_initialize_perag(mp, nagcount);
186 up_write(&mp->m_peraglock);
187 170
188 kmem_free(old_perag); 171 /* allocate the new per-ag structures */
172 if (nagcount > oagcount) {
173 error = xfs_initialize_perag(mp, nagcount, &nagimax);
174 if (error)
175 return error;
189 } 176 }
177
190 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 178 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
191 tp->t_flags |= XFS_TRANS_RESERVE; 179 tp->t_flags |= XFS_TRANS_RESERVE;
192 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), 180 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -195,14 +183,19 @@ xfs_growfs_data_private(
195 return error; 183 return error;
196 } 184 }
197 185
186 /*
187 * Write new AG headers to disk. Non-transactional, but written
188 * synchronously so they are completed prior to the growfs transaction
189 * being logged.
190 */
198 nfree = 0; 191 nfree = 0;
199 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 192 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
200 /* 193 /*
201 * AG freelist header block 194 * AG freelist header block
202 */ 195 */
203 bp = xfs_buf_get(mp->m_ddev_targp, 196 bp = xfs_buf_get(mp->m_ddev_targp,
204 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 197 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
205 XFS_FSS_TO_BB(mp, 1), 0); 198 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
206 agf = XFS_BUF_TO_AGF(bp); 199 agf = XFS_BUF_TO_AGF(bp);
207 memset(agf, 0, mp->m_sb.sb_sectsize); 200 memset(agf, 0, mp->m_sb.sb_sectsize);
208 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); 201 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -233,8 +226,8 @@ xfs_growfs_data_private(
233 * AG inode header block 226 * AG inode header block
234 */ 227 */
235 bp = xfs_buf_get(mp->m_ddev_targp, 228 bp = xfs_buf_get(mp->m_ddev_targp,
236 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 229 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
237 XFS_FSS_TO_BB(mp, 1), 0); 230 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
238 agi = XFS_BUF_TO_AGI(bp); 231 agi = XFS_BUF_TO_AGI(bp);
239 memset(agi, 0, mp->m_sb.sb_sectsize); 232 memset(agi, 0, mp->m_sb.sb_sectsize);
240 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); 233 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -257,8 +250,9 @@ xfs_growfs_data_private(
257 * BNO btree root block 250 * BNO btree root block
258 */ 251 */
259 bp = xfs_buf_get(mp->m_ddev_targp, 252 bp = xfs_buf_get(mp->m_ddev_targp,
260 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 253 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
261 BTOBB(mp->m_sb.sb_blocksize), 0); 254 BTOBB(mp->m_sb.sb_blocksize),
255 XBF_LOCK | XBF_MAPPED);
262 block = XFS_BUF_TO_BLOCK(bp); 256 block = XFS_BUF_TO_BLOCK(bp);
263 memset(block, 0, mp->m_sb.sb_blocksize); 257 memset(block, 0, mp->m_sb.sb_blocksize);
264 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 258 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -278,8 +272,9 @@ xfs_growfs_data_private(
278 * CNT btree root block 272 * CNT btree root block
279 */ 273 */
280 bp = xfs_buf_get(mp->m_ddev_targp, 274 bp = xfs_buf_get(mp->m_ddev_targp,
281 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 275 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
282 BTOBB(mp->m_sb.sb_blocksize), 0); 276 BTOBB(mp->m_sb.sb_blocksize),
277 XBF_LOCK | XBF_MAPPED);
283 block = XFS_BUF_TO_BLOCK(bp); 278 block = XFS_BUF_TO_BLOCK(bp);
284 memset(block, 0, mp->m_sb.sb_blocksize); 279 memset(block, 0, mp->m_sb.sb_blocksize);
285 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 280 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -300,8 +295,9 @@ xfs_growfs_data_private(
300 * INO btree root block 295 * INO btree root block
301 */ 296 */
302 bp = xfs_buf_get(mp->m_ddev_targp, 297 bp = xfs_buf_get(mp->m_ddev_targp,
303 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 298 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
304 BTOBB(mp->m_sb.sb_blocksize), 0); 299 BTOBB(mp->m_sb.sb_blocksize),
300 XBF_LOCK | XBF_MAPPED);
305 block = XFS_BUF_TO_BLOCK(bp); 301 block = XFS_BUF_TO_BLOCK(bp);
306 memset(block, 0, mp->m_sb.sb_blocksize); 302 memset(block, 0, mp->m_sb.sb_blocksize);
307 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 303 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -344,6 +340,7 @@ xfs_growfs_data_private(
344 be32_add_cpu(&agf->agf_length, new); 340 be32_add_cpu(&agf->agf_length, new);
345 ASSERT(be32_to_cpu(agf->agf_length) == 341 ASSERT(be32_to_cpu(agf->agf_length) ==
346 be32_to_cpu(agi->agi_length)); 342 be32_to_cpu(agi->agi_length));
343
347 xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); 344 xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
348 /* 345 /*
349 * Free the new space. 346 * Free the new space.
@@ -354,6 +351,12 @@ xfs_growfs_data_private(
354 goto error0; 351 goto error0;
355 } 352 }
356 } 353 }
354
355 /*
356 * Update changed superblock fields transactionally. These are not
357 * seen by the rest of the world until the transaction commit applies
358 * them atomically to the superblock.
359 */
357 if (nagcount > oagcount) 360 if (nagcount > oagcount)
358 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); 361 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
359 if (nb > mp->m_sb.sb_dblocks) 362 if (nb > mp->m_sb.sb_dblocks)
@@ -364,9 +367,9 @@ xfs_growfs_data_private(
364 if (dpct) 367 if (dpct)
365 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 368 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
366 error = xfs_trans_commit(tp, 0); 369 error = xfs_trans_commit(tp, 0);
367 if (error) { 370 if (error)
368 return error; 371 return error;
369 } 372
370 /* New allocation groups fully initialized, so update mount struct */ 373 /* New allocation groups fully initialized, so update mount struct */
371 if (nagimax) 374 if (nagimax)
372 mp->m_maxagi = nagimax; 375 mp->m_maxagi = nagimax;
@@ -376,6 +379,8 @@ xfs_growfs_data_private(
376 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 379 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
377 } else 380 } else
378 mp->m_maxicount = 0; 381 mp->m_maxicount = 0;
382
383 /* update secondary superblocks. */
379 for (agno = 1; agno < nagcount; agno++) { 384 for (agno = 1; agno < nagcount; agno++) {
380 error = xfs_read_buf(mp, mp->m_ddev_targp, 385 error = xfs_read_buf(mp, mp->m_ddev_targp,
381 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 386 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
@@ -611,7 +616,7 @@ xfs_fs_log_dummy(
611 xfs_inode_t *ip; 616 xfs_inode_t *ip;
612 int error; 617 int error;
613 618
614 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); 619 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
615 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 620 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
616 if (error) { 621 if (error) {
617 xfs_trans_cancel(tp, 0); 622 xfs_trans_cancel(tp, 0);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0785797db828..9d884c127bb9 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -205,7 +205,7 @@ xfs_ialloc_inode_init(
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster, 207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK); 208 XBF_LOCK);
209 ASSERT(fbuf); 209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf)); 210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211 211
@@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc(
253 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
254 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
255 /* boundary */ 255 /* boundary */
256 struct xfs_perag *pag;
256 257
257 args.tp = tp; 258 args.tp = tp;
258 args.mp = tp->t_mountp; 259 args.mp = tp->t_mountp;
@@ -382,9 +383,9 @@ xfs_ialloc_ag_alloc(
382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 383 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
383 be32_add_cpu(&agi->agi_count, newlen); 384 be32_add_cpu(&agi->agi_count, newlen);
384 be32_add_cpu(&agi->agi_freecount, newlen); 385 be32_add_cpu(&agi->agi_freecount, newlen);
385 down_read(&args.mp->m_peraglock); 386 pag = xfs_perag_get(args.mp, agno);
386 args.mp->m_perag[agno].pagi_freecount += newlen; 387 pag->pagi_freecount += newlen;
387 up_read(&args.mp->m_peraglock); 388 xfs_perag_put(pag);
388 agi->agi_newino = cpu_to_be32(newino); 389 agi->agi_newino = cpu_to_be32(newino);
389 390
390 /* 391 /*
@@ -425,7 +426,7 @@ xfs_ialloc_ag_alloc(
425 return 0; 426 return 0;
426} 427}
427 428
428STATIC_INLINE xfs_agnumber_t 429STATIC xfs_agnumber_t
429xfs_ialloc_next_ag( 430xfs_ialloc_next_ag(
430 xfs_mount_t *mp) 431 xfs_mount_t *mp)
431{ 432{
@@ -486,9 +487,8 @@ xfs_ialloc_ag_select(
486 */ 487 */
487 agno = pagno; 488 agno = pagno;
488 flags = XFS_ALLOC_FLAG_TRYLOCK; 489 flags = XFS_ALLOC_FLAG_TRYLOCK;
489 down_read(&mp->m_peraglock);
490 for (;;) { 490 for (;;) {
491 pag = &mp->m_perag[agno]; 491 pag = xfs_perag_get(mp, agno);
492 if (!pag->pagi_init) { 492 if (!pag->pagi_init) {
493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { 493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
494 agbp = NULL; 494 agbp = NULL;
@@ -527,7 +527,7 @@ xfs_ialloc_ag_select(
527 agbp = NULL; 527 agbp = NULL;
528 goto nextag; 528 goto nextag;
529 } 529 }
530 up_read(&mp->m_peraglock); 530 xfs_perag_put(pag);
531 return agbp; 531 return agbp;
532 } 532 }
533 } 533 }
@@ -535,22 +535,19 @@ unlock_nextag:
535 if (agbp) 535 if (agbp)
536 xfs_trans_brelse(tp, agbp); 536 xfs_trans_brelse(tp, agbp);
537nextag: 537nextag:
538 xfs_perag_put(pag);
538 /* 539 /*
539 * No point in iterating over the rest, if we're shutting 540 * No point in iterating over the rest, if we're shutting
540 * down. 541 * down.
541 */ 542 */
542 if (XFS_FORCED_SHUTDOWN(mp)) { 543 if (XFS_FORCED_SHUTDOWN(mp))
543 up_read(&mp->m_peraglock);
544 return NULL; 544 return NULL;
545 }
546 agno++; 545 agno++;
547 if (agno >= agcount) 546 if (agno >= agcount)
548 agno = 0; 547 agno = 0;
549 if (agno == pagno) { 548 if (agno == pagno) {
550 if (flags == 0) { 549 if (flags == 0)
551 up_read(&mp->m_peraglock);
552 return NULL; 550 return NULL;
553 }
554 flags = 0; 551 flags = 0;
555 } 552 }
556 } 553 }
@@ -672,6 +669,7 @@ xfs_dialloc(
672 xfs_agnumber_t tagno; /* testing allocation group number */ 669 xfs_agnumber_t tagno; /* testing allocation group number */
673 xfs_btree_cur_t *tcur; /* temp cursor */ 670 xfs_btree_cur_t *tcur; /* temp cursor */
674 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ 671 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */
672 struct xfs_perag *pag;
675 673
676 674
677 if (*IO_agbp == NULL) { 675 if (*IO_agbp == NULL) {
@@ -771,13 +769,13 @@ nextag:
771 *inop = NULLFSINO; 769 *inop = NULLFSINO;
772 return noroom ? ENOSPC : 0; 770 return noroom ? ENOSPC : 0;
773 } 771 }
774 down_read(&mp->m_peraglock); 772 pag = xfs_perag_get(mp, tagno);
775 if (mp->m_perag[tagno].pagi_inodeok == 0) { 773 if (pag->pagi_inodeok == 0) {
776 up_read(&mp->m_peraglock); 774 xfs_perag_put(pag);
777 goto nextag; 775 goto nextag;
778 } 776 }
779 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); 777 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
780 up_read(&mp->m_peraglock); 778 xfs_perag_put(pag);
781 if (error) 779 if (error)
782 goto nextag; 780 goto nextag;
783 agi = XFS_BUF_TO_AGI(agbp); 781 agi = XFS_BUF_TO_AGI(agbp);
@@ -790,6 +788,7 @@ nextag:
790 */ 788 */
791 agno = tagno; 789 agno = tagno;
792 *IO_agbp = NULL; 790 *IO_agbp = NULL;
791 pag = xfs_perag_get(mp, agno);
793 792
794 restart_pagno: 793 restart_pagno:
795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 794 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
@@ -808,7 +807,6 @@ nextag:
808 * If in the same AG as the parent, try to get near the parent. 807 * If in the same AG as the parent, try to get near the parent.
809 */ 808 */
810 if (pagno == agno) { 809 if (pagno == agno) {
811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */ 810 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */ 811 int doneright; /* done, to the right */
814 int searchdistance = 10; 812 int searchdistance = 10;
@@ -1006,9 +1004,7 @@ alloc_inode:
1006 goto error0; 1004 goto error0;
1007 be32_add_cpu(&agi->agi_freecount, -1); 1005 be32_add_cpu(&agi->agi_freecount, -1);
1008 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1006 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1009 down_read(&mp->m_peraglock); 1007 pag->pagi_freecount--;
1010 mp->m_perag[tagno].pagi_freecount--;
1011 up_read(&mp->m_peraglock);
1012 1008
1013 error = xfs_check_agi_freecount(cur, agi); 1009 error = xfs_check_agi_freecount(cur, agi);
1014 if (error) 1010 if (error)
@@ -1016,12 +1012,14 @@ alloc_inode:
1016 1012
1017 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1013 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1018 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1014 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1015 xfs_perag_put(pag);
1019 *inop = ino; 1016 *inop = ino;
1020 return 0; 1017 return 0;
1021error1: 1018error1:
1022 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 1019 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1023error0: 1020error0:
1024 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 1021 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1022 xfs_perag_put(pag);
1025 return error; 1023 return error;
1026} 1024}
1027 1025
@@ -1052,6 +1050,7 @@ xfs_difree(
1052 xfs_mount_t *mp; /* mount structure for filesystem */ 1050 xfs_mount_t *mp; /* mount structure for filesystem */
1053 int off; /* offset of inode in inode chunk */ 1051 int off; /* offset of inode in inode chunk */
1054 xfs_inobt_rec_incore_t rec; /* btree record */ 1052 xfs_inobt_rec_incore_t rec; /* btree record */
1053 struct xfs_perag *pag;
1055 1054
1056 mp = tp->t_mountp; 1055 mp = tp->t_mountp;
1057 1056
@@ -1088,9 +1087,7 @@ xfs_difree(
1088 /* 1087 /*
1089 * Get the allocation group header. 1088 * Get the allocation group header.
1090 */ 1089 */
1091 down_read(&mp->m_peraglock);
1092 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1090 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1093 up_read(&mp->m_peraglock);
1094 if (error) { 1091 if (error) {
1095 cmn_err(CE_WARN, 1092 cmn_err(CE_WARN,
1096 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1093 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
@@ -1157,9 +1154,9 @@ xfs_difree(
1157 be32_add_cpu(&agi->agi_count, -ilen); 1154 be32_add_cpu(&agi->agi_count, -ilen);
1158 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1155 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1159 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1156 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1160 down_read(&mp->m_peraglock); 1157 pag = xfs_perag_get(mp, agno);
1161 mp->m_perag[agno].pagi_freecount -= ilen - 1; 1158 pag->pagi_freecount -= ilen - 1;
1162 up_read(&mp->m_peraglock); 1159 xfs_perag_put(pag);
1163 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); 1160 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1164 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1161 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1165 1162
@@ -1188,9 +1185,9 @@ xfs_difree(
1188 */ 1185 */
1189 be32_add_cpu(&agi->agi_freecount, 1); 1186 be32_add_cpu(&agi->agi_freecount, 1);
1190 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1187 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1191 down_read(&mp->m_peraglock); 1188 pag = xfs_perag_get(mp, agno);
1192 mp->m_perag[agno].pagi_freecount++; 1189 pag->pagi_freecount++;
1193 up_read(&mp->m_peraglock); 1190 xfs_perag_put(pag);
1194 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1191 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1195 } 1192 }
1196 1193
@@ -1312,9 +1309,7 @@ xfs_imap(
1312 xfs_buf_t *agbp; /* agi buffer */ 1309 xfs_buf_t *agbp; /* agi buffer */
1313 int i; /* temp state */ 1310 int i; /* temp state */
1314 1311
1315 down_read(&mp->m_peraglock);
1316 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1312 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1317 up_read(&mp->m_peraglock);
1318 if (error) { 1313 if (error) {
1319 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1314 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1320 "xfs_ialloc_read_agi() returned " 1315 "xfs_ialloc_read_agi() returned "
@@ -1379,7 +1374,6 @@ xfs_imap(
1379 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1374 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1380 return XFS_ERROR(EINVAL); 1375 return XFS_ERROR(EINVAL);
1381 } 1376 }
1382
1383 return 0; 1377 return 0;
1384} 1378}
1385 1379
@@ -1523,8 +1517,7 @@ xfs_ialloc_read_agi(
1523 return error; 1517 return error;
1524 1518
1525 agi = XFS_BUF_TO_AGI(*bpp); 1519 agi = XFS_BUF_TO_AGI(*bpp);
1526 pag = &mp->m_perag[agno]; 1520 pag = xfs_perag_get(mp, agno);
1527
1528 if (!pag->pagi_init) { 1521 if (!pag->pagi_init) {
1529 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); 1522 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1530 pag->pagi_count = be32_to_cpu(agi->agi_count); 1523 pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -1537,6 +1530,7 @@ xfs_ialloc_read_agi(
1537 */ 1530 */
1538 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || 1531 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
1539 XFS_FORCED_SHUTDOWN(mp)); 1532 XFS_FORCED_SHUTDOWN(mp));
1533 xfs_perag_put(pag);
1540 return 0; 1534 return 0;
1541} 1535}
1542 1536
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 80e526489be5..6845db90818f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,7 +43,7 @@
43#include "xfs_inode_item.h" 43#include "xfs_inode_item.h"
44#include "xfs_bmap.h" 44#include "xfs_bmap.h"
45#include "xfs_btree_trace.h" 45#include "xfs_btree_trace.h"
46#include "xfs_dir2_trace.h" 46#include "xfs_trace.h"
47 47
48 48
49/* 49/*
@@ -74,6 +74,8 @@ xfs_inode_alloc(
74 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 74 ASSERT(!spin_is_locked(&ip->i_flags_lock));
75 ASSERT(completion_done(&ip->i_flush)); 75 ASSERT(completion_done(&ip->i_flush));
76 76
77 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
78
77 /* initialise the xfs inode */ 79 /* initialise the xfs inode */
78 ip->i_ino = ino; 80 ip->i_ino = ino;
79 ip->i_mount = mp; 81 ip->i_mount = mp;
@@ -87,30 +89,8 @@ xfs_inode_alloc(
87 ip->i_size = 0; 89 ip->i_size = 0;
88 ip->i_new_size = 0; 90 ip->i_new_size = 0;
89 91
90 /*
91 * Initialize inode's trace buffers.
92 */
93#ifdef XFS_INODE_TRACE
94 ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
95#endif
96#ifdef XFS_BMAP_TRACE
97 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
98#endif
99#ifdef XFS_BTREE_TRACE
100 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
101#endif
102#ifdef XFS_RW_TRACE
103 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
104#endif
105#ifdef XFS_ILOCK_TRACE
106 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
107#endif
108#ifdef XFS_DIR2_TRACE
109 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
110#endif
111
112 /* prevent anyone from using this yet */ 92 /* prevent anyone from using this yet */
113 VFS_I(ip)->i_state = I_NEW|I_LOCK; 93 VFS_I(ip)->i_state = I_NEW;
114 94
115 return ip; 95 return ip;
116} 96}
@@ -130,25 +110,6 @@ xfs_inode_free(
130 if (ip->i_afp) 110 if (ip->i_afp)
131 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 111 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
132 112
133#ifdef XFS_INODE_TRACE
134 ktrace_free(ip->i_trace);
135#endif
136#ifdef XFS_BMAP_TRACE
137 ktrace_free(ip->i_xtrace);
138#endif
139#ifdef XFS_BTREE_TRACE
140 ktrace_free(ip->i_btrace);
141#endif
142#ifdef XFS_RW_TRACE
143 ktrace_free(ip->i_rwtrace);
144#endif
145#ifdef XFS_ILOCK_TRACE
146 ktrace_free(ip->i_lock_trace);
147#endif
148#ifdef XFS_DIR2_TRACE
149 ktrace_free(ip->i_dir_trace);
150#endif
151
152 if (ip->i_itemp) { 113 if (ip->i_itemp) {
153 /* 114 /*
154 * Only if we are shutting down the fs will we see an 115 * Only if we are shutting down the fs will we see an
@@ -207,6 +168,7 @@ xfs_iget_cache_hit(
207 * instead of polling for it. 168 * instead of polling for it.
208 */ 169 */
209 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 170 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
171 trace_xfs_iget_skip(ip);
210 XFS_STATS_INC(xs_ig_frecycle); 172 XFS_STATS_INC(xs_ig_frecycle);
211 error = EAGAIN; 173 error = EAGAIN;
212 goto out_error; 174 goto out_error;
@@ -225,16 +187,15 @@ xfs_iget_cache_hit(
225 * Need to carefully get it back into useable state. 187 * Need to carefully get it back into useable state.
226 */ 188 */
227 if (ip->i_flags & XFS_IRECLAIMABLE) { 189 if (ip->i_flags & XFS_IRECLAIMABLE) {
228 xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); 190 trace_xfs_iget_reclaim(ip);
229 191
230 /* 192 /*
231 * We need to set XFS_INEW atomically with clearing the 193 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
232 * reclaimable tag so that we do have an indicator of the 194 * from stomping over us while we recycle the inode. We can't
233 * inode still being initialized. 195 * clear the radix tree reclaimable tag yet as it requires
196 * pag_ici_lock to be held exclusive.
234 */ 197 */
235 ip->i_flags |= XFS_INEW; 198 ip->i_flags |= XFS_IRECLAIM;
236 ip->i_flags &= ~XFS_IRECLAIMABLE;
237 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
238 199
239 spin_unlock(&ip->i_flags_lock); 200 spin_unlock(&ip->i_flags_lock);
240 read_unlock(&pag->pag_ici_lock); 201 read_unlock(&pag->pag_ici_lock);
@@ -251,9 +212,18 @@ xfs_iget_cache_hit(
251 ip->i_flags &= ~XFS_INEW; 212 ip->i_flags &= ~XFS_INEW;
252 ip->i_flags |= XFS_IRECLAIMABLE; 213 ip->i_flags |= XFS_IRECLAIMABLE;
253 __xfs_inode_set_reclaim_tag(pag, ip); 214 __xfs_inode_set_reclaim_tag(pag, ip);
215 trace_xfs_iget_reclaim(ip);
254 goto out_error; 216 goto out_error;
255 } 217 }
256 inode->i_state = I_LOCK|I_NEW; 218
219 write_lock(&pag->pag_ici_lock);
220 spin_lock(&ip->i_flags_lock);
221 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
222 ip->i_flags |= XFS_INEW;
223 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
224 inode->i_state = I_NEW;
225 spin_unlock(&ip->i_flags_lock);
226 write_unlock(&pag->pag_ici_lock);
257 } else { 227 } else {
258 /* If the VFS inode is being torn down, pause and try again. */ 228 /* If the VFS inode is being torn down, pause and try again. */
259 if (!igrab(inode)) { 229 if (!igrab(inode)) {
@@ -270,8 +240,9 @@ xfs_iget_cache_hit(
270 xfs_ilock(ip, lock_flags); 240 xfs_ilock(ip, lock_flags);
271 241
272 xfs_iflags_clear(ip, XFS_ISTALE); 242 xfs_iflags_clear(ip, XFS_ISTALE);
273 xfs_itrace_exit_tag(ip, "xfs_iget.found");
274 XFS_STATS_INC(xs_ig_found); 243 XFS_STATS_INC(xs_ig_found);
244
245 trace_xfs_iget_found(ip);
275 return 0; 246 return 0;
276 247
277out_error: 248out_error:
@@ -290,7 +261,7 @@ xfs_iget_cache_miss(
290 struct xfs_inode **ipp, 261 struct xfs_inode **ipp,
291 xfs_daddr_t bno, 262 xfs_daddr_t bno,
292 int flags, 263 int flags,
293 int lock_flags) __releases(pag->pag_ici_lock) 264 int lock_flags)
294{ 265{
295 struct xfs_inode *ip; 266 struct xfs_inode *ip;
296 int error; 267 int error;
@@ -305,7 +276,7 @@ xfs_iget_cache_miss(
305 if (error) 276 if (error)
306 goto out_destroy; 277 goto out_destroy;
307 278
308 xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); 279 xfs_itrace_entry(ip);
309 280
310 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 281 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
311 error = ENOENT; 282 error = ENOENT;
@@ -350,6 +321,8 @@ xfs_iget_cache_miss(
350 321
351 write_unlock(&pag->pag_ici_lock); 322 write_unlock(&pag->pag_ici_lock);
352 radix_tree_preload_end(); 323 radix_tree_preload_end();
324
325 trace_xfs_iget_alloc(ip);
353 *ipp = ip; 326 *ipp = ip;
354 return 0; 327 return 0;
355 328
@@ -408,7 +381,7 @@ xfs_iget(
408 return EINVAL; 381 return EINVAL;
409 382
410 /* get the perag structure and ensure that it's inode capable */ 383 /* get the perag structure and ensure that it's inode capable */
411 pag = xfs_get_perag(mp, ino); 384 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
412 if (!pag->pagi_inodeok) 385 if (!pag->pagi_inodeok)
413 return EINVAL; 386 return EINVAL;
414 ASSERT(pag->pag_ici_init); 387 ASSERT(pag->pag_ici_init);
@@ -432,7 +405,7 @@ again:
432 if (error) 405 if (error)
433 goto out_error_or_again; 406 goto out_error_or_again;
434 } 407 }
435 xfs_put_perag(mp, pag); 408 xfs_perag_put(pag);
436 409
437 *ipp = ip; 410 *ipp = ip;
438 411
@@ -451,7 +424,7 @@ out_error_or_again:
451 delay(1); 424 delay(1);
452 goto again; 425 goto again;
453 } 426 }
454 xfs_put_perag(mp, pag); 427 xfs_perag_put(pag);
455 return error; 428 return error;
456} 429}
457 430
@@ -511,19 +484,23 @@ xfs_ireclaim(
511{ 484{
512 struct xfs_mount *mp = ip->i_mount; 485 struct xfs_mount *mp = ip->i_mount;
513 struct xfs_perag *pag; 486 struct xfs_perag *pag;
487 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
514 488
515 XFS_STATS_INC(xs_ig_reclaims); 489 XFS_STATS_INC(xs_ig_reclaims);
516 490
517 /* 491 /*
518 * Remove the inode from the per-AG radix tree. It doesn't matter 492 * Remove the inode from the per-AG radix tree.
519 * if it was never added to it because radix_tree_delete can deal 493 *
520 * with that case just fine. 494 * Because radix_tree_delete won't complain even if the item was never
495 * added to the tree assert that it's been there before to catch
496 * problems with the inode life time early on.
521 */ 497 */
522 pag = xfs_get_perag(mp, ip->i_ino); 498 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
523 write_lock(&pag->pag_ici_lock); 499 write_lock(&pag->pag_ici_lock);
524 radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); 500 if (!radix_tree_delete(&pag->pag_ici_root, agino))
501 ASSERT(0);
525 write_unlock(&pag->pag_ici_lock); 502 write_unlock(&pag->pag_ici_lock);
526 xfs_put_perag(mp, pag); 503 xfs_perag_put(pag);
527 504
528 /* 505 /*
529 * Here we do an (almost) spurious inode lock in order to coordinate 506 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -636,7 +613,7 @@ xfs_ilock(
636 else if (lock_flags & XFS_ILOCK_SHARED) 613 else if (lock_flags & XFS_ILOCK_SHARED)
637 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 614 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
638 615
639 xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); 616 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
640} 617}
641 618
642/* 619/*
@@ -681,7 +658,7 @@ xfs_ilock_nowait(
681 if (!mrtryaccess(&ip->i_lock)) 658 if (!mrtryaccess(&ip->i_lock))
682 goto out_undo_iolock; 659 goto out_undo_iolock;
683 } 660 }
684 xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address); 661 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
685 return 1; 662 return 1;
686 663
687 out_undo_iolock: 664 out_undo_iolock:
@@ -743,7 +720,7 @@ xfs_iunlock(
743 xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp, 720 xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
744 (xfs_log_item_t*)(ip->i_itemp)); 721 (xfs_log_item_t*)(ip->i_itemp));
745 } 722 }
746 xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); 723 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
747} 724}
748 725
749/* 726/*
@@ -762,6 +739,8 @@ xfs_ilock_demote(
762 mrdemote(&ip->i_lock); 739 mrdemote(&ip->i_lock);
763 if (lock_flags & XFS_IOLOCK_EXCL) 740 if (lock_flags & XFS_IOLOCK_EXCL)
764 mrdemote(&ip->i_iolock); 741 mrdemote(&ip->i_iolock);
742
743 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
765} 744}
766 745
767#ifdef DEBUG 746#ifdef DEBUG
@@ -792,52 +771,3 @@ xfs_isilocked(
792 return 1; 771 return 1;
793} 772}
794#endif 773#endif
795
796#ifdef XFS_INODE_TRACE
797
798#define KTRACE_ENTER(ip, vk, s, line, ra) \
799 ktrace_enter((ip)->i_trace, \
800/* 0 */ (void *)(__psint_t)(vk), \
801/* 1 */ (void *)(s), \
802/* 2 */ (void *)(__psint_t) line, \
803/* 3 */ (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
804/* 4 */ (void *)(ra), \
805/* 5 */ NULL, \
806/* 6 */ (void *)(__psint_t)current_cpu(), \
807/* 7 */ (void *)(__psint_t)current_pid(), \
808/* 8 */ (void *)__return_address, \
809/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
810
811/*
812 * Vnode tracing code.
813 */
814void
815_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
816{
817 KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
818}
819
820void
821_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
822{
823 KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
824}
825
826void
827xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
828{
829 KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
830}
831
832void
833_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
834{
835 KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
836}
837
838void
839xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
840{
841 KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
842}
843#endif /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b92a4fa2a0a1..0ffd56447045 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -47,10 +47,10 @@
47#include "xfs_rw.h" 47#include "xfs_rw.h"
48#include "xfs_error.h" 48#include "xfs_error.h"
49#include "xfs_utils.h" 49#include "xfs_utils.h"
50#include "xfs_dir2_trace.h"
51#include "xfs_quota.h" 50#include "xfs_quota.h"
52#include "xfs_filestream.h" 51#include "xfs_filestream.h"
53#include "xfs_vnodeops.h" 52#include "xfs_vnodeops.h"
53#include "xfs_trace.h"
54 54
55kmem_zone_t *xfs_ifork_zone; 55kmem_zone_t *xfs_ifork_zone;
56kmem_zone_t *xfs_inode_zone; 56kmem_zone_t *xfs_inode_zone;
@@ -151,7 +151,7 @@ xfs_imap_to_bp(
151 "an error %d on %s. Returning error.", 151 "an error %d on %s. Returning error.",
152 error, mp->m_fsname); 152 error, mp->m_fsname);
153 } else { 153 } else {
154 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 154 ASSERT(buf_flags & XBF_TRYLOCK);
155 } 155 }
156 return error; 156 return error;
157 } 157 }
@@ -239,7 +239,7 @@ xfs_inotobp(
239 if (error) 239 if (error)
240 return error; 240 return error;
241 241
242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); 242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
243 if (error) 243 if (error)
244 return error; 244 return error;
245 245
@@ -285,7 +285,7 @@ xfs_itobp(
285 return error; 285 return error;
286 286
287 if (!bp) { 287 if (!bp) {
288 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 288 ASSERT(buf_flags & XBF_TRYLOCK);
289 ASSERT(tp == NULL); 289 ASSERT(tp == NULL);
290 *bpp = NULL; 290 *bpp = NULL;
291 return EAGAIN; 291 return EAGAIN;
@@ -807,7 +807,7 @@ xfs_iread(
807 * Get pointers to the on-disk inode and the buffer containing it. 807 * Get pointers to the on-disk inode and the buffer containing it.
808 */ 808 */
809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
810 XFS_BUF_LOCK, iget_flags); 810 XBF_LOCK, iget_flags);
811 if (error) 811 if (error)
812 return error; 812 return error;
813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1291,42 +1291,6 @@ xfs_file_last_byte(
1291 return last_byte; 1291 return last_byte;
1292} 1292}
1293 1293
1294#if defined(XFS_RW_TRACE)
1295STATIC void
1296xfs_itrunc_trace(
1297 int tag,
1298 xfs_inode_t *ip,
1299 int flag,
1300 xfs_fsize_t new_size,
1301 xfs_off_t toss_start,
1302 xfs_off_t toss_finish)
1303{
1304 if (ip->i_rwtrace == NULL) {
1305 return;
1306 }
1307
1308 ktrace_enter(ip->i_rwtrace,
1309 (void*)((long)tag),
1310 (void*)ip,
1311 (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
1312 (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
1313 (void*)((long)flag),
1314 (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
1315 (void*)(unsigned long)(new_size & 0xffffffff),
1316 (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
1317 (void*)(unsigned long)(toss_start & 0xffffffff),
1318 (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
1319 (void*)(unsigned long)(toss_finish & 0xffffffff),
1320 (void*)(unsigned long)current_cpu(),
1321 (void*)(unsigned long)current_pid(),
1322 (void*)NULL,
1323 (void*)NULL,
1324 (void*)NULL);
1325}
1326#else
1327#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
1328#endif
1329
1330/* 1294/*
1331 * Start the truncation of the file to new_size. The new size 1295 * Start the truncation of the file to new_size. The new size
1332 * must be smaller than the current size. This routine will 1296 * must be smaller than the current size. This routine will
@@ -1409,8 +1373,7 @@ xfs_itruncate_start(
1409 return 0; 1373 return 0;
1410 } 1374 }
1411 last_byte = xfs_file_last_byte(ip); 1375 last_byte = xfs_file_last_byte(ip);
1412 xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, 1376 trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
1413 last_byte);
1414 if (last_byte > toss_start) { 1377 if (last_byte > toss_start) {
1415 if (flags & XFS_ITRUNC_DEFINITE) { 1378 if (flags & XFS_ITRUNC_DEFINITE) {
1416 xfs_tosspages(ip, toss_start, 1379 xfs_tosspages(ip, toss_start,
@@ -1514,7 +1477,8 @@ xfs_itruncate_finish(
1514 new_size = 0LL; 1477 new_size = 0LL;
1515 } 1478 }
1516 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1479 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1517 xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0); 1480 trace_xfs_itruncate_finish_start(ip, new_size);
1481
1518 /* 1482 /*
1519 * The first thing we do is set the size to new_size permanently 1483 * The first thing we do is set the size to new_size permanently
1520 * on disk. This way we don't have to worry about anyone ever 1484 * on disk. This way we don't have to worry about anyone ever
@@ -1731,7 +1695,7 @@ xfs_itruncate_finish(
1731 ASSERT((new_size != 0) || 1695 ASSERT((new_size != 0) ||
1732 (fork == XFS_ATTR_FORK) || 1696 (fork == XFS_ATTR_FORK) ||
1733 (ip->i_d.di_nextents == 0)); 1697 (ip->i_d.di_nextents == 0));
1734 xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0); 1698 trace_xfs_itruncate_finish_end(ip, new_size);
1735 return 0; 1699 return 0;
1736} 1700}
1737 1701
@@ -1787,7 +1751,7 @@ xfs_iunlink(
1787 * Here we put the head pointer into our next pointer, 1751 * Here we put the head pointer into our next pointer,
1788 * and then we fall through to point the head at us. 1752 * and then we fall through to point the head at us.
1789 */ 1753 */
1790 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1754 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1791 if (error) 1755 if (error)
1792 return error; 1756 return error;
1793 1757
@@ -1869,7 +1833,7 @@ xfs_iunlink_remove(
1869 * of dealing with the buffer when there is no need to 1833 * of dealing with the buffer when there is no need to
1870 * change it. 1834 * change it.
1871 */ 1835 */
1872 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1836 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1873 if (error) { 1837 if (error) {
1874 cmn_err(CE_WARN, 1838 cmn_err(CE_WARN,
1875 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1839 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1931,7 +1895,7 @@ xfs_iunlink_remove(
1931 * Now last_ibp points to the buffer previous to us on 1895 * Now last_ibp points to the buffer previous to us on
1932 * the unlinked list. Pull us from the list. 1896 * the unlinked list. Pull us from the list.
1933 */ 1897 */
1934 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1898 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1935 if (error) { 1899 if (error) {
1936 cmn_err(CE_WARN, 1900 cmn_err(CE_WARN,
1937 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1901 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1982,8 +1946,9 @@ xfs_ifree_cluster(
1982 xfs_inode_t *ip, **ip_found; 1946 xfs_inode_t *ip, **ip_found;
1983 xfs_inode_log_item_t *iip; 1947 xfs_inode_log_item_t *iip;
1984 xfs_log_item_t *lip; 1948 xfs_log_item_t *lip;
1985 xfs_perag_t *pag = xfs_get_perag(mp, inum); 1949 struct xfs_perag *pag;
1986 1950
1951 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1987 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1952 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1988 blks_per_cluster = 1; 1953 blks_per_cluster = 1;
1989 ninodes = mp->m_sb.sb_inopblock; 1954 ninodes = mp->m_sb.sb_inopblock;
@@ -2075,7 +2040,7 @@ xfs_ifree_cluster(
2075 2040
2076 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2077 mp->m_bsize * blks_per_cluster, 2042 mp->m_bsize * blks_per_cluster,
2078 XFS_BUF_LOCK); 2043 XBF_LOCK);
2079 2044
2080 pre_flushed = 0; 2045 pre_flushed = 0;
2081 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -2124,7 +2089,7 @@ xfs_ifree_cluster(
2124 } 2089 }
2125 2090
2126 kmem_free(ip_found); 2091 kmem_free(ip_found);
2127 xfs_put_perag(mp, pag); 2092 xfs_perag_put(pag);
2128} 2093}
2129 2094
2130/* 2095/*
@@ -2186,7 +2151,7 @@ xfs_ifree(
2186 2151
2187 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2152 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2188 2153
2189 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 2154 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
2190 if (error) 2155 if (error)
2191 return error; 2156 return error;
2192 2157
@@ -2474,72 +2439,31 @@ xfs_idestroy_fork(
2474} 2439}
2475 2440
2476/* 2441/*
2477 * Increment the pin count of the given buffer. 2442 * This is called to unpin an inode. The caller must have the inode locked
2478 * This value is protected by ipinlock spinlock in the mount structure. 2443 * in at least shared mode so that the buffer cannot be subsequently pinned
2479 */ 2444 * once someone is waiting for it to be unpinned.
2480void
2481xfs_ipin(
2482 xfs_inode_t *ip)
2483{
2484 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2485
2486 atomic_inc(&ip->i_pincount);
2487}
2488
2489/*
2490 * Decrement the pin count of the given inode, and wake up
2491 * anyone in xfs_iwait_unpin() if the count goes to 0. The
2492 * inode must have been previously pinned with a call to xfs_ipin().
2493 */ 2445 */
2494void 2446static void
2495xfs_iunpin( 2447xfs_iunpin_nowait(
2496 xfs_inode_t *ip) 2448 struct xfs_inode *ip)
2497{
2498 ASSERT(atomic_read(&ip->i_pincount) > 0);
2499
2500 if (atomic_dec_and_test(&ip->i_pincount))
2501 wake_up(&ip->i_ipin_wait);
2502}
2503
2504/*
2505 * This is called to unpin an inode. It can be directed to wait or to return
2506 * immediately without waiting for the inode to be unpinned. The caller must
2507 * have the inode locked in at least shared mode so that the buffer cannot be
2508 * subsequently pinned once someone is waiting for it to be unpinned.
2509 */
2510STATIC void
2511__xfs_iunpin_wait(
2512 xfs_inode_t *ip,
2513 int wait)
2514{ 2449{
2515 xfs_inode_log_item_t *iip = ip->i_itemp;
2516
2517 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2518 if (atomic_read(&ip->i_pincount) == 0)
2519 return;
2520 2451
2521 /* Give the log a push to start the unpinning I/O */ 2452 /* Give the log a push to start the unpinning I/O */
2522 xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? 2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2523 iip->ili_last_lsn : 0, XFS_LOG_FORCE);
2524 if (wait)
2525 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2526}
2527 2454
2528static inline void
2529xfs_iunpin_wait(
2530 xfs_inode_t *ip)
2531{
2532 __xfs_iunpin_wait(ip, 1);
2533} 2455}
2534 2456
2535static inline void 2457void
2536xfs_iunpin_nowait( 2458xfs_iunpin_wait(
2537 xfs_inode_t *ip) 2459 struct xfs_inode *ip)
2538{ 2460{
2539 __xfs_iunpin_wait(ip, 0); 2461 if (xfs_ipincount(ip)) {
2462 xfs_iunpin_nowait(ip);
2463 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2464 }
2540} 2465}
2541 2466
2542
2543/* 2467/*
2544 * xfs_iextents_copy() 2468 * xfs_iextents_copy()
2545 * 2469 *
@@ -2711,7 +2635,7 @@ xfs_iflush_cluster(
2711 xfs_buf_t *bp) 2635 xfs_buf_t *bp)
2712{ 2636{
2713 xfs_mount_t *mp = ip->i_mount; 2637 xfs_mount_t *mp = ip->i_mount;
2714 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 2638 struct xfs_perag *pag;
2715 unsigned long first_index, mask; 2639 unsigned long first_index, mask;
2716 unsigned long inodes_per_cluster; 2640 unsigned long inodes_per_cluster;
2717 int ilist_size; 2641 int ilist_size;
@@ -2722,6 +2646,7 @@ xfs_iflush_cluster(
2722 int bufwasdelwri; 2646 int bufwasdelwri;
2723 int i; 2647 int i;
2724 2648
2649 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2725 ASSERT(pag->pagi_inodeok); 2650 ASSERT(pag->pagi_inodeok);
2726 ASSERT(pag->pag_ici_init); 2651 ASSERT(pag->pag_ici_init);
2727 2652
@@ -2729,7 +2654,7 @@ xfs_iflush_cluster(
2729 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2654 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2730 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2655 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2731 if (!ilist) 2656 if (!ilist)
2732 return 0; 2657 goto out_put;
2733 2658
2734 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2659 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2735 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2660 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
@@ -2798,6 +2723,8 @@ xfs_iflush_cluster(
2798out_free: 2723out_free:
2799 read_unlock(&pag->pag_ici_lock); 2724 read_unlock(&pag->pag_ici_lock);
2800 kmem_free(ilist); 2725 kmem_free(ilist);
2726out_put:
2727 xfs_perag_put(pag);
2801 return 0; 2728 return 0;
2802 2729
2803 2730
@@ -2841,6 +2768,7 @@ cluster_corrupt_out:
2841 */ 2768 */
2842 xfs_iflush_abort(iq); 2769 xfs_iflush_abort(iq);
2843 kmem_free(ilist); 2770 kmem_free(ilist);
2771 xfs_perag_put(pag);
2844 return XFS_ERROR(EFSCORRUPTED); 2772 return XFS_ERROR(EFSCORRUPTED);
2845} 2773}
2846 2774
@@ -2863,8 +2791,6 @@ xfs_iflush(
2863 xfs_dinode_t *dip; 2791 xfs_dinode_t *dip;
2864 xfs_mount_t *mp; 2792 xfs_mount_t *mp;
2865 int error; 2793 int error;
2866 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2867 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2868 2794
2869 XFS_STATS_INC(xs_iflush_count); 2795 XFS_STATS_INC(xs_iflush_count);
2870 2796
@@ -2877,15 +2803,6 @@ xfs_iflush(
2877 mp = ip->i_mount; 2803 mp = ip->i_mount;
2878 2804
2879 /* 2805 /*
2880 * If the inode isn't dirty, then just release the inode
2881 * flush lock and do nothing.
2882 */
2883 if (xfs_inode_clean(ip)) {
2884 xfs_ifunlock(ip);
2885 return 0;
2886 }
2887
2888 /*
2889 * We can't flush the inode until it is unpinned, so wait for it if we 2806 * We can't flush the inode until it is unpinned, so wait for it if we
2890 * are allowed to block. We know noone new can pin it, because we are 2807 * are allowed to block. We know noone new can pin it, because we are
2891 * holding the inode lock shared and you need to hold it exclusively to 2808 * holding the inode lock shared and you need to hold it exclusively to
@@ -2896,7 +2813,7 @@ xfs_iflush(
2896 * in the same cluster are dirty, they will probably write the inode 2813 * in the same cluster are dirty, they will probably write the inode
2897 * out for us if they occur after the log force completes. 2814 * out for us if they occur after the log force completes.
2898 */ 2815 */
2899 if (noblock && xfs_ipincount(ip)) { 2816 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2900 xfs_iunpin_nowait(ip); 2817 xfs_iunpin_nowait(ip);
2901 xfs_ifunlock(ip); 2818 xfs_ifunlock(ip);
2902 return EAGAIN; 2819 return EAGAIN;
@@ -2904,6 +2821,19 @@ xfs_iflush(
2904 xfs_iunpin_wait(ip); 2821 xfs_iunpin_wait(ip);
2905 2822
2906 /* 2823 /*
2824 * For stale inodes we cannot rely on the backing buffer remaining
2825 * stale in cache for the remaining life of the stale inode and so
2826 * xfs_itobp() below may give us a buffer that no longer contains
2827 * inodes below. We have to check this after ensuring the inode is
2828 * unpinned so that it is safe to reclaim the stale inode after the
2829 * flush call.
2830 */
2831 if (xfs_iflags_test(ip, XFS_ISTALE)) {
2832 xfs_ifunlock(ip);
2833 return 0;
2834 }
2835
2836 /*
2907 * This may have been unpinned because the filesystem is shutting 2837 * This may have been unpinned because the filesystem is shutting
2908 * down forcibly. If that's the case we must not write this inode 2838 * down forcibly. If that's the case we must not write this inode
2909 * to disk, because the log record didn't make it to disk! 2839 * to disk, because the log record didn't make it to disk!
@@ -2917,60 +2847,10 @@ xfs_iflush(
2917 } 2847 }
2918 2848
2919 /* 2849 /*
2920 * Decide how buffer will be flushed out. This is done before
2921 * the call to xfs_iflush_int because this field is zeroed by it.
2922 */
2923 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2924 /*
2925 * Flush out the inode buffer according to the directions
2926 * of the caller. In the cases where the caller has given
2927 * us a choice choose the non-delwri case. This is because
2928 * the inode is in the AIL and we need to get it out soon.
2929 */
2930 switch (flags) {
2931 case XFS_IFLUSH_SYNC:
2932 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2933 flags = 0;
2934 break;
2935 case XFS_IFLUSH_ASYNC_NOBLOCK:
2936 case XFS_IFLUSH_ASYNC:
2937 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2938 flags = INT_ASYNC;
2939 break;
2940 case XFS_IFLUSH_DELWRI:
2941 flags = INT_DELWRI;
2942 break;
2943 default:
2944 ASSERT(0);
2945 flags = 0;
2946 break;
2947 }
2948 } else {
2949 switch (flags) {
2950 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2951 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2952 case XFS_IFLUSH_DELWRI:
2953 flags = INT_DELWRI;
2954 break;
2955 case XFS_IFLUSH_ASYNC_NOBLOCK:
2956 case XFS_IFLUSH_ASYNC:
2957 flags = INT_ASYNC;
2958 break;
2959 case XFS_IFLUSH_SYNC:
2960 flags = 0;
2961 break;
2962 default:
2963 ASSERT(0);
2964 flags = 0;
2965 break;
2966 }
2967 }
2968
2969 /*
2970 * Get the buffer containing the on-disk inode. 2850 * Get the buffer containing the on-disk inode.
2971 */ 2851 */
2972 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2852 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2973 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); 2853 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2974 if (error || !bp) { 2854 if (error || !bp) {
2975 xfs_ifunlock(ip); 2855 xfs_ifunlock(ip);
2976 return error; 2856 return error;
@@ -2988,7 +2868,7 @@ xfs_iflush(
2988 * get stuck waiting in the write for too long. 2868 * get stuck waiting in the write for too long.
2989 */ 2869 */
2990 if (XFS_BUF_ISPINNED(bp)) 2870 if (XFS_BUF_ISPINNED(bp))
2991 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 2871 xfs_log_force(mp, 0);
2992 2872
2993 /* 2873 /*
2994 * inode clustering: 2874 * inode clustering:
@@ -2998,13 +2878,10 @@ xfs_iflush(
2998 if (error) 2878 if (error)
2999 goto cluster_corrupt_out; 2879 goto cluster_corrupt_out;
3000 2880
3001 if (flags & INT_DELWRI) { 2881 if (flags & SYNC_WAIT)
3002 xfs_bdwrite(mp, bp);
3003 } else if (flags & INT_ASYNC) {
3004 error = xfs_bawrite(mp, bp);
3005 } else {
3006 error = xfs_bwrite(mp, bp); 2882 error = xfs_bwrite(mp, bp);
3007 } 2883 else
2884 xfs_bdwrite(mp, bp);
3008 return error; 2885 return error;
3009 2886
3010corrupt_out: 2887corrupt_out:
@@ -3039,16 +2916,6 @@ xfs_iflush_int(
3039 iip = ip->i_itemp; 2916 iip = ip->i_itemp;
3040 mp = ip->i_mount; 2917 mp = ip->i_mount;
3041 2918
3042
3043 /*
3044 * If the inode isn't dirty, then just release the inode
3045 * flush lock and do nothing.
3046 */
3047 if (xfs_inode_clean(ip)) {
3048 xfs_ifunlock(ip);
3049 return 0;
3050 }
3051
3052 /* set *dip = inode's place in the buffer */ 2919 /* set *dip = inode's place in the buffer */
3053 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2920 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3054 2921
@@ -3252,23 +3119,6 @@ corrupt_out:
3252 return XFS_ERROR(EFSCORRUPTED); 3119 return XFS_ERROR(EFSCORRUPTED);
3253} 3120}
3254 3121
3255
3256
3257#ifdef XFS_ILOCK_TRACE
3258void
3259xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
3260{
3261 ktrace_enter(ip->i_lock_trace,
3262 (void *)ip,
3263 (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
3264 (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
3265 (void *)ra, /* caller of ilock */
3266 (void *)(unsigned long)current_cpu(),
3267 (void *)(unsigned long)current_pid(),
3268 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
3269}
3270#endif
3271
3272/* 3122/*
3273 * Return a pointer to the extent record at file index idx. 3123 * Return a pointer to the extent record at file index idx.
3274 */ 3124 */
@@ -3300,13 +3150,17 @@ xfs_iext_get_ext(
3300 */ 3150 */
3301void 3151void
3302xfs_iext_insert( 3152xfs_iext_insert(
3303 xfs_ifork_t *ifp, /* inode fork pointer */ 3153 xfs_inode_t *ip, /* incore inode pointer */
3304 xfs_extnum_t idx, /* starting index of new items */ 3154 xfs_extnum_t idx, /* starting index of new items */
3305 xfs_extnum_t count, /* number of inserted items */ 3155 xfs_extnum_t count, /* number of inserted items */
3306 xfs_bmbt_irec_t *new) /* items to insert */ 3156 xfs_bmbt_irec_t *new, /* items to insert */
3157 int state) /* type of extent conversion */
3307{ 3158{
3159 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3308 xfs_extnum_t i; /* extent record index */ 3160 xfs_extnum_t i; /* extent record index */
3309 3161
3162 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
3163
3310 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3164 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3311 xfs_iext_add(ifp, idx, count); 3165 xfs_iext_add(ifp, idx, count);
3312 for (i = idx; i < idx + count; i++, new++) 3166 for (i = idx; i < idx + count; i++, new++)
@@ -3549,13 +3403,17 @@ xfs_iext_add_indirect_multi(
3549 */ 3403 */
3550void 3404void
3551xfs_iext_remove( 3405xfs_iext_remove(
3552 xfs_ifork_t *ifp, /* inode fork pointer */ 3406 xfs_inode_t *ip, /* incore inode pointer */
3553 xfs_extnum_t idx, /* index to begin removing exts */ 3407 xfs_extnum_t idx, /* index to begin removing exts */
3554 int ext_diff) /* number of extents to remove */ 3408 int ext_diff, /* number of extents to remove */
3409 int state) /* type of extent conversion */
3555{ 3410{
3411 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3556 xfs_extnum_t nextents; /* number of extents in file */ 3412 xfs_extnum_t nextents; /* number of extents in file */
3557 int new_size; /* size of extents after removal */ 3413 int new_size; /* size of extents after removal */
3558 3414
3415 trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
3416
3559 ASSERT(ext_diff > 0); 3417 ASSERT(ext_diff > 0);
3560 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3418 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3561 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); 3419 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 41555de1d1db..9965e40a4615 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -213,7 +213,6 @@ typedef struct xfs_icdinode {
213 213
214struct bhv_desc; 214struct bhv_desc;
215struct cred; 215struct cred;
216struct ktrace;
217struct xfs_buf; 216struct xfs_buf;
218struct xfs_bmap_free; 217struct xfs_bmap_free;
219struct xfs_bmbt_irec; 218struct xfs_bmbt_irec;
@@ -222,13 +221,6 @@ struct xfs_mount;
222struct xfs_trans; 221struct xfs_trans;
223struct xfs_dquot; 222struct xfs_dquot;
224 223
225#if defined(XFS_ILOCK_TRACE)
226#define XFS_ILOCK_KTRACE_SIZE 32
227extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
228#else
229#define xfs_ilock_trace(i,n,f,ra)
230#endif
231
232typedef struct dm_attrs_s { 224typedef struct dm_attrs_s {
233 __uint32_t da_dmevmask; /* DMIG event mask */ 225 __uint32_t da_dmevmask; /* DMIG event mask */
234 __uint16_t da_dmstate; /* DMIG state info */ 226 __uint16_t da_dmstate; /* DMIG state info */
@@ -271,26 +263,6 @@ typedef struct xfs_inode {
271 263
272 /* VFS inode */ 264 /* VFS inode */
273 struct inode i_vnode; /* embedded VFS inode */ 265 struct inode i_vnode; /* embedded VFS inode */
274
275 /* Trace buffers per inode. */
276#ifdef XFS_INODE_TRACE
277 struct ktrace *i_trace; /* general inode trace */
278#endif
279#ifdef XFS_BMAP_TRACE
280 struct ktrace *i_xtrace; /* inode extent list trace */
281#endif
282#ifdef XFS_BTREE_TRACE
283 struct ktrace *i_btrace; /* inode bmap btree trace */
284#endif
285#ifdef XFS_RW_TRACE
286 struct ktrace *i_rwtrace; /* inode read/write trace */
287#endif
288#ifdef XFS_ILOCK_TRACE
289 struct ktrace *i_lock_trace; /* inode lock/unlock trace */
290#endif
291#ifdef XFS_DIR2_TRACE
292 struct ktrace *i_dir_trace; /* inode directory trace */
293#endif
294} xfs_inode_t; 266} xfs_inode_t;
295 267
296#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ 268#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
@@ -406,6 +378,14 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
406#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ 378#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
407 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) 379 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
408 380
381#define XFS_LOCK_FLAGS \
382 { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
383 { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
384 { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
385 { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
386 { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
387
388
409/* 389/*
410 * Flags for lockdep annotations. 390 * Flags for lockdep annotations.
411 * 391 *
@@ -440,21 +420,15 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
440#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
441 421
442/* 422/*
443 * Flags for xfs_iflush()
444 */
445#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
446#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
447#define XFS_IFLUSH_SYNC 3
448#define XFS_IFLUSH_ASYNC 4
449#define XFS_IFLUSH_DELWRI 5
450#define XFS_IFLUSH_ASYNC_NOBLOCK 6
451
452/*
453 * Flags for xfs_itruncate_start(). 423 * Flags for xfs_itruncate_start().
454 */ 424 */
455#define XFS_ITRUNC_DEFINITE 0x1 425#define XFS_ITRUNC_DEFINITE 0x1
456#define XFS_ITRUNC_MAYBE 0x2 426#define XFS_ITRUNC_MAYBE 0x2
457 427
428#define XFS_ITRUNC_FLAGS \
429 { XFS_ITRUNC_DEFINITE, "DEFINITE" }, \
430 { XFS_ITRUNC_MAYBE, "MAYBE" }
431
458/* 432/*
459 * For multiple groups support: if S_ISGID bit is set in the parent 433 * For multiple groups support: if S_ISGID bit is set in the parent
460 * directory, group of new file is set to that of the parent, and 434 * directory, group of new file is set to that of the parent, and
@@ -497,58 +471,26 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
497int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 471int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
498 472
499void xfs_iext_realloc(xfs_inode_t *, int, int); 473void xfs_iext_realloc(xfs_inode_t *, int, int);
500void xfs_ipin(xfs_inode_t *); 474void xfs_iunpin_wait(xfs_inode_t *);
501void xfs_iunpin(xfs_inode_t *);
502int xfs_iflush(xfs_inode_t *, uint); 475int xfs_iflush(xfs_inode_t *, uint);
503void xfs_ichgtime(xfs_inode_t *, int); 476void xfs_ichgtime(xfs_inode_t *, int);
504void xfs_lock_inodes(xfs_inode_t **, int, uint); 477void xfs_lock_inodes(xfs_inode_t **, int, uint);
505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 478void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
506 479
507void xfs_synchronize_times(xfs_inode_t *); 480void xfs_synchronize_times(xfs_inode_t *);
481void xfs_mark_inode_dirty(xfs_inode_t *);
508void xfs_mark_inode_dirty_sync(xfs_inode_t *); 482void xfs_mark_inode_dirty_sync(xfs_inode_t *);
509 483
510#if defined(XFS_INODE_TRACE)
511
512#define INODE_TRACE_SIZE 16 /* number of trace entries */
513#define INODE_KTRACE_ENTRY 1
514#define INODE_KTRACE_EXIT 2
515#define INODE_KTRACE_HOLD 3
516#define INODE_KTRACE_REF 4
517#define INODE_KTRACE_RELE 5
518
519extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
520extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
521extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
522extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
523extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
524#define xfs_itrace_entry(ip) \
525 _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
526#define xfs_itrace_exit(ip) \
527 _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
528#define xfs_itrace_exit_tag(ip, tag) \
529 _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
530#define xfs_itrace_ref(ip) \
531 _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
532
533#else
534#define xfs_itrace_entry(a)
535#define xfs_itrace_exit(a)
536#define xfs_itrace_exit_tag(a, b)
537#define xfs_itrace_hold(a, b, c, d)
538#define xfs_itrace_ref(a)
539#define xfs_itrace_rele(a, b, c, d)
540#endif
541
542#define IHOLD(ip) \ 484#define IHOLD(ip) \
543do { \ 485do { \
544 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 486 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
545 atomic_inc(&(VFS_I(ip)->i_count)); \ 487 atomic_inc(&(VFS_I(ip)->i_count)); \
546 xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ 488 trace_xfs_ihold(ip, _THIS_IP_); \
547} while (0) 489} while (0)
548 490
549#define IRELE(ip) \ 491#define IRELE(ip) \
550do { \ 492do { \
551 xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ 493 trace_xfs_irele(ip, _THIS_IP_); \
552 iput(VFS_I(ip)); \ 494 iput(VFS_I(ip)); \
553} while (0) 495} while (0)
554 496
@@ -577,11 +519,11 @@ int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
577int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int); 519int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
578 520
579xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); 521xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
580void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t, 522void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
581 xfs_bmbt_irec_t *); 523 xfs_bmbt_irec_t *, int);
582void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int); 524void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
583void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int); 525void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
584void xfs_iext_remove(xfs_ifork_t *, xfs_extnum_t, int); 526void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
585void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); 527void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
586void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); 528void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
587void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); 529void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9794b876d6ff..7bfea8540159 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -41,6 +41,7 @@
41#include "xfs_ialloc.h" 41#include "xfs_ialloc.h"
42#include "xfs_rw.h" 42#include "xfs_rw.h"
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_trace.h"
44 45
45 46
46kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 47kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -227,7 +228,7 @@ xfs_inode_item_format(
227 228
228 vecp->i_addr = (xfs_caddr_t)&iip->ili_format; 229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
229 vecp->i_len = sizeof(xfs_inode_log_format_t); 230 vecp->i_len = sizeof(xfs_inode_log_format_t);
230 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); 231 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
231 vecp++; 232 vecp++;
232 nvecs = 1; 233 nvecs = 1;
233 234
@@ -278,7 +279,7 @@ xfs_inode_item_format(
278 279
279 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 280 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
280 vecp->i_len = sizeof(struct xfs_icdinode); 281 vecp->i_len = sizeof(struct xfs_icdinode);
281 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); 282 vecp->i_type = XLOG_REG_TYPE_ICORE;
282 vecp++; 283 vecp++;
283 nvecs++; 284 nvecs++;
284 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 285 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -335,7 +336,7 @@ xfs_inode_item_format(
335 vecp->i_addr = 336 vecp->i_addr =
336 (char *)(ip->i_df.if_u1.if_extents); 337 (char *)(ip->i_df.if_u1.if_extents);
337 vecp->i_len = ip->i_df.if_bytes; 338 vecp->i_len = ip->i_df.if_bytes;
338 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 339 vecp->i_type = XLOG_REG_TYPE_IEXT;
339 } else 340 } else
340#endif 341#endif
341 { 342 {
@@ -354,7 +355,7 @@ xfs_inode_item_format(
354 vecp->i_addr = (xfs_caddr_t)ext_buffer; 355 vecp->i_addr = (xfs_caddr_t)ext_buffer;
355 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
356 XFS_DATA_FORK); 357 XFS_DATA_FORK);
357 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 358 vecp->i_type = XLOG_REG_TYPE_IEXT;
358 } 359 }
359 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 360 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
360 iip->ili_format.ilf_dsize = vecp->i_len; 361 iip->ili_format.ilf_dsize = vecp->i_len;
@@ -372,7 +373,7 @@ xfs_inode_item_format(
372 ASSERT(ip->i_df.if_broot != NULL); 373 ASSERT(ip->i_df.if_broot != NULL);
373 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
374 vecp->i_len = ip->i_df.if_broot_bytes; 375 vecp->i_len = ip->i_df.if_broot_bytes;
375 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); 376 vecp->i_type = XLOG_REG_TYPE_IBROOT;
376 vecp++; 377 vecp++;
377 nvecs++; 378 nvecs++;
378 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 379 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -398,7 +399,7 @@ xfs_inode_item_format(
398 ASSERT((ip->i_df.if_real_bytes == 0) || 399 ASSERT((ip->i_df.if_real_bytes == 0) ||
399 (ip->i_df.if_real_bytes == data_bytes)); 400 (ip->i_df.if_real_bytes == data_bytes));
400 vecp->i_len = (int)data_bytes; 401 vecp->i_len = (int)data_bytes;
401 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); 402 vecp->i_type = XLOG_REG_TYPE_ILOCAL;
402 vecp++; 403 vecp++;
403 nvecs++; 404 nvecs++;
404 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 405 iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -476,7 +477,7 @@ xfs_inode_item_format(
476 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
477 XFS_ATTR_FORK); 478 XFS_ATTR_FORK);
478#endif 479#endif
479 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); 480 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
480 iip->ili_format.ilf_asize = vecp->i_len; 481 iip->ili_format.ilf_asize = vecp->i_len;
481 vecp++; 482 vecp++;
482 nvecs++; 483 nvecs++;
@@ -491,7 +492,7 @@ xfs_inode_item_format(
491 ASSERT(ip->i_afp->if_broot != NULL); 492 ASSERT(ip->i_afp->if_broot != NULL);
492 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
493 vecp->i_len = ip->i_afp->if_broot_bytes; 494 vecp->i_len = ip->i_afp->if_broot_bytes;
494 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); 495 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
495 vecp++; 496 vecp++;
496 nvecs++; 497 nvecs++;
497 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 498 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -515,7 +516,7 @@ xfs_inode_item_format(
515 ASSERT((ip->i_afp->if_real_bytes == 0) || 516 ASSERT((ip->i_afp->if_real_bytes == 0) ||
516 (ip->i_afp->if_real_bytes == data_bytes)); 517 (ip->i_afp->if_real_bytes == data_bytes));
517 vecp->i_len = (int)data_bytes; 518 vecp->i_len = (int)data_bytes;
518 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); 519 vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
519 vecp++; 520 vecp++;
520 nvecs++; 521 nvecs++;
521 iip->ili_format.ilf_asize = (unsigned)data_bytes; 522 iip->ili_format.ilf_asize = (unsigned)data_bytes;
@@ -534,23 +535,23 @@ xfs_inode_item_format(
534 535
535/* 536/*
536 * This is called to pin the inode associated with the inode log 537 * This is called to pin the inode associated with the inode log
537 * item in memory so it cannot be written out. Do this by calling 538 * item in memory so it cannot be written out.
538 * xfs_ipin() to bump the pin count in the inode while holding the
539 * inode pin lock.
540 */ 539 */
541STATIC void 540STATIC void
542xfs_inode_item_pin( 541xfs_inode_item_pin(
543 xfs_inode_log_item_t *iip) 542 xfs_inode_log_item_t *iip)
544{ 543{
545 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
546 xfs_ipin(iip->ili_inode); 545
546 atomic_inc(&iip->ili_inode->i_pincount);
547} 547}
548 548
549 549
550/* 550/*
551 * This is called to unpin the inode associated with the inode log 551 * This is called to unpin the inode associated with the inode log
552 * item which was previously pinned with a call to xfs_inode_item_pin(). 552 * item which was previously pinned with a call to xfs_inode_item_pin().
553 * Just call xfs_iunpin() on the inode to do this. 553 *
554 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
554 */ 555 */
555/* ARGSUSED */ 556/* ARGSUSED */
556STATIC void 557STATIC void
@@ -558,7 +559,11 @@ xfs_inode_item_unpin(
558 xfs_inode_log_item_t *iip, 559 xfs_inode_log_item_t *iip,
559 int stale) 560 int stale)
560{ 561{
561 xfs_iunpin(iip->ili_inode); 562 struct xfs_inode *ip = iip->ili_inode;
563
564 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait);
562} 567}
563 568
564/* ARGSUSED */ 569/* ARGSUSED */
@@ -567,7 +572,7 @@ xfs_inode_item_unpin_remove(
567 xfs_inode_log_item_t *iip, 572 xfs_inode_log_item_t *iip,
568 xfs_trans_t *tp) 573 xfs_trans_t *tp)
569{ 574{
570 xfs_iunpin(iip->ili_inode); 575 xfs_inode_item_unpin(iip, 0);
571} 576}
572 577
573/* 578/*
@@ -601,33 +606,20 @@ xfs_inode_item_trylock(
601 606
602 if (!xfs_iflock_nowait(ip)) { 607 if (!xfs_iflock_nowait(ip)) {
603 /* 608 /*
604 * If someone else isn't already trying to push the inode 609 * inode has already been flushed to the backing buffer,
605 * buffer, we get to do it. 610 * leave it locked in shared mode, pushbuf routine will
611 * unlock it.
606 */ 612 */
607 if (iip->ili_pushbuf_flag == 0) { 613 return XFS_ITEM_PUSHBUF;
608 iip->ili_pushbuf_flag = 1;
609#ifdef DEBUG
610 iip->ili_push_owner = current_pid();
611#endif
612 /*
613 * Inode is left locked in shared mode.
614 * Pushbuf routine gets to unlock it.
615 */
616 return XFS_ITEM_PUSHBUF;
617 } else {
618 /*
619 * We hold the AIL lock, so we must specify the
620 * NONOTIFY flag so that we won't double trip.
621 */
622 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
623 return XFS_ITEM_FLUSHING;
624 }
625 /* NOTREACHED */
626 } 614 }
627 615
628 /* Stale items should force out the iclog */ 616 /* Stale items should force out the iclog */
629 if (ip->i_flags & XFS_ISTALE) { 617 if (ip->i_flags & XFS_ISTALE) {
630 xfs_ifunlock(ip); 618 xfs_ifunlock(ip);
619 /*
620 * we hold the AIL lock - notify the unlock routine of this
621 * so it doesn't try to get the lock again.
622 */
631 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 623 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
632 return XFS_ITEM_PINNED; 624 return XFS_ITEM_PINNED;
633 } 625 }
@@ -745,11 +737,8 @@ xfs_inode_item_committed(
745 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 737 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
746 * failed to get the inode flush lock but did get the inode locked SHARED. 738 * failed to get the inode flush lock but did get the inode locked SHARED.
747 * Here we're trying to see if the inode buffer is incore, and if so whether it's 739 * Here we're trying to see if the inode buffer is incore, and if so whether it's
748 * marked delayed write. If that's the case, we'll initiate a bawrite on that 740 * marked delayed write. If that's the case, we'll promote it and that will
749 * buffer to expedite the process. 741 * allow the caller to write the buffer by triggering the xfsbufd to run.
750 *
751 * We aren't holding the AIL lock (or the flush lock) when this gets called,
752 * so it is inherently race-y.
753 */ 742 */
754STATIC void 743STATIC void
755xfs_inode_item_pushbuf( 744xfs_inode_item_pushbuf(
@@ -758,80 +747,30 @@ xfs_inode_item_pushbuf(
758 xfs_inode_t *ip; 747 xfs_inode_t *ip;
759 xfs_mount_t *mp; 748 xfs_mount_t *mp;
760 xfs_buf_t *bp; 749 xfs_buf_t *bp;
761 uint dopush;
762 750
763 ip = iip->ili_inode; 751 ip = iip->ili_inode;
764
765 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 752 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
766 753
767 /* 754 /*
768 * The ili_pushbuf_flag keeps others from
769 * trying to duplicate our effort.
770 */
771 ASSERT(iip->ili_pushbuf_flag != 0);
772 ASSERT(iip->ili_push_owner == current_pid());
773
774 /*
775 * If a flush is not in progress anymore, chances are that the 755 * If a flush is not in progress anymore, chances are that the
776 * inode was taken off the AIL. So, just get out. 756 * inode was taken off the AIL. So, just get out.
777 */ 757 */
778 if (completion_done(&ip->i_flush) || 758 if (completion_done(&ip->i_flush) ||
779 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 759 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
780 iip->ili_pushbuf_flag = 0;
781 xfs_iunlock(ip, XFS_ILOCK_SHARED); 760 xfs_iunlock(ip, XFS_ILOCK_SHARED);
782 return; 761 return;
783 } 762 }
784 763
785 mp = ip->i_mount; 764 mp = ip->i_mount;
786 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 765 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
787 iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); 766 iip->ili_format.ilf_len, XBF_TRYLOCK);
788 767
789 if (bp != NULL) {
790 if (XFS_BUF_ISDELAYWRITE(bp)) {
791 /*
792 * We were racing with iflush because we don't hold
793 * the AIL lock or the flush lock. However, at this point,
794 * we have the buffer, and we know that it's dirty.
795 * So, it's possible that iflush raced with us, and
796 * this item is already taken off the AIL.
797 * If not, we can flush it async.
798 */
799 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
800 !completion_done(&ip->i_flush));
801 iip->ili_pushbuf_flag = 0;
802 xfs_iunlock(ip, XFS_ILOCK_SHARED);
803 xfs_buftrace("INODE ITEM PUSH", bp);
804 if (XFS_BUF_ISPINNED(bp)) {
805 xfs_log_force(mp, (xfs_lsn_t)0,
806 XFS_LOG_FORCE);
807 }
808 if (dopush) {
809 int error;
810 error = xfs_bawrite(mp, bp);
811 if (error)
812 xfs_fs_cmn_err(CE_WARN, mp,
813 "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
814 error, iip, bp);
815 } else {
816 xfs_buf_relse(bp);
817 }
818 } else {
819 iip->ili_pushbuf_flag = 0;
820 xfs_iunlock(ip, XFS_ILOCK_SHARED);
821 xfs_buf_relse(bp);
822 }
823 return;
824 }
825 /*
826 * We have to be careful about resetting pushbuf flag too early (above).
827 * Even though in theory we can do it as soon as we have the buflock,
828 * we don't want others to be doing work needlessly. They'll come to
829 * this function thinking that pushing the buffer is their
830 * responsibility only to find that the buffer is still locked by
831 * another doing the same thing
832 */
833 iip->ili_pushbuf_flag = 0;
834 xfs_iunlock(ip, XFS_ILOCK_SHARED); 768 xfs_iunlock(ip, XFS_ILOCK_SHARED);
769 if (!bp)
770 return;
771 if (XFS_BUF_ISDELAYWRITE(bp))
772 xfs_buf_delwri_promote(bp);
773 xfs_buf_relse(bp);
835 return; 774 return;
836} 775}
837 776
@@ -864,10 +803,14 @@ xfs_inode_item_push(
864 iip->ili_format.ilf_fields != 0); 803 iip->ili_format.ilf_fields != 0);
865 804
866 /* 805 /*
867 * Write out the inode. The completion routine ('iflush_done') will 806 * Push the inode to it's backing buffer. This will not remove the
868 * pull it from the AIL, mark it clean, unlock the flush lock. 807 * inode from the AIL - a further push will be required to trigger a
808 * buffer push. However, this allows all the dirty inodes to be pushed
809 * to the buffer before it is pushed to disk. THe buffer IO completion
810 * will pull th einode from the AIL, mark it clean and unlock the flush
811 * lock.
869 */ 812 */
870 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 813 (void) xfs_iflush(ip, 0);
871 xfs_iunlock(ip, XFS_ILOCK_SHARED); 814 xfs_iunlock(ip, XFS_ILOCK_SHARED);
872 815
873 return; 816 return;
@@ -931,7 +874,6 @@ xfs_inode_item_init(
931 /* 874 /*
932 We have zeroed memory. No need ... 875 We have zeroed memory. No need ...
933 iip->ili_extents_buf = NULL; 876 iip->ili_extents_buf = NULL;
934 iip->ili_pushbuf_flag = 0;
935 */ 877 */
936 878
937 iip->ili_format.ilf_type = XFS_LI_INODE; 879 iip->ili_format.ilf_type = XFS_LI_INODE;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 65bae4c9b8bf..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -127,7 +127,7 @@ static inline int xfs_ilog_fdata(int w)
127#ifdef __KERNEL__ 127#ifdef __KERNEL__
128 128
129struct xfs_buf; 129struct xfs_buf;
130struct xfs_bmbt_rec_64; 130struct xfs_bmbt_rec;
131struct xfs_inode; 131struct xfs_inode;
132struct xfs_mount; 132struct xfs_mount;
133 133
@@ -140,16 +140,10 @@ typedef struct xfs_inode_log_item {
140 unsigned short ili_flags; /* misc flags */ 140 unsigned short ili_flags; /* misc flags */
141 unsigned short ili_logged; /* flushed logged data */ 141 unsigned short ili_logged; /* flushed logged data */
142 unsigned int ili_last_fields; /* fields when flushed */ 142 unsigned int ili_last_fields; /* fields when flushed */
143 struct xfs_bmbt_rec_64 *ili_extents_buf; /* array of logged 143 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
144 data exts */ 144 data exts */
145 struct xfs_bmbt_rec_64 *ili_aextents_buf; /* array of logged 145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
146 attr exts */ 146 attr exts */
147 unsigned int ili_pushbuf_flag; /* one bit used in push_ail */
148
149#ifdef DEBUG
150 uint64_t ili_push_owner; /* one who sets pushbuf_flag
151 above gets to push the buf */
152#endif
153#ifdef XFS_TRANS_DEBUG 147#ifdef XFS_TRANS_DEBUG
154 int ili_root_size; 148 int ili_root_size;
155 char *ili_orig_root; 149 char *ili_orig_root;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67ae5555a30a..0b65039951a0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,72 +47,8 @@
47#include "xfs_trans_space.h" 47#include "xfs_trans_space.h"
48#include "xfs_utils.h" 48#include "xfs_utils.h"
49#include "xfs_iomap.h" 49#include "xfs_iomap.h"
50#include "xfs_trace.h"
50 51
51#if defined(XFS_RW_TRACE)
52void
53xfs_iomap_enter_trace(
54 int tag,
55 xfs_inode_t *ip,
56 xfs_off_t offset,
57 ssize_t count)
58{
59 if (!ip->i_rwtrace)
60 return;
61
62 ktrace_enter(ip->i_rwtrace,
63 (void *)((unsigned long)tag),
64 (void *)ip,
65 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
66 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
67 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
68 (void *)((unsigned long)(offset & 0xffffffff)),
69 (void *)((unsigned long)count),
70 (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
71 (void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
72 (void *)((unsigned long)current_pid()),
73 (void *)NULL,
74 (void *)NULL,
75 (void *)NULL,
76 (void *)NULL,
77 (void *)NULL,
78 (void *)NULL);
79}
80
81void
82xfs_iomap_map_trace(
83 int tag,
84 xfs_inode_t *ip,
85 xfs_off_t offset,
86 ssize_t count,
87 xfs_iomap_t *iomapp,
88 xfs_bmbt_irec_t *imapp,
89 int flags)
90{
91 if (!ip->i_rwtrace)
92 return;
93
94 ktrace_enter(ip->i_rwtrace,
95 (void *)((unsigned long)tag),
96 (void *)ip,
97 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
98 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
99 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
100 (void *)((unsigned long)(offset & 0xffffffff)),
101 (void *)((unsigned long)count),
102 (void *)((unsigned long)flags),
103 (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)),
104 (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)),
105 (void *)((unsigned long)(iomapp->iomap_delta)),
106 (void *)((unsigned long)(iomapp->iomap_bsize)),
107 (void *)((unsigned long)(iomapp->iomap_bn)),
108 (void *)(__psint_t)(imapp->br_startoff),
109 (void *)((unsigned long)(imapp->br_blockcount)),
110 (void *)(__psint_t)(imapp->br_startblock));
111}
112#else
113#define xfs_iomap_enter_trace(tag, io, offset, count)
114#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags)
115#endif
116 52
117#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 53#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
118 << mp->m_writeio_log) 54 << mp->m_writeio_log)
@@ -187,21 +123,20 @@ xfs_iomap(
187 if (XFS_FORCED_SHUTDOWN(mp)) 123 if (XFS_FORCED_SHUTDOWN(mp))
188 return XFS_ERROR(EIO); 124 return XFS_ERROR(EIO);
189 125
126 trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
127
190 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) { 128 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
191 case BMAPI_READ: 129 case BMAPI_READ:
192 xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, ip, offset, count);
193 lockmode = xfs_ilock_map_shared(ip); 130 lockmode = xfs_ilock_map_shared(ip);
194 bmapi_flags = XFS_BMAPI_ENTIRE; 131 bmapi_flags = XFS_BMAPI_ENTIRE;
195 break; 132 break;
196 case BMAPI_WRITE: 133 case BMAPI_WRITE:
197 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count);
198 lockmode = XFS_ILOCK_EXCL; 134 lockmode = XFS_ILOCK_EXCL;
199 if (flags & BMAPI_IGNSTATE) 135 if (flags & BMAPI_IGNSTATE)
200 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; 136 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
201 xfs_ilock(ip, lockmode); 137 xfs_ilock(ip, lockmode);
202 break; 138 break;
203 case BMAPI_ALLOCATE: 139 case BMAPI_ALLOCATE:
204 xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count);
205 lockmode = XFS_ILOCK_SHARED; 140 lockmode = XFS_ILOCK_SHARED;
206 bmapi_flags = XFS_BMAPI_ENTIRE; 141 bmapi_flags = XFS_BMAPI_ENTIRE;
207 142
@@ -237,8 +172,7 @@ xfs_iomap(
237 if (nimaps && 172 if (nimaps &&
238 (imap.br_startblock != HOLESTARTBLOCK) && 173 (imap.br_startblock != HOLESTARTBLOCK) &&
239 (imap.br_startblock != DELAYSTARTBLOCK)) { 174 (imap.br_startblock != DELAYSTARTBLOCK)) {
240 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, 175 trace_xfs_iomap_found(ip, offset, count, flags, &imap);
241 offset, count, iomapp, &imap, flags);
242 break; 176 break;
243 } 177 }
244 178
@@ -250,8 +184,7 @@ xfs_iomap(
250 &imap, &nimaps); 184 &imap, &nimaps);
251 } 185 }
252 if (!error) { 186 if (!error) {
253 xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, ip, 187 trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
254 offset, count, iomapp, &imap, flags);
255 } 188 }
256 iomap_flags = IOMAP_NEW; 189 iomap_flags = IOMAP_NEW;
257 break; 190 break;
@@ -261,8 +194,7 @@ xfs_iomap(
261 lockmode = 0; 194 lockmode = 0;
262 195
263 if (nimaps && !isnullstartblock(imap.br_startblock)) { 196 if (nimaps && !isnullstartblock(imap.br_startblock)) {
264 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, 197 trace_xfs_iomap_found(ip, offset, count, flags, &imap);
265 offset, count, iomapp, &imap, flags);
266 break; 198 break;
267 } 199 }
268 200
@@ -623,8 +555,7 @@ retry:
623 * delalloc blocks and retry without EOF preallocation. 555 * delalloc blocks and retry without EOF preallocation.
624 */ 556 */
625 if (nimaps == 0) { 557 if (nimaps == 0) {
626 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, 558 trace_xfs_delalloc_enospc(ip, offset, count);
627 ip, offset, count);
628 if (flushed) 559 if (flushed)
629 return XFS_ERROR(ENOSPC); 560 return XFS_ERROR(ENOSPC);
630 561
@@ -837,7 +768,7 @@ xfs_iomap_write_unwritten(
837 int committed; 768 int committed;
838 int error; 769 int error;
839 770
840 xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, ip, offset, count); 771 trace_xfs_unwritten_convert(ip, offset, count);
841 772
842 offset_fsb = XFS_B_TO_FSBT(mp, offset); 773 offset_fsb = XFS_B_TO_FSBT(mp, offset);
843 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 774 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
@@ -860,8 +791,15 @@ xfs_iomap_write_unwritten(
860 * set up a transaction to convert the range of extents 791 * set up a transaction to convert the range of extents
861 * from unwritten to real. Do allocations in a loop until 792 * from unwritten to real. Do allocations in a loop until
862 * we have covered the range passed in. 793 * we have covered the range passed in.
794 *
795 * Note that we open code the transaction allocation here
796 * to pass KM_NOFS--we can't risk to recursing back into
797 * the filesystem here as we might be asked to write out
798 * the same inode that we complete here and might deadlock
799 * on the iolock.
863 */ 800 */
864 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); 801 xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
802 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
865 tp->t_flags |= XFS_TRANS_RESERVE; 803 tp->t_flags |= XFS_TRANS_RESERVE;
866 error = xfs_trans_reserve(tp, resblks, 804 error = xfs_trans_reserve(tp, resblks,
867 XFS_WRITE_LOG_RES(mp), 0, 805 XFS_WRITE_LOG_RES(mp), 0,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fdcf7b82747f..174f29990991 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -43,6 +43,14 @@ typedef enum {
43 BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */ 43 BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */
44} bmapi_flags_t; 44} bmapi_flags_t;
45 45
46#define BMAPI_FLAGS \
47 { BMAPI_READ, "READ" }, \
48 { BMAPI_WRITE, "WRITE" }, \
49 { BMAPI_ALLOCATE, "ALLOCATE" }, \
50 { BMAPI_IGNSTATE, "IGNSTATE" }, \
51 { BMAPI_DIRECT, "DIRECT" }, \
52 { BMAPI_MMAP, "MMAP" }, \
53 { BMAPI_TRYLOCK, "TRYLOCK" }
46 54
47/* 55/*
48 * xfs_iomap_t: File system I/O map 56 * xfs_iomap_t: File system I/O map
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 62efab2f3839..b1b801e4a28e 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -106,6 +106,7 @@ xfs_bulkstat_one_iget(
106 buf->bs_dmevmask = dic->di_dmevmask; 106 buf->bs_dmevmask = dic->di_dmevmask;
107 buf->bs_dmstate = dic->di_dmstate; 107 buf->bs_dmstate = dic->di_dmstate;
108 buf->bs_aextents = dic->di_anextents; 108 buf->bs_aextents = dic->di_anextents;
109 buf->bs_forkoff = XFS_IFORK_BOFF(ip);
109 110
110 switch (dic->di_format) { 111 switch (dic->di_format) {
111 case XFS_DINODE_FMT_DEV: 112 case XFS_DINODE_FMT_DEV:
@@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode(
176 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); 177 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
177 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); 178 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
178 buf->bs_aextents = be16_to_cpu(dic->di_anextents); 179 buf->bs_aextents = be16_to_cpu(dic->di_anextents);
180 buf->bs_forkoff = XFS_DFORK_BOFF(dic);
179 181
180 switch (dic->di_format) { 182 switch (dic->di_format) {
181 case XFS_DINODE_FMT_DEV: 183 case XFS_DINODE_FMT_DEV:
@@ -408,8 +410,10 @@ xfs_bulkstat(
408 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); 410 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
409 nimask = ~(nicluster - 1); 411 nimask = ~(nicluster - 1);
410 nbcluster = nicluster >> mp->m_sb.sb_inopblog; 412 nbcluster = nicluster >> mp->m_sb.sb_inopblog;
411 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4, 413 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
412 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 414 if (!irbuf)
415 return ENOMEM;
416
413 nirbuf = irbsize / sizeof(*irbuf); 417 nirbuf = irbsize / sizeof(*irbuf);
414 418
415 /* 419 /*
@@ -420,9 +424,7 @@ xfs_bulkstat(
420 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 424 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
421 cond_resched(); 425 cond_resched();
422 bp = NULL; 426 bp = NULL;
423 down_read(&mp->m_peraglock);
424 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 427 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
425 up_read(&mp->m_peraglock);
426 if (error) { 428 if (error) {
427 /* 429 /*
428 * Skip this allocation group and go to the next one. 430 * Skip this allocation group and go to the next one.
@@ -729,7 +731,7 @@ xfs_bulkstat(
729 /* 731 /*
730 * Done, we're either out of filesystem or space to put the data. 732 * Done, we're either out of filesystem or space to put the data.
731 */ 733 */
732 kmem_free(irbuf); 734 kmem_free_large(irbuf);
733 *ubcountp = ubelem; 735 *ubcountp = ubelem;
734 /* 736 /*
735 * Found some inodes, return them now and return the error next time. 737 * Found some inodes, return them now and return the error next time.
@@ -849,9 +851,7 @@ xfs_inumbers(
849 agbp = NULL; 851 agbp = NULL;
850 while (left > 0 && agno < mp->m_sb.sb_agcount) { 852 while (left > 0 && agno < mp->m_sb.sb_agcount) {
851 if (agbp == NULL) { 853 if (agbp == NULL) {
852 down_read(&mp->m_peraglock);
853 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 854 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
854 up_read(&mp->m_peraglock);
855 if (error) { 855 if (error) {
856 /* 856 /*
857 * If we can't read the AGI of this ag, 857 * If we can't read the AGI of this ag,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9dbdff3ea484..e8fba92d7cd9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -40,6 +40,7 @@
40#include "xfs_dinode.h" 40#include "xfs_dinode.h"
41#include "xfs_inode.h" 41#include "xfs_inode.h"
42#include "xfs_rw.h" 42#include "xfs_rw.h"
43#include "xfs_trace.h"
43 44
44kmem_zone_t *xfs_log_ticket_zone; 45kmem_zone_t *xfs_log_ticket_zone;
45 46
@@ -49,7 +50,6 @@ kmem_zone_t *xfs_log_ticket_zone;
49 (off) += (bytes);} 50 (off) += (bytes);}
50 51
51/* Local miscellaneous function prototypes */ 52/* Local miscellaneous function prototypes */
52STATIC int xlog_bdstrat_cb(struct xfs_buf *);
53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
54 xlog_in_core_t **, xfs_lsn_t *); 54 xlog_in_core_t **, xfs_lsn_t *);
55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
@@ -60,7 +60,7 @@ STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
61STATIC void xlog_dealloc_log(xlog_t *log); 61STATIC void xlog_dealloc_log(xlog_t *log);
62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], 62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
63 int nentries, xfs_log_ticket_t tic, 63 int nentries, struct xlog_ticket *tic,
64 xfs_lsn_t *start_lsn, 64 xfs_lsn_t *start_lsn,
65 xlog_in_core_t **commit_iclog, 65 xlog_in_core_t **commit_iclog,
66 uint flags); 66 uint flags);
@@ -79,11 +79,6 @@ STATIC int xlog_state_release_iclog(xlog_t *log,
79STATIC void xlog_state_switch_iclogs(xlog_t *log, 79STATIC void xlog_state_switch_iclogs(xlog_t *log,
80 xlog_in_core_t *iclog, 80 xlog_in_core_t *iclog,
81 int eventual_size); 81 int eventual_size);
82STATIC int xlog_state_sync(xlog_t *log,
83 xfs_lsn_t lsn,
84 uint flags,
85 int *log_flushed);
86STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
87STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); 82STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
88 83
89/* local functions to manipulate grant head */ 84/* local functions to manipulate grant head */
@@ -122,85 +117,6 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
122 117
123STATIC int xlog_iclogs_empty(xlog_t *log); 118STATIC int xlog_iclogs_empty(xlog_t *log);
124 119
125#if defined(XFS_LOG_TRACE)
126
127#define XLOG_TRACE_LOGGRANT_SIZE 2048
128#define XLOG_TRACE_ICLOG_SIZE 256
129
130void
131xlog_trace_loggrant_alloc(xlog_t *log)
132{
133 log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
134}
135
136void
137xlog_trace_loggrant_dealloc(xlog_t *log)
138{
139 ktrace_free(log->l_grant_trace);
140}
141
142void
143xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
144{
145 unsigned long cnts;
146
147 /* ticket counts are 1 byte each */
148 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
149
150 ktrace_enter(log->l_grant_trace,
151 (void *)tic,
152 (void *)log->l_reserve_headq,
153 (void *)log->l_write_headq,
154 (void *)((unsigned long)log->l_grant_reserve_cycle),
155 (void *)((unsigned long)log->l_grant_reserve_bytes),
156 (void *)((unsigned long)log->l_grant_write_cycle),
157 (void *)((unsigned long)log->l_grant_write_bytes),
158 (void *)((unsigned long)log->l_curr_cycle),
159 (void *)((unsigned long)log->l_curr_block),
160 (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
161 (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
162 (void *)string,
163 (void *)((unsigned long)tic->t_trans_type),
164 (void *)cnts,
165 (void *)((unsigned long)tic->t_curr_res),
166 (void *)((unsigned long)tic->t_unit_res));
167}
168
169void
170xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
171{
172 iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
173}
174
175void
176xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
177{
178 ktrace_free(iclog->ic_trace);
179}
180
181void
182xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
183{
184 ktrace_enter(iclog->ic_trace,
185 (void *)((unsigned long)state),
186 (void *)((unsigned long)current_pid()),
187 (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
188 (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
189 (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
190 (void *)NULL, (void *)NULL);
191}
192#else
193
194#define xlog_trace_loggrant_alloc(log)
195#define xlog_trace_loggrant_dealloc(log)
196#define xlog_trace_loggrant(log,tic,string)
197
198#define xlog_trace_iclog_alloc(iclog)
199#define xlog_trace_iclog_dealloc(iclog)
200#define xlog_trace_iclog(iclog,state)
201
202#endif /* XFS_LOG_TRACE */
203
204 120
205static void 121static void
206xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 122xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
@@ -327,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
327 * out when the next write occurs. 243 * out when the next write occurs.
328 */ 244 */
329xfs_lsn_t 245xfs_lsn_t
330xfs_log_done(xfs_mount_t *mp, 246xfs_log_done(
331 xfs_log_ticket_t xtic, 247 struct xfs_mount *mp,
332 void **iclog, 248 struct xlog_ticket *ticket,
333 uint flags) 249 struct xlog_in_core **iclog,
250 uint flags)
334{ 251{
335 xlog_t *log = mp->m_log; 252 struct log *log = mp->m_log;
336 xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; 253 xfs_lsn_t lsn = 0;
337 xfs_lsn_t lsn = 0;
338 254
339 if (XLOG_FORCED_SHUTDOWN(log) || 255 if (XLOG_FORCED_SHUTDOWN(log) ||
340 /* 256 /*
@@ -342,8 +258,7 @@ xfs_log_done(xfs_mount_t *mp,
342 * If we get an error, just continue and give back the log ticket. 258 * If we get an error, just continue and give back the log ticket.
343 */ 259 */
344 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
345 (xlog_commit_record(mp, ticket, 261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
346 (xlog_in_core_t **)iclog, &lsn)))) {
347 lsn = (xfs_lsn_t) -1; 262 lsn = (xfs_lsn_t) -1;
348 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
349 flags |= XFS_LOG_REL_PERM_RESERV; 264 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -353,15 +268,17 @@ xfs_log_done(xfs_mount_t *mp,
353 268
354 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || 269 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
355 (flags & XFS_LOG_REL_PERM_RESERV)) { 270 (flags & XFS_LOG_REL_PERM_RESERV)) {
271 trace_xfs_log_done_nonperm(log, ticket);
272
356 /* 273 /*
357 * Release ticket if not permanent reservation or a specific 274 * Release ticket if not permanent reservation or a specific
358 * request has been made to release a permanent reservation. 275 * request has been made to release a permanent reservation.
359 */ 276 */
360 xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
361 xlog_ungrant_log_space(log, ticket); 277 xlog_ungrant_log_space(log, ticket);
362 xfs_log_ticket_put(ticket); 278 xfs_log_ticket_put(ticket);
363 } else { 279 } else {
364 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); 280 trace_xfs_log_done_perm(log, ticket);
281
365 xlog_regrant_reserve_log_space(log, ticket); 282 xlog_regrant_reserve_log_space(log, ticket);
366 /* If this ticket was a permanent reservation and we aren't 283 /* If this ticket was a permanent reservation and we aren't
367 * trying to release it, reset the inited flags; so next time 284 * trying to release it, reset the inited flags; so next time
@@ -371,67 +288,8 @@ xfs_log_done(xfs_mount_t *mp,
371 } 288 }
372 289
373 return lsn; 290 return lsn;
374} /* xfs_log_done */
375
376
377/*
378 * Force the in-core log to disk. If flags == XFS_LOG_SYNC,
379 * the force is done synchronously.
380 *
381 * Asynchronous forces are implemented by setting the WANT_SYNC
382 * bit in the appropriate in-core log and then returning.
383 *
384 * Synchronous forces are implemented with a signal variable. All callers
385 * to force a given lsn to disk will wait on a the sv attached to the
386 * specific in-core log. When given in-core log finally completes its
387 * write to disk, that thread will wake up all threads waiting on the
388 * sv.
389 */
390int
391_xfs_log_force(
392 xfs_mount_t *mp,
393 xfs_lsn_t lsn,
394 uint flags,
395 int *log_flushed)
396{
397 xlog_t *log = mp->m_log;
398 int dummy;
399
400 if (!log_flushed)
401 log_flushed = &dummy;
402
403 ASSERT(flags & XFS_LOG_FORCE);
404
405 XFS_STATS_INC(xs_log_force);
406
407 if (log->l_flags & XLOG_IO_ERROR)
408 return XFS_ERROR(EIO);
409 if (lsn == 0)
410 return xlog_state_sync_all(log, flags, log_flushed);
411 else
412 return xlog_state_sync(log, lsn, flags, log_flushed);
413} /* _xfs_log_force */
414
415/*
416 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
417 * about errors or whether the log was flushed or not. This is the normal
418 * interface to use when trying to unpin items or move the log forward.
419 */
420void
421xfs_log_force(
422 xfs_mount_t *mp,
423 xfs_lsn_t lsn,
424 uint flags)
425{
426 int error;
427 error = _xfs_log_force(mp, lsn, flags, NULL);
428 if (error) {
429 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
430 "error %d returned.", error);
431 }
432} 291}
433 292
434
435/* 293/*
436 * Attaches a new iclog I/O completion callback routine during 294 * Attaches a new iclog I/O completion callback routine during
437 * transaction commit. If the log is in error state, a non-zero 295 * transaction commit. If the log is in error state, a non-zero
@@ -439,11 +297,11 @@ xfs_log_force(
439 * executing the callback at an appropriate time. 297 * executing the callback at an appropriate time.
440 */ 298 */
441int 299int
442xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ 300xfs_log_notify(
443 void *iclog_hndl, /* iclog to hang callback off */ 301 struct xfs_mount *mp,
444 xfs_log_callback_t *cb) 302 struct xlog_in_core *iclog,
303 xfs_log_callback_t *cb)
445{ 304{
446 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
447 int abortflg; 305 int abortflg;
448 306
449 spin_lock(&iclog->ic_callback_lock); 307 spin_lock(&iclog->ic_callback_lock);
@@ -457,16 +315,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
457 } 315 }
458 spin_unlock(&iclog->ic_callback_lock); 316 spin_unlock(&iclog->ic_callback_lock);
459 return abortflg; 317 return abortflg;
460} /* xfs_log_notify */ 318}
461 319
462int 320int
463xfs_log_release_iclog(xfs_mount_t *mp, 321xfs_log_release_iclog(
464 void *iclog_hndl) 322 struct xfs_mount *mp,
323 struct xlog_in_core *iclog)
465{ 324{
466 xlog_t *log = mp->m_log; 325 if (xlog_state_release_iclog(mp->m_log, iclog)) {
467 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
468
469 if (xlog_state_release_iclog(log, iclog)) {
470 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 326 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
471 return EIO; 327 return EIO;
472 } 328 }
@@ -485,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp,
485 * reservation, we prevent over allocation problems. 341 * reservation, we prevent over allocation problems.
486 */ 342 */
487int 343int
488xfs_log_reserve(xfs_mount_t *mp, 344xfs_log_reserve(
489 int unit_bytes, 345 struct xfs_mount *mp,
490 int cnt, 346 int unit_bytes,
491 xfs_log_ticket_t *ticket, 347 int cnt,
492 __uint8_t client, 348 struct xlog_ticket **ticket,
493 uint flags, 349 __uint8_t client,
494 uint t_type) 350 uint flags,
351 uint t_type)
495{ 352{
496 xlog_t *log = mp->m_log; 353 struct log *log = mp->m_log;
497 xlog_ticket_t *internal_ticket; 354 struct xlog_ticket *internal_ticket;
498 int retval = 0; 355 int retval = 0;
499 356
500 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 357 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
501 ASSERT((flags & XFS_LOG_NOSLEEP) == 0); 358 ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
@@ -505,10 +362,13 @@ xfs_log_reserve(xfs_mount_t *mp,
505 362
506 XFS_STATS_INC(xs_try_logspace); 363 XFS_STATS_INC(xs_try_logspace);
507 364
365
508 if (*ticket != NULL) { 366 if (*ticket != NULL) {
509 ASSERT(flags & XFS_LOG_PERM_RESERV); 367 ASSERT(flags & XFS_LOG_PERM_RESERV);
510 internal_ticket = (xlog_ticket_t *)*ticket; 368 internal_ticket = *ticket;
511 xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)"); 369
370 trace_xfs_log_reserve(log, internal_ticket);
371
512 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 372 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
513 retval = xlog_regrant_write_log_space(log, internal_ticket); 373 retval = xlog_regrant_write_log_space(log, internal_ticket);
514 } else { 374 } else {
@@ -519,10 +379,9 @@ xfs_log_reserve(xfs_mount_t *mp,
519 return XFS_ERROR(ENOMEM); 379 return XFS_ERROR(ENOMEM);
520 internal_ticket->t_trans_type = t_type; 380 internal_ticket->t_trans_type = t_type;
521 *ticket = internal_ticket; 381 *ticket = internal_ticket;
522 xlog_trace_loggrant(log, internal_ticket, 382
523 (internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ? 383 trace_xfs_log_reserve(log, internal_ticket);
524 "xfs_log_reserve: create new ticket (permanent trans)" : 384
525 "xfs_log_reserve: create new ticket");
526 xlog_grant_push_ail(mp, 385 xlog_grant_push_ail(mp,
527 (internal_ticket->t_unit_res * 386 (internal_ticket->t_unit_res *
528 internal_ticket->t_cnt)); 387 internal_ticket->t_cnt));
@@ -658,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
658 xlog_in_core_t *first_iclog; 517 xlog_in_core_t *first_iclog;
659#endif 518#endif
660 xfs_log_iovec_t reg[1]; 519 xfs_log_iovec_t reg[1];
661 xfs_log_ticket_t tic = NULL; 520 xlog_ticket_t *tic = NULL;
662 xfs_lsn_t lsn; 521 xfs_lsn_t lsn;
663 int error; 522 int error;
664 523
@@ -676,7 +535,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
676 if (mp->m_flags & XFS_MOUNT_RDONLY) 535 if (mp->m_flags & XFS_MOUNT_RDONLY)
677 return 0; 536 return 0;
678 537
679 error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); 538 error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
680 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); 539 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
681 540
682#ifdef DEBUG 541#ifdef DEBUG
@@ -692,7 +551,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
692 if (! (XLOG_FORCED_SHUTDOWN(log))) { 551 if (! (XLOG_FORCED_SHUTDOWN(log))) {
693 reg[0].i_addr = (void*)&magic; 552 reg[0].i_addr = (void*)&magic;
694 reg[0].i_len = sizeof(magic); 553 reg[0].i_len = sizeof(magic);
695 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT); 554 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
696 555
697 error = xfs_log_reserve(mp, 600, 1, &tic, 556 error = xfs_log_reserve(mp, 600, 1, &tic,
698 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 557 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
@@ -734,7 +593,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
734 spin_unlock(&log->l_icloglock); 593 spin_unlock(&log->l_icloglock);
735 } 594 }
736 if (tic) { 595 if (tic) {
737 xlog_trace_loggrant(log, tic, "unmount rec"); 596 trace_xfs_log_umount_write(log, tic);
738 xlog_ungrant_log_space(log, tic); 597 xlog_ungrant_log_space(log, tic);
739 xfs_log_ticket_put(tic); 598 xfs_log_ticket_put(tic);
740 } 599 }
@@ -795,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp)
795 * transaction occur with one call to xfs_log_write(). 654 * transaction occur with one call to xfs_log_write().
796 */ 655 */
797int 656int
798xfs_log_write(xfs_mount_t * mp, 657xfs_log_write(
799 xfs_log_iovec_t reg[], 658 struct xfs_mount *mp,
800 int nentries, 659 struct xfs_log_iovec reg[],
801 xfs_log_ticket_t tic, 660 int nentries,
802 xfs_lsn_t *start_lsn) 661 struct xlog_ticket *tic,
662 xfs_lsn_t *start_lsn)
803{ 663{
804 int error; 664 struct log *log = mp->m_log;
805 xlog_t *log = mp->m_log; 665 int error;
806 666
807 if (XLOG_FORCED_SHUTDOWN(log)) 667 if (XLOG_FORCED_SHUTDOWN(log))
808 return XFS_ERROR(EIO); 668 return XFS_ERROR(EIO);
809 669
810 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { 670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
671 if (error)
811 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
812 }
813 return error; 673 return error;
814} /* xfs_log_write */ 674}
815
816 675
817void 676void
818xfs_log_move_tail(xfs_mount_t *mp, 677xfs_log_move_tail(xfs_mount_t *mp,
@@ -1030,7 +889,6 @@ xlog_iodone(xfs_buf_t *bp)
1030 xfs_fs_cmn_err(CE_WARN, l->l_mp, 889 xfs_fs_cmn_err(CE_WARN, l->l_mp,
1031 "xlog_iodone: Barriers are no longer supported" 890 "xlog_iodone: Barriers are no longer supported"
1032 " by device. Disabling barriers\n"); 891 " by device. Disabling barriers\n");
1033 xfs_buftrace("XLOG_IODONE BARRIERS OFF", bp);
1034 } 892 }
1035 893
1036 /* 894 /*
@@ -1063,38 +921,6 @@ xlog_iodone(xfs_buf_t *bp)
1063} /* xlog_iodone */ 921} /* xlog_iodone */
1064 922
1065/* 923/*
1066 * The bdstrat callback function for log bufs. This gives us a central
1067 * place to trap bufs in case we get hit by a log I/O error and need to
1068 * shutdown. Actually, in practice, even when we didn't get a log error,
1069 * we transition the iclogs to IOERROR state *after* flushing all existing
1070 * iclogs to disk. This is because we don't want anymore new transactions to be
1071 * started or completed afterwards.
1072 */
1073STATIC int
1074xlog_bdstrat_cb(struct xfs_buf *bp)
1075{
1076 xlog_in_core_t *iclog;
1077
1078 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1079
1080 if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
1081 /* note for irix bstrat will need struct bdevsw passed
1082 * Fix the following macro if the code ever is merged
1083 */
1084 XFS_bdstrat(bp);
1085 return 0;
1086 }
1087
1088 xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
1089 XFS_BUF_ERROR(bp, EIO);
1090 XFS_BUF_STALE(bp);
1091 xfs_biodone(bp);
1092 return XFS_ERROR(EIO);
1093
1094
1095}
1096
1097/*
1098 * Return size of each in-core log record buffer. 924 * Return size of each in-core log record buffer.
1099 * 925 *
1100 * All machines get 8 x 32kB buffers by default, unless tuned otherwise. 926 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
@@ -1236,7 +1062,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1236 if (!bp) 1062 if (!bp)
1237 goto out_free_log; 1063 goto out_free_log;
1238 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1064 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1239 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1240 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1065 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1241 ASSERT(XFS_BUF_ISBUSY(bp)); 1066 ASSERT(XFS_BUF_ISBUSY(bp));
1242 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 1067 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
@@ -1246,7 +1071,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1246 spin_lock_init(&log->l_grant_lock); 1071 spin_lock_init(&log->l_grant_lock);
1247 sv_init(&log->l_flush_wait, 0, "flush_wait"); 1072 sv_init(&log->l_flush_wait, 0, "flush_wait");
1248 1073
1249 xlog_trace_loggrant_alloc(log);
1250 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1074 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1251 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1075 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
1252 1076
@@ -1275,7 +1099,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1275 if (!XFS_BUF_CPSEMA(bp)) 1099 if (!XFS_BUF_CPSEMA(bp))
1276 ASSERT(0); 1100 ASSERT(0);
1277 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1101 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1278 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1279 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1102 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1280 iclog->ic_bp = bp; 1103 iclog->ic_bp = bp;
1281 iclog->ic_data = bp->b_addr; 1104 iclog->ic_data = bp->b_addr;
@@ -1305,8 +1128,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1305 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1128 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
1306 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1129 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
1307 1130
1308 xlog_trace_iclog_alloc(iclog);
1309
1310 iclogp = &iclog->ic_next; 1131 iclogp = &iclog->ic_next;
1311 } 1132 }
1312 *iclogp = log->l_iclog; /* complete ring */ 1133 *iclogp = log->l_iclog; /* complete ring */
@@ -1321,13 +1142,11 @@ out_free_iclog:
1321 sv_destroy(&iclog->ic_force_wait); 1142 sv_destroy(&iclog->ic_force_wait);
1322 sv_destroy(&iclog->ic_write_wait); 1143 sv_destroy(&iclog->ic_write_wait);
1323 xfs_buf_free(iclog->ic_bp); 1144 xfs_buf_free(iclog->ic_bp);
1324 xlog_trace_iclog_dealloc(iclog);
1325 } 1145 }
1326 kmem_free(iclog); 1146 kmem_free(iclog);
1327 } 1147 }
1328 spinlock_destroy(&log->l_icloglock); 1148 spinlock_destroy(&log->l_icloglock);
1329 spinlock_destroy(&log->l_grant_lock); 1149 spinlock_destroy(&log->l_grant_lock);
1330 xlog_trace_loggrant_dealloc(log);
1331 xfs_buf_free(log->l_xbuf); 1150 xfs_buf_free(log->l_xbuf);
1332out_free_log: 1151out_free_log:
1333 kmem_free(log); 1152 kmem_free(log);
@@ -1351,7 +1170,7 @@ xlog_commit_record(xfs_mount_t *mp,
1351 1170
1352 reg[0].i_addr = NULL; 1171 reg[0].i_addr = NULL;
1353 reg[0].i_len = 0; 1172 reg[0].i_len = 0;
1354 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT); 1173 reg[0].i_type = XLOG_REG_TYPE_COMMIT;
1355 1174
1356 ASSERT_ALWAYS(iclog); 1175 ASSERT_ALWAYS(iclog);
1357 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1176 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
@@ -1426,6 +1245,37 @@ xlog_grant_push_ail(xfs_mount_t *mp,
1426 xfs_trans_ail_push(log->l_ailp, threshold_lsn); 1245 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1427} /* xlog_grant_push_ail */ 1246} /* xlog_grant_push_ail */
1428 1247
1248/*
1249 * The bdstrat callback function for log bufs. This gives us a central
1250 * place to trap bufs in case we get hit by a log I/O error and need to
1251 * shutdown. Actually, in practice, even when we didn't get a log error,
1252 * we transition the iclogs to IOERROR state *after* flushing all existing
1253 * iclogs to disk. This is because we don't want anymore new transactions to be
1254 * started or completed afterwards.
1255 */
1256STATIC int
1257xlog_bdstrat(
1258 struct xfs_buf *bp)
1259{
1260 struct xlog_in_core *iclog;
1261
1262 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1263 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1264 XFS_BUF_ERROR(bp, EIO);
1265 XFS_BUF_STALE(bp);
1266 xfs_biodone(bp);
1267 /*
1268 * It would seem logical to return EIO here, but we rely on
1269 * the log state machine to propagate I/O errors instead of
1270 * doing it here.
1271 */
1272 return 0;
1273 }
1274
1275 bp->b_flags |= _XBF_RUN_QUEUES;
1276 xfs_buf_iorequest(bp);
1277 return 0;
1278}
1429 1279
1430/* 1280/*
1431 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1281 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
@@ -1524,6 +1374,7 @@ xlog_sync(xlog_t *log,
1524 XFS_BUF_ZEROFLAGS(bp); 1374 XFS_BUF_ZEROFLAGS(bp);
1525 XFS_BUF_BUSY(bp); 1375 XFS_BUF_BUSY(bp);
1526 XFS_BUF_ASYNC(bp); 1376 XFS_BUF_ASYNC(bp);
1377 bp->b_flags |= XBF_LOG_BUFFER;
1527 /* 1378 /*
1528 * Do an ordered write for the log block. 1379 * Do an ordered write for the log block.
1529 * Its unnecessary to flush the first split block in the log wrap case. 1380 * Its unnecessary to flush the first split block in the log wrap case.
@@ -1544,7 +1395,7 @@ xlog_sync(xlog_t *log,
1544 */ 1395 */
1545 XFS_BUF_WRITE(bp); 1396 XFS_BUF_WRITE(bp);
1546 1397
1547 if ((error = XFS_bwrite(bp))) { 1398 if ((error = xlog_bdstrat(bp))) {
1548 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1399 xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
1549 XFS_BUF_ADDR(bp)); 1400 XFS_BUF_ADDR(bp));
1550 return error; 1401 return error;
@@ -1561,6 +1412,7 @@ xlog_sync(xlog_t *log,
1561 XFS_BUF_ZEROFLAGS(bp); 1412 XFS_BUF_ZEROFLAGS(bp);
1562 XFS_BUF_BUSY(bp); 1413 XFS_BUF_BUSY(bp);
1563 XFS_BUF_ASYNC(bp); 1414 XFS_BUF_ASYNC(bp);
1415 bp->b_flags |= XBF_LOG_BUFFER;
1564 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1416 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1565 XFS_BUF_ORDERED(bp); 1417 XFS_BUF_ORDERED(bp);
1566 dptr = XFS_BUF_PTR(bp); 1418 dptr = XFS_BUF_PTR(bp);
@@ -1583,7 +1435,7 @@ xlog_sync(xlog_t *log,
1583 /* account for internal log which doesn't start at block #0 */ 1435 /* account for internal log which doesn't start at block #0 */
1584 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1436 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1585 XFS_BUF_WRITE(bp); 1437 XFS_BUF_WRITE(bp);
1586 if ((error = XFS_bwrite(bp))) { 1438 if ((error = xlog_bdstrat(bp))) {
1587 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1439 xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
1588 bp, XFS_BUF_ADDR(bp)); 1440 bp, XFS_BUF_ADDR(bp));
1589 return error; 1441 return error;
@@ -1607,7 +1459,6 @@ xlog_dealloc_log(xlog_t *log)
1607 sv_destroy(&iclog->ic_force_wait); 1459 sv_destroy(&iclog->ic_force_wait);
1608 sv_destroy(&iclog->ic_write_wait); 1460 sv_destroy(&iclog->ic_write_wait);
1609 xfs_buf_free(iclog->ic_bp); 1461 xfs_buf_free(iclog->ic_bp);
1610 xlog_trace_iclog_dealloc(iclog);
1611 next_iclog = iclog->ic_next; 1462 next_iclog = iclog->ic_next;
1612 kmem_free(iclog); 1463 kmem_free(iclog);
1613 iclog = next_iclog; 1464 iclog = next_iclog;
@@ -1616,7 +1467,6 @@ xlog_dealloc_log(xlog_t *log)
1616 spinlock_destroy(&log->l_grant_lock); 1467 spinlock_destroy(&log->l_grant_lock);
1617 1468
1618 xfs_buf_free(log->l_xbuf); 1469 xfs_buf_free(log->l_xbuf);
1619 xlog_trace_loggrant_dealloc(log);
1620 log->l_mp->m_log = NULL; 1470 log->l_mp->m_log = NULL;
1621 kmem_free(log); 1471 kmem_free(log);
1622} /* xlog_dealloc_log */ 1472} /* xlog_dealloc_log */
@@ -1790,16 +1640,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1790 * bytes have been written out. 1640 * bytes have been written out.
1791 */ 1641 */
1792STATIC int 1642STATIC int
1793xlog_write(xfs_mount_t * mp, 1643xlog_write(
1794 xfs_log_iovec_t reg[], 1644 struct xfs_mount *mp,
1795 int nentries, 1645 struct xfs_log_iovec reg[],
1796 xfs_log_ticket_t tic, 1646 int nentries,
1797 xfs_lsn_t *start_lsn, 1647 struct xlog_ticket *ticket,
1798 xlog_in_core_t **commit_iclog, 1648 xfs_lsn_t *start_lsn,
1799 uint flags) 1649 struct xlog_in_core **commit_iclog,
1650 uint flags)
1800{ 1651{
1801 xlog_t *log = mp->m_log; 1652 xlog_t *log = mp->m_log;
1802 xlog_ticket_t *ticket = (xlog_ticket_t *)tic;
1803 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1653 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */
1804 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1654 xlog_op_header_t *logop_head; /* ptr to log operation header */
1805 __psint_t ptr; /* copy address into data region */ 1655 __psint_t ptr; /* copy address into data region */
@@ -1913,7 +1763,7 @@ xlog_write(xfs_mount_t * mp,
1913 default: 1763 default:
1914 xfs_fs_cmn_err(CE_WARN, mp, 1764 xfs_fs_cmn_err(CE_WARN, mp,
1915 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1765 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1916 logop_head->oh_clientid, tic); 1766 logop_head->oh_clientid, ticket);
1917 return XFS_ERROR(EIO); 1767 return XFS_ERROR(EIO);
1918 } 1768 }
1919 1769
@@ -2414,7 +2264,6 @@ restart:
2414 2264
2415 iclog = log->l_iclog; 2265 iclog = log->l_iclog;
2416 if (iclog->ic_state != XLOG_STATE_ACTIVE) { 2266 if (iclog->ic_state != XLOG_STATE_ACTIVE) {
2417 xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
2418 XFS_STATS_INC(xs_log_noiclogs); 2267 XFS_STATS_INC(xs_log_noiclogs);
2419 2268
2420 /* Wait for log writes to have flushed */ 2269 /* Wait for log writes to have flushed */
@@ -2520,13 +2369,15 @@ xlog_grant_log_space(xlog_t *log,
2520 2369
2521 /* Is there space or do we need to sleep? */ 2370 /* Is there space or do we need to sleep? */
2522 spin_lock(&log->l_grant_lock); 2371 spin_lock(&log->l_grant_lock);
2523 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter"); 2372
2373 trace_xfs_log_grant_enter(log, tic);
2524 2374
2525 /* something is already sleeping; insert new transaction at end */ 2375 /* something is already sleeping; insert new transaction at end */
2526 if (log->l_reserve_headq) { 2376 if (log->l_reserve_headq) {
2527 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2377 xlog_ins_ticketq(&log->l_reserve_headq, tic);
2528 xlog_trace_loggrant(log, tic, 2378
2529 "xlog_grant_log_space: sleep 1"); 2379 trace_xfs_log_grant_sleep1(log, tic);
2380
2530 /* 2381 /*
2531 * Gotta check this before going to sleep, while we're 2382 * Gotta check this before going to sleep, while we're
2532 * holding the grant lock. 2383 * holding the grant lock.
@@ -2540,8 +2391,7 @@ xlog_grant_log_space(xlog_t *log,
2540 * If we got an error, and the filesystem is shutting down, 2391 * If we got an error, and the filesystem is shutting down,
2541 * we'll catch it down below. So just continue... 2392 * we'll catch it down below. So just continue...
2542 */ 2393 */
2543 xlog_trace_loggrant(log, tic, 2394 trace_xfs_log_grant_wake1(log, tic);
2544 "xlog_grant_log_space: wake 1");
2545 spin_lock(&log->l_grant_lock); 2395 spin_lock(&log->l_grant_lock);
2546 } 2396 }
2547 if (tic->t_flags & XFS_LOG_PERM_RESERV) 2397 if (tic->t_flags & XFS_LOG_PERM_RESERV)
@@ -2558,8 +2408,9 @@ redo:
2558 if (free_bytes < need_bytes) { 2408 if (free_bytes < need_bytes) {
2559 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2409 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2560 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2410 xlog_ins_ticketq(&log->l_reserve_headq, tic);
2561 xlog_trace_loggrant(log, tic, 2411
2562 "xlog_grant_log_space: sleep 2"); 2412 trace_xfs_log_grant_sleep2(log, tic);
2413
2563 spin_unlock(&log->l_grant_lock); 2414 spin_unlock(&log->l_grant_lock);
2564 xlog_grant_push_ail(log->l_mp, need_bytes); 2415 xlog_grant_push_ail(log->l_mp, need_bytes);
2565 spin_lock(&log->l_grant_lock); 2416 spin_lock(&log->l_grant_lock);
@@ -2571,8 +2422,8 @@ redo:
2571 if (XLOG_FORCED_SHUTDOWN(log)) 2422 if (XLOG_FORCED_SHUTDOWN(log))
2572 goto error_return; 2423 goto error_return;
2573 2424
2574 xlog_trace_loggrant(log, tic, 2425 trace_xfs_log_grant_wake2(log, tic);
2575 "xlog_grant_log_space: wake 2"); 2426
2576 goto redo; 2427 goto redo;
2577 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2428 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2578 xlog_del_ticketq(&log->l_reserve_headq, tic); 2429 xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2592,7 +2443,7 @@ redo:
2592 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); 2443 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2593 } 2444 }
2594#endif 2445#endif
2595 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit"); 2446 trace_xfs_log_grant_exit(log, tic);
2596 xlog_verify_grant_head(log, 1); 2447 xlog_verify_grant_head(log, 1);
2597 spin_unlock(&log->l_grant_lock); 2448 spin_unlock(&log->l_grant_lock);
2598 return 0; 2449 return 0;
@@ -2600,7 +2451,9 @@ redo:
2600 error_return: 2451 error_return:
2601 if (tic->t_flags & XLOG_TIC_IN_Q) 2452 if (tic->t_flags & XLOG_TIC_IN_Q)
2602 xlog_del_ticketq(&log->l_reserve_headq, tic); 2453 xlog_del_ticketq(&log->l_reserve_headq, tic);
2603 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); 2454
2455 trace_xfs_log_grant_error(log, tic);
2456
2604 /* 2457 /*
2605 * If we are failing, make sure the ticket doesn't have any 2458 * If we are failing, make sure the ticket doesn't have any
2606 * current reservations. We don't want to add this back when 2459 * current reservations. We don't want to add this back when
@@ -2640,7 +2493,8 @@ xlog_regrant_write_log_space(xlog_t *log,
2640#endif 2493#endif
2641 2494
2642 spin_lock(&log->l_grant_lock); 2495 spin_lock(&log->l_grant_lock);
2643 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter"); 2496
2497 trace_xfs_log_regrant_write_enter(log, tic);
2644 2498
2645 if (XLOG_FORCED_SHUTDOWN(log)) 2499 if (XLOG_FORCED_SHUTDOWN(log))
2646 goto error_return; 2500 goto error_return;
@@ -2669,8 +2523,8 @@ xlog_regrant_write_log_space(xlog_t *log,
2669 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2523 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2670 xlog_ins_ticketq(&log->l_write_headq, tic); 2524 xlog_ins_ticketq(&log->l_write_headq, tic);
2671 2525
2672 xlog_trace_loggrant(log, tic, 2526 trace_xfs_log_regrant_write_sleep1(log, tic);
2673 "xlog_regrant_write_log_space: sleep 1"); 2527
2674 spin_unlock(&log->l_grant_lock); 2528 spin_unlock(&log->l_grant_lock);
2675 xlog_grant_push_ail(log->l_mp, need_bytes); 2529 xlog_grant_push_ail(log->l_mp, need_bytes);
2676 spin_lock(&log->l_grant_lock); 2530 spin_lock(&log->l_grant_lock);
@@ -2685,8 +2539,7 @@ xlog_regrant_write_log_space(xlog_t *log,
2685 if (XLOG_FORCED_SHUTDOWN(log)) 2539 if (XLOG_FORCED_SHUTDOWN(log))
2686 goto error_return; 2540 goto error_return;
2687 2541
2688 xlog_trace_loggrant(log, tic, 2542 trace_xfs_log_regrant_write_wake1(log, tic);
2689 "xlog_regrant_write_log_space: wake 1");
2690 } 2543 }
2691 } 2544 }
2692 2545
@@ -2704,6 +2557,8 @@ redo:
2704 spin_lock(&log->l_grant_lock); 2557 spin_lock(&log->l_grant_lock);
2705 2558
2706 XFS_STATS_INC(xs_sleep_logspace); 2559 XFS_STATS_INC(xs_sleep_logspace);
2560 trace_xfs_log_regrant_write_sleep2(log, tic);
2561
2707 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2562 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2708 2563
2709 /* If we're shutting down, this tic is already off the queue */ 2564 /* If we're shutting down, this tic is already off the queue */
@@ -2711,8 +2566,7 @@ redo:
2711 if (XLOG_FORCED_SHUTDOWN(log)) 2566 if (XLOG_FORCED_SHUTDOWN(log))
2712 goto error_return; 2567 goto error_return;
2713 2568
2714 xlog_trace_loggrant(log, tic, 2569 trace_xfs_log_regrant_write_wake2(log, tic);
2715 "xlog_regrant_write_log_space: wake 2");
2716 goto redo; 2570 goto redo;
2717 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2571 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2718 xlog_del_ticketq(&log->l_write_headq, tic); 2572 xlog_del_ticketq(&log->l_write_headq, tic);
@@ -2727,7 +2581,8 @@ redo:
2727 } 2581 }
2728#endif 2582#endif
2729 2583
2730 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); 2584 trace_xfs_log_regrant_write_exit(log, tic);
2585
2731 xlog_verify_grant_head(log, 1); 2586 xlog_verify_grant_head(log, 1);
2732 spin_unlock(&log->l_grant_lock); 2587 spin_unlock(&log->l_grant_lock);
2733 return 0; 2588 return 0;
@@ -2736,7 +2591,9 @@ redo:
2736 error_return: 2591 error_return:
2737 if (tic->t_flags & XLOG_TIC_IN_Q) 2592 if (tic->t_flags & XLOG_TIC_IN_Q)
2738 xlog_del_ticketq(&log->l_reserve_headq, tic); 2593 xlog_del_ticketq(&log->l_reserve_headq, tic);
2739 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); 2594
2595 trace_xfs_log_regrant_write_error(log, tic);
2596
2740 /* 2597 /*
2741 * If we are failing, make sure the ticket doesn't have any 2598 * If we are failing, make sure the ticket doesn't have any
2742 * current reservations. We don't want to add this back when 2599 * current reservations. We don't want to add this back when
@@ -2760,8 +2617,8 @@ STATIC void
2760xlog_regrant_reserve_log_space(xlog_t *log, 2617xlog_regrant_reserve_log_space(xlog_t *log,
2761 xlog_ticket_t *ticket) 2618 xlog_ticket_t *ticket)
2762{ 2619{
2763 xlog_trace_loggrant(log, ticket, 2620 trace_xfs_log_regrant_reserve_enter(log, ticket);
2764 "xlog_regrant_reserve_log_space: enter"); 2621
2765 if (ticket->t_cnt > 0) 2622 if (ticket->t_cnt > 0)
2766 ticket->t_cnt--; 2623 ticket->t_cnt--;
2767 2624
@@ -2769,8 +2626,9 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2769 xlog_grant_sub_space(log, ticket->t_curr_res); 2626 xlog_grant_sub_space(log, ticket->t_curr_res);
2770 ticket->t_curr_res = ticket->t_unit_res; 2627 ticket->t_curr_res = ticket->t_unit_res;
2771 xlog_tic_reset_res(ticket); 2628 xlog_tic_reset_res(ticket);
2772 xlog_trace_loggrant(log, ticket, 2629
2773 "xlog_regrant_reserve_log_space: sub current res"); 2630 trace_xfs_log_regrant_reserve_sub(log, ticket);
2631
2774 xlog_verify_grant_head(log, 1); 2632 xlog_verify_grant_head(log, 1);
2775 2633
2776 /* just return if we still have some of the pre-reserved space */ 2634 /* just return if we still have some of the pre-reserved space */
@@ -2780,8 +2638,9 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2780 } 2638 }
2781 2639
2782 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2640 xlog_grant_add_space_reserve(log, ticket->t_unit_res);
2783 xlog_trace_loggrant(log, ticket, 2641
2784 "xlog_regrant_reserve_log_space: exit"); 2642 trace_xfs_log_regrant_reserve_exit(log, ticket);
2643
2785 xlog_verify_grant_head(log, 0); 2644 xlog_verify_grant_head(log, 0);
2786 spin_unlock(&log->l_grant_lock); 2645 spin_unlock(&log->l_grant_lock);
2787 ticket->t_curr_res = ticket->t_unit_res; 2646 ticket->t_curr_res = ticket->t_unit_res;
@@ -2811,11 +2670,11 @@ xlog_ungrant_log_space(xlog_t *log,
2811 ticket->t_cnt--; 2670 ticket->t_cnt--;
2812 2671
2813 spin_lock(&log->l_grant_lock); 2672 spin_lock(&log->l_grant_lock);
2814 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); 2673 trace_xfs_log_ungrant_enter(log, ticket);
2815 2674
2816 xlog_grant_sub_space(log, ticket->t_curr_res); 2675 xlog_grant_sub_space(log, ticket->t_curr_res);
2817 2676
2818 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); 2677 trace_xfs_log_ungrant_sub(log, ticket);
2819 2678
2820 /* If this is a permanent reservation ticket, we may be able to free 2679 /* If this is a permanent reservation ticket, we may be able to free
2821 * up more space based on the remaining count. 2680 * up more space based on the remaining count.
@@ -2825,7 +2684,8 @@ xlog_ungrant_log_space(xlog_t *log,
2825 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2684 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
2826 } 2685 }
2827 2686
2828 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); 2687 trace_xfs_log_ungrant_exit(log, ticket);
2688
2829 xlog_verify_grant_head(log, 1); 2689 xlog_verify_grant_head(log, 1);
2830 spin_unlock(&log->l_grant_lock); 2690 spin_unlock(&log->l_grant_lock);
2831 xfs_log_move_tail(log->l_mp, 1); 2691 xfs_log_move_tail(log->l_mp, 1);
@@ -2927,7 +2787,6 @@ xlog_state_switch_iclogs(xlog_t *log,
2927 log->l_iclog = iclog->ic_next; 2787 log->l_iclog = iclog->ic_next;
2928} /* xlog_state_switch_iclogs */ 2788} /* xlog_state_switch_iclogs */
2929 2789
2930
2931/* 2790/*
2932 * Write out all data in the in-core log as of this exact moment in time. 2791 * Write out all data in the in-core log as of this exact moment in time.
2933 * 2792 *
@@ -2955,11 +2814,17 @@ xlog_state_switch_iclogs(xlog_t *log,
2955 * b) when we return from flushing out this iclog, it is still 2814 * b) when we return from flushing out this iclog, it is still
2956 * not in the active nor dirty state. 2815 * not in the active nor dirty state.
2957 */ 2816 */
2958STATIC int 2817int
2959xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) 2818_xfs_log_force(
2819 struct xfs_mount *mp,
2820 uint flags,
2821 int *log_flushed)
2960{ 2822{
2961 xlog_in_core_t *iclog; 2823 struct log *log = mp->m_log;
2962 xfs_lsn_t lsn; 2824 struct xlog_in_core *iclog;
2825 xfs_lsn_t lsn;
2826
2827 XFS_STATS_INC(xs_log_force);
2963 2828
2964 spin_lock(&log->l_icloglock); 2829 spin_lock(&log->l_icloglock);
2965 2830
@@ -3005,7 +2870,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
3005 2870
3006 if (xlog_state_release_iclog(log, iclog)) 2871 if (xlog_state_release_iclog(log, iclog))
3007 return XFS_ERROR(EIO); 2872 return XFS_ERROR(EIO);
3008 *log_flushed = 1; 2873
2874 if (log_flushed)
2875 *log_flushed = 1;
3009 spin_lock(&log->l_icloglock); 2876 spin_lock(&log->l_icloglock);
3010 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && 2877 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
3011 iclog->ic_state != XLOG_STATE_DIRTY) 2878 iclog->ic_state != XLOG_STATE_DIRTY)
@@ -3049,19 +2916,37 @@ maybe_sleep:
3049 */ 2916 */
3050 if (iclog->ic_state & XLOG_STATE_IOERROR) 2917 if (iclog->ic_state & XLOG_STATE_IOERROR)
3051 return XFS_ERROR(EIO); 2918 return XFS_ERROR(EIO);
3052 *log_flushed = 1; 2919 if (log_flushed)
3053 2920 *log_flushed = 1;
3054 } else { 2921 } else {
3055 2922
3056no_sleep: 2923no_sleep:
3057 spin_unlock(&log->l_icloglock); 2924 spin_unlock(&log->l_icloglock);
3058 } 2925 }
3059 return 0; 2926 return 0;
3060} /* xlog_state_sync_all */ 2927}
3061 2928
2929/*
2930 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
2931 * about errors or whether the log was flushed or not. This is the normal
2932 * interface to use when trying to unpin items or move the log forward.
2933 */
2934void
2935xfs_log_force(
2936 xfs_mount_t *mp,
2937 uint flags)
2938{
2939 int error;
2940
2941 error = _xfs_log_force(mp, flags, NULL);
2942 if (error) {
2943 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
2944 "error %d returned.", error);
2945 }
2946}
3062 2947
3063/* 2948/*
3064 * Used by code which implements synchronous log forces. 2949 * Force the in-core log to disk for a specific LSN.
3065 * 2950 *
3066 * Find in-core log with lsn. 2951 * Find in-core log with lsn.
3067 * If it is in the DIRTY state, just return. 2952 * If it is in the DIRTY state, just return.
@@ -3069,109 +2954,142 @@ no_sleep:
3069 * state and go to sleep or return. 2954 * state and go to sleep or return.
3070 * If it is in any other state, go to sleep or return. 2955 * If it is in any other state, go to sleep or return.
3071 * 2956 *
3072 * If filesystem activity goes to zero, the iclog will get flushed only by 2957 * Synchronous forces are implemented with a signal variable. All callers
3073 * bdflush(). 2958 * to force a given lsn to disk will wait on a the sv attached to the
2959 * specific in-core log. When given in-core log finally completes its
2960 * write to disk, that thread will wake up all threads waiting on the
2961 * sv.
3074 */ 2962 */
3075STATIC int 2963int
3076xlog_state_sync(xlog_t *log, 2964_xfs_log_force_lsn(
3077 xfs_lsn_t lsn, 2965 struct xfs_mount *mp,
3078 uint flags, 2966 xfs_lsn_t lsn,
3079 int *log_flushed) 2967 uint flags,
2968 int *log_flushed)
3080{ 2969{
3081 xlog_in_core_t *iclog; 2970 struct log *log = mp->m_log;
3082 int already_slept = 0; 2971 struct xlog_in_core *iclog;
2972 int already_slept = 0;
3083 2973
3084try_again: 2974 ASSERT(lsn != 0);
3085 spin_lock(&log->l_icloglock);
3086 iclog = log->l_iclog;
3087 2975
3088 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2976 XFS_STATS_INC(xs_log_force);
3089 spin_unlock(&log->l_icloglock);
3090 return XFS_ERROR(EIO);
3091 }
3092
3093 do {
3094 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3095 iclog = iclog->ic_next;
3096 continue;
3097 }
3098 2977
3099 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2978try_again:
2979 spin_lock(&log->l_icloglock);
2980 iclog = log->l_iclog;
2981 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3100 spin_unlock(&log->l_icloglock); 2982 spin_unlock(&log->l_icloglock);
3101 return 0; 2983 return XFS_ERROR(EIO);
3102 } 2984 }
3103 2985
3104 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 2986 do {
3105 /* 2987 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3106 * We sleep here if we haven't already slept (e.g. 2988 iclog = iclog->ic_next;
3107 * this is the first time we've looked at the correct 2989 continue;
3108 * iclog buf) and the buffer before us is going to 2990 }
3109 * be sync'ed. The reason for this is that if we 2991
3110 * are doing sync transactions here, by waiting for 2992 if (iclog->ic_state == XLOG_STATE_DIRTY) {
3111 * the previous I/O to complete, we can allow a few 2993 spin_unlock(&log->l_icloglock);
3112 * more transactions into this iclog before we close 2994 return 0;
3113 * it down. 2995 }
3114 * 2996
3115 * Otherwise, we mark the buffer WANT_SYNC, and bump 2997 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3116 * up the refcnt so we can release the log (which drops 2998 /*
3117 * the ref count). The state switch keeps new transaction 2999 * We sleep here if we haven't already slept (e.g.
3118 * commits from using this buffer. When the current commits 3000 * this is the first time we've looked at the correct
3119 * finish writing into the buffer, the refcount will drop to 3001 * iclog buf) and the buffer before us is going to
3120 * zero and the buffer will go out then. 3002 * be sync'ed. The reason for this is that if we
3121 */ 3003 * are doing sync transactions here, by waiting for
3122 if (!already_slept && 3004 * the previous I/O to complete, we can allow a few
3123 (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | 3005 * more transactions into this iclog before we close
3124 XLOG_STATE_SYNCING))) { 3006 * it down.
3125 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3007 *
3126 XFS_STATS_INC(xs_log_force_sleep); 3008 * Otherwise, we mark the buffer WANT_SYNC, and bump
3127 sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, 3009 * up the refcnt so we can release the log (which
3128 &log->l_icloglock, s); 3010 * drops the ref count). The state switch keeps new
3129 *log_flushed = 1; 3011 * transaction commits from using this buffer. When
3130 already_slept = 1; 3012 * the current commits finish writing into the buffer,
3131 goto try_again; 3013 * the refcount will drop to zero and the buffer will
3132 } else { 3014 * go out then.
3015 */
3016 if (!already_slept &&
3017 (iclog->ic_prev->ic_state &
3018 (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
3019 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3020
3021 XFS_STATS_INC(xs_log_force_sleep);
3022
3023 sv_wait(&iclog->ic_prev->ic_write_wait,
3024 PSWP, &log->l_icloglock, s);
3025 if (log_flushed)
3026 *log_flushed = 1;
3027 already_slept = 1;
3028 goto try_again;
3029 }
3133 atomic_inc(&iclog->ic_refcnt); 3030 atomic_inc(&iclog->ic_refcnt);
3134 xlog_state_switch_iclogs(log, iclog, 0); 3031 xlog_state_switch_iclogs(log, iclog, 0);
3135 spin_unlock(&log->l_icloglock); 3032 spin_unlock(&log->l_icloglock);
3136 if (xlog_state_release_iclog(log, iclog)) 3033 if (xlog_state_release_iclog(log, iclog))
3137 return XFS_ERROR(EIO); 3034 return XFS_ERROR(EIO);
3138 *log_flushed = 1; 3035 if (log_flushed)
3036 *log_flushed = 1;
3139 spin_lock(&log->l_icloglock); 3037 spin_lock(&log->l_icloglock);
3140 } 3038 }
3141 }
3142 3039
3143 if ((flags & XFS_LOG_SYNC) && /* sleep */ 3040 if ((flags & XFS_LOG_SYNC) && /* sleep */
3144 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { 3041 !(iclog->ic_state &
3042 (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
3043 /*
3044 * Don't wait on completion if we know that we've
3045 * gotten a log write error.
3046 */
3047 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3048 spin_unlock(&log->l_icloglock);
3049 return XFS_ERROR(EIO);
3050 }
3051 XFS_STATS_INC(xs_log_force_sleep);
3052 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3053 /*
3054 * No need to grab the log lock here since we're
3055 * only deciding whether or not to return EIO
3056 * and the memory read should be atomic.
3057 */
3058 if (iclog->ic_state & XLOG_STATE_IOERROR)
3059 return XFS_ERROR(EIO);
3145 3060
3146 /* 3061 if (log_flushed)
3147 * Don't wait on completion if we know that we've 3062 *log_flushed = 1;
3148 * gotten a log write error. 3063 } else { /* just return */
3149 */
3150 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3151 spin_unlock(&log->l_icloglock); 3064 spin_unlock(&log->l_icloglock);
3152 return XFS_ERROR(EIO);
3153 } 3065 }
3154 XFS_STATS_INC(xs_log_force_sleep);
3155 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3156 /*
3157 * No need to grab the log lock here since we're
3158 * only deciding whether or not to return EIO
3159 * and the memory read should be atomic.
3160 */
3161 if (iclog->ic_state & XLOG_STATE_IOERROR)
3162 return XFS_ERROR(EIO);
3163 *log_flushed = 1;
3164 } else { /* just return */
3165 spin_unlock(&log->l_icloglock);
3166 }
3167 return 0;
3168 3066
3169 } while (iclog != log->l_iclog); 3067 return 0;
3068 } while (iclog != log->l_iclog);
3069
3070 spin_unlock(&log->l_icloglock);
3071 return 0;
3072}
3170 3073
3171 spin_unlock(&log->l_icloglock); 3074/*
3172 return 0; 3075 * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
3173} /* xlog_state_sync */ 3076 * about errors or whether the log was flushed or not. This is the normal
3077 * interface to use when trying to unpin items or move the log forward.
3078 */
3079void
3080xfs_log_force_lsn(
3081 xfs_mount_t *mp,
3082 xfs_lsn_t lsn,
3083 uint flags)
3084{
3085 int error;
3174 3086
3087 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3088 if (error) {
3089 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
3090 "error %d returned.", error);
3091 }
3092}
3175 3093
3176/* 3094/*
3177 * Called when we want to mark the current iclog as being ready to sync to 3095 * Called when we want to mark the current iclog as being ready to sync to
@@ -3536,7 +3454,6 @@ xfs_log_force_umount(
3536 xlog_ticket_t *tic; 3454 xlog_ticket_t *tic;
3537 xlog_t *log; 3455 xlog_t *log;
3538 int retval; 3456 int retval;
3539 int dummy;
3540 3457
3541 log = mp->m_log; 3458 log = mp->m_log;
3542 3459
@@ -3610,13 +3527,14 @@ xfs_log_force_umount(
3610 } 3527 }
3611 spin_unlock(&log->l_grant_lock); 3528 spin_unlock(&log->l_grant_lock);
3612 3529
3613 if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3530 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3614 ASSERT(!logerror); 3531 ASSERT(!logerror);
3615 /* 3532 /*
3616 * Force the incore logs to disk before shutting the 3533 * Force the incore logs to disk before shutting the
3617 * log down completely. 3534 * log down completely.
3618 */ 3535 */
3619 xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); 3536 _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3537
3620 spin_lock(&log->l_icloglock); 3538 spin_lock(&log->l_icloglock);
3621 retval = xlog_state_ioerror(log); 3539 retval = xlog_state_ioerror(log);
3622 spin_unlock(&log->l_icloglock); 3540 spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d0c9baa50b1a..97a24c7795a4 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -70,14 +70,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
70 * Flags to xfs_log_force() 70 * Flags to xfs_log_force()
71 * 71 *
72 * XFS_LOG_SYNC: Synchronous force in-core log to disk 72 * XFS_LOG_SYNC: Synchronous force in-core log to disk
73 * XFS_LOG_FORCE: Start in-core log write now.
74 * XFS_LOG_URGE: Start write within some window of time.
75 *
76 * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
77 */ 73 */
78#define XFS_LOG_SYNC 0x1 74#define XFS_LOG_SYNC 0x1
79#define XFS_LOG_FORCE 0x2
80#define XFS_LOG_URGE 0x4
81 75
82#endif /* __KERNEL__ */ 76#endif /* __KERNEL__ */
83 77
@@ -110,16 +104,12 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
110#define XLOG_REG_TYPE_TRANSHDR 19 104#define XLOG_REG_TYPE_TRANSHDR 19
111#define XLOG_REG_TYPE_MAX 19 105#define XLOG_REG_TYPE_MAX 19
112 106
113#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
114
115typedef struct xfs_log_iovec { 107typedef struct xfs_log_iovec {
116 xfs_caddr_t i_addr; /* beginning address of region */ 108 xfs_caddr_t i_addr; /* beginning address of region */
117 int i_len; /* length in bytes of region */ 109 int i_len; /* length in bytes of region */
118 uint i_type; /* type of region */ 110 uint i_type; /* type of region */
119} xfs_log_iovec_t; 111} xfs_log_iovec_t;
120 112
121typedef void* xfs_log_ticket_t;
122
123/* 113/*
124 * Structure used to pass callback function and the function's argument 114 * Structure used to pass callback function and the function's argument
125 * to the log manager. 115 * to the log manager.
@@ -134,18 +124,25 @@ typedef struct xfs_log_callback {
134#ifdef __KERNEL__ 124#ifdef __KERNEL__
135/* Log manager interfaces */ 125/* Log manager interfaces */
136struct xfs_mount; 126struct xfs_mount;
127struct xlog_in_core;
137struct xlog_ticket; 128struct xlog_ticket;
129
138xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 130xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
139 xfs_log_ticket_t ticket, 131 struct xlog_ticket *ticket,
140 void **iclog, 132 struct xlog_in_core **iclog,
141 uint flags); 133 uint flags);
142int _xfs_log_force(struct xfs_mount *mp, 134int _xfs_log_force(struct xfs_mount *mp,
143 xfs_lsn_t lsn,
144 uint flags, 135 uint flags,
145 int *log_forced); 136 int *log_forced);
146void xfs_log_force(struct xfs_mount *mp, 137void xfs_log_force(struct xfs_mount *mp,
147 xfs_lsn_t lsn,
148 uint flags); 138 uint flags);
139int _xfs_log_force_lsn(struct xfs_mount *mp,
140 xfs_lsn_t lsn,
141 uint flags,
142 int *log_forced);
143void xfs_log_force_lsn(struct xfs_mount *mp,
144 xfs_lsn_t lsn,
145 uint flags);
149int xfs_log_mount(struct xfs_mount *mp, 146int xfs_log_mount(struct xfs_mount *mp,
150 struct xfs_buftarg *log_target, 147 struct xfs_buftarg *log_target,
151 xfs_daddr_t start_block, 148 xfs_daddr_t start_block,
@@ -154,21 +151,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
154void xfs_log_move_tail(struct xfs_mount *mp, 151void xfs_log_move_tail(struct xfs_mount *mp,
155 xfs_lsn_t tail_lsn); 152 xfs_lsn_t tail_lsn);
156int xfs_log_notify(struct xfs_mount *mp, 153int xfs_log_notify(struct xfs_mount *mp,
157 void *iclog, 154 struct xlog_in_core *iclog,
158 xfs_log_callback_t *callback_entry); 155 xfs_log_callback_t *callback_entry);
159int xfs_log_release_iclog(struct xfs_mount *mp, 156int xfs_log_release_iclog(struct xfs_mount *mp,
160 void *iclog_hndl); 157 struct xlog_in_core *iclog);
161int xfs_log_reserve(struct xfs_mount *mp, 158int xfs_log_reserve(struct xfs_mount *mp,
162 int length, 159 int length,
163 int count, 160 int count,
164 xfs_log_ticket_t *ticket, 161 struct xlog_ticket **ticket,
165 __uint8_t clientid, 162 __uint8_t clientid,
166 uint flags, 163 uint flags,
167 uint t_type); 164 uint t_type);
168int xfs_log_write(struct xfs_mount *mp, 165int xfs_log_write(struct xfs_mount *mp,
169 xfs_log_iovec_t region[], 166 xfs_log_iovec_t region[],
170 int nentries, 167 int nentries,
171 xfs_log_ticket_t ticket, 168 struct xlog_ticket *ticket,
172 xfs_lsn_t *start_lsn); 169 xfs_lsn_t *start_lsn);
173int xfs_log_unmount_write(struct xfs_mount *mp); 170int xfs_log_unmount_write(struct xfs_mount *mp);
174void xfs_log_unmount(struct xfs_mount *mp); 171void xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 679c7c4926a2..fd02a18facd5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_PRIV_H__ 19#define __XFS_LOG_PRIV_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct ktrace;
23struct log; 22struct log;
24struct xlog_ticket; 23struct xlog_ticket;
25struct xfs_buf_cancel; 24struct xfs_buf_cancel;
@@ -135,6 +134,12 @@ static inline uint xlog_get_client_id(__be32 i)
135#define XLOG_TIC_INITED 0x1 /* has been initialized */ 134#define XLOG_TIC_INITED 0x1 /* has been initialized */
136#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ 135#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
137#define XLOG_TIC_IN_Q 0x4 136#define XLOG_TIC_IN_Q 0x4
137
138#define XLOG_TIC_FLAGS \
139 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
140 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
141 { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
142
138#endif /* __KERNEL__ */ 143#endif /* __KERNEL__ */
139 144
140#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ 145#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
@@ -361,9 +366,6 @@ typedef struct xlog_in_core {
361 int ic_bwritecnt; 366 int ic_bwritecnt;
362 unsigned short ic_state; 367 unsigned short ic_state;
363 char *ic_datap; /* pointer to iclog data */ 368 char *ic_datap; /* pointer to iclog data */
364#ifdef XFS_LOG_TRACE
365 struct ktrace *ic_trace;
366#endif
367 369
368 /* Callback structures need their own cacheline */ 370 /* Callback structures need their own cacheline */
369 spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; 371 spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
@@ -429,10 +431,6 @@ typedef struct log {
429 int l_grant_write_cycle; 431 int l_grant_write_cycle;
430 int l_grant_write_bytes; 432 int l_grant_write_bytes;
431 433
432#ifdef XFS_LOG_TRACE
433 struct ktrace *l_grant_trace;
434#endif
435
436 /* The following field are used for debugging; need to hold icloglock */ 434 /* The following field are used for debugging; need to hold icloglock */
437#ifdef DEBUG 435#ifdef DEBUG
438 char *l_iclog_bak[XLOG_MAX_ICLOGS]; 436 char *l_iclog_bak[XLOG_MAX_ICLOGS];
@@ -445,23 +443,12 @@ typedef struct log {
445 443
446/* common routines */ 444/* common routines */
447extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
448extern int xlog_find_tail(xlog_t *log,
449 xfs_daddr_t *head_blk,
450 xfs_daddr_t *tail_blk);
451extern int xlog_recover(xlog_t *log); 446extern int xlog_recover(xlog_t *log);
452extern int xlog_recover_finish(xlog_t *log); 447extern int xlog_recover_finish(xlog_t *log);
453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 448extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
454extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
455extern void xlog_put_bp(struct xfs_buf *);
456 449
457extern kmem_zone_t *xfs_log_ticket_zone; 450extern kmem_zone_t *xfs_log_ticket_zone;
458 451
459/* iclog tracing */
460#define XLOG_TRACE_GRAB_FLUSH 1
461#define XLOG_TRACE_REL_FLUSH 2
462#define XLOG_TRACE_SLEEP_FLUSH 3
463#define XLOG_TRACE_WAKE_FLUSH 4
464
465/* 452/*
466 * Unmount record type is used as a pseudo transaction type for the ticket. 453 * Unmount record type is used as a pseudo transaction type for the ticket.
467 * It's value must be outside the range of XFS_TRANS_* values. 454 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index fb17f8226b09..22e6efdc17ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,11 +46,10 @@
46#include "xfs_quota.h" 46#include "xfs_quota.h"
47#include "xfs_rw.h" 47#include "xfs_rw.h"
48#include "xfs_utils.h" 48#include "xfs_utils.h"
49#include "xfs_trace.h"
49 50
50STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); 51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
51STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); 52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
52STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
53 xlog_recover_item_t *item);
54#if defined(DEBUG) 53#if defined(DEBUG)
55STATIC void xlog_recover_check_summary(xlog_t *); 54STATIC void xlog_recover_check_summary(xlog_t *);
56#else 55#else
@@ -67,7 +66,7 @@ STATIC void xlog_recover_check_summary(xlog_t *);
67 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
68#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask)
69 68
70xfs_buf_t * 69STATIC xfs_buf_t *
71xlog_get_bp( 70xlog_get_bp(
72 xlog_t *log, 71 xlog_t *log,
73 int nbblks) 72 int nbblks)
@@ -87,7 +86,7 @@ xlog_get_bp(
87 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
88} 87}
89 88
90void 89STATIC void
91xlog_put_bp( 90xlog_put_bp(
92 xfs_buf_t *bp) 91 xfs_buf_t *bp)
93{ 92{
@@ -225,16 +224,10 @@ xlog_header_check_dump(
225 xfs_mount_t *mp, 224 xfs_mount_t *mp,
226 xlog_rec_header_t *head) 225 xlog_rec_header_t *head)
227{ 226{
228 int b; 227 cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n",
229 228 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
230 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); 229 cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n",
231 for (b = 0; b < 16; b++) 230 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
232 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
233 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
234 cmn_err(CE_DEBUG, " log : uuid = ");
235 for (b = 0; b < 16; b++)
236 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
237 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
238} 231}
239#else 232#else
240#define xlog_header_check_dump(mp, head) 233#define xlog_header_check_dump(mp, head)
@@ -810,7 +803,7 @@ xlog_find_head(
810 * We could speed up search by using current head_blk buffer, but it is not 803 * We could speed up search by using current head_blk buffer, but it is not
811 * available. 804 * available.
812 */ 805 */
813int 806STATIC int
814xlog_find_tail( 807xlog_find_tail(
815 xlog_t *log, 808 xlog_t *log,
816 xfs_daddr_t *head_blk, 809 xfs_daddr_t *head_blk,
@@ -1372,36 +1365,45 @@ xlog_clear_stale_blocks(
1372 1365
1373STATIC xlog_recover_t * 1366STATIC xlog_recover_t *
1374xlog_recover_find_tid( 1367xlog_recover_find_tid(
1375 xlog_recover_t *q, 1368 struct hlist_head *head,
1376 xlog_tid_t tid) 1369 xlog_tid_t tid)
1377{ 1370{
1378 xlog_recover_t *p = q; 1371 xlog_recover_t *trans;
1372 struct hlist_node *n;
1379 1373
1380 while (p != NULL) { 1374 hlist_for_each_entry(trans, n, head, r_list) {
1381 if (p->r_log_tid == tid) 1375 if (trans->r_log_tid == tid)
1382 break; 1376 return trans;
1383 p = p->r_next;
1384 } 1377 }
1385 return p; 1378 return NULL;
1386} 1379}
1387 1380
1388STATIC void 1381STATIC void
1389xlog_recover_put_hashq( 1382xlog_recover_new_tid(
1390 xlog_recover_t **q, 1383 struct hlist_head *head,
1391 xlog_recover_t *trans) 1384 xlog_tid_t tid,
1385 xfs_lsn_t lsn)
1392{ 1386{
1393 trans->r_next = *q; 1387 xlog_recover_t *trans;
1394 *q = trans; 1388
1389 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1390 trans->r_log_tid = tid;
1391 trans->r_lsn = lsn;
1392 INIT_LIST_HEAD(&trans->r_itemq);
1393
1394 INIT_HLIST_NODE(&trans->r_list);
1395 hlist_add_head(&trans->r_list, head);
1395} 1396}
1396 1397
1397STATIC void 1398STATIC void
1398xlog_recover_add_item( 1399xlog_recover_add_item(
1399 xlog_recover_item_t **itemq) 1400 struct list_head *head)
1400{ 1401{
1401 xlog_recover_item_t *item; 1402 xlog_recover_item_t *item;
1402 1403
1403 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 1404 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1404 xlog_recover_insert_item_backq(itemq, item); 1405 INIT_LIST_HEAD(&item->ri_list);
1406 list_add_tail(&item->ri_list, head);
1405} 1407}
1406 1408
1407STATIC int 1409STATIC int
@@ -1414,8 +1416,7 @@ xlog_recover_add_to_cont_trans(
1414 xfs_caddr_t ptr, old_ptr; 1416 xfs_caddr_t ptr, old_ptr;
1415 int old_len; 1417 int old_len;
1416 1418
1417 item = trans->r_itemq; 1419 if (list_empty(&trans->r_itemq)) {
1418 if (item == NULL) {
1419 /* finish copying rest of trans header */ 1420 /* finish copying rest of trans header */
1420 xlog_recover_add_item(&trans->r_itemq); 1421 xlog_recover_add_item(&trans->r_itemq);
1421 ptr = (xfs_caddr_t) &trans->r_theader + 1422 ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1423,7 +1424,8 @@ xlog_recover_add_to_cont_trans(
1423 memcpy(ptr, dp, len); /* d, s, l */ 1424 memcpy(ptr, dp, len); /* d, s, l */
1424 return 0; 1425 return 0;
1425 } 1426 }
1426 item = item->ri_prev; 1427 /* take the tail entry */
1428 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1427 1429
1428 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 1430 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1429 old_len = item->ri_buf[item->ri_cnt-1].i_len; 1431 old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1460,8 +1462,7 @@ xlog_recover_add_to_trans(
1460 1462
1461 if (!len) 1463 if (!len)
1462 return 0; 1464 return 0;
1463 item = trans->r_itemq; 1465 if (list_empty(&trans->r_itemq)) {
1464 if (item == NULL) {
1465 /* we need to catch log corruptions here */ 1466 /* we need to catch log corruptions here */
1466 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1467 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1467 xlog_warn("XFS: xlog_recover_add_to_trans: " 1468 xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1479,12 +1480,15 @@ xlog_recover_add_to_trans(
1479 memcpy(ptr, dp, len); 1480 memcpy(ptr, dp, len);
1480 in_f = (xfs_inode_log_format_t *)ptr; 1481 in_f = (xfs_inode_log_format_t *)ptr;
1481 1482
1482 if (item->ri_prev->ri_total != 0 && 1483 /* take the tail entry */
1483 item->ri_prev->ri_total == item->ri_prev->ri_cnt) { 1484 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1485 if (item->ri_total != 0 &&
1486 item->ri_total == item->ri_cnt) {
1487 /* tail item is in use, get a new one */
1484 xlog_recover_add_item(&trans->r_itemq); 1488 xlog_recover_add_item(&trans->r_itemq);
1489 item = list_entry(trans->r_itemq.prev,
1490 xlog_recover_item_t, ri_list);
1485 } 1491 }
1486 item = trans->r_itemq;
1487 item = item->ri_prev;
1488 1492
1489 if (item->ri_total == 0) { /* first region to be added */ 1493 if (item->ri_total == 0) { /* first region to be added */
1490 if (in_f->ilf_size == 0 || 1494 if (in_f->ilf_size == 0 ||
@@ -1509,96 +1513,29 @@ xlog_recover_add_to_trans(
1509 return 0; 1513 return 0;
1510} 1514}
1511 1515
1512STATIC void 1516/*
1513xlog_recover_new_tid( 1517 * Sort the log items in the transaction. Cancelled buffers need
1514 xlog_recover_t **q, 1518 * to be put first so they are processed before any items that might
1515 xlog_tid_t tid, 1519 * modify the buffers. If they are cancelled, then the modifications
1516 xfs_lsn_t lsn) 1520 * don't need to be replayed.
1517{ 1521 */
1518 xlog_recover_t *trans;
1519
1520 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1521 trans->r_log_tid = tid;
1522 trans->r_lsn = lsn;
1523 xlog_recover_put_hashq(q, trans);
1524}
1525
1526STATIC int
1527xlog_recover_unlink_tid(
1528 xlog_recover_t **q,
1529 xlog_recover_t *trans)
1530{
1531 xlog_recover_t *tp;
1532 int found = 0;
1533
1534 ASSERT(trans != NULL);
1535 if (trans == *q) {
1536 *q = (*q)->r_next;
1537 } else {
1538 tp = *q;
1539 while (tp) {
1540 if (tp->r_next == trans) {
1541 found = 1;
1542 break;
1543 }
1544 tp = tp->r_next;
1545 }
1546 if (!found) {
1547 xlog_warn(
1548 "XFS: xlog_recover_unlink_tid: trans not found");
1549 ASSERT(0);
1550 return XFS_ERROR(EIO);
1551 }
1552 tp->r_next = tp->r_next->r_next;
1553 }
1554 return 0;
1555}
1556
1557STATIC void
1558xlog_recover_insert_item_backq(
1559 xlog_recover_item_t **q,
1560 xlog_recover_item_t *item)
1561{
1562 if (*q == NULL) {
1563 item->ri_prev = item->ri_next = item;
1564 *q = item;
1565 } else {
1566 item->ri_next = *q;
1567 item->ri_prev = (*q)->ri_prev;
1568 (*q)->ri_prev = item;
1569 item->ri_prev->ri_next = item;
1570 }
1571}
1572
1573STATIC void
1574xlog_recover_insert_item_frontq(
1575 xlog_recover_item_t **q,
1576 xlog_recover_item_t *item)
1577{
1578 xlog_recover_insert_item_backq(q, item);
1579 *q = item;
1580}
1581
1582STATIC int 1522STATIC int
1583xlog_recover_reorder_trans( 1523xlog_recover_reorder_trans(
1584 xlog_recover_t *trans) 1524 xlog_recover_t *trans)
1585{ 1525{
1586 xlog_recover_item_t *first_item, *itemq, *itemq_next; 1526 xlog_recover_item_t *item, *n;
1587 xfs_buf_log_format_t *buf_f; 1527 LIST_HEAD(sort_list);
1588 ushort flags = 0;
1589 1528
1590 first_item = itemq = trans->r_itemq; 1529 list_splice_init(&trans->r_itemq, &sort_list);
1591 trans->r_itemq = NULL; 1530 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1592 do { 1531 xfs_buf_log_format_t *buf_f;
1593 itemq_next = itemq->ri_next;
1594 buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1595 1532
1596 switch (ITEM_TYPE(itemq)) { 1533 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1534
1535 switch (ITEM_TYPE(item)) {
1597 case XFS_LI_BUF: 1536 case XFS_LI_BUF:
1598 flags = buf_f->blf_flags; 1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
1599 if (!(flags & XFS_BLI_CANCEL)) { 1538 list_move(&item->ri_list, &trans->r_itemq);
1600 xlog_recover_insert_item_frontq(&trans->r_itemq,
1601 itemq);
1602 break; 1539 break;
1603 } 1540 }
1604 case XFS_LI_INODE: 1541 case XFS_LI_INODE:
@@ -1606,7 +1543,7 @@ xlog_recover_reorder_trans(
1606 case XFS_LI_QUOTAOFF: 1543 case XFS_LI_QUOTAOFF:
1607 case XFS_LI_EFD: 1544 case XFS_LI_EFD:
1608 case XFS_LI_EFI: 1545 case XFS_LI_EFI:
1609 xlog_recover_insert_item_backq(&trans->r_itemq, itemq); 1546 list_move_tail(&item->ri_list, &trans->r_itemq);
1610 break; 1547 break;
1611 default: 1548 default:
1612 xlog_warn( 1549 xlog_warn(
@@ -1614,8 +1551,8 @@ xlog_recover_reorder_trans(
1614 ASSERT(0); 1551 ASSERT(0);
1615 return XFS_ERROR(EIO); 1552 return XFS_ERROR(EIO);
1616 } 1553 }
1617 itemq = itemq_next; 1554 }
1618 } while (first_item != itemq); 1555 ASSERT(list_empty(&sort_list));
1619 return 0; 1556 return 0;
1620} 1557}
1621 1558
@@ -2206,6 +2143,7 @@ xlog_recover_do_buffer_trans(
2206 xfs_daddr_t blkno; 2143 xfs_daddr_t blkno;
2207 int len; 2144 int len;
2208 ushort flags; 2145 ushort flags;
2146 uint buf_flags;
2209 2147
2210 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; 2148 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2211 2149
@@ -2246,12 +2184,11 @@ xlog_recover_do_buffer_trans(
2246 } 2184 }
2247 2185
2248 mp = log->l_mp; 2186 mp = log->l_mp;
2249 if (flags & XFS_BLI_INODE_BUF) { 2187 buf_flags = XBF_LOCK;
2250 bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len, 2188 if (!(flags & XFS_BLI_INODE_BUF))
2251 XFS_BUF_LOCK); 2189 buf_flags |= XBF_MAPPED;
2252 } else { 2190
2253 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0); 2191 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
2254 }
2255 if (XFS_BUF_ISERROR(bp)) { 2192 if (XFS_BUF_ISERROR(bp)) {
2256 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2193 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2257 bp, blkno); 2194 bp, blkno);
@@ -2350,8 +2287,8 @@ xlog_recover_do_inode_trans(
2350 goto error; 2287 goto error;
2351 } 2288 }
2352 2289
2353 bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno, 2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2354 in_f->ilf_len, XFS_BUF_LOCK); 2291 XBF_LOCK);
2355 if (XFS_BUF_ISERROR(bp)) { 2292 if (XFS_BUF_ISERROR(bp)) {
2356 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2293 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2357 bp, in_f->ilf_blkno); 2294 bp, in_f->ilf_blkno);
@@ -2819,14 +2756,13 @@ xlog_recover_do_trans(
2819 int pass) 2756 int pass)
2820{ 2757{
2821 int error = 0; 2758 int error = 0;
2822 xlog_recover_item_t *item, *first_item; 2759 xlog_recover_item_t *item;
2823 2760
2824 error = xlog_recover_reorder_trans(trans); 2761 error = xlog_recover_reorder_trans(trans);
2825 if (error) 2762 if (error)
2826 return error; 2763 return error;
2827 2764
2828 first_item = item = trans->r_itemq; 2765 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2829 do {
2830 switch (ITEM_TYPE(item)) { 2766 switch (ITEM_TYPE(item)) {
2831 case XFS_LI_BUF: 2767 case XFS_LI_BUF:
2832 error = xlog_recover_do_buffer_trans(log, item, pass); 2768 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2859,8 +2795,7 @@ xlog_recover_do_trans(
2859 2795
2860 if (error) 2796 if (error)
2861 return error; 2797 return error;
2862 item = item->ri_next; 2798 }
2863 } while (first_item != item);
2864 2799
2865 return 0; 2800 return 0;
2866} 2801}
@@ -2874,21 +2809,18 @@ STATIC void
2874xlog_recover_free_trans( 2809xlog_recover_free_trans(
2875 xlog_recover_t *trans) 2810 xlog_recover_t *trans)
2876{ 2811{
2877 xlog_recover_item_t *first_item, *item, *free_item; 2812 xlog_recover_item_t *item, *n;
2878 int i; 2813 int i;
2879 2814
2880 item = first_item = trans->r_itemq; 2815 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2881 do { 2816 /* Free the regions in the item. */
2882 free_item = item; 2817 list_del(&item->ri_list);
2883 item = item->ri_next; 2818 for (i = 0; i < item->ri_cnt; i++)
2884 /* Free the regions in the item. */ 2819 kmem_free(item->ri_buf[i].i_addr);
2885 for (i = 0; i < free_item->ri_cnt; i++) {
2886 kmem_free(free_item->ri_buf[i].i_addr);
2887 }
2888 /* Free the item itself */ 2820 /* Free the item itself */
2889 kmem_free(free_item->ri_buf); 2821 kmem_free(item->ri_buf);
2890 kmem_free(free_item); 2822 kmem_free(item);
2891 } while (first_item != item); 2823 }
2892 /* Free the transaction recover structure */ 2824 /* Free the transaction recover structure */
2893 kmem_free(trans); 2825 kmem_free(trans);
2894} 2826}
@@ -2896,14 +2828,12 @@ xlog_recover_free_trans(
2896STATIC int 2828STATIC int
2897xlog_recover_commit_trans( 2829xlog_recover_commit_trans(
2898 xlog_t *log, 2830 xlog_t *log,
2899 xlog_recover_t **q,
2900 xlog_recover_t *trans, 2831 xlog_recover_t *trans,
2901 int pass) 2832 int pass)
2902{ 2833{
2903 int error; 2834 int error;
2904 2835
2905 if ((error = xlog_recover_unlink_tid(q, trans))) 2836 hlist_del(&trans->r_list);
2906 return error;
2907 if ((error = xlog_recover_do_trans(log, trans, pass))) 2837 if ((error = xlog_recover_do_trans(log, trans, pass)))
2908 return error; 2838 return error;
2909 xlog_recover_free_trans(trans); /* no error */ 2839 xlog_recover_free_trans(trans); /* no error */
@@ -2931,7 +2861,7 @@ xlog_recover_unmount_trans(
2931STATIC int 2861STATIC int
2932xlog_recover_process_data( 2862xlog_recover_process_data(
2933 xlog_t *log, 2863 xlog_t *log,
2934 xlog_recover_t *rhash[], 2864 struct hlist_head rhash[],
2935 xlog_rec_header_t *rhead, 2865 xlog_rec_header_t *rhead,
2936 xfs_caddr_t dp, 2866 xfs_caddr_t dp,
2937 int pass) 2867 int pass)
@@ -2965,7 +2895,7 @@ xlog_recover_process_data(
2965 } 2895 }
2966 tid = be32_to_cpu(ohead->oh_tid); 2896 tid = be32_to_cpu(ohead->oh_tid);
2967 hash = XLOG_RHASH(tid); 2897 hash = XLOG_RHASH(tid);
2968 trans = xlog_recover_find_tid(rhash[hash], tid); 2898 trans = xlog_recover_find_tid(&rhash[hash], tid);
2969 if (trans == NULL) { /* not found; add new tid */ 2899 if (trans == NULL) { /* not found; add new tid */
2970 if (ohead->oh_flags & XLOG_START_TRANS) 2900 if (ohead->oh_flags & XLOG_START_TRANS)
2971 xlog_recover_new_tid(&rhash[hash], tid, 2901 xlog_recover_new_tid(&rhash[hash], tid,
@@ -2983,7 +2913,7 @@ xlog_recover_process_data(
2983 switch (flags) { 2913 switch (flags) {
2984 case XLOG_COMMIT_TRANS: 2914 case XLOG_COMMIT_TRANS:
2985 error = xlog_recover_commit_trans(log, 2915 error = xlog_recover_commit_trans(log,
2986 &rhash[hash], trans, pass); 2916 trans, pass);
2987 break; 2917 break;
2988 case XLOG_UNMOUNT_TRANS: 2918 case XLOG_UNMOUNT_TRANS:
2989 error = xlog_recover_unmount_trans(trans); 2919 error = xlog_recover_unmount_trans(trans);
@@ -3216,7 +3146,7 @@ xlog_recover_process_one_iunlink(
3216 /* 3146 /*
3217 * Get the on disk inode to find the next inode in the bucket. 3147 * Get the on disk inode to find the next inode in the bucket.
3218 */ 3148 */
3219 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); 3149 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
3220 if (error) 3150 if (error)
3221 goto fail_iput; 3151 goto fail_iput;
3222 3152
@@ -3517,12 +3447,12 @@ xlog_do_recovery_pass(
3517{ 3447{
3518 xlog_rec_header_t *rhead; 3448 xlog_rec_header_t *rhead;
3519 xfs_daddr_t blk_no; 3449 xfs_daddr_t blk_no;
3520 xfs_caddr_t bufaddr, offset; 3450 xfs_caddr_t offset;
3521 xfs_buf_t *hbp, *dbp; 3451 xfs_buf_t *hbp, *dbp;
3522 int error = 0, h_size; 3452 int error = 0, h_size;
3523 int bblks, split_bblks; 3453 int bblks, split_bblks;
3524 int hblks, split_hblks, wrapped_hblks; 3454 int hblks, split_hblks, wrapped_hblks;
3525 xlog_recover_t *rhash[XLOG_RHASH_SIZE]; 3455 struct hlist_head rhash[XLOG_RHASH_SIZE];
3526 3456
3527 ASSERT(head_blk != tail_blk); 3457 ASSERT(head_blk != tail_blk);
3528 3458
@@ -3610,7 +3540,7 @@ xlog_do_recovery_pass(
3610 /* 3540 /*
3611 * Check for header wrapping around physical end-of-log 3541 * Check for header wrapping around physical end-of-log
3612 */ 3542 */
3613 offset = NULL; 3543 offset = XFS_BUF_PTR(hbp);
3614 split_hblks = 0; 3544 split_hblks = 0;
3615 wrapped_hblks = 0; 3545 wrapped_hblks = 0;
3616 if (blk_no + hblks <= log->l_logBBsize) { 3546 if (blk_no + hblks <= log->l_logBBsize) {
@@ -3646,9 +3576,8 @@ xlog_do_recovery_pass(
3646 * - order is important. 3576 * - order is important.
3647 */ 3577 */
3648 wrapped_hblks = hblks - split_hblks; 3578 wrapped_hblks = hblks - split_hblks;
3649 bufaddr = XFS_BUF_PTR(hbp);
3650 error = XFS_BUF_SET_PTR(hbp, 3579 error = XFS_BUF_SET_PTR(hbp,
3651 bufaddr + BBTOB(split_hblks), 3580 offset + BBTOB(split_hblks),
3652 BBTOB(hblks - split_hblks)); 3581 BBTOB(hblks - split_hblks));
3653 if (error) 3582 if (error)
3654 goto bread_err2; 3583 goto bread_err2;
@@ -3658,14 +3587,10 @@ xlog_do_recovery_pass(
3658 if (error) 3587 if (error)
3659 goto bread_err2; 3588 goto bread_err2;
3660 3589
3661 error = XFS_BUF_SET_PTR(hbp, bufaddr, 3590 error = XFS_BUF_SET_PTR(hbp, offset,
3662 BBTOB(hblks)); 3591 BBTOB(hblks));
3663 if (error) 3592 if (error)
3664 goto bread_err2; 3593 goto bread_err2;
3665
3666 if (!offset)
3667 offset = xlog_align(log, 0,
3668 wrapped_hblks, hbp);
3669 } 3594 }
3670 rhead = (xlog_rec_header_t *)offset; 3595 rhead = (xlog_rec_header_t *)offset;
3671 error = xlog_valid_rec_header(log, rhead, 3596 error = xlog_valid_rec_header(log, rhead,
@@ -3685,7 +3610,7 @@ xlog_do_recovery_pass(
3685 } else { 3610 } else {
3686 /* This log record is split across the 3611 /* This log record is split across the
3687 * physical end of log */ 3612 * physical end of log */
3688 offset = NULL; 3613 offset = XFS_BUF_PTR(dbp);
3689 split_bblks = 0; 3614 split_bblks = 0;
3690 if (blk_no != log->l_logBBsize) { 3615 if (blk_no != log->l_logBBsize) {
3691 /* some data is before the physical 3616 /* some data is before the physical
@@ -3714,9 +3639,8 @@ xlog_do_recovery_pass(
3714 * _first_, then the log start (LR header end) 3639 * _first_, then the log start (LR header end)
3715 * - order is important. 3640 * - order is important.
3716 */ 3641 */
3717 bufaddr = XFS_BUF_PTR(dbp);
3718 error = XFS_BUF_SET_PTR(dbp, 3642 error = XFS_BUF_SET_PTR(dbp,
3719 bufaddr + BBTOB(split_bblks), 3643 offset + BBTOB(split_bblks),
3720 BBTOB(bblks - split_bblks)); 3644 BBTOB(bblks - split_bblks));
3721 if (error) 3645 if (error)
3722 goto bread_err2; 3646 goto bread_err2;
@@ -3727,13 +3651,9 @@ xlog_do_recovery_pass(
3727 if (error) 3651 if (error)
3728 goto bread_err2; 3652 goto bread_err2;
3729 3653
3730 error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size); 3654 error = XFS_BUF_SET_PTR(dbp, offset, h_size);
3731 if (error) 3655 if (error)
3732 goto bread_err2; 3656 goto bread_err2;
3733
3734 if (!offset)
3735 offset = xlog_align(log, wrapped_hblks,
3736 bblks - split_bblks, dbp);
3737 } 3657 }
3738 xlog_unpack_data(rhead, offset, log); 3658 xlog_unpack_data(rhead, offset, log);
3739 if ((error = xlog_recover_process_data(log, rhash, 3659 if ((error = xlog_recover_process_data(log, rhash,
@@ -3993,8 +3913,7 @@ xlog_recover_finish(
3993 * case the unlink transactions would have problems 3913 * case the unlink transactions would have problems
3994 * pushing the EFIs out of the way. 3914 * pushing the EFIs out of the way.
3995 */ 3915 */
3996 xfs_log_force(log->l_mp, (xfs_lsn_t)0, 3916 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3997 (XFS_LOG_FORCE | XFS_LOG_SYNC));
3998 3917
3999 xlog_recover_process_iunlinks(log); 3918 xlog_recover_process_iunlinks(log);
4000 3919
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index b22545555301..75d749207258 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -35,22 +35,21 @@
35 * item headers are in ri_buf[0]. Additional buffers follow. 35 * item headers are in ri_buf[0]. Additional buffers follow.
36 */ 36 */
37typedef struct xlog_recover_item { 37typedef struct xlog_recover_item {
38 struct xlog_recover_item *ri_next; 38 struct list_head ri_list;
39 struct xlog_recover_item *ri_prev; 39 int ri_type;
40 int ri_type; 40 int ri_cnt; /* count of regions found */
41 int ri_cnt; /* count of regions found */ 41 int ri_total; /* total regions */
42 int ri_total; /* total regions */ 42 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
43 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
44} xlog_recover_item_t; 43} xlog_recover_item_t;
45 44
46struct xlog_tid; 45struct xlog_tid;
47typedef struct xlog_recover { 46typedef struct xlog_recover {
48 struct xlog_recover *r_next; 47 struct hlist_node r_list;
49 xlog_tid_t r_log_tid; /* log's transaction id */ 48 xlog_tid_t r_log_tid; /* log's transaction id */
50 xfs_trans_header_t r_theader; /* trans header for partial */ 49 xfs_trans_header_t r_theader; /* trans header for partial */
51 int r_state; /* not needed */ 50 int r_state; /* not needed */
52 xfs_lsn_t r_lsn; /* xact lsn */ 51 xfs_lsn_t r_lsn; /* xact lsn */
53 xlog_recover_item_t *r_itemq; /* q for items */ 52 struct list_head r_itemq; /* q for items */
54} xlog_recover_t; 53} xlog_recover_t;
55 54
56#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) 55#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8b6c9e807efb..e79b56b4bca6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,6 +44,8 @@
44#include "xfs_quota.h" 44#include "xfs_quota.h"
45#include "xfs_fsops.h" 45#include "xfs_fsops.h"
46#include "xfs_utils.h" 46#include "xfs_utils.h"
47#include "xfs_trace.h"
48
47 49
48STATIC void xfs_unmountfs_wait(xfs_mount_t *); 50STATIC void xfs_unmountfs_wait(xfs_mount_t *);
49 51
@@ -199,6 +201,38 @@ xfs_uuid_unmount(
199 201
200 202
201/* 203/*
204 * Reference counting access wrappers to the perag structures.
205 */
206struct xfs_perag *
207xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
208{
209 struct xfs_perag *pag;
210 int ref = 0;
211
212 spin_lock(&mp->m_perag_lock);
213 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
214 if (pag) {
215 ASSERT(atomic_read(&pag->pag_ref) >= 0);
216 /* catch leaks in the positive direction during testing */
217 ASSERT(atomic_read(&pag->pag_ref) < 1000);
218 ref = atomic_inc_return(&pag->pag_ref);
219 }
220 spin_unlock(&mp->m_perag_lock);
221 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
222 return pag;
223}
224
225void
226xfs_perag_put(struct xfs_perag *pag)
227{
228 int ref;
229
230 ASSERT(atomic_read(&pag->pag_ref) > 0);
231 ref = atomic_dec_return(&pag->pag_ref);
232 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
233}
234
235/*
202 * Free up the resources associated with a mount structure. Assume that 236 * Free up the resources associated with a mount structure. Assume that
203 * the structure was initially zeroed, so we can tell which fields got 237 * the structure was initially zeroed, so we can tell which fields got
204 * initialized. 238 * initialized.
@@ -207,13 +241,16 @@ STATIC void
207xfs_free_perag( 241xfs_free_perag(
208 xfs_mount_t *mp) 242 xfs_mount_t *mp)
209{ 243{
210 if (mp->m_perag) { 244 xfs_agnumber_t agno;
211 int agno; 245 struct xfs_perag *pag;
212 246
213 for (agno = 0; agno < mp->m_maxagi; agno++) 247 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
214 if (mp->m_perag[agno].pagb_list) 248 spin_lock(&mp->m_perag_lock);
215 kmem_free(mp->m_perag[agno].pagb_list); 249 pag = radix_tree_delete(&mp->m_perag_tree, agno);
216 kmem_free(mp->m_perag); 250 ASSERT(pag);
251 ASSERT(atomic_read(&pag->pag_ref) == 0);
252 spin_unlock(&mp->m_perag_lock);
253 kmem_free(pag);
217 } 254 }
218} 255}
219 256
@@ -387,22 +424,57 @@ xfs_initialize_perag_icache(
387 } 424 }
388} 425}
389 426
390xfs_agnumber_t 427int
391xfs_initialize_perag( 428xfs_initialize_perag(
392 xfs_mount_t *mp, 429 xfs_mount_t *mp,
393 xfs_agnumber_t agcount) 430 xfs_agnumber_t agcount,
431 xfs_agnumber_t *maxagi)
394{ 432{
395 xfs_agnumber_t index, max_metadata; 433 xfs_agnumber_t index, max_metadata;
434 xfs_agnumber_t first_initialised = 0;
396 xfs_perag_t *pag; 435 xfs_perag_t *pag;
397 xfs_agino_t agino; 436 xfs_agino_t agino;
398 xfs_ino_t ino; 437 xfs_ino_t ino;
399 xfs_sb_t *sbp = &mp->m_sb; 438 xfs_sb_t *sbp = &mp->m_sb;
400 xfs_ino_t max_inum = XFS_MAXINUMBER_32; 439 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
440 int error = -ENOMEM;
401 441
402 /* Check to see if the filesystem can overflow 32 bit inodes */ 442 /* Check to see if the filesystem can overflow 32 bit inodes */
403 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); 443 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
404 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); 444 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
405 445
446 /*
447 * Walk the current per-ag tree so we don't try to initialise AGs
448 * that already exist (growfs case). Allocate and insert all the
449 * AGs we don't find ready for initialisation.
450 */
451 for (index = 0; index < agcount; index++) {
452 pag = xfs_perag_get(mp, index);
453 if (pag) {
454 xfs_perag_put(pag);
455 continue;
456 }
457 if (!first_initialised)
458 first_initialised = index;
459 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
460 if (!pag)
461 goto out_unwind;
462 if (radix_tree_preload(GFP_NOFS))
463 goto out_unwind;
464 spin_lock(&mp->m_perag_lock);
465 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
466 BUG();
467 spin_unlock(&mp->m_perag_lock);
468 radix_tree_preload_end();
469 error = -EEXIST;
470 goto out_unwind;
471 }
472 pag->pag_agno = index;
473 pag->pag_mount = mp;
474 spin_unlock(&mp->m_perag_lock);
475 radix_tree_preload_end();
476 }
477
406 /* Clear the mount flag if no inode can overflow 32 bits 478 /* Clear the mount flag if no inode can overflow 32 bits
407 * on this filesystem, or if specifically requested.. 479 * on this filesystem, or if specifically requested..
408 */ 480 */
@@ -436,21 +508,33 @@ xfs_initialize_perag(
436 } 508 }
437 509
438 /* This ag is preferred for inodes */ 510 /* This ag is preferred for inodes */
439 pag = &mp->m_perag[index]; 511 pag = xfs_perag_get(mp, index);
440 pag->pagi_inodeok = 1; 512 pag->pagi_inodeok = 1;
441 if (index < max_metadata) 513 if (index < max_metadata)
442 pag->pagf_metadata = 1; 514 pag->pagf_metadata = 1;
443 xfs_initialize_perag_icache(pag); 515 xfs_initialize_perag_icache(pag);
516 xfs_perag_put(pag);
444 } 517 }
445 } else { 518 } else {
446 /* Setup default behavior for smaller filesystems */ 519 /* Setup default behavior for smaller filesystems */
447 for (index = 0; index < agcount; index++) { 520 for (index = 0; index < agcount; index++) {
448 pag = &mp->m_perag[index]; 521 pag = xfs_perag_get(mp, index);
449 pag->pagi_inodeok = 1; 522 pag->pagi_inodeok = 1;
450 xfs_initialize_perag_icache(pag); 523 xfs_initialize_perag_icache(pag);
524 xfs_perag_put(pag);
451 } 525 }
452 } 526 }
453 return index; 527 if (maxagi)
528 *maxagi = index;
529 return 0;
530
531out_unwind:
532 kmem_free(pag);
533 for (; index > first_initialised; index--) {
534 pag = radix_tree_delete(&mp->m_perag_tree, index);
535 kmem_free(pag);
536 }
537 return error;
454} 538}
455 539
456void 540void
@@ -581,10 +665,10 @@ xfs_readsb(xfs_mount_t *mp, int flags)
581 * access to the superblock. 665 * access to the superblock.
582 */ 666 */
583 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 667 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
584 extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; 668 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
585 669
586 bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, 670 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
587 BTOBB(sector_size), extra_flags); 671 extra_flags);
588 if (!bp || XFS_BUF_ISERROR(bp)) { 672 if (!bp || XFS_BUF_ISERROR(bp)) {
589 xfs_fs_mount_cmn_err(flags, "SB read failed"); 673 xfs_fs_mount_cmn_err(flags, "SB read failed");
590 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 674 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
@@ -624,8 +708,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
624 XFS_BUF_UNMANAGE(bp); 708 XFS_BUF_UNMANAGE(bp);
625 xfs_buf_relse(bp); 709 xfs_buf_relse(bp);
626 sector_size = mp->m_sb.sb_sectsize; 710 sector_size = mp->m_sb.sb_sectsize;
627 bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, 711 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
628 BTOBB(sector_size), extra_flags); 712 BTOBB(sector_size), extra_flags);
629 if (!bp || XFS_BUF_ISERROR(bp)) { 713 if (!bp || XFS_BUF_ISERROR(bp)) {
630 xfs_fs_mount_cmn_err(flags, "SB re-read failed"); 714 xfs_fs_mount_cmn_err(flags, "SB re-read failed");
631 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 715 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
@@ -729,12 +813,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
729 error = xfs_ialloc_pagi_init(mp, NULL, index); 813 error = xfs_ialloc_pagi_init(mp, NULL, index);
730 if (error) 814 if (error)
731 return error; 815 return error;
732 pag = &mp->m_perag[index]; 816 pag = xfs_perag_get(mp, index);
733 ifree += pag->pagi_freecount; 817 ifree += pag->pagi_freecount;
734 ialloc += pag->pagi_count; 818 ialloc += pag->pagi_count;
735 bfree += pag->pagf_freeblks; 819 bfree += pag->pagf_freeblks;
736 bfreelst += pag->pagf_flcount; 820 bfreelst += pag->pagf_flcount;
737 btree += pag->pagf_btreeblks; 821 btree += pag->pagf_btreeblks;
822 xfs_perag_put(pag);
738 } 823 }
739 /* 824 /*
740 * Overwrite incore superblock counters with just-read data 825 * Overwrite incore superblock counters with just-read data
@@ -1006,6 +1091,24 @@ xfs_mount_reset_sbqflags(
1006 return xfs_trans_commit(tp, 0); 1091 return xfs_trans_commit(tp, 0);
1007} 1092}
1008 1093
1094__uint64_t
1095xfs_default_resblks(xfs_mount_t *mp)
1096{
1097 __uint64_t resblks;
1098
1099 /*
1100 * We default to 5% or 8192 fsbs of space reserved, whichever is
1101 * smaller. This is intended to cover concurrent allocation
1102 * transactions when we initially hit enospc. These each require a 4
1103 * block reservation. Hence by default we cover roughly 2000 concurrent
1104 * allocation reservations.
1105 */
1106 resblks = mp->m_sb.sb_dblocks;
1107 do_div(resblks, 20);
1108 resblks = min_t(__uint64_t, resblks, 8192);
1109 return resblks;
1110}
1111
1009/* 1112/*
1010 * This function does the following on an initial mount of a file system: 1113 * This function does the following on an initial mount of a file system:
1011 * - reads the superblock from disk and init the mount struct 1114 * - reads the superblock from disk and init the mount struct
@@ -1150,13 +1253,13 @@ xfs_mountfs(
1150 /* 1253 /*
1151 * Allocate and initialize the per-ag data. 1254 * Allocate and initialize the per-ag data.
1152 */ 1255 */
1153 init_rwsem(&mp->m_peraglock); 1256 spin_lock_init(&mp->m_perag_lock);
1154 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), 1257 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
1155 KM_MAYFAIL); 1258 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1156 if (!mp->m_perag) 1259 if (error) {
1260 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
1157 goto out_remove_uuid; 1261 goto out_remove_uuid;
1158 1262 }
1159 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1160 1263
1161 if (!sbp->sb_logblocks) { 1264 if (!sbp->sb_logblocks) {
1162 cmn_err(CE_WARN, "XFS: no log defined"); 1265 cmn_err(CE_WARN, "XFS: no log defined");
@@ -1317,17 +1420,16 @@ xfs_mountfs(
1317 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1420 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1318 * are not allowed to use this reserved space. 1421 * are not allowed to use this reserved space.
1319 * 1422 *
1320 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1321 * This may drive us straight to ENOSPC on mount, but that implies 1423 * This may drive us straight to ENOSPC on mount, but that implies
1322 * we were already there on the last unmount. Warn if this occurs. 1424 * we were already there on the last unmount. Warn if this occurs.
1323 */ 1425 */
1324 resblks = mp->m_sb.sb_dblocks; 1426 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1325 do_div(resblks, 20); 1427 resblks = xfs_default_resblks(mp);
1326 resblks = min_t(__uint64_t, resblks, 1024); 1428 error = xfs_reserve_blocks(mp, &resblks, NULL);
1327 error = xfs_reserve_blocks(mp, &resblks, NULL); 1429 if (error)
1328 if (error) 1430 cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
1329 cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " 1431 "blocks. Continuing without a reserve pool.");
1330 "Continuing without a reserve pool."); 1432 }
1331 1433
1332 return 0; 1434 return 0;
1333 1435
@@ -1370,8 +1472,19 @@ xfs_unmountfs(
1370 * push out the iclog we will never get that unlocked. hence we 1472 * push out the iclog we will never get that unlocked. hence we
1371 * need to force the log first. 1473 * need to force the log first.
1372 */ 1474 */
1373 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1475 xfs_log_force(mp, XFS_LOG_SYNC);
1374 xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); 1476
1477 /*
1478 * Do a delwri reclaim pass first so that as many dirty inodes are
1479 * queued up for IO as possible. Then flush the buffers before making
1480 * a synchronous path to catch all the remaining inodes are reclaimed.
1481 * This makes the reclaim process as quick as possible by avoiding
1482 * synchronous writeout and blocking on inodes already in the delwri
1483 * state as much as possible.
1484 */
1485 xfs_reclaim_inodes(mp, 0);
1486 XFS_bflush(mp->m_ddev_targp);
1487 xfs_reclaim_inodes(mp, SYNC_WAIT);
1375 1488
1376 xfs_qm_unmount(mp); 1489 xfs_qm_unmount(mp);
1377 1490
@@ -1380,7 +1493,7 @@ xfs_unmountfs(
1380 * that nothing is pinned. This is important because bflush() 1493 * that nothing is pinned. This is important because bflush()
1381 * will skip pinned buffers. 1494 * will skip pinned buffers.
1382 */ 1495 */
1383 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1496 xfs_log_force(mp, XFS_LOG_SYNC);
1384 1497
1385 xfs_binval(mp->m_ddev_targp); 1498 xfs_binval(mp->m_ddev_targp);
1386 if (mp->m_rtdev_targp) { 1499 if (mp->m_rtdev_targp) {
@@ -1471,7 +1584,7 @@ xfs_log_sbcount(
1471 if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) 1584 if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
1472 return 0; 1585 return 0;
1473 1586
1474 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT); 1587 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
1475 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, 1588 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1476 XFS_DEFAULT_LOG_COUNT); 1589 XFS_DEFAULT_LOG_COUNT);
1477 if (error) { 1590 if (error) {
@@ -1546,15 +1659,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1546 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); 1659 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1547 1660
1548 /* find modified range */ 1661 /* find modified range */
1662 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1663 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1664 last = xfs_sb_info[f + 1].offset - 1;
1549 1665
1550 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 1666 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1551 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 1667 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1552 first = xfs_sb_info[f].offset; 1668 first = xfs_sb_info[f].offset;
1553 1669
1554 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1555 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1556 last = xfs_sb_info[f + 1].offset - 1;
1557
1558 xfs_trans_log_buf(tp, bp, first, last); 1670 xfs_trans_log_buf(tp, bp, first, last);
1559} 1671}
1560 1672
@@ -1618,26 +1730,30 @@ xfs_mod_incore_sb_unlocked(
1618 lcounter += rem; 1730 lcounter += rem;
1619 } 1731 }
1620 } else { /* Taking blocks away */ 1732 } else { /* Taking blocks away */
1621
1622 lcounter += delta; 1733 lcounter += delta;
1734 if (lcounter >= 0) {
1735 mp->m_sb.sb_fdblocks = lcounter +
1736 XFS_ALLOC_SET_ASIDE(mp);
1737 return 0;
1738 }
1623 1739
1624 /* 1740 /*
1625 * If were out of blocks, use any available reserved blocks if 1741 * We are out of blocks, use any available reserved
1626 * were allowed to. 1742 * blocks if were allowed to.
1627 */ 1743 */
1744 if (!rsvd)
1745 return XFS_ERROR(ENOSPC);
1628 1746
1629 if (lcounter < 0) { 1747 lcounter = (long long)mp->m_resblks_avail + delta;
1630 if (rsvd) { 1748 if (lcounter >= 0) {
1631 lcounter = (long long)mp->m_resblks_avail + delta; 1749 mp->m_resblks_avail = lcounter;
1632 if (lcounter < 0) { 1750 return 0;
1633 return XFS_ERROR(ENOSPC);
1634 }
1635 mp->m_resblks_avail = lcounter;
1636 return 0;
1637 } else { /* not reserved */
1638 return XFS_ERROR(ENOSPC);
1639 }
1640 } 1751 }
1752 printk_once(KERN_WARNING
1753 "Filesystem \"%s\": reserve blocks depleted! "
1754 "Consider increasing reserve pool size.",
1755 mp->m_fsname);
1756 return XFS_ERROR(ENOSPC);
1641 } 1757 }
1642 1758
1643 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); 1759 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1885,7 +2001,7 @@ xfs_getsb(
1885 2001
1886 ASSERT(mp->m_sb_bp != NULL); 2002 ASSERT(mp->m_sb_bp != NULL);
1887 bp = mp->m_sb_bp; 2003 bp = mp->m_sb_bp;
1888 if (flags & XFS_BUF_TRYLOCK) { 2004 if (flags & XBF_TRYLOCK) {
1889 if (!XFS_BUF_CPSEMA(bp)) { 2005 if (!XFS_BUF_CPSEMA(bp)) {
1890 return NULL; 2006 return NULL;
1891 } 2007 }
@@ -1945,6 +2061,26 @@ xfs_mount_log_sb(
1945 return error; 2061 return error;
1946} 2062}
1947 2063
2064/*
2065 * If the underlying (data/log/rt) device is readonly, there are some
2066 * operations that cannot proceed.
2067 */
2068int
2069xfs_dev_is_read_only(
2070 struct xfs_mount *mp,
2071 char *message)
2072{
2073 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2074 xfs_readonly_buftarg(mp->m_logdev_targp) ||
2075 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2076 cmn_err(CE_NOTE,
2077 "XFS: %s required on read-only device.", message);
2078 cmn_err(CE_NOTE,
2079 "XFS: write access unavailable, cannot proceed.");
2080 return EROFS;
2081 }
2082 return 0;
2083}
1948 2084
1949#ifdef HAVE_PERCPU_SB 2085#ifdef HAVE_PERCPU_SB
1950/* 2086/*
@@ -2123,7 +2259,7 @@ xfs_icsb_destroy_counters(
2123 mutex_destroy(&mp->m_icsb_mutex); 2259 mutex_destroy(&mp->m_icsb_mutex);
2124} 2260}
2125 2261
2126STATIC_INLINE void 2262STATIC void
2127xfs_icsb_lock_cntr( 2263xfs_icsb_lock_cntr(
2128 xfs_icsb_cnts_t *icsbp) 2264 xfs_icsb_cnts_t *icsbp)
2129{ 2265{
@@ -2132,7 +2268,7 @@ xfs_icsb_lock_cntr(
2132 } 2268 }
2133} 2269}
2134 2270
2135STATIC_INLINE void 2271STATIC void
2136xfs_icsb_unlock_cntr( 2272xfs_icsb_unlock_cntr(
2137 xfs_icsb_cnts_t *icsbp) 2273 xfs_icsb_cnts_t *icsbp)
2138{ 2274{
@@ -2140,7 +2276,7 @@ xfs_icsb_unlock_cntr(
2140} 2276}
2141 2277
2142 2278
2143STATIC_INLINE void 2279STATIC void
2144xfs_icsb_lock_all_counters( 2280xfs_icsb_lock_all_counters(
2145 xfs_mount_t *mp) 2281 xfs_mount_t *mp)
2146{ 2282{
@@ -2153,7 +2289,7 @@ xfs_icsb_lock_all_counters(
2153 } 2289 }
2154} 2290}
2155 2291
2156STATIC_INLINE void 2292STATIC void
2157xfs_icsb_unlock_all_counters( 2293xfs_icsb_unlock_all_counters(
2158 xfs_mount_t *mp) 2294 xfs_mount_t *mp)
2159{ 2295{
@@ -2389,12 +2525,12 @@ xfs_icsb_modify_counters(
2389{ 2525{
2390 xfs_icsb_cnts_t *icsbp; 2526 xfs_icsb_cnts_t *icsbp;
2391 long long lcounter; /* long counter for 64 bit fields */ 2527 long long lcounter; /* long counter for 64 bit fields */
2392 int cpu, ret = 0; 2528 int ret = 0;
2393 2529
2394 might_sleep(); 2530 might_sleep();
2395again: 2531again:
2396 cpu = get_cpu(); 2532 preempt_disable();
2397 icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu); 2533 icsbp = this_cpu_ptr(mp->m_sb_cnts);
2398 2534
2399 /* 2535 /*
2400 * if the counter is disabled, go to slow path 2536 * if the counter is disabled, go to slow path
@@ -2438,11 +2574,11 @@ again:
2438 break; 2574 break;
2439 } 2575 }
2440 xfs_icsb_unlock_cntr(icsbp); 2576 xfs_icsb_unlock_cntr(icsbp);
2441 put_cpu(); 2577 preempt_enable();
2442 return 0; 2578 return 0;
2443 2579
2444slow_path: 2580slow_path:
2445 put_cpu(); 2581 preempt_enable();
2446 2582
2447 /* 2583 /*
2448 * serialise with a mutex so we don't burn lots of cpu on 2584 * serialise with a mutex so we don't burn lots of cpu on
@@ -2490,7 +2626,7 @@ slow_path:
2490 2626
2491balance_counter: 2627balance_counter:
2492 xfs_icsb_unlock_cntr(icsbp); 2628 xfs_icsb_unlock_cntr(icsbp);
2493 put_cpu(); 2629 preempt_enable();
2494 2630
2495 /* 2631 /*
2496 * We may have multiple threads here if multiple per-cpu 2632 * We may have multiple threads here if multiple per-cpu
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a6c023bc0fb2..4fa0bc7b983e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -78,7 +78,8 @@ typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, 78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
79 struct xfs_inode *, dm_right_t, 79 struct xfs_inode *, dm_right_t,
80 struct xfs_inode *, dm_right_t, 80 struct xfs_inode *, dm_right_t,
81 const char *, const char *, mode_t, int, int); 81 const unsigned char *, const unsigned char *,
82 mode_t, int, int);
82typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, 83typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
83 char *, char *); 84 char *, char *);
84typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, 85typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -93,6 +94,9 @@ typedef struct xfs_dmops {
93 xfs_send_unmount_t xfs_send_unmount; 94 xfs_send_unmount_t xfs_send_unmount;
94} xfs_dmops_t; 95} xfs_dmops_t;
95 96
97#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
98 (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
99
96#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \ 100#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
97 (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock) 101 (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
98#define XFS_SEND_MMAP(mp, vma,fl) \ 102#define XFS_SEND_MMAP(mp, vma,fl) \
@@ -101,12 +105,24 @@ typedef struct xfs_dmops {
101 (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right) 105 (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
102#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ 106#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
103 (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) 107 (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
104#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
105 (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
106#define XFS_SEND_MOUNT(mp,right,path,name) \ 108#define XFS_SEND_MOUNT(mp,right,path,name) \
107 (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) 109 (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
108#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \ 110#define XFS_SEND_PREUNMOUNT(mp) \
109 (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl) 111do { \
112 if (mp->m_flags & XFS_MOUNT_DMAPI) { \
113 (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
114 (mp)->m_rootip, DM_RIGHT_NULL, \
115 (mp)->m_rootip, DM_RIGHT_NULL, \
116 NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
117 } \
118} while (0)
119#define XFS_SEND_UNMOUNT(mp) \
120do { \
121 if (mp->m_flags & XFS_MOUNT_DMAPI) { \
122 (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
123 DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
124 } \
125} while (0)
110 126
111 127
112#ifdef HAVE_PERCPU_SB 128#ifdef HAVE_PERCPU_SB
@@ -192,8 +208,8 @@ typedef struct xfs_mount {
192 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 208 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
193 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 209 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
194 uint m_in_maxlevels; /* max inobt btree levels. */ 210 uint m_in_maxlevels; /* max inobt btree levels. */
195 struct xfs_perag *m_perag; /* per-ag accounting info */ 211 struct radix_tree_root m_perag_tree; /* per-ag accounting info */
196 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ 212 spinlock_t m_perag_lock; /* lock for m_perag_tree */
197 struct mutex m_growlock; /* growfs mutex */ 213 struct mutex m_growlock; /* growfs mutex */
198 int m_fixedfsid[2]; /* unchanged for life of FS */ 214 int m_fixedfsid[2]; /* unchanged for life of FS */
199 uint m_dmevmask; /* DMI events for this FS */ 215 uint m_dmevmask; /* DMI events for this FS */
@@ -209,6 +225,7 @@ typedef struct xfs_mount {
209 __uint64_t m_maxioffset; /* maximum inode offset */ 225 __uint64_t m_maxioffset; /* maximum inode offset */
210 __uint64_t m_resblks; /* total reserved blocks */ 226 __uint64_t m_resblks; /* total reserved blocks */
211 __uint64_t m_resblks_avail;/* available reserved blocks */ 227 __uint64_t m_resblks_avail;/* available reserved blocks */
228 __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
212 int m_dalign; /* stripe unit */ 229 int m_dalign; /* stripe unit */
213 int m_swidth; /* stripe width */ 230 int m_swidth; /* stripe width */
214 int m_sinoalign; /* stripe unit inode alignment */ 231 int m_sinoalign; /* stripe unit inode alignment */
@@ -228,7 +245,7 @@ typedef struct xfs_mount {
228 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ 245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
229 atomic_t m_active_trans; /* number trans frozen */ 246 atomic_t m_active_trans; /* number trans frozen */
230#ifdef HAVE_PERCPU_SB 247#ifdef HAVE_PERCPU_SB
231 xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ 248 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
232 unsigned long m_icsb_counters; /* disabled per-cpu counters */ 249 unsigned long m_icsb_counters; /* disabled per-cpu counters */
233 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ 250 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
234 struct mutex m_icsb_mutex; /* balancer sync lock */ 251 struct mutex m_icsb_mutex; /* balancer sync lock */
@@ -369,31 +386,22 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
369} 386}
370 387
371/* 388/*
372 * perag get/put wrappers for eventual ref counting 389 * perag get/put wrappers for ref counting
373 */ 390 */
374static inline xfs_perag_t * 391struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
375xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino) 392void xfs_perag_put(struct xfs_perag *pag);
376{
377 return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
378}
379
380static inline void
381xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
382{
383 /* nothing to see here, move along */
384}
385 393
386/* 394/*
387 * Per-cpu superblock locking functions 395 * Per-cpu superblock locking functions
388 */ 396 */
389#ifdef HAVE_PERCPU_SB 397#ifdef HAVE_PERCPU_SB
390STATIC_INLINE void 398static inline void
391xfs_icsb_lock(xfs_mount_t *mp) 399xfs_icsb_lock(xfs_mount_t *mp)
392{ 400{
393 mutex_lock(&mp->m_icsb_mutex); 401 mutex_lock(&mp->m_icsb_mutex);
394} 402}
395 403
396STATIC_INLINE void 404static inline void
397xfs_icsb_unlock(xfs_mount_t *mp) 405xfs_icsb_unlock(xfs_mount_t *mp)
398{ 406{
399 mutex_unlock(&mp->m_icsb_mutex); 407 mutex_unlock(&mp->m_icsb_mutex);
@@ -413,6 +421,7 @@ typedef struct xfs_mod_sb {
413} xfs_mod_sb_t; 421} xfs_mod_sb_t;
414 422
415extern int xfs_log_sbcount(xfs_mount_t *, uint); 423extern int xfs_log_sbcount(xfs_mount_t *, uint);
424extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
416extern int xfs_mountfs(xfs_mount_t *mp); 425extern int xfs_mountfs(xfs_mount_t *mp);
417 426
418extern void xfs_unmountfs(xfs_mount_t *); 427extern void xfs_unmountfs(xfs_mount_t *);
@@ -427,6 +436,8 @@ extern void xfs_freesb(xfs_mount_t *);
427extern int xfs_fs_writable(xfs_mount_t *); 436extern int xfs_fs_writable(xfs_mount_t *);
428extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); 437extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
429 438
439extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
440
430extern int xfs_dmops_get(struct xfs_mount *); 441extern int xfs_dmops_get(struct xfs_mount *);
431extern void xfs_dmops_put(struct xfs_mount *); 442extern void xfs_dmops_put(struct xfs_mount *);
432 443
@@ -435,7 +446,8 @@ extern struct xfs_dmops xfs_dmcore_xfs;
435#endif /* __KERNEL__ */ 446#endif /* __KERNEL__ */
436 447
437extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 448extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
438extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t); 449extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
450 xfs_agnumber_t *);
439extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); 451extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
440extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 452extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
441 453
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4b0613d99faa..45ce15dc5b2b 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -398,7 +398,7 @@ exit:
398 * guaranteed that all the free functions for all the elements have finished 398 * guaranteed that all the free functions for all the elements have finished
399 * executing and the reaper is not running. 399 * executing and the reaper is not running.
400 */ 400 */
401void 401static void
402xfs_mru_cache_flush( 402xfs_mru_cache_flush(
403 xfs_mru_cache_t *mru) 403 xfs_mru_cache_t *mru)
404{ 404{
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 5d439f34b0c9..36dd3ec8b4eb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void);
42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, 42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
43 unsigned int grp_count, 43 unsigned int grp_count,
44 xfs_mru_cache_free_func_t free_func); 44 xfs_mru_cache_free_func_t free_func);
45void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
46void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); 45void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
47int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, 46int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
48 void *value); 47 void *value);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 3ec91ac74c2a..fdcab3f81dde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -92,6 +92,14 @@ typedef struct xfs_dqblk {
92 92
93#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) 93#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
94 94
95#define XFS_DQ_FLAGS \
96 { XFS_DQ_USER, "USER" }, \
97 { XFS_DQ_PROJ, "PROJ" }, \
98 { XFS_DQ_GROUP, "GROUP" }, \
99 { XFS_DQ_DIRTY, "DIRTY" }, \
100 { XFS_DQ_WANT, "WANT" }, \
101 { XFS_DQ_INACTIVE, "INACTIVE" }
102
95/* 103/*
96 * In the worst case, when both user and group quotas are on, 104 * In the worst case, when both user and group quotas are on,
97 * we can have a max of three dquots changing in a single transaction. 105 * we can have a max of three dquots changing in a single transaction.
@@ -215,16 +223,9 @@ typedef struct xfs_qoff_logformat {
215#define XFS_QMOPT_RES_INOS 0x0800000 223#define XFS_QMOPT_RES_INOS 0x0800000
216 224
217/* 225/*
218 * flags for dqflush and dqflush_all.
219 */
220#define XFS_QMOPT_SYNC 0x1000000
221#define XFS_QMOPT_ASYNC 0x2000000
222#define XFS_QMOPT_DELWRI 0x4000000
223
224/*
225 * flags for dqalloc. 226 * flags for dqalloc.
226 */ 227 */
227#define XFS_QMOPT_INHERIT 0x8000000 228#define XFS_QMOPT_INHERIT 0x1000000
228 229
229/* 230/*
230 * flags to xfs_trans_mod_dquot. 231 * flags to xfs_trans_mod_dquot.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index b81deea0ce19..fc1cda23b817 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -39,6 +39,7 @@
39#include "xfs_utils.h" 39#include "xfs_utils.h"
40#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_trace.h"
42 43
43 44
44/* 45/*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 385f6dceba5d..6be05f756d59 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -45,6 +45,7 @@
45#include "xfs_inode_item.h" 45#include "xfs_inode_item.h"
46#include "xfs_trans_space.h" 46#include "xfs_trans_space.h"
47#include "xfs_utils.h" 47#include "xfs_utils.h"
48#include "xfs_trace.h"
48 49
49 50
50/* 51/*
@@ -1516,6 +1517,8 @@ xfs_rtfree_range(
1516 */ 1517 */
1517 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, 1518 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
1518 &postblock); 1519 &postblock);
1520 if (error)
1521 return error;
1519 /* 1522 /*
1520 * If there are blocks not being freed at the front of the 1523 * If there are blocks not being freed at the front of the
1521 * old extent, add summary data for them to be allocated. 1524 * old extent, add summary data for them to be allocated.
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3f816ad7ff19..e336742a58a4 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -44,48 +44,7 @@
44#include "xfs_error.h" 44#include "xfs_error.h"
45#include "xfs_buf_item.h" 45#include "xfs_buf_item.h"
46#include "xfs_rw.h" 46#include "xfs_rw.h"
47 47#include "xfs_trace.h"
48/*
49 * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
50 * which clears the setuid and setgid bits when a file is written.
51 */
52int
53xfs_write_clear_setuid(
54 xfs_inode_t *ip)
55{
56 xfs_mount_t *mp;
57 xfs_trans_t *tp;
58 int error;
59
60 mp = ip->i_mount;
61 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
62 if ((error = xfs_trans_reserve(tp, 0,
63 XFS_WRITEID_LOG_RES(mp),
64 0, 0, 0))) {
65 xfs_trans_cancel(tp, 0);
66 return error;
67 }
68 xfs_ilock(ip, XFS_ILOCK_EXCL);
69 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
70 xfs_trans_ihold(tp, ip);
71 ip->i_d.di_mode &= ~S_ISUID;
72
73 /*
74 * Note that we don't have to worry about mandatory
75 * file locking being disabled here because we only
76 * clear the S_ISGID bit if the Group execute bit is
77 * on, but if it was on then mandatory locking wouldn't
78 * have been enabled.
79 */
80 if (ip->i_d.di_mode & S_IXGRP) {
81 ip->i_d.di_mode &= ~S_ISGID;
82 }
83 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
84 xfs_trans_set_sync(tp);
85 error = xfs_trans_commit(tp, 0);
86 xfs_iunlock(ip, XFS_ILOCK_EXCL);
87 return 0;
88}
89 48
90/* 49/*
91 * Force a shutdown of the filesystem instantly while keeping 50 * Force a shutdown of the filesystem instantly while keeping
@@ -152,90 +111,6 @@ xfs_do_force_shutdown(
152 } 111 }
153} 112}
154 113
155
156/*
157 * Called when we want to stop a buffer from getting written or read.
158 * We attach the EIO error, muck with its flags, and call biodone
159 * so that the proper iodone callbacks get called.
160 */
161int
162xfs_bioerror(
163 xfs_buf_t *bp)
164{
165
166#ifdef XFSERRORDEBUG
167 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
168#endif
169
170 /*
171 * No need to wait until the buffer is unpinned.
172 * We aren't flushing it.
173 */
174 xfs_buftrace("XFS IOERROR", bp);
175 XFS_BUF_ERROR(bp, EIO);
176 /*
177 * We're calling biodone, so delete B_DONE flag. Either way
178 * we have to call the iodone callback, and calling biodone
179 * probably is the best way since it takes care of
180 * GRIO as well.
181 */
182 XFS_BUF_UNREAD(bp);
183 XFS_BUF_UNDELAYWRITE(bp);
184 XFS_BUF_UNDONE(bp);
185 XFS_BUF_STALE(bp);
186
187 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
188 xfs_biodone(bp);
189
190 return (EIO);
191}
192
193/*
194 * Same as xfs_bioerror, except that we are releasing the buffer
195 * here ourselves, and avoiding the biodone call.
196 * This is meant for userdata errors; metadata bufs come with
197 * iodone functions attached, so that we can track down errors.
198 */
199int
200xfs_bioerror_relse(
201 xfs_buf_t *bp)
202{
203 int64_t fl;
204
205 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
206 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
207
208 xfs_buftrace("XFS IOERRELSE", bp);
209 fl = XFS_BUF_BFLAGS(bp);
210 /*
211 * No need to wait until the buffer is unpinned.
212 * We aren't flushing it.
213 *
214 * chunkhold expects B_DONE to be set, whether
215 * we actually finish the I/O or not. We don't want to
216 * change that interface.
217 */
218 XFS_BUF_UNREAD(bp);
219 XFS_BUF_UNDELAYWRITE(bp);
220 XFS_BUF_DONE(bp);
221 XFS_BUF_STALE(bp);
222 XFS_BUF_CLR_IODONE_FUNC(bp);
223 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
224 if (!(fl & XFS_B_ASYNC)) {
225 /*
226 * Mark b_error and B_ERROR _both_.
227 * Lot's of chunkcache code assumes that.
228 * There's no reason to mark error for
229 * ASYNC buffers.
230 */
231 XFS_BUF_ERROR(bp, EIO);
232 XFS_BUF_FINISH_IOWAIT(bp);
233 } else {
234 xfs_buf_relse(bp);
235 }
236 return (EIO);
237}
238
239/* 114/*
240 * Prints out an ALERT message about I/O error. 115 * Prints out an ALERT message about I/O error.
241 */ 116 */
@@ -277,10 +152,10 @@ xfs_read_buf(
277 xfs_buf_t *bp; 152 xfs_buf_t *bp;
278 int error; 153 int error;
279 154
280 if (flags) 155 if (!flags)
281 bp = xfs_buf_read_flags(target, blkno, len, flags); 156 flags = XBF_LOCK | XBF_MAPPED;
282 else 157
283 bp = xfs_buf_read(target, blkno, len, flags); 158 bp = xfs_buf_read(target, blkno, len, flags);
284 if (!bp) 159 if (!bp)
285 return XFS_ERROR(EIO); 160 return XFS_ERROR(EIO);
286 error = XFS_BUF_GETERROR(bp); 161 error = XFS_BUF_GETERROR(bp);
@@ -307,32 +182,23 @@ xfs_read_buf(
307} 182}
308 183
309/* 184/*
310 * Wrapper around bwrite() so that we can trap 185 * helper function to extract extent size hint from inode
311 * write errors, and act accordingly.
312 */ 186 */
313int 187xfs_extlen_t
314xfs_bwrite( 188xfs_get_extsz_hint(
315 struct xfs_mount *mp, 189 struct xfs_inode *ip)
316 struct xfs_buf *bp)
317{ 190{
318 int error; 191 xfs_extlen_t extsz;
319 192
320 /* 193 if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
321 * XXXsup how does this work for quotas. 194 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
322 */ 195 ? ip->i_d.di_extsize
323 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); 196 : ip->i_mount->m_sb.sb_rextsize;
324 bp->b_mount = mp; 197 ASSERT(extsz);
325 XFS_BUF_WRITE(bp); 198 } else {
326 199 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
327 if ((error = XFS_bwrite(bp))) { 200 ? ip->i_d.di_extsize : 0;
328 ASSERT(mp);
329 /*
330 * Cannot put a buftrace here since if the buffer is not
331 * B_HOLD then we will brelse() the buffer before returning
332 * from bwrite and we could be tracing a buffer that has
333 * been reused.
334 */
335 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
336 } 201 }
337 return (error); 202
203 return extsz;
338} 204}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f5e4874c37d8..11c41ec6ed75 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -37,44 +37,13 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
37} 37}
38 38
39/* 39/*
40 * Flags for xfs_free_eofblocks
41 */
42#define XFS_FREE_EOF_LOCK (1<<0)
43#define XFS_FREE_EOF_NOLOCK (1<<1)
44
45
46/*
47 * helper function to extract extent size hint from inode
48 */
49STATIC_INLINE xfs_extlen_t
50xfs_get_extsz_hint(
51 xfs_inode_t *ip)
52{
53 xfs_extlen_t extsz;
54
55 if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
56 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
57 ? ip->i_d.di_extsize
58 : ip->i_mount->m_sb.sb_rextsize;
59 ASSERT(extsz);
60 } else {
61 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
62 ? ip->i_d.di_extsize : 0;
63 }
64 return extsz;
65}
66
67/*
68 * Prototypes for functions in xfs_rw.c. 40 * Prototypes for functions in xfs_rw.c.
69 */ 41 */
70extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
72extern int xfs_bioerror(struct xfs_buf *bp);
73extern int xfs_bioerror_relse(struct xfs_buf *bp);
74extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, 42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
75 xfs_daddr_t blkno, int len, uint flags, 43 xfs_daddr_t blkno, int len, uint flags,
76 struct xfs_buf **bpp); 44 struct xfs_buf **bpp);
77extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, 45extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
78 xfs_buf_t *bp, xfs_daddr_t blkno); 46 xfs_buf_t *bp, xfs_daddr_t blkno);
47extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
79 48
80#endif /* __XFS_RW_H__ */ 49#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 66b849358e62..f73e358bae8d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,19 +236,20 @@ xfs_trans_alloc(
236 uint type) 236 uint type)
237{ 237{
238 xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); 238 xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
239 return _xfs_trans_alloc(mp, type); 239 return _xfs_trans_alloc(mp, type, KM_SLEEP);
240} 240}
241 241
242xfs_trans_t * 242xfs_trans_t *
243_xfs_trans_alloc( 243_xfs_trans_alloc(
244 xfs_mount_t *mp, 244 xfs_mount_t *mp,
245 uint type) 245 uint type,
246 uint memflags)
246{ 247{
247 xfs_trans_t *tp; 248 xfs_trans_t *tp;
248 249
249 atomic_inc(&mp->m_active_trans); 250 atomic_inc(&mp->m_active_trans);
250 251
251 tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 252 tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
252 tp->t_magic = XFS_TRANS_MAGIC; 253 tp->t_magic = XFS_TRANS_MAGIC;
253 tp->t_type = type; 254 tp->t_type = type;
254 tp->t_mountp = mp; 255 tp->t_mountp = mp;
@@ -795,7 +796,7 @@ _xfs_trans_commit(
795 int sync; 796 int sync;
796#define XFS_TRANS_LOGVEC_COUNT 16 797#define XFS_TRANS_LOGVEC_COUNT 16
797 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; 798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
798 void *commit_iclog; 799 struct xlog_in_core *commit_iclog;
799 int shutdown; 800 int shutdown;
800 801
801 commit_lsn = -1; 802 commit_lsn = -1;
@@ -980,9 +981,8 @@ shut_us_down:
980 */ 981 */
981 if (sync) { 982 if (sync) {
982 if (!error) { 983 if (!error) {
983 error = _xfs_log_force(mp, commit_lsn, 984 error = _xfs_log_force_lsn(mp, commit_lsn,
984 XFS_LOG_FORCE | XFS_LOG_SYNC, 985 XFS_LOG_SYNC, log_flushed);
985 log_flushed);
986 } 986 }
987 XFS_STATS_INC(xs_trans_sync); 987 XFS_STATS_INC(xs_trans_sync);
988 } else { 988 } else {
@@ -1120,7 +1120,7 @@ xfs_trans_fill_vecs(
1120 tp->t_header.th_num_items = nitems; 1120 tp->t_header.th_num_items = nitems;
1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
1122 log_vector->i_len = sizeof(xfs_trans_header_t); 1122 log_vector->i_len = sizeof(xfs_trans_header_t);
1123 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR); 1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
1124} 1124}
1125 1125
1126 1126
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ed47fc77759c..79c8bab9dfff 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -100,6 +100,49 @@ typedef struct xfs_trans_header {
100#define XFS_TRANS_TYPE_MAX 41 100#define XFS_TRANS_TYPE_MAX 41
101/* new transaction types need to be reflected in xfs_logprint(8) */ 101/* new transaction types need to be reflected in xfs_logprint(8) */
102 102
103#define XFS_TRANS_TYPES \
104 { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
105 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
106 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
107 { XFS_TRANS_CREATE, "CREATE" }, \
108 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
109 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
110 { XFS_TRANS_REMOVE, "REMOVE" }, \
111 { XFS_TRANS_LINK, "LINK" }, \
112 { XFS_TRANS_RENAME, "RENAME" }, \
113 { XFS_TRANS_MKDIR, "MKDIR" }, \
114 { XFS_TRANS_RMDIR, "RMDIR" }, \
115 { XFS_TRANS_SYMLINK, "SYMLINK" }, \
116 { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
117 { XFS_TRANS_GROWFS, "GROWFS" }, \
118 { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
119 { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
120 { XFS_TRANS_WRITEID, "WRITEID" }, \
121 { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
122 { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
123 { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
124 { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
125 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
126 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
127 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
128 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
129 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
130 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
131 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
132 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
133 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
134 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
135 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
136 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
137 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
138 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
139 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
140 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
141 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
142 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
143 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
144 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
145
103/* 146/*
104 * This structure is used to track log items associated with 147 * This structure is used to track log items associated with
105 * a transaction. It points to the log item and keeps some 148 * a transaction. It points to the log item and keeps some
@@ -782,6 +825,10 @@ typedef struct xfs_log_item {
782#define XFS_LI_IN_AIL 0x1 825#define XFS_LI_IN_AIL 0x1
783#define XFS_LI_ABORTED 0x2 826#define XFS_LI_ABORTED 0x2
784 827
828#define XFS_LI_FLAGS \
829 { XFS_LI_IN_AIL, "IN_AIL" }, \
830 { XFS_LI_ABORTED, "ABORTED" }
831
785typedef struct xfs_item_ops { 832typedef struct xfs_item_ops {
786 uint (*iop_size)(xfs_log_item_t *); 833 uint (*iop_size)(xfs_log_item_t *);
787 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 834 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
@@ -814,8 +861,7 @@ typedef struct xfs_item_ops {
814#define XFS_ITEM_SUCCESS 0 861#define XFS_ITEM_SUCCESS 0
815#define XFS_ITEM_PINNED 1 862#define XFS_ITEM_PINNED 1
816#define XFS_ITEM_LOCKED 2 863#define XFS_ITEM_LOCKED 2
817#define XFS_ITEM_FLUSHING 3 864#define XFS_ITEM_PUSHBUF 3
818#define XFS_ITEM_PUSHBUF 4
819 865
820/* 866/*
821 * This structure is used to maintain a list of block ranges that have been 867 * This structure is used to maintain a list of block ranges that have been
@@ -864,7 +910,7 @@ typedef struct xfs_trans {
864 unsigned int t_blk_res_used; /* # of resvd blocks used */ 910 unsigned int t_blk_res_used; /* # of resvd blocks used */
865 unsigned int t_rtx_res; /* # of rt extents resvd */ 911 unsigned int t_rtx_res; /* # of rt extents resvd */
866 unsigned int t_rtx_res_used; /* # of resvd rt extents used */ 912 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
867 xfs_log_ticket_t t_ticket; /* log mgr ticket */ 913 struct xlog_ticket *t_ticket; /* log mgr ticket */
868 xfs_lsn_t t_lsn; /* log seq num of start of 914 xfs_lsn_t t_lsn; /* log seq num of start of
869 * transaction. */ 915 * transaction. */
870 xfs_lsn_t t_commit_lsn; /* log seq num of end of 916 xfs_lsn_t t_commit_lsn; /* log seq num of end of
@@ -924,7 +970,7 @@ typedef struct xfs_trans {
924 * XFS transaction mechanism exported interfaces. 970 * XFS transaction mechanism exported interfaces.
925 */ 971 */
926xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); 972xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
927xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); 973xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
928xfs_trans_t *xfs_trans_dup(xfs_trans_t *); 974xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
929int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, 975int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
930 uint, uint); 976 uint, uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -237,14 +237,15 @@ out:
237} 237}
238 238
239/* 239/*
240 * Function that does the work of pushing on the AIL 240 * xfsaild_push does the work of pushing on the AIL. Returning a timeout of
241 * zero indicates that the caller should sleep until woken.
241 */ 242 */
242long 243long
243xfsaild_push( 244xfsaild_push(
244 struct xfs_ail *ailp, 245 struct xfs_ail *ailp,
245 xfs_lsn_t *last_lsn) 246 xfs_lsn_t *last_lsn)
246{ 247{
247 long tout = 1000; /* milliseconds */ 248 long tout = 0;
248 xfs_lsn_t last_pushed_lsn = *last_lsn; 249 xfs_lsn_t last_pushed_lsn = *last_lsn;
249 xfs_lsn_t target = ailp->xa_target; 250 xfs_lsn_t target = ailp->xa_target;
250 xfs_lsn_t lsn; 251 xfs_lsn_t lsn;
@@ -252,6 +253,7 @@ xfsaild_push(
252 int flush_log, count, stuck; 253 int flush_log, count, stuck;
253 xfs_mount_t *mp = ailp->xa_mount; 254 xfs_mount_t *mp = ailp->xa_mount;
254 struct xfs_ail_cursor *cur = &ailp->xa_cursors; 255 struct xfs_ail_cursor *cur = &ailp->xa_cursors;
256 int push_xfsbufd = 0;
255 257
256 spin_lock(&ailp->xa_lock); 258 spin_lock(&ailp->xa_lock);
257 xfs_trans_ail_cursor_init(ailp, cur); 259 xfs_trans_ail_cursor_init(ailp, cur);
@@ -262,7 +264,7 @@ xfsaild_push(
262 */ 264 */
263 xfs_trans_ail_cursor_done(ailp, cur); 265 xfs_trans_ail_cursor_done(ailp, cur);
264 spin_unlock(&ailp->xa_lock); 266 spin_unlock(&ailp->xa_lock);
265 last_pushed_lsn = 0; 267 *last_lsn = 0;
266 return tout; 268 return tout;
267 } 269 }
268 270
@@ -279,7 +281,6 @@ xfsaild_push(
279 * prevents use from spinning when we can't do anything or there is 281 * prevents use from spinning when we can't do anything or there is
280 * lots of contention on the AIL lists. 282 * lots of contention on the AIL lists.
281 */ 283 */
282 tout = 10;
283 lsn = lip->li_lsn; 284 lsn = lip->li_lsn;
284 flush_log = stuck = count = 0; 285 flush_log = stuck = count = 0;
285 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { 286 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
@@ -308,6 +309,7 @@ xfsaild_push(
308 XFS_STATS_INC(xs_push_ail_pushbuf); 309 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 310 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 311 last_pushed_lsn = lsn;
312 push_xfsbufd = 1;
311 break; 313 break;
312 314
313 case XFS_ITEM_PINNED: 315 case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
322 stuck++; 324 stuck++;
323 break; 325 break;
324 326
325 case XFS_ITEM_FLUSHING:
326 XFS_STATS_INC(xs_push_ail_flushing);
327 last_pushed_lsn = lsn;
328 stuck++;
329 break;
330
331 default: 327 default:
332 ASSERT(0); 328 ASSERT(0);
333 break; 329 break;
@@ -371,19 +367,24 @@ xfsaild_push(
371 * move forward in the AIL. 367 * move forward in the AIL.
372 */ 368 */
373 XFS_STATS_INC(xs_push_ail_flush); 369 XFS_STATS_INC(xs_push_ail_flush);
374 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 370 xfs_log_force(mp, 0);
371 }
372
373 if (push_xfsbufd) {
374 /* we've got delayed write buffers to flush */
375 wake_up_process(mp->m_ddev_targp->bt_task);
375 } 376 }
376 377
377 if (!count) { 378 if (!count) {
378 /* We're past our target or empty, so idle */ 379 /* We're past our target or empty, so idle */
379 tout = 1000; 380 last_pushed_lsn = 0;
380 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 381 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
381 /* 382 /*
382 * We reached the target so wait a bit longer for I/O to 383 * We reached the target so wait a bit longer for I/O to
383 * complete and remove pushed items from the AIL before we 384 * complete and remove pushed items from the AIL before we
384 * start the next scan from the start of the AIL. 385 * start the next scan from the start of the AIL.
385 */ 386 */
386 tout += 20; 387 tout = 50;
387 last_pushed_lsn = 0; 388 last_pushed_lsn = 0;
388 } else if ((stuck * 100) / count > 90) { 389 } else if ((stuck * 100) / count > 90) {
389 /* 390 /*
@@ -395,11 +396,14 @@ xfsaild_push(
395 * Backoff a bit more to allow some I/O to complete before 396 * Backoff a bit more to allow some I/O to complete before
396 * continuing from where we were. 397 * continuing from where we were.
397 */ 398 */
398 tout += 10; 399 tout = 20;
400 } else {
401 /* more to do, but wait a short while before continuing */
402 tout = 10;
399 } 403 }
400 *last_lsn = last_pushed_lsn; 404 *last_lsn = last_pushed_lsn;
401 return tout; 405 return tout;
402} /* xfsaild_push */ 406}
403 407
404 408
405/* 409/*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 218829e6a152..fb586360d1c9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -38,6 +38,7 @@
38#include "xfs_trans_priv.h" 38#include "xfs_trans_priv.h"
39#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_rw.h" 40#include "xfs_rw.h"
41#include "xfs_trace.h"
41 42
42 43
43STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, 44STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
@@ -45,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
45STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, 46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
46 xfs_daddr_t, int); 47 xfs_daddr_t, int);
47 48
49/*
50 * Add the locked buffer to the transaction.
51 *
52 * The buffer must be locked, and it cannot be associated with any
53 * transaction.
54 *
55 * If the buffer does not yet have a buf log item associated with it,
56 * then allocate one for it. Then add the buf item to the transaction.
57 */
58STATIC void
59_xfs_trans_bjoin(
60 struct xfs_trans *tp,
61 struct xfs_buf *bp,
62 int reset_recur)
63{
64 struct xfs_buf_log_item *bip;
65
66 ASSERT(XFS_BUF_ISBUSY(bp));
67 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
68
69 /*
70 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
71 * it doesn't have one yet, then allocate one and initialize it.
72 * The checks to see if one is there are in xfs_buf_item_init().
73 */
74 xfs_buf_item_init(bp, tp->t_mountp);
75 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
76 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
77 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
78 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
79 if (reset_recur)
80 bip->bli_recur = 0;
81
82 /*
83 * Take a reference for this transaction on the buf item.
84 */
85 atomic_inc(&bip->bli_refcount);
86
87 /*
88 * Get a log_item_desc to point at the new item.
89 */
90 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
91
92 /*
93 * Initialize b_fsprivate2 so we can find it with incore_match()
94 * in xfs_trans_get_buf() and friends above.
95 */
96 XFS_BUF_SET_FSPRIVATE2(bp, tp);
97
98}
99
100void
101xfs_trans_bjoin(
102 struct xfs_trans *tp,
103 struct xfs_buf *bp)
104{
105 _xfs_trans_bjoin(tp, bp, 0);
106 trace_xfs_trans_bjoin(bp->b_fspriv);
107}
48 108
49/* 109/*
50 * Get and lock the buffer for the caller if it is not already 110 * Get and lock the buffer for the caller if it is not already
@@ -74,16 +134,14 @@ xfs_trans_get_buf(xfs_trans_t *tp,
74 xfs_buf_log_item_t *bip; 134 xfs_buf_log_item_t *bip;
75 135
76 if (flags == 0) 136 if (flags == 0)
77 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 137 flags = XBF_LOCK | XBF_MAPPED;
78 138
79 /* 139 /*
80 * Default to a normal get_buf() call if the tp is NULL. 140 * Default to a normal get_buf() call if the tp is NULL.
81 */ 141 */
82 if (tp == NULL) { 142 if (tp == NULL)
83 bp = xfs_buf_get_flags(target_dev, blkno, len, 143 return xfs_buf_get(target_dev, blkno, len,
84 flags | BUF_BUSY); 144 flags | XBF_DONT_BLOCK);
85 return(bp);
86 }
87 145
88 /* 146 /*
89 * If we find the buffer in the cache with this transaction 147 * If we find the buffer in the cache with this transaction
@@ -98,79 +156,43 @@ xfs_trans_get_buf(xfs_trans_t *tp,
98 } 156 }
99 if (bp != NULL) { 157 if (bp != NULL) {
100 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 158 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
101 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { 159 if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
102 xfs_buftrace("TRANS GET RECUR SHUT", bp);
103 XFS_BUF_SUPER_STALE(bp); 160 XFS_BUF_SUPER_STALE(bp);
104 } 161
105 /* 162 /*
106 * If the buffer is stale then it was binval'ed 163 * If the buffer is stale then it was binval'ed
107 * since last read. This doesn't matter since the 164 * since last read. This doesn't matter since the
108 * caller isn't allowed to use the data anyway. 165 * caller isn't allowed to use the data anyway.
109 */ 166 */
110 else if (XFS_BUF_ISSTALE(bp)) { 167 else if (XFS_BUF_ISSTALE(bp))
111 xfs_buftrace("TRANS GET RECUR STALE", bp);
112 ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); 168 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
113 } 169
114 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 170 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 171 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
116 ASSERT(bip != NULL); 172 ASSERT(bip != NULL);
117 ASSERT(atomic_read(&bip->bli_refcount) > 0); 173 ASSERT(atomic_read(&bip->bli_refcount) > 0);
118 bip->bli_recur++; 174 bip->bli_recur++;
119 xfs_buftrace("TRANS GET RECUR", bp); 175 trace_xfs_trans_get_buf_recur(bip);
120 xfs_buf_item_trace("GET RECUR", bip);
121 return (bp); 176 return (bp);
122 } 177 }
123 178
124 /* 179 /*
125 * We always specify the BUF_BUSY flag within a transaction so 180 * We always specify the XBF_DONT_BLOCK flag within a transaction
126 * that get_buf does not try to push out a delayed write buffer 181 * so that get_buf does not try to push out a delayed write buffer
127 * which might cause another transaction to take place (if the 182 * which might cause another transaction to take place (if the
128 * buffer was delayed alloc). Such recursive transactions can 183 * buffer was delayed alloc). Such recursive transactions can
129 * easily deadlock with our current transaction as well as cause 184 * easily deadlock with our current transaction as well as cause
130 * us to run out of stack space. 185 * us to run out of stack space.
131 */ 186 */
132 bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY); 187 bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
133 if (bp == NULL) { 188 if (bp == NULL) {
134 return NULL; 189 return NULL;
135 } 190 }
136 191
137 ASSERT(!XFS_BUF_GETERROR(bp)); 192 ASSERT(!XFS_BUF_GETERROR(bp));
138 193
139 /* 194 _xfs_trans_bjoin(tp, bp, 1);
140 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 195 trace_xfs_trans_get_buf(bp->b_fspriv);
141 * it doesn't have one yet, then allocate one and initialize it.
142 * The checks to see if one is there are in xfs_buf_item_init().
143 */
144 xfs_buf_item_init(bp, tp->t_mountp);
145
146 /*
147 * Set the recursion count for the buffer within this transaction
148 * to 0.
149 */
150 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
151 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
152 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
153 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
154 bip->bli_recur = 0;
155
156 /*
157 * Take a reference for this transaction on the buf item.
158 */
159 atomic_inc(&bip->bli_refcount);
160
161 /*
162 * Get a log_item_desc to point at the new item.
163 */
164 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
165
166 /*
167 * Initialize b_fsprivate2 so we can find it with incore_match()
168 * above.
169 */
170 XFS_BUF_SET_FSPRIVATE2(bp, tp);
171
172 xfs_buftrace("TRANS GET", bp);
173 xfs_buf_item_trace("GET", bip);
174 return (bp); 196 return (bp);
175} 197}
176 198
@@ -210,49 +232,16 @@ xfs_trans_getsb(xfs_trans_t *tp,
210 ASSERT(bip != NULL); 232 ASSERT(bip != NULL);
211 ASSERT(atomic_read(&bip->bli_refcount) > 0); 233 ASSERT(atomic_read(&bip->bli_refcount) > 0);
212 bip->bli_recur++; 234 bip->bli_recur++;
213 xfs_buf_item_trace("GETSB RECUR", bip); 235 trace_xfs_trans_getsb_recur(bip);
214 return (bp); 236 return (bp);
215 } 237 }
216 238
217 bp = xfs_getsb(mp, flags); 239 bp = xfs_getsb(mp, flags);
218 if (bp == NULL) { 240 if (bp == NULL)
219 return NULL; 241 return NULL;
220 }
221
222 /*
223 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
224 * it doesn't have one yet, then allocate one and initialize it.
225 * The checks to see if one is there are in xfs_buf_item_init().
226 */
227 xfs_buf_item_init(bp, mp);
228
229 /*
230 * Set the recursion count for the buffer within this transaction
231 * to 0.
232 */
233 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
234 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
235 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
236 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
237 bip->bli_recur = 0;
238 242
239 /* 243 _xfs_trans_bjoin(tp, bp, 1);
240 * Take a reference for this transaction on the buf item. 244 trace_xfs_trans_getsb(bp->b_fspriv);
241 */
242 atomic_inc(&bip->bli_refcount);
243
244 /*
245 * Get a log_item_desc to point at the new item.
246 */
247 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
248
249 /*
250 * Initialize b_fsprivate2 so we can find it with incore_match()
251 * above.
252 */
253 XFS_BUF_SET_FSPRIVATE2(bp, tp);
254
255 xfs_buf_item_trace("GETSB", bip);
256 return (bp); 245 return (bp);
257} 246}
258 247
@@ -296,15 +285,15 @@ xfs_trans_read_buf(
296 int error; 285 int error;
297 286
298 if (flags == 0) 287 if (flags == 0)
299 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 288 flags = XBF_LOCK | XBF_MAPPED;
300 289
301 /* 290 /*
302 * Default to a normal get_buf() call if the tp is NULL. 291 * Default to a normal get_buf() call if the tp is NULL.
303 */ 292 */
304 if (tp == NULL) { 293 if (tp == NULL) {
305 bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); 294 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
306 if (!bp) 295 if (!bp)
307 return (flags & XFS_BUF_TRYLOCK) ? 296 return (flags & XBF_TRYLOCK) ?
308 EAGAIN : XFS_ERROR(ENOMEM); 297 EAGAIN : XFS_ERROR(ENOMEM);
309 298
310 if (XFS_BUF_GETERROR(bp) != 0) { 299 if (XFS_BUF_GETERROR(bp) != 0) {
@@ -350,7 +339,7 @@ xfs_trans_read_buf(
350 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 339 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
351 ASSERT((XFS_BUF_ISERROR(bp)) == 0); 340 ASSERT((XFS_BUF_ISERROR(bp)) == 0);
352 if (!(XFS_BUF_ISDONE(bp))) { 341 if (!(XFS_BUF_ISDONE(bp))) {
353 xfs_buftrace("READ_BUF_INCORE !DONE", bp); 342 trace_xfs_trans_read_buf_io(bp, _RET_IP_);
354 ASSERT(!XFS_BUF_ISASYNC(bp)); 343 ASSERT(!XFS_BUF_ISASYNC(bp));
355 XFS_BUF_READ(bp); 344 XFS_BUF_READ(bp);
356 xfsbdstrat(tp->t_mountp, bp); 345 xfsbdstrat(tp->t_mountp, bp);
@@ -375,7 +364,7 @@ xfs_trans_read_buf(
375 * brelse it either. Just get out. 364 * brelse it either. Just get out.
376 */ 365 */
377 if (XFS_FORCED_SHUTDOWN(mp)) { 366 if (XFS_FORCED_SHUTDOWN(mp)) {
378 xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp); 367 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
379 *bpp = NULL; 368 *bpp = NULL;
380 return XFS_ERROR(EIO); 369 return XFS_ERROR(EIO);
381 } 370 }
@@ -385,27 +374,26 @@ xfs_trans_read_buf(
385 bip->bli_recur++; 374 bip->bli_recur++;
386 375
387 ASSERT(atomic_read(&bip->bli_refcount) > 0); 376 ASSERT(atomic_read(&bip->bli_refcount) > 0);
388 xfs_buf_item_trace("READ RECUR", bip); 377 trace_xfs_trans_read_buf_recur(bip);
389 *bpp = bp; 378 *bpp = bp;
390 return 0; 379 return 0;
391 } 380 }
392 381
393 /* 382 /*
394 * We always specify the BUF_BUSY flag within a transaction so 383 * We always specify the XBF_DONT_BLOCK flag within a transaction
395 * that get_buf does not try to push out a delayed write buffer 384 * so that get_buf does not try to push out a delayed write buffer
396 * which might cause another transaction to take place (if the 385 * which might cause another transaction to take place (if the
397 * buffer was delayed alloc). Such recursive transactions can 386 * buffer was delayed alloc). Such recursive transactions can
398 * easily deadlock with our current transaction as well as cause 387 * easily deadlock with our current transaction as well as cause
399 * us to run out of stack space. 388 * us to run out of stack space.
400 */ 389 */
401 bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); 390 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
402 if (bp == NULL) { 391 if (bp == NULL) {
403 *bpp = NULL; 392 *bpp = NULL;
404 return 0; 393 return 0;
405 } 394 }
406 if (XFS_BUF_GETERROR(bp) != 0) { 395 if (XFS_BUF_GETERROR(bp) != 0) {
407 XFS_BUF_SUPER_STALE(bp); 396 XFS_BUF_SUPER_STALE(bp);
408 xfs_buftrace("READ ERROR", bp);
409 error = XFS_BUF_GETERROR(bp); 397 error = XFS_BUF_GETERROR(bp);
410 398
411 xfs_ioerror_alert("xfs_trans_read_buf", mp, 399 xfs_ioerror_alert("xfs_trans_read_buf", mp,
@@ -431,41 +419,9 @@ xfs_trans_read_buf(
431 if (XFS_FORCED_SHUTDOWN(mp)) 419 if (XFS_FORCED_SHUTDOWN(mp))
432 goto shutdown_abort; 420 goto shutdown_abort;
433 421
434 /* 422 _xfs_trans_bjoin(tp, bp, 1);
435 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 423 trace_xfs_trans_read_buf(bp->b_fspriv);
436 * it doesn't have one yet, then allocate one and initialize it.
437 * The checks to see if one is there are in xfs_buf_item_init().
438 */
439 xfs_buf_item_init(bp, tp->t_mountp);
440
441 /*
442 * Set the recursion count for the buffer within this transaction
443 * to 0.
444 */
445 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
446 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
447 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
448 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
449 bip->bli_recur = 0;
450
451 /*
452 * Take a reference for this transaction on the buf item.
453 */
454 atomic_inc(&bip->bli_refcount);
455
456 /*
457 * Get a log_item_desc to point at the new item.
458 */
459 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
460 424
461 /*
462 * Initialize b_fsprivate2 so we can find it with incore_match()
463 * above.
464 */
465 XFS_BUF_SET_FSPRIVATE2(bp, tp);
466
467 xfs_buftrace("TRANS READ", bp);
468 xfs_buf_item_trace("READ", bip);
469 *bpp = bp; 425 *bpp = bp;
470 return 0; 426 return 0;
471 427
@@ -480,10 +436,10 @@ shutdown_abort:
480 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 436 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
481 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 437 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
482#endif 438#endif
483 ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != 439 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
484 (XFS_B_STALE|XFS_B_DELWRI)); 440 (XBF_STALE|XBF_DELWRI));
485 441
486 xfs_buftrace("READ_BUF XFSSHUTDN", bp); 442 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
487 xfs_buf_relse(bp); 443 xfs_buf_relse(bp);
488 *bpp = NULL; 444 *bpp = NULL;
489 return XFS_ERROR(EIO); 445 return XFS_ERROR(EIO);
@@ -549,13 +505,14 @@ xfs_trans_brelse(xfs_trans_t *tp,
549 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); 505 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
550 ASSERT(lidp != NULL); 506 ASSERT(lidp != NULL);
551 507
508 trace_xfs_trans_brelse(bip);
509
552 /* 510 /*
553 * If the release is just for a recursive lock, 511 * If the release is just for a recursive lock,
554 * then decrement the count and return. 512 * then decrement the count and return.
555 */ 513 */
556 if (bip->bli_recur > 0) { 514 if (bip->bli_recur > 0) {
557 bip->bli_recur--; 515 bip->bli_recur--;
558 xfs_buf_item_trace("RELSE RECUR", bip);
559 return; 516 return;
560 } 517 }
561 518
@@ -563,10 +520,8 @@ xfs_trans_brelse(xfs_trans_t *tp,
563 * If the buffer is dirty within this transaction, we can't 520 * If the buffer is dirty within this transaction, we can't
564 * release it until we commit. 521 * release it until we commit.
565 */ 522 */
566 if (lidp->lid_flags & XFS_LID_DIRTY) { 523 if (lidp->lid_flags & XFS_LID_DIRTY)
567 xfs_buf_item_trace("RELSE DIRTY", bip);
568 return; 524 return;
569 }
570 525
571 /* 526 /*
572 * If the buffer has been invalidated, then we can't release 527 * If the buffer has been invalidated, then we can't release
@@ -574,13 +529,10 @@ xfs_trans_brelse(xfs_trans_t *tp,
574 * as part of this transaction. This prevents us from pulling 529 * as part of this transaction. This prevents us from pulling
575 * the item from the AIL before we should. 530 * the item from the AIL before we should.
576 */ 531 */
577 if (bip->bli_flags & XFS_BLI_STALE) { 532 if (bip->bli_flags & XFS_BLI_STALE)
578 xfs_buf_item_trace("RELSE STALE", bip);
579 return; 533 return;
580 }
581 534
582 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 535 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
583 xfs_buf_item_trace("RELSE", bip);
584 536
585 /* 537 /*
586 * Free up the log item descriptor tracking the released item. 538 * Free up the log item descriptor tracking the released item.
@@ -634,53 +586,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
634} 586}
635 587
636/* 588/*
637 * Add the locked buffer to the transaction.
638 * The buffer must be locked, and it cannot be associated with any
639 * transaction.
640 *
641 * If the buffer does not yet have a buf log item associated with it,
642 * then allocate one for it. Then add the buf item to the transaction.
643 */
644void
645xfs_trans_bjoin(xfs_trans_t *tp,
646 xfs_buf_t *bp)
647{
648 xfs_buf_log_item_t *bip;
649
650 ASSERT(XFS_BUF_ISBUSY(bp));
651 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
652
653 /*
654 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
655 * it doesn't have one yet, then allocate one and initialize it.
656 * The checks to see if one is there are in xfs_buf_item_init().
657 */
658 xfs_buf_item_init(bp, tp->t_mountp);
659 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
660 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
661 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
662 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
663
664 /*
665 * Take a reference for this transaction on the buf item.
666 */
667 atomic_inc(&bip->bli_refcount);
668
669 /*
670 * Get a log_item_desc to point at the new item.
671 */
672 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
673
674 /*
675 * Initialize b_fsprivate2 so we can find it with incore_match()
676 * in xfs_trans_get_buf() and friends above.
677 */
678 XFS_BUF_SET_FSPRIVATE2(bp, tp);
679
680 xfs_buf_item_trace("BJOIN", bip);
681}
682
683/*
684 * Mark the buffer as not needing to be unlocked when the buf item's 589 * Mark the buffer as not needing to be unlocked when the buf item's
685 * IOP_UNLOCK() routine is called. The buffer must already be locked 590 * IOP_UNLOCK() routine is called. The buffer must already be locked
686 * and associated with the given transaction. 591 * and associated with the given transaction.
@@ -701,7 +606,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
701 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 606 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
702 ASSERT(atomic_read(&bip->bli_refcount) > 0); 607 ASSERT(atomic_read(&bip->bli_refcount) > 0);
703 bip->bli_flags |= XFS_BLI_HOLD; 608 bip->bli_flags |= XFS_BLI_HOLD;
704 xfs_buf_item_trace("BHOLD", bip); 609 trace_xfs_trans_bhold(bip);
705} 610}
706 611
707/* 612/*
@@ -724,7 +629,8 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
724 ASSERT(atomic_read(&bip->bli_refcount) > 0); 629 ASSERT(atomic_read(&bip->bli_refcount) > 0);
725 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 630 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
726 bip->bli_flags &= ~XFS_BLI_HOLD; 631 bip->bli_flags &= ~XFS_BLI_HOLD;
727 xfs_buf_item_trace("BHOLD RELEASE", bip); 632
633 trace_xfs_trans_bhold_release(bip);
728} 634}
729 635
730/* 636/*
@@ -770,6 +676,8 @@ xfs_trans_log_buf(xfs_trans_t *tp,
770 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 676 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
771 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; 677 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
772 678
679 trace_xfs_trans_log_buf(bip);
680
773 /* 681 /*
774 * If we invalidated the buffer within this transaction, then 682 * If we invalidated the buffer within this transaction, then
775 * cancel the invalidation now that we're dirtying the buffer 683 * cancel the invalidation now that we're dirtying the buffer
@@ -777,7 +685,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
777 * because we have a reference to the buffer this entire time. 685 * because we have a reference to the buffer this entire time.
778 */ 686 */
779 if (bip->bli_flags & XFS_BLI_STALE) { 687 if (bip->bli_flags & XFS_BLI_STALE) {
780 xfs_buf_item_trace("BLOG UNSTALE", bip);
781 bip->bli_flags &= ~XFS_BLI_STALE; 688 bip->bli_flags &= ~XFS_BLI_STALE;
782 ASSERT(XFS_BUF_ISSTALE(bp)); 689 ASSERT(XFS_BUF_ISSTALE(bp));
783 XFS_BUF_UNSTALE(bp); 690 XFS_BUF_UNSTALE(bp);
@@ -792,7 +699,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
792 lidp->lid_flags &= ~XFS_LID_BUF_STALE; 699 lidp->lid_flags &= ~XFS_LID_BUF_STALE;
793 bip->bli_flags |= XFS_BLI_LOGGED; 700 bip->bli_flags |= XFS_BLI_LOGGED;
794 xfs_buf_item_log(bip, first, last); 701 xfs_buf_item_log(bip, first, last);
795 xfs_buf_item_trace("BLOG", bip);
796} 702}
797 703
798 704
@@ -831,6 +737,8 @@ xfs_trans_binval(
831 ASSERT(lidp != NULL); 737 ASSERT(lidp != NULL);
832 ASSERT(atomic_read(&bip->bli_refcount) > 0); 738 ASSERT(atomic_read(&bip->bli_refcount) > 0);
833 739
740 trace_xfs_trans_binval(bip);
741
834 if (bip->bli_flags & XFS_BLI_STALE) { 742 if (bip->bli_flags & XFS_BLI_STALE) {
835 /* 743 /*
836 * If the buffer is already invalidated, then 744 * If the buffer is already invalidated, then
@@ -843,8 +751,6 @@ xfs_trans_binval(
843 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 751 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
844 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 752 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
845 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 753 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
846 xfs_buftrace("XFS_BINVAL RECUR", bp);
847 xfs_buf_item_trace("BINVAL RECUR", bip);
848 return; 754 return;
849 } 755 }
850 756
@@ -878,8 +784,6 @@ xfs_trans_binval(
878 (bip->bli_format.blf_map_size * sizeof(uint))); 784 (bip->bli_format.blf_map_size * sizeof(uint)));
879 lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; 785 lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
880 tp->t_flags |= XFS_TRANS_DIRTY; 786 tp->t_flags |= XFS_TRANS_DIRTY;
881 xfs_buftrace("XFS_BINVAL", bp);
882 xfs_buf_item_trace("BINVAL", bip);
883} 787}
884 788
885/* 789/*
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index d725428c9df6..b09904555d07 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -151,8 +151,8 @@ typedef enum {
151} xfs_btnum_t; 151} xfs_btnum_t;
152 152
153struct xfs_name { 153struct xfs_name {
154 const char *name; 154 const unsigned char *name;
155 int len; 155 int len;
156}; 156};
157 157
158#endif /* __XFS_TYPES_H__ */ 158#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b572f7e840e0..9d376be0ea38 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -53,6 +53,7 @@
53#include "xfs_log_priv.h" 53#include "xfs_log_priv.h"
54#include "xfs_filestream.h" 54#include "xfs_filestream.h"
55#include "xfs_vnodeops.h" 55#include "xfs_vnodeops.h"
56#include "xfs_trace.h"
56 57
57int 58int
58xfs_setattr( 59xfs_setattr(
@@ -69,7 +70,6 @@ xfs_setattr(
69 uint commit_flags=0; 70 uint commit_flags=0;
70 uid_t uid=0, iuid=0; 71 uid_t uid=0, iuid=0;
71 gid_t gid=0, igid=0; 72 gid_t gid=0, igid=0;
72 int timeflags = 0;
73 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; 73 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
74 int need_iolock = 1; 74 int need_iolock = 1;
75 75
@@ -134,16 +134,13 @@ xfs_setattr(
134 if (flags & XFS_ATTR_NOLOCK) 134 if (flags & XFS_ATTR_NOLOCK)
135 need_iolock = 0; 135 need_iolock = 0;
136 if (!(mask & ATTR_SIZE)) { 136 if (!(mask & ATTR_SIZE)) {
137 if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || 137 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
138 (mp->m_flags & XFS_MOUNT_WSYNC)) { 138 commit_flags = 0;
139 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 139 code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
140 commit_flags = 0; 140 0, 0, 0);
141 if ((code = xfs_trans_reserve(tp, 0, 141 if (code) {
142 XFS_ICHANGE_LOG_RES(mp), 0, 142 lock_flags = 0;
143 0, 0))) { 143 goto error_return;
144 lock_flags = 0;
145 goto error_return;
146 }
147 } 144 }
148 } else { 145 } else {
149 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && 146 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
@@ -259,7 +256,7 @@ xfs_setattr(
259 iattr->ia_size > ip->i_d.di_size) { 256 iattr->ia_size > ip->i_d.di_size) {
260 code = xfs_flush_pages(ip, 257 code = xfs_flush_pages(ip,
261 ip->i_d.di_size, iattr->ia_size, 258 ip->i_d.di_size, iattr->ia_size,
262 XFS_B_ASYNC, FI_NONE); 259 XBF_ASYNC, FI_NONE);
263 } 260 }
264 261
265 /* wait for all I/O to complete */ 262 /* wait for all I/O to complete */
@@ -294,15 +291,23 @@ xfs_setattr(
294 * or we are explicitly asked to change it. This handles 291 * or we are explicitly asked to change it. This handles
295 * the semantic difference between truncate() and ftruncate() 292 * the semantic difference between truncate() and ftruncate()
296 * as implemented in the VFS. 293 * as implemented in the VFS.
294 *
295 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
296 * is a special case where we need to update the times despite
297 * not having these flags set. For all other operations the
298 * VFS set these flags explicitly if it wants a timestamp
299 * update.
297 */ 300 */
298 if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) 301 if (iattr->ia_size != ip->i_size &&
299 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 302 (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
303 iattr->ia_ctime = iattr->ia_mtime =
304 current_fs_time(inode->i_sb);
305 mask |= ATTR_CTIME | ATTR_MTIME;
306 }
300 307
301 if (iattr->ia_size > ip->i_size) { 308 if (iattr->ia_size > ip->i_size) {
302 ip->i_d.di_size = iattr->ia_size; 309 ip->i_d.di_size = iattr->ia_size;
303 ip->i_size = iattr->ia_size; 310 ip->i_size = iattr->ia_size;
304 if (!(flags & XFS_ATTR_DMI))
305 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
306 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 311 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
307 } else if (iattr->ia_size <= ip->i_size || 312 } else if (iattr->ia_size <= ip->i_size ||
308 (iattr->ia_size == 0 && ip->i_d.di_nextents)) { 313 (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
@@ -373,9 +378,6 @@ xfs_setattr(
373 ip->i_d.di_gid = gid; 378 ip->i_d.di_gid = gid;
374 inode->i_gid = gid; 379 inode->i_gid = gid;
375 } 380 }
376
377 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
378 timeflags |= XFS_ICHGTIME_CHG;
379 } 381 }
380 382
381 /* 383 /*
@@ -392,51 +394,37 @@ xfs_setattr(
392 394
393 inode->i_mode &= S_IFMT; 395 inode->i_mode &= S_IFMT;
394 inode->i_mode |= mode & ~S_IFMT; 396 inode->i_mode |= mode & ~S_IFMT;
395
396 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
397 timeflags |= XFS_ICHGTIME_CHG;
398 } 397 }
399 398
400 /* 399 /*
401 * Change file access or modified times. 400 * Change file access or modified times.
402 */ 401 */
403 if (mask & (ATTR_ATIME|ATTR_MTIME)) { 402 if (mask & ATTR_ATIME) {
404 if (mask & ATTR_ATIME) { 403 inode->i_atime = iattr->ia_atime;
405 inode->i_atime = iattr->ia_atime; 404 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
406 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; 405 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
407 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; 406 ip->i_update_core = 1;
408 ip->i_update_core = 1;
409 }
410 if (mask & ATTR_MTIME) {
411 inode->i_mtime = iattr->ia_mtime;
412 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
413 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
414 timeflags &= ~XFS_ICHGTIME_MOD;
415 timeflags |= XFS_ICHGTIME_CHG;
416 }
417 if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
418 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
419 } 407 }
420 408 if (mask & ATTR_CTIME) {
421 /*
422 * Change file inode change time only if ATTR_CTIME set
423 * AND we have been called by a DMI function.
424 */
425
426 if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
427 inode->i_ctime = iattr->ia_ctime; 409 inode->i_ctime = iattr->ia_ctime;
428 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; 410 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
429 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; 411 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
430 ip->i_update_core = 1; 412 ip->i_update_core = 1;
431 timeflags &= ~XFS_ICHGTIME_CHG; 413 }
414 if (mask & ATTR_MTIME) {
415 inode->i_mtime = iattr->ia_mtime;
416 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
417 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
418 ip->i_update_core = 1;
432 } 419 }
433 420
434 /* 421 /*
435 * Send out timestamp changes that need to be set to the 422 * And finally, log the inode core if any attribute in it
436 * current time. Not done when called by a DMI function. 423 * has been changed.
437 */ 424 */
438 if (timeflags && !(flags & XFS_ATTR_DMI)) 425 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
439 xfs_ichgtime(ip, timeflags); 426 ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
427 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
440 428
441 XFS_STATS_INC(xs_ig_attrchg); 429 XFS_STATS_INC(xs_ig_attrchg);
442 430
@@ -451,12 +439,10 @@ xfs_setattr(
451 * mix so this probably isn't worth the trouble to optimize. 439 * mix so this probably isn't worth the trouble to optimize.
452 */ 440 */
453 code = 0; 441 code = 0;
454 if (tp) { 442 if (mp->m_flags & XFS_MOUNT_WSYNC)
455 if (mp->m_flags & XFS_MOUNT_WSYNC) 443 xfs_trans_set_sync(tp);
456 xfs_trans_set_sync(tp);
457 444
458 code = xfs_trans_commit(tp, commit_flags); 445 code = xfs_trans_commit(tp, commit_flags);
459 }
460 446
461 xfs_iunlock(ip, lock_flags); 447 xfs_iunlock(ip, lock_flags);
462 448
@@ -538,9 +524,8 @@ xfs_readlink_bmap(
538 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 524 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
539 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 525 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
540 526
541 bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), 527 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
542 XBF_LOCK | XBF_MAPPED | 528 XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
543 XBF_DONT_BLOCK);
544 error = XFS_BUF_GETERROR(bp); 529 error = XFS_BUF_GETERROR(bp);
545 if (error) { 530 if (error) {
546 xfs_ioerror_alert("xfs_readlink", 531 xfs_ioerror_alert("xfs_readlink",
@@ -599,114 +584,9 @@ xfs_readlink(
599} 584}
600 585
601/* 586/*
602 * xfs_fsync 587 * Flags for xfs_free_eofblocks
603 *
604 * This is called to sync the inode and its data out to disk. We need to hold
605 * the I/O lock while flushing the data, and the inode lock while flushing the
606 * inode. The inode lock CANNOT be held while flushing the data, so acquire
607 * after we're done with that.
608 */ 588 */
609int 589#define XFS_FREE_EOF_TRYLOCK (1<<0)
610xfs_fsync(
611 xfs_inode_t *ip)
612{
613 xfs_trans_t *tp;
614 int error = 0;
615 int log_flushed = 0, changed = 1;
616
617 xfs_itrace_entry(ip);
618
619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620 return XFS_ERROR(EIO);
621
622 /*
623 * We always need to make sure that the required inode state is safe on
624 * disk. The inode might be clean but we still might need to force the
625 * log because of committed transactions that haven't hit the disk yet.
626 * Likewise, there could be unflushed non-transactional changes to the
627 * inode core that have to go to disk and this requires us to issue
628 * a synchronous transaction to capture these changes correctly.
629 *
630 * This code relies on the assumption that if the update_* fields
631 * of the inode are clear and the inode is unpinned then it is clean
632 * and no action is required.
633 */
634 xfs_ilock(ip, XFS_ILOCK_SHARED);
635
636 if (!ip->i_update_core) {
637 /*
638 * Timestamps/size haven't changed since last inode flush or
639 * inode transaction commit. That means either nothing got
640 * written or a transaction committed which caught the updates.
641 * If the latter happened and the transaction hasn't hit the
642 * disk yet, the inode will be still be pinned. If it is,
643 * force the log.
644 */
645
646 xfs_iunlock(ip, XFS_ILOCK_SHARED);
647
648 if (xfs_ipincount(ip)) {
649 error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
650 XFS_LOG_FORCE | XFS_LOG_SYNC,
651 &log_flushed);
652 } else {
653 /*
654 * If the inode is not pinned and nothing has changed
655 * we don't need to flush the cache.
656 */
657 changed = 0;
658 }
659 } else {
660 /*
661 * Kick off a transaction to log the inode core to get the
662 * updates. The sync transaction will also force the log.
663 */
664 xfs_iunlock(ip, XFS_ILOCK_SHARED);
665 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
666 error = xfs_trans_reserve(tp, 0,
667 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
668 if (error) {
669 xfs_trans_cancel(tp, 0);
670 return error;
671 }
672 xfs_ilock(ip, XFS_ILOCK_EXCL);
673
674 /*
675 * Note - it's possible that we might have pushed ourselves out
676 * of the way during trans_reserve which would flush the inode.
677 * But there's no guarantee that the inode buffer has actually
678 * gone out yet (it's delwri). Plus the buffer could be pinned
679 * anyway if it's part of an inode in another recent
680 * transaction. So we play it safe and fire off the
681 * transaction anyway.
682 */
683 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
684 xfs_trans_ihold(tp, ip);
685 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
686 xfs_trans_set_sync(tp);
687 error = _xfs_trans_commit(tp, 0, &log_flushed);
688
689 xfs_iunlock(ip, XFS_ILOCK_EXCL);
690 }
691
692 if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
693 /*
694 * If the log write didn't issue an ordered tag we need
695 * to flush the disk cache for the data device now.
696 */
697 if (!log_flushed)
698 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
699
700 /*
701 * If this inode is on the RT dev we need to flush that
702 * cache as well.
703 */
704 if (XFS_IS_REALTIME_INODE(ip))
705 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
706 }
707
708 return error;
709}
710 590
711/* 591/*
712 * This is called by xfs_inactive to free any blocks beyond eof 592 * This is called by xfs_inactive to free any blocks beyond eof
@@ -726,7 +606,6 @@ xfs_free_eofblocks(
726 xfs_filblks_t map_len; 606 xfs_filblks_t map_len;
727 int nimaps; 607 int nimaps;
728 xfs_bmbt_irec_t imap; 608 xfs_bmbt_irec_t imap;
729 int use_iolock = (flags & XFS_FREE_EOF_LOCK);
730 609
731 /* 610 /*
732 * Figure out if there are any blocks beyond the end 611 * Figure out if there are any blocks beyond the end
@@ -768,14 +647,19 @@ xfs_free_eofblocks(
768 * cache and we can't 647 * cache and we can't
769 * do that within a transaction. 648 * do that within a transaction.
770 */ 649 */
771 if (use_iolock) 650 if (flags & XFS_FREE_EOF_TRYLOCK) {
651 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
652 xfs_trans_cancel(tp, 0);
653 return 0;
654 }
655 } else {
772 xfs_ilock(ip, XFS_IOLOCK_EXCL); 656 xfs_ilock(ip, XFS_IOLOCK_EXCL);
657 }
773 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 658 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
774 ip->i_size); 659 ip->i_size);
775 if (error) { 660 if (error) {
776 xfs_trans_cancel(tp, 0); 661 xfs_trans_cancel(tp, 0);
777 if (use_iolock) 662 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
778 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
779 return error; 663 return error;
780 } 664 }
781 665
@@ -812,8 +696,7 @@ xfs_free_eofblocks(
812 error = xfs_trans_commit(tp, 696 error = xfs_trans_commit(tp,
813 XFS_TRANS_RELEASE_LOG_RES); 697 XFS_TRANS_RELEASE_LOG_RES);
814 } 698 }
815 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL) 699 xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
816 : XFS_ILOCK_EXCL));
817 } 700 }
818 return error; 701 return error;
819} 702}
@@ -1103,7 +986,7 @@ xfs_release(
1103 */ 986 */
1104 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 987 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1105 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 988 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
1106 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); 989 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
1107 } 990 }
1108 991
1109 if (ip->i_d.di_nlink != 0) { 992 if (ip->i_d.di_nlink != 0) {
@@ -1113,7 +996,17 @@ xfs_release(
1113 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 996 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
1114 (!(ip->i_d.di_flags & 997 (!(ip->i_d.di_flags &
1115 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { 998 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1116 error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); 999
1000 /*
1001 * If we can't get the iolock just skip truncating
1002 * the blocks past EOF because we could deadlock
1003 * with the mmap_sem otherwise. We'll get another
1004 * chance to drop them once the last reference to
1005 * the inode is dropped, so we'll never leak blocks
1006 * permanently.
1007 */
1008 error = xfs_free_eofblocks(mp, ip,
1009 XFS_FREE_EOF_TRYLOCK);
1117 if (error) 1010 if (error)
1118 return error; 1011 return error;
1119 } 1012 }
@@ -1184,7 +1077,7 @@ xfs_inactive(
1184 (!(ip->i_d.di_flags & 1077 (!(ip->i_d.di_flags &
1185 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 1078 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1186 (ip->i_delayed_blks != 0)))) { 1079 (ip->i_delayed_blks != 0)))) {
1187 error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); 1080 error = xfs_free_eofblocks(mp, ip, 0);
1188 if (error) 1081 if (error)
1189 return VN_INACTIVE_CACHE; 1082 return VN_INACTIVE_CACHE;
1190 } 1083 }
@@ -1380,7 +1273,6 @@ xfs_lookup(
1380 if (error) 1273 if (error)
1381 goto out_free_name; 1274 goto out_free_name;
1382 1275
1383 xfs_itrace_ref(*ipp);
1384 return 0; 1276 return 0;
1385 1277
1386out_free_name: 1278out_free_name:
@@ -1526,7 +1418,6 @@ xfs_create(
1526 * At this point, we've gotten a newly allocated inode. 1418 * At this point, we've gotten a newly allocated inode.
1527 * It is locked (and joined to the transaction). 1419 * It is locked (and joined to the transaction).
1528 */ 1420 */
1529 xfs_itrace_ref(ip);
1530 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1421 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1531 1422
1532 /* 1423 /*
@@ -1986,9 +1877,6 @@ xfs_remove(
1986 if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) 1877 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1987 xfs_filestream_deassociate(ip); 1878 xfs_filestream_deassociate(ip);
1988 1879
1989 xfs_itrace_exit(ip);
1990 xfs_itrace_exit(dp);
1991
1992 std_return: 1880 std_return:
1993 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { 1881 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
1994 XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL, 1882 XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
@@ -2201,7 +2089,8 @@ xfs_symlink(
2201 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { 2089 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2202 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, 2090 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2203 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, 2091 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2204 link_name->name, target_path, 0, 0, 0); 2092 link_name->name,
2093 (unsigned char *)target_path, 0, 0, 0);
2205 if (error) 2094 if (error)
2206 return error; 2095 return error;
2207 } 2096 }
@@ -2285,7 +2174,6 @@ xfs_symlink(
2285 goto error_return; 2174 goto error_return;
2286 goto error1; 2175 goto error1;
2287 } 2176 }
2288 xfs_itrace_ref(ip);
2289 2177
2290 /* 2178 /*
2291 * An error after we've joined dp to the transaction will result in the 2179 * An error after we've joined dp to the transaction will result in the
@@ -2398,7 +2286,8 @@ std_return:
2398 dp, DM_RIGHT_NULL, 2286 dp, DM_RIGHT_NULL,
2399 error ? NULL : ip, 2287 error ? NULL : ip,
2400 DM_RIGHT_NULL, link_name->name, 2288 DM_RIGHT_NULL, link_name->name,
2401 target_path, 0, error, 0); 2289 (unsigned char *)target_path,
2290 0, error, 0);
2402 } 2291 }
2403 2292
2404 if (!error) 2293 if (!error)
@@ -2456,46 +2345,6 @@ xfs_set_dmattrs(
2456 return error; 2345 return error;
2457} 2346}
2458 2347
2459int
2460xfs_reclaim(
2461 xfs_inode_t *ip)
2462{
2463
2464 xfs_itrace_entry(ip);
2465
2466 ASSERT(!VN_MAPPED(VFS_I(ip)));
2467
2468 /* bad inode, get out here ASAP */
2469 if (is_bad_inode(VFS_I(ip))) {
2470 xfs_ireclaim(ip);
2471 return 0;
2472 }
2473
2474 xfs_ioend_wait(ip);
2475
2476 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
2477
2478 /*
2479 * If we have nothing to flush with this inode then complete the
2480 * teardown now, otherwise break the link between the xfs inode and the
2481 * linux inode and clean up the xfs inode later. This avoids flushing
2482 * the inode to disk during the delete operation itself.
2483 *
2484 * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
2485 * first to ensure that xfs_iunpin() will never see an xfs inode
2486 * that has a linux inode being reclaimed. Synchronisation is provided
2487 * by the i_flags_lock.
2488 */
2489 if (!ip->i_update_core && (ip->i_itemp == NULL)) {
2490 xfs_ilock(ip, XFS_ILOCK_EXCL);
2491 xfs_iflock(ip);
2492 xfs_iflags_set(ip, XFS_IRECLAIMABLE);
2493 return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
2494 }
2495 xfs_inode_set_reclaim_tag(ip);
2496 return 0;
2497}
2498
2499/* 2348/*
2500 * xfs_alloc_file_space() 2349 * xfs_alloc_file_space()
2501 * This routine allocates disk space for the given file. 2350 * This routine allocates disk space for the given file.
@@ -2868,7 +2717,6 @@ xfs_free_file_space(
2868 ioffset = offset & ~(rounding - 1); 2717 ioffset = offset & ~(rounding - 1);
2869 2718
2870 if (VN_CACHED(VFS_I(ip)) != 0) { 2719 if (VN_CACHED(VFS_I(ip)) != 0) {
2871 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
2872 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); 2720 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
2873 if (error) 2721 if (error)
2874 goto out_unlock_iolock; 2722 goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index a9e102de71a1..d8dfa8d0dadd 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
22 22
23int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
24int xfs_fsync(struct xfs_inode *ip);
25int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);
26int xfs_inactive(struct xfs_inode *ip); 25int xfs_inactive(struct xfs_inode *ip);
27int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
@@ -38,31 +37,18 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
38 const char *target_path, mode_t mode, struct xfs_inode **ipp, 37 const char *target_path, mode_t mode, struct xfs_inode **ipp,
39 cred_t *credp); 38 cred_t *credp);
40int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
41int xfs_reclaim(struct xfs_inode *ip);
42int xfs_change_file_space(struct xfs_inode *ip, int cmd, 40int xfs_change_file_space(struct xfs_inode *ip, int cmd,
43 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); 41 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
44int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, 42int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
45 struct xfs_inode *src_ip, struct xfs_inode *target_dp, 43 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
46 struct xfs_name *target_name, struct xfs_inode *target_ip); 44 struct xfs_name *target_name, struct xfs_inode *target_ip);
47int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, 45int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
48 int *valuelenp, int flags); 46 unsigned char *value, int *valuelenp, int flags);
49int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, 47int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
50 int valuelen, int flags); 48 unsigned char *value, int valuelen, int flags);
51int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); 49int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
52int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 50int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
53 int flags, struct attrlist_cursor_kern *cursor); 51 int flags, struct attrlist_cursor_kern *cursor);
54ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
55 const struct iovec *iovp, unsigned int segs,
56 loff_t *offset, int ioflags);
57ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
58 loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
59 int flags, int ioflags);
60ssize_t xfs_splice_write(struct xfs_inode *ip,
61 struct pipe_inode_info *pipe, struct file *outfilp,
62 loff_t *ppos, size_t count, int flags, int ioflags);
63ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb,
64 const struct iovec *iovp, unsigned int nsegs,
65 loff_t *offset, int ioflags);
66int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 52int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
67 int flags, struct xfs_iomap *iomapp, int *niomaps); 53 int flags, struct xfs_iomap *iomapp, int *niomaps);
68void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, 54void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
@@ -73,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
73 xfs_off_t last, uint64_t flags, int fiopt); 59 xfs_off_t last, uint64_t flags, int fiopt);
74int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); 60int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
75 61
62int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
63
76#endif /* _XFS_VNODEOPS_H */ 64#endif /* _XFS_VNODEOPS_H */