aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-04-02 14:02:55 -0400
committerIngo Molnar <mingo@elte.hu>2010-04-02 14:03:08 -0400
commitc9494727cf293ae2ec66af57547a3e79c724fec2 (patch)
tree44ae197b64fa7530ee695a90ad31326dda06f1e1 /fs
parent6427462bfa50f50dc6c088c07037264fcc73eca1 (diff)
parent42be79e37e264557f12860fa4cc84b4de3685954 (diff)
Merge branch 'linus' into sched/core
Merge reason: update to latest upstream Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c2
-rw-r--r--fs/9p/v9fs.c8
-rw-r--r--fs/9p/v9fs.h23
-rw-r--r--fs/9p/vfs_dir.c13
-rw-r--r--fs/9p/vfs_file.c8
-rw-r--r--fs/9p/vfs_inode.c48
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/adfs.h2
-rw-r--r--fs/adfs/inode.c5
-rw-r--r--fs/affs/affs.h3
-rw-r--r--fs/affs/bitmap.c2
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/anon_inodes.c1
-rw-r--r--fs/attr.c13
-rw-r--r--fs/autofs4/autofs_i.h7
-rw-r--r--fs/autofs4/dev-ioctl.c11
-rw-r--r--fs/autofs4/expire.c6
-rw-r--r--fs/autofs4/inode.c63
-rw-r--r--fs/autofs4/root.c474
-rw-r--r--fs/bfs/inode.c5
-rw-r--r--fs/binfmt_aout.c52
-rw-r--r--fs/binfmt_elf.c151
-rw-r--r--fs/binfmt_elf_fdpic.c183
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/bio.c11
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/disk-io.c19
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c11
-rw-r--r--fs/btrfs/extent_io.c83
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/inode.c143
-rw-r--r--fs/btrfs/ioctl.c706
-rw-r--r--fs/btrfs/ioctl.h111
-rw-r--r--fs/btrfs/ordered-data.c41
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/ref-cache.h2
-rw-r--r--fs/btrfs/relocation.c8
-rw-r--r--fs/btrfs/super.c243
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c39
-rw-r--r--fs/buffer.c15
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1194
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c257
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c121
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c679
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c78
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2932
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c408
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c483
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1222
-rw-r--r--fs/ceph/export.c223
-rw-r--r--fs/ceph/file.c937
-rw-r--r--fs/ceph/inode.c1766
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3042
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2239
-rw-r--r--fs/ceph/messenger.h255
-rw-r--r--fs/ceph/mon_client.c834
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1550
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1022
-rw-r--r--fs/ceph/osdmap.h125
-rw-r--r--fs/ceph/pagelist.c54
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h374
-rw-r--r--fs/ceph/snap.c906
-rw-r--r--fs/ceph/super.c1030
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c844
-rw-r--r--fs/cifs/CHANGES3
-rw-r--r--fs/cifs/asn1.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h3
-rw-r--r--fs/cifs/cifspdu.h6
-rw-r--r--fs/cifs/cifsproto.h13
-rw-r--r--fs/cifs/cifssmb.c497
-rw-r--r--fs/cifs/connect.c8
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c11
-rw-r--r--fs/cifs/inode.c300
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/xattr.c8
-rw-r--r--fs/compat.c18
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/dcache.c70
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/member.c2
-rw-r--r--fs/exec.c50
-rw-r--r--fs/exofs/common.h39
-rw-r--r--fs/exofs/exofs.h55
-rw-r--r--fs/exofs/inode.c198
-rw-r--r--fs/exofs/ios.c575
-rw-r--r--fs/exofs/super.c121
-rw-r--r--fs/ext2/balloc.c12
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c5
-rw-r--r--fs/ext2/ialloc.c14
-rw-r--r--fs/ext2/inode.c18
-rw-r--r--fs/ext2/namei.c51
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext3/balloc.c11
-rw-r--r--fs/ext3/file.c7
-rw-r--r--fs/ext3/ialloc.c20
-rw-r--r--fs/ext3/inode.c47
-rw-r--r--fs/ext3/namei.c24
-rw-r--r--fs/ext3/super.c248
-rw-r--r--fs/ext3/xattr.c22
-rw-r--r--fs/ext4/balloc.c35
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h110
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h24
-rw-r--r--fs/ext4/extents.c260
-rw-r--r--fs/ext4/file.c13
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c52
-rw-r--r--fs/ext4/inode.c496
-rw-r--r--fs/ext4/ioctl.c12
-rw-r--r--fs/ext4/mballoc.c81
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/migrate.c35
-rw-r--r--fs/ext4/move_extent.c36
-rw-r--r--fs/ext4/namei.c86
-rw-r--r--fs/ext4/resize.c102
-rw-r--r--fs/ext4/super.c390
-rw-r--r--fs/ext4/xattr.c64
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/fat/namei_vfat.c33
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/file.c2
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/object.c6
-rw-r--r--fs/fscache/operation.c4
-rw-r--r--fs/fscache/page.c1
-rw-r--r--fs/fuse/dev.c30
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/aops.c4
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c75
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/glops.c16
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c6
-rw-r--r--fs/gfs2/lock_dlm.c5
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/lops.c4
-rw-r--r--fs/gfs2/main.c28
-rw-r--r--fs/gfs2/meta_io.c46
-rw-r--r--fs/gfs2/meta_io.h12
-rw-r--r--fs/gfs2/ops_fstype.c6
-rw-r--r--fs/gfs2/ops_inode.c113
-rw-r--r--fs/gfs2/quota.c9
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/super.c34
-rw-r--r--fs/gfs2/sys.c8
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hpfs/anode.c2
-rw-r--r--fs/hpfs/dentry.c14
-rw-r--r--fs/hpfs/dir.c14
-rw-r--r--fs/hpfs/dnode.c21
-rw-r--r--fs/hpfs/ea.c7
-rw-r--r--fs/hpfs/hpfs_fn.h30
-rw-r--r--fs/hpfs/inode.c4
-rw-r--r--fs/hpfs/map.c6
-rw-r--r--fs/hpfs/name.c21
-rw-r--r--fs/hpfs/namei.c75
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/inode.c4
-rw-r--r--fs/internal.h2
-rw-r--r--fs/jbd/commit.c10
-rw-r--r--fs/jbd/transaction.c45
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jbd2/journal.c132
-rw-r--r--fs/jbd2/transaction.c43
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jfs/acl.c26
-rw-r--r--fs/jfs/file.c31
-rw-r--r--fs/jfs/inode.c14
-rw-r--r--fs/jfs/jfs_acl.h7
-rw-r--r--fs/jfs/jfs_dtree.c28
-rw-r--r--fs/jfs/jfs_extent.c16
-rw-r--r--fs/jfs/jfs_inode.c8
-rw-r--r--fs/jfs/jfs_inode.h3
-rw-r--r--fs/jfs/jfs_xtree.c21
-rw-r--r--fs/jfs/namei.c23
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/jfs/xattr.c17
-rw-r--r--fs/libfs.c77
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/mon.c12
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c7
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c332
-rw-r--r--fs/logfs/dev_mtd.c254
-rw-r--r--fs/logfs/dir.c827
-rw-r--r--fs/logfs/file.c263
-rw-r--r--fs/logfs/gc.c730
-rw-r--r--fs/logfs/inode.c417
-rw-r--r--fs/logfs/journal.c890
-rw-r--r--fs/logfs/logfs.h725
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2257
-rw-r--r--fs/logfs/segment.c935
-rw-r--r--fs/logfs/super.c649
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c575
-rw-r--r--fs/namespace.c53
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/callback.h8
-rw-r--r--fs/nfs/callback_proc.c165
-rw-r--r--fs/nfs/callback_xdr.c106
-rw-r--r--fs/nfs/client.c48
-rw-r--r--fs/nfs/delegation.h6
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/dns_resolve.c18
-rw-r--r--fs/nfs/file.c33
-rw-r--r--fs/nfs/inode.c104
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/iostat.h4
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4proc.c112
-rw-r--r--fs/nfs/nfs4renewd.c24
-rw-r--r--fs/nfs/nfs4state.c118
-rw-r--r--fs/nfs/nfs4xdr.c12
-rw-r--r--fs/nfs/pagelist.c23
-rw-r--r--fs/nfs/proc.c41
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/symlink.c2
-rw-r--r--fs/nfs/write.c247
-rw-r--r--fs/nfsctl.c5
-rw-r--r--fs/nfsd/nfs4callback.c5
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c6
-rw-r--r--fs/nfsd/nfs4xdr.c16
-rw-r--r--fs/nfsd/nfsctl.c24
-rw-r--r--fs/nfsd/vfs.c161
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/dat.c5
-rw-r--r--fs/nilfs2/dir.c16
-rw-r--r--fs/nilfs2/gcinode.c4
-rw-r--r--fs/nilfs2/ioctl.c66
-rw-r--r--fs/nilfs2/namei.c13
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/page.c4
-rw-r--r--fs/nilfs2/recovery.c41
-rw-r--r--fs/nilfs2/segbuf.c36
-rw-r--r--fs/nilfs2/segbuf.h5
-rw-r--r--fs/nilfs2/segment.c143
-rw-r--r--fs/nilfs2/segment.h6
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/super.c19
-rw-r--r--fs/nilfs2/the_nilfs.c40
-rw-r--r--fs/nilfs2/the_nilfs.h3
-rw-r--r--fs/notify/inotify/inotify_user.c59
-rw-r--r--fs/ntfs/ChangeLog1702
-rw-r--r--fs/ntfs/dir.c2
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/super.c33
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c77
-rw-r--r--fs/ocfs2/alloc.c18
-rw-r--r--fs/ocfs2/aops.c16
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h7
-rw-r--r--fs/ocfs2/cluster/tcp.c4
-rw-r--r--fs/ocfs2/dir.c39
-rw-r--r--fs/ocfs2/dlm/Makefile3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)127
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c (renamed from fs/ocfs2/dlm/dlmfsver.c)0
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h (renamed from fs/ocfs2/dlm/dlmfsver.h)0
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)308
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)16
-rw-r--r--fs/ocfs2/dlmglue.c286
-rw-r--r--fs/ocfs2/extent_map.c2
-rw-r--r--fs/ocfs2/file.c33
-rw-r--r--fs/ocfs2/inode.c21
-rw-r--r--fs/ocfs2/ioctl.h6
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/namei.c80
-rw-r--r--fs/ocfs2/ocfs2.h34
-rw-r--r--fs/ocfs2/ocfs2_fs.h57
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h79
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h2
-rw-r--r--fs/ocfs2/quota_global.c7
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/ocfs2/refcounttree.c9
-rw-r--r--fs/ocfs2/stack_o2cb.c37
-rw-r--r--fs/ocfs2/stack_user.c49
-rw-r--r--fs/ocfs2/stackglue.c98
-rw-r--r--fs/ocfs2/stackglue.h95
-rw-r--r--fs/ocfs2/suballoc.c300
-rw-r--r--fs/ocfs2/suballoc.h6
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/ocfs2/xattr.c2186
-rw-r--r--fs/omfs/inode.c10
-rw-r--r--fs/open.c7
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/pnode.c28
-rw-r--r--fs/pnode.h5
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c15
-rw-r--r--fs/proc/generic.c38
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/kmsg.c14
-rw-r--r--fs/proc/root.c6
-rw-r--r--fs/proc/task_mmu.c13
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile2
-rw-r--r--fs/quota/compat.c118
-rw-r--r--fs/quota/dquot.c412
-rw-r--r--fs/quota/netlink.c95
-rw-r--r--fs/quota/quota.c735
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/bitmap.c12
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c24
-rw-r--r--fs/reiserfs/journal.c15
-rw-r--r--fs/reiserfs/namei.c23
-rw-r--r--fs/reiserfs/stree.c20
-rw-r--r--fs/reiserfs/super.c15
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/select.c19
-rw-r--r--fs/seq_file.c130
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/block.c76
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c68
-rw-r--r--fs/squashfs/decompressor.h55
-rw-r--r--fs/squashfs/dir.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/file.c1
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/inode.c1
-rw-r--r--fs/squashfs/namei.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h6
-rw-r--r--fs/squashfs/squashfs_fs_sb.h40
-rw-r--r--fs/squashfs/super.c49
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/zlib_wrapper.c150
-rw-r--r--fs/super.c21
-rw-r--r--fs/sync.c14
-rw-r--r--fs/sysfs/bin.c50
-rw-r--r--fs/sysfs/dir.c132
-rw-r--r--fs/sysfs/file.c47
-rw-r--r--fs/sysfs/inode.c13
-rw-r--r--fs/sysfs/mount.c4
-rw-r--r--fs/sysfs/symlink.c38
-rw-r--r--fs/sysfs/sysfs.h17
-rw-r--r--fs/sysv/inode.c10
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c8
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/balloc.c86
-rw-r--r--fs/udf/dir.c4
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c48
-rw-r--r--fs/udf/namei.c37
-rw-r--r--fs/udf/symlink.c10
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c10
-rw-r--r--fs/ufs/file.c3
-rw-r--r--fs/ufs/ialloc.c11
-rw-r--r--fs/ufs/inode.c9
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c9
-rw-r--r--fs/ufs/truncate.c10
-rw-r--r--fs/ufs/ufs.h6
-rw-r--r--fs/ufs/ufs_fs.h15
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c228
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c81
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c854
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c796
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h29
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h22
-rw-r--r--fs/xfs/xfs_bmap.c220
-rw-r--r--fs/xfs/xfs_fs.h3
-rw-r--r--fs/xfs/xfs_iget.c19
-rw-r--r--fs/xfs/xfs_inode.c68
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_log.c106
-rw-r--r--fs/xfs/xfs_log.h16
-rw-r--r--fs/xfs/xfs_mount.c69
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c216
-rw-r--r--fs/xfs/xfs_vnodeops.c107
-rw-r--r--fs/xfs/xfs_vnodeops.h15
477 files changed, 49095 insertions, 10390 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 14d944204571..08b2eb157048 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -151,7 +151,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
151 if (access == V9FS_ACCESS_SINGLE) 151 if (access == V9FS_ACCESS_SINGLE)
152 return ERR_PTR(-EPERM); 152 return ERR_PTR(-EPERM);
153 153
154 if (v9fs_extended(v9ses)) 154 if (v9fs_proto_dotu(v9ses))
155 uname = NULL; 155 uname = NULL;
156 else 156 else
157 uname = v9ses->uname; 157 uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 7d6c2139891d..6c7f6a251115 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -241,7 +241,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
241 list_add(&v9ses->slist, &v9fs_sessionlist); 241 list_add(&v9ses->slist, &v9fs_sessionlist);
242 spin_unlock(&v9fs_sessionlist_lock); 242 spin_unlock(&v9fs_sessionlist_lock);
243 243
244 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; 244 v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
245 strcpy(v9ses->uname, V9FS_DEFUSER); 245 strcpy(v9ses->uname, V9FS_DEFUSER);
246 strcpy(v9ses->aname, V9FS_DEFANAME); 246 strcpy(v9ses->aname, V9FS_DEFANAME);
247 v9ses->uid = ~0; 247 v9ses->uid = ~0;
@@ -262,13 +262,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
262 goto error; 262 goto error;
263 } 263 }
264 264
265 if (!v9ses->clnt->dotu) 265 if (!p9_is_proto_dotu(v9ses->clnt))
266 v9ses->flags &= ~V9FS_EXTENDED; 266 v9ses->flags &= ~V9FS_PROTO_2000U;
267 267
268 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 268 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
269 269
270 /* for legacy mode, fall back to V9FS_ACCESS_ANY */ 270 /* for legacy mode, fall back to V9FS_ACCESS_ANY */
271 if (!v9fs_extended(v9ses) && 271 if (!v9fs_proto_dotu(v9ses) &&
272 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { 272 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
273 273
274 v9ses->flags &= ~V9FS_ACCESS_MASK; 274 v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 019f4ccb70c1..6b801d1ddf4b 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -23,7 +23,8 @@
23 23
24/** 24/**
25 * enum p9_session_flags - option flags for each 9P session 25 * enum p9_session_flags - option flags for each 9P session
26 * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions 26 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
27 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
27 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy 28 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
28 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) 29 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
29 * @V9FS_ACCESS_ANY: use a single attach for all users 30 * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -32,11 +33,12 @@
32 * Session flags reflect options selected by users at mount time 33 * Session flags reflect options selected by users at mount time
33 */ 34 */
34enum p9_session_flags { 35enum p9_session_flags {
35 V9FS_EXTENDED = 0x01, 36 V9FS_PROTO_2000U = 0x01,
36 V9FS_ACCESS_SINGLE = 0x02, 37 V9FS_PROTO_2000L = 0x02,
37 V9FS_ACCESS_USER = 0x04, 38 V9FS_ACCESS_SINGLE = 0x04,
38 V9FS_ACCESS_ANY = 0x06, 39 V9FS_ACCESS_USER = 0x08,
39 V9FS_ACCESS_MASK = 0x06, 40 V9FS_ACCESS_ANY = 0x0C,
41 V9FS_ACCESS_MASK = 0x0C,
40}; 42};
41 43
42/* possible values of ->cache */ 44/* possible values of ->cache */
@@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
121 return (inode->i_sb->s_fs_info); 123 return (inode->i_sb->s_fs_info);
122} 124}
123 125
124static inline int v9fs_extended(struct v9fs_session_info *v9ses) 126static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
125{ 127{
126 return v9ses->flags & V9FS_EXTENDED; 128 return v9ses->flags & V9FS_PROTO_2000U;
129}
130
131static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
132{
133 return v9ses->flags & V9FS_PROTO_2000L;
127} 134}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 15cce53bf61e..d8a3afe4ff72 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -76,6 +76,15 @@ static inline int dt_type(struct p9_wstat *mistat)
76 return rettype; 76 return rettype;
77} 77}
78 78
79static void p9stat_init(struct p9_wstat *stbuf)
80{
81 stbuf->name = NULL;
82 stbuf->uid = NULL;
83 stbuf->gid = NULL;
84 stbuf->muid = NULL;
85 stbuf->extension = NULL;
86}
87
79/** 88/**
80 * v9fs_dir_readdir - read a directory 89 * v9fs_dir_readdir - read a directory
81 * @filp: opened file structure 90 * @filp: opened file structure
@@ -131,11 +140,11 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 rdir->head = 0; 140 rdir->head = 0;
132 rdir->tail = err; 141 rdir->tail = err;
133 } 142 }
134
135 while (rdir->head < rdir->tail) { 143 while (rdir->head < rdir->tail) {
144 p9stat_init(&st);
136 err = p9stat_read(rdir->buf + rdir->head, 145 err = p9stat_read(rdir->buf + rdir->head,
137 buflen - rdir->head, &st, 146 buflen - rdir->head, &st,
138 fid->clnt->dotu); 147 fid->clnt->proto_version);
139 if (err) { 148 if (err) {
140 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 149 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
141 err = -EIO; 150 err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 74a0461a9ac0..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
61 61
62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); 62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
63 v9ses = v9fs_inode2v9ses(inode); 63 v9ses = v9fs_inode2v9ses(inode);
64 omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); 64 omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
65 fid = file->private_data; 65 fid = file->private_data;
66 if (!fid) { 66 if (!fid) {
67 fid = v9fs_fid_clone(file->f_path.dentry); 67 fid = v9fs_fid_clone(file->f_path.dentry);
@@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
77 i_size_write(inode, 0); 77 i_size_write(inode, 0);
78 inode->i_blocks = 0; 78 inode->i_blocks = 0;
79 } 79 }
80 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) 80 if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
81 generic_file_llseek(file, 0, SEEK_END); 81 generic_file_llseek(file, 0, SEEK_END);
82 } 82 }
83 83
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); 114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
115 115
116 /* No mandatory locks */ 116 /* No mandatory locks */
117 if (__mandatory_lock(inode)) 117 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
118 return -ENOLCK; 118 return -ENOLCK;
119 119
120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { 120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
215 struct p9_fid *fid; 215 struct p9_fid *fid;
216 struct p9_client *clnt; 216 struct p9_client *clnt;
217 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
218 int origin = *offset; 218 loff_t origin = *offset;
219 unsigned long pg_start, pg_end; 219 unsigned long pg_start, pg_end;
220 220
221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index a407fa3388c0..5fe45d692c9f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -60,7 +60,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
60 res = mode & 0777; 60 res = mode & 0777;
61 if (S_ISDIR(mode)) 61 if (S_ISDIR(mode))
62 res |= P9_DMDIR; 62 res |= P9_DMDIR;
63 if (v9fs_extended(v9ses)) { 63 if (v9fs_proto_dotu(v9ses)) {
64 if (S_ISLNK(mode)) 64 if (S_ISLNK(mode))
65 res |= P9_DMSYMLINK; 65 res |= P9_DMSYMLINK;
66 if (v9ses->nodev == 0) { 66 if (v9ses->nodev == 0) {
@@ -102,21 +102,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
102 102
103 if ((mode & P9_DMDIR) == P9_DMDIR) 103 if ((mode & P9_DMDIR) == P9_DMDIR)
104 res |= S_IFDIR; 104 res |= S_IFDIR;
105 else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses))) 105 else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
106 res |= S_IFLNK; 106 res |= S_IFLNK;
107 else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses)) 107 else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
108 && (v9ses->nodev == 0)) 108 && (v9ses->nodev == 0))
109 res |= S_IFSOCK; 109 res |= S_IFSOCK;
110 else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses)) 110 else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
111 && (v9ses->nodev == 0)) 111 && (v9ses->nodev == 0))
112 res |= S_IFIFO; 112 res |= S_IFIFO;
113 else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses)) 113 else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
114 && (v9ses->nodev == 0)) 114 && (v9ses->nodev == 0))
115 res |= S_IFBLK; 115 res |= S_IFBLK;
116 else 116 else
117 res |= S_IFREG; 117 res |= S_IFREG;
118 118
119 if (v9fs_extended(v9ses)) { 119 if (v9fs_proto_dotu(v9ses)) {
120 if ((mode & P9_DMSETUID) == P9_DMSETUID) 120 if ((mode & P9_DMSETUID) == P9_DMSETUID)
121 res |= S_ISUID; 121 res |= S_ISUID;
122 122
@@ -265,7 +265,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
265 case S_IFBLK: 265 case S_IFBLK:
266 case S_IFCHR: 266 case S_IFCHR:
267 case S_IFSOCK: 267 case S_IFSOCK:
268 if (!v9fs_extended(v9ses)) { 268 if (!v9fs_proto_dotu(v9ses)) {
269 P9_DPRINTK(P9_DEBUG_ERROR, 269 P9_DPRINTK(P9_DEBUG_ERROR,
270 "special files without extended mode\n"); 270 "special files without extended mode\n");
271 err = -EINVAL; 271 err = -EINVAL;
@@ -278,7 +278,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
278 inode->i_fop = &v9fs_file_operations; 278 inode->i_fop = &v9fs_file_operations;
279 break; 279 break;
280 case S_IFLNK: 280 case S_IFLNK:
281 if (!v9fs_extended(v9ses)) { 281 if (!v9fs_proto_dotu(v9ses)) {
282 P9_DPRINTK(P9_DEBUG_ERROR, 282 P9_DPRINTK(P9_DEBUG_ERROR,
283 "extended modes used w/o 9P2000.u\n"); 283 "extended modes used w/o 9P2000.u\n");
284 err = -EINVAL; 284 err = -EINVAL;
@@ -288,7 +288,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
288 break; 288 break;
289 case S_IFDIR: 289 case S_IFDIR:
290 inc_nlink(inode); 290 inc_nlink(inode);
291 if (v9fs_extended(v9ses)) 291 if (v9fs_proto_dotu(v9ses))
292 inode->i_op = &v9fs_dir_inode_operations_ext; 292 inode->i_op = &v9fs_dir_inode_operations_ext;
293 else 293 else
294 inode->i_op = &v9fs_dir_inode_operations; 294 inode->i_op = &v9fs_dir_inode_operations;
@@ -575,7 +575,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
575 flags = O_RDWR; 575 flags = O_RDWR;
576 576
577 fid = v9fs_create(v9ses, dir, dentry, NULL, perm, 577 fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
578 v9fs_uflags2omode(flags, v9fs_extended(v9ses))); 578 v9fs_uflags2omode(flags,
579 v9fs_proto_dotu(v9ses)));
579 if (IS_ERR(fid)) { 580 if (IS_ERR(fid)) {
580 err = PTR_ERR(fid); 581 err = PTR_ERR(fid);
581 fid = NULL; 582 fid = NULL;
@@ -858,7 +859,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
858 if (iattr->ia_valid & ATTR_SIZE) 859 if (iattr->ia_valid & ATTR_SIZE)
859 wstat.length = iattr->ia_size; 860 wstat.length = iattr->ia_size;
860 861
861 if (v9fs_extended(v9ses)) { 862 if (v9fs_proto_dotu(v9ses)) {
862 if (iattr->ia_valid & ATTR_UID) 863 if (iattr->ia_valid & ATTR_UID)
863 wstat.n_uid = iattr->ia_uid; 864 wstat.n_uid = iattr->ia_uid;
864 865
@@ -886,6 +887,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
886 struct super_block *sb) 887 struct super_block *sb)
887{ 888{
888 char ext[32]; 889 char ext[32];
890 char tag_name[14];
891 unsigned int i_nlink;
889 struct v9fs_session_info *v9ses = sb->s_fs_info; 892 struct v9fs_session_info *v9ses = sb->s_fs_info;
890 893
891 inode->i_nlink = 1; 894 inode->i_nlink = 1;
@@ -897,11 +900,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
897 inode->i_uid = v9ses->dfltuid; 900 inode->i_uid = v9ses->dfltuid;
898 inode->i_gid = v9ses->dfltgid; 901 inode->i_gid = v9ses->dfltgid;
899 902
900 if (v9fs_extended(v9ses)) { 903 if (v9fs_proto_dotu(v9ses)) {
901 inode->i_uid = stat->n_uid; 904 inode->i_uid = stat->n_uid;
902 inode->i_gid = stat->n_gid; 905 inode->i_gid = stat->n_gid;
903 } 906 }
904 907 if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
908 if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
909 /*
910 * Hadlink support got added later to
911 * to the .u extension. So there can be
912 * server out there that doesn't support
913 * this even with .u extension. So check
914 * for non NULL stat->extension
915 */
916 strncpy(ext, stat->extension, sizeof(ext));
917 /* HARDLINKCOUNT %u */
918 sscanf(ext, "%13s %u", tag_name, &i_nlink);
919 if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
920 inode->i_nlink = i_nlink;
921 }
922 }
905 inode->i_mode = p9mode2unixmode(v9ses, stat->mode); 923 inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
906 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { 924 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
907 char type = 0; 925 char type = 0;
@@ -976,7 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
976 if (IS_ERR(fid)) 994 if (IS_ERR(fid))
977 return PTR_ERR(fid); 995 return PTR_ERR(fid);
978 996
979 if (!v9fs_extended(v9ses)) 997 if (!v9fs_proto_dotu(v9ses))
980 return -EBADF; 998 return -EBADF;
981 999
982 st = p9_client_stat(fid); 1000 st = p9_client_stat(fid);
@@ -1066,7 +1084,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1066 struct p9_fid *fid; 1084 struct p9_fid *fid;
1067 1085
1068 v9ses = v9fs_inode2v9ses(dir); 1086 v9ses = v9fs_inode2v9ses(dir);
1069 if (!v9fs_extended(v9ses)) { 1087 if (!v9fs_proto_dotu(v9ses)) {
1070 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); 1088 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
1071 return -EPERM; 1089 return -EPERM;
1072 } 1090 }
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a5..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
177source "fs/jffs2/Kconfig" 177source "fs/jffs2/Kconfig"
178# UBIFS File system configuration 178# UBIFS File system configuration
179source "fs/ubifs/Kconfig" 179source "fs/ubifs/Kconfig"
180source "fs/logfs/Kconfig"
180source "fs/cramfs/Kconfig" 181source "fs/cramfs/Kconfig"
181source "fs/squashfs/Kconfig" 182source "fs/squashfs/Kconfig"
182source "fs/freevxfs/Kconfig" 183source "fs/freevxfs/Kconfig"
@@ -234,6 +235,7 @@ config NFS_COMMON
234 235
235source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
236source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
237source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
238source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
239source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
99obj-$(CONFIG_UFS_FS) += ufs/ 99obj-$(CONFIG_UFS_FS) += ufs/
100obj-$(CONFIG_EFS_FS) += efs/ 100obj-$(CONFIG_EFS_FS) += efs/
101obj-$(CONFIG_JFFS2_FS) += jffs2/ 101obj-$(CONFIG_JFFS2_FS) += jffs2/
102obj-$(CONFIG_LOGFS) += logfs/
102obj-$(CONFIG_UBIFS_FS) += ubifs/ 103obj-$(CONFIG_UBIFS_FS) += ubifs/
103obj-$(CONFIG_AFFS_FS) += affs/ 104obj-$(CONFIG_AFFS_FS) += affs/
104obj-$(CONFIG_ROMFS_FS) += romfs/ 105obj-$(CONFIG_ROMFS_FS) += romfs/
@@ -124,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
124obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
125obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
126obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b832..2ff622f6f547 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
121 121
122/* Inode stuff */ 122/* Inode stuff */
123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); 123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
124int adfs_write_inode(struct inode *inode,int unused); 124int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
125int adfs_notify_change(struct dentry *dentry, struct iattr *attr); 125int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
126 126
127/* map.c */ 127/* map.c */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5d..0f5e30978135 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/writeback.h>
12#include "adfs.h" 13#include "adfs.h"
13 14
14/* 15/*
@@ -360,7 +361,7 @@ out:
360 * The adfs-specific inode data has already been updated by 361 * The adfs-specific inode data has already been updated by
361 * adfs_notify_change() 362 * adfs_notify_change()
362 */ 363 */
363int adfs_write_inode(struct inode *inode, int wait) 364int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
364{ 365{
365 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
366 struct object_info obj; 367 struct object_info obj;
@@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait)
375 obj.attr = ADFS_I(inode)->attr; 376 obj.attr = ADFS_I(inode)->attr;
376 obj.size = inode->i_size; 377 obj.size = inode->i_size;
377 378
378 ret = adfs_dir_update(sb, &obj, wait); 379 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
379 unlock_kernel(); 380 unlock_kernel();
380 return ret; 381 return ret;
381} 382}
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0e40caaba456..861dae68ac12 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode);
175extern void affs_clear_inode(struct inode *inode); 175extern void affs_clear_inode(struct inode *inode);
176extern struct inode *affs_iget(struct super_block *sb, 176extern struct inode *affs_iget(struct super_block *sb,
177 unsigned long ino); 177 unsigned long ino);
178extern int affs_write_inode(struct inode *inode, int); 178extern int affs_write_inode(struct inode *inode,
179 struct writeback_control *wbc);
179extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); 180extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
180 181
181/* file.c */ 182/* file.c */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1c..8306d53307ed 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -128,7 +128,7 @@ err_range:
128/* 128/*
129 * Allocate a block in the given allocation zone. 129 * Allocate a block in the given allocation zone.
130 * Since we have to byte-swap the bitmap on little-endian 130 * Since we have to byte-swap the bitmap on little-endian
131 * machines, this is rather expensive. Therefor we will 131 * machines, this is rather expensive. Therefore we will
132 * preallocate up to 16 blocks from the same word, if 132 * preallocate up to 16 blocks from the same word, if
133 * possible. We are not doing preallocations in the 133 * possible. We are not doing preallocations in the
134 * header zone, though. 134 * header zone, though.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c4..c9744d771d98 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -166,7 +166,7 @@ bad_inode:
166} 166}
167 167
168int 168int
169affs_write_inode(struct inode *inode, int unused) 169affs_write_inode(struct inode *inode, struct writeback_control *wbc)
170{ 170{
171 struct super_block *sb = inode->i_sb; 171 struct super_block *sb = inode->i_sb;
172 struct buffer_head *bh; 172 struct buffer_head *bh;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf71..c54dad4e6063 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
733 struct page *page, void *fsdata); 733 struct page *page, void *fsdata);
734extern int afs_writepage(struct page *, struct writeback_control *); 734extern int afs_writepage(struct page *, struct writeback_control *);
735extern int afs_writepages(struct address_space *, struct writeback_control *); 735extern int afs_writepages(struct address_space *, struct writeback_control *);
736extern int afs_write_inode(struct inode *, int);
737extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); 736extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
738extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 737extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
739 unsigned long, loff_t); 738 unsigned long, loff_t);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6a..14f6431598ad 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
48static const struct super_operations afs_super_ops = { 48static const struct super_operations afs_super_ops = {
49 .statfs = afs_statfs, 49 .statfs = afs_statfs,
50 .alloc_inode = afs_alloc_inode, 50 .alloc_inode = afs_alloc_inode,
51 .write_inode = afs_write_inode,
52 .destroy_inode = afs_destroy_inode, 51 .destroy_inode = afs_destroy_inode,
53 .clear_inode = afs_clear_inode, 52 .clear_inode = afs_clear_inode,
54 .put_super = afs_put_super, 53 .put_super = afs_put_super,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e15a21dbf9f..3bed54a294d4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping,
585} 585}
586 586
587/* 587/*
588 * write an inode back
589 */
590int afs_write_inode(struct inode *inode, int sync)
591{
592 struct afs_vnode *vnode = AFS_FS_I(inode);
593 int ret;
594
595 _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
596
597 ret = 0;
598 if (sync) {
599 ret = filemap_fdatawait(inode->i_mapping);
600 if (ret < 0)
601 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
602 }
603
604 _leave(" = %d", ret);
605 return ret;
606}
607
608/*
609 * completion of write to server 588 * completion of write to server
610 */ 589 */
611void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 590void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9f0bf13291e5..2de009565d8e 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -209,6 +209,7 @@ static struct inode *anon_inode_mkinode(void)
209 inode->i_mode = S_IRUSR | S_IWUSR; 209 inode->i_mode = S_IRUSR | S_IWUSR;
210 inode->i_uid = current_fsuid(); 210 inode->i_uid = current_fsuid();
211 inode->i_gid = current_fsgid(); 211 inode->i_gid = current_fsgid();
212 inode->i_flags |= S_PRIVATE;
212 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 213 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
213 return inode; 214 return inode;
214} 215}
diff --git a/fs/attr.c b/fs/attr.c
index 96d394bdaddf..0815e93bb487 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -12,7 +12,6 @@
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/fsnotify.h> 13#include <linux/fsnotify.h>
14#include <linux/fcntl.h> 14#include <linux/fcntl.h>
15#include <linux/quotaops.h>
16#include <linux/security.h> 15#include <linux/security.h>
17 16
18/* Taken over from the old code... */ 17/* Taken over from the old code... */
@@ -82,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset)
82 if (inode->i_size < offset) { 81 if (inode->i_size < offset) {
83 unsigned long limit; 82 unsigned long limit;
84 83
85 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 84 limit = rlimit(RLIMIT_FSIZE);
86 if (limit != RLIM_INFINITY && offset > limit) 85 if (limit != RLIM_INFINITY && offset > limit)
87 goto out_sig; 86 goto out_sig;
88 if (offset > inode->i_sb->s_maxbytes) 87 if (offset > inode->i_sb->s_maxbytes)
@@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
212 error = inode->i_op->setattr(dentry, attr); 211 error = inode->i_op->setattr(dentry, attr);
213 } else { 212 } else {
214 error = inode_change_ok(inode, attr); 213 error = inode_change_ok(inode, attr);
215 if (!error) { 214 if (!error)
216 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 215 error = inode_setattr(inode, attr);
217 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
218 error = vfs_dq_transfer(inode, attr) ?
219 -EDQUOT : 0;
220 if (!error)
221 error = inode_setattr(inode, attr);
222 }
223 } 216 }
224 217
225 if (ia_valid & ATTR_SIZE) 218 if (ia_valid & ATTR_SIZE)
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0118d67221b2..3d283abf67d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -60,11 +60,6 @@ do { \
60 current->pid, __func__, ##args); \ 60 current->pid, __func__, ##args); \
61} while (0) 61} while (0)
62 62
63struct rehash_entry {
64 struct task_struct *task;
65 struct list_head list;
66};
67
68/* Unified info structure. This is pointed to by both the dentry and 63/* Unified info structure. This is pointed to by both the dentry and
69 inode structures. Each file in the filesystem has an instance of this 64 inode structures. Each file in the filesystem has an instance of this
70 structure. It holds a reference to the dentry, so dentries are never 65 structure. It holds a reference to the dentry, so dentries are never
@@ -81,7 +76,6 @@ struct autofs_info {
81 76
82 struct list_head active; 77 struct list_head active;
83 int active_count; 78 int active_count;
84 struct list_head rehash_list;
85 79
86 struct list_head expiring; 80 struct list_head expiring;
87 81
@@ -104,7 +98,6 @@ struct autofs_info {
104#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 98#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
105#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ 99#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
106#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 100#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
107#define AUTOFS_INF_REHASH (1<<3) /* dentry in transit to ->lookup() */
108 101
109struct autofs_wait_queue { 102struct autofs_wait_queue {
110 wait_queue_head_t queue; 103 wait_queue_head_t queue;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fcb245f..c8a80dffb455 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -544,10 +544,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
544 goto out; 544 goto out;
545 devid = new_encode_dev(path.mnt->mnt_sb->s_dev); 545 devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
546 err = 0; 546 err = 0;
547 if (path.dentry->d_inode && 547 if (path.mnt->mnt_root == path.dentry) {
548 path.mnt->mnt_root == path.dentry) {
549 err = 1; 548 err = 1;
550 magic = path.dentry->d_inode->i_sb->s_magic; 549 magic = path.mnt->mnt_sb->s_magic;
551 } 550 }
552 } else { 551 } else {
553 dev_t dev = sbi->sb->s_dev; 552 dev_t dev = sbi->sb->s_dev;
@@ -560,10 +559,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
560 559
561 err = have_submounts(path.dentry); 560 err = have_submounts(path.dentry);
562 561
563 if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { 562 if (follow_down(&path))
564 if (follow_down(&path)) 563 magic = path.mnt->mnt_sb->s_magic;
565 magic = path.mnt->mnt_sb->s_magic;
566 }
567 } 564 }
568 565
569 param->ismountpoint.out.devid = devid; 566 param->ismountpoint.out.devid = devid;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 74bc9aa6df31..a796c9417fb1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -279,7 +279,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
279 root->d_mounted--; 279 root->d_mounted--;
280 } 280 }
281 ino->flags |= AUTOFS_INF_EXPIRING; 281 ino->flags |= AUTOFS_INF_EXPIRING;
282 autofs4_add_expiring(root);
283 init_completion(&ino->expire_complete); 282 init_completion(&ino->expire_complete);
284 spin_unlock(&sbi->fs_lock); 283 spin_unlock(&sbi->fs_lock);
285 return root; 284 return root;
@@ -407,7 +406,6 @@ found:
407 expired, (int)expired->d_name.len, expired->d_name.name); 406 expired, (int)expired->d_name.len, expired->d_name.name);
408 ino = autofs4_dentry_ino(expired); 407 ino = autofs4_dentry_ino(expired);
409 ino->flags |= AUTOFS_INF_EXPIRING; 408 ino->flags |= AUTOFS_INF_EXPIRING;
410 autofs4_add_expiring(expired);
411 init_completion(&ino->expire_complete); 409 init_completion(&ino->expire_complete);
412 spin_unlock(&sbi->fs_lock); 410 spin_unlock(&sbi->fs_lock);
413 spin_lock(&dcache_lock); 411 spin_lock(&dcache_lock);
@@ -435,7 +433,7 @@ int autofs4_expire_wait(struct dentry *dentry)
435 433
436 DPRINTK("expire done status=%d", status); 434 DPRINTK("expire done status=%d", status);
437 435
438 if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode)) 436 if (d_unhashed(dentry))
439 return -EAGAIN; 437 return -EAGAIN;
440 438
441 return status; 439 return status;
@@ -475,7 +473,6 @@ int autofs4_expire_run(struct super_block *sb,
475 spin_lock(&sbi->fs_lock); 473 spin_lock(&sbi->fs_lock);
476 ino = autofs4_dentry_ino(dentry); 474 ino = autofs4_dentry_ino(dentry);
477 ino->flags &= ~AUTOFS_INF_EXPIRING; 475 ino->flags &= ~AUTOFS_INF_EXPIRING;
478 autofs4_del_expiring(dentry);
479 complete_all(&ino->expire_complete); 476 complete_all(&ino->expire_complete);
480 spin_unlock(&sbi->fs_lock); 477 spin_unlock(&sbi->fs_lock);
481 478
@@ -506,7 +503,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
506 ino->flags &= ~AUTOFS_INF_MOUNTPOINT; 503 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
507 } 504 }
508 ino->flags &= ~AUTOFS_INF_EXPIRING; 505 ino->flags &= ~AUTOFS_INF_EXPIRING;
509 autofs4_del_expiring(dentry);
510 complete_all(&ino->expire_complete); 506 complete_all(&ino->expire_complete);
511 spin_unlock(&sbi->fs_lock); 507 spin_unlock(&sbi->fs_lock);
512 dput(dentry); 508 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d0a3de247458..821b2b955dac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,7 +49,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
49 ino->dentry = NULL; 49 ino->dentry = NULL;
50 ino->size = 0; 50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 51 INIT_LIST_HEAD(&ino->active);
52 INIT_LIST_HEAD(&ino->rehash_list);
53 ino->active_count = 0; 52 ino->active_count = 0;
54 INIT_LIST_HEAD(&ino->expiring); 53 INIT_LIST_HEAD(&ino->expiring);
55 atomic_set(&ino->count, 0); 54 atomic_set(&ino->count, 0);
@@ -97,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino)
97 kfree(ino); 96 kfree(ino);
98} 97}
99 98
100/*
101 * Deal with the infamous "Busy inodes after umount ..." message.
102 *
103 * Clean up the dentry tree. This happens with autofs if the user
104 * space program goes away due to a SIGKILL, SIGSEGV etc.
105 */
106static void autofs4_force_release(struct autofs_sb_info *sbi)
107{
108 struct dentry *this_parent = sbi->sb->s_root;
109 struct list_head *next;
110
111 if (!sbi->sb->s_root)
112 return;
113
114 spin_lock(&dcache_lock);
115repeat:
116 next = this_parent->d_subdirs.next;
117resume:
118 while (next != &this_parent->d_subdirs) {
119 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
120
121 /* Negative dentry - don`t care */
122 if (!simple_positive(dentry)) {
123 next = next->next;
124 continue;
125 }
126
127 if (!list_empty(&dentry->d_subdirs)) {
128 this_parent = dentry;
129 goto repeat;
130 }
131
132 next = next->next;
133 spin_unlock(&dcache_lock);
134
135 DPRINTK("dentry %p %.*s",
136 dentry, (int)dentry->d_name.len, dentry->d_name.name);
137
138 dput(dentry);
139 spin_lock(&dcache_lock);
140 }
141
142 if (this_parent != sbi->sb->s_root) {
143 struct dentry *dentry = this_parent;
144
145 next = this_parent->d_u.d_child.next;
146 this_parent = this_parent->d_parent;
147 spin_unlock(&dcache_lock);
148 DPRINTK("parent dentry %p %.*s",
149 dentry, (int)dentry->d_name.len, dentry->d_name.name);
150 dput(dentry);
151 spin_lock(&dcache_lock);
152 goto resume;
153 }
154 spin_unlock(&dcache_lock);
155}
156
157void autofs4_kill_sb(struct super_block *sb) 99void autofs4_kill_sb(struct super_block *sb)
158{ 100{
159 struct autofs_sb_info *sbi = autofs4_sbi(sb); 101 struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -170,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb)
170 /* Free wait queues, close pipe */ 112 /* Free wait queues, close pipe */
171 autofs4_catatonic_mode(sbi); 113 autofs4_catatonic_mode(sbi);
172 114
173 /* Clean up and release dangling references */
174 autofs4_force_release(sbi);
175
176 sb->s_fs_info = NULL; 115 sb->s_fs_info = NULL;
177 kfree(sbi); 116 kfree(sbi);
178 117
179out_kill_sb: 118out_kill_sb:
180 DPRINTK("shutting down"); 119 DPRINTK("shutting down");
181 kill_anon_super(sb); 120 kill_litter_super(sb);
182} 121}
183 122
184static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) 123static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 30cc9ddf4b70..a015b49891df 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -104,99 +104,6 @@ static void autofs4_del_active(struct dentry *dentry)
104 return; 104 return;
105} 105}
106 106
107static void autofs4_add_rehash_entry(struct autofs_info *ino,
108 struct rehash_entry *entry)
109{
110 entry->task = current;
111 INIT_LIST_HEAD(&entry->list);
112 list_add(&entry->list, &ino->rehash_list);
113 return;
114}
115
116static void autofs4_remove_rehash_entry(struct autofs_info *ino)
117{
118 struct list_head *head = &ino->rehash_list;
119 struct rehash_entry *entry;
120 list_for_each_entry(entry, head, list) {
121 if (entry->task == current) {
122 list_del(&entry->list);
123 kfree(entry);
124 break;
125 }
126 }
127 return;
128}
129
130static void autofs4_remove_rehash_entrys(struct autofs_info *ino)
131{
132 struct autofs_sb_info *sbi = ino->sbi;
133 struct rehash_entry *entry, *next;
134 struct list_head *head;
135
136 spin_lock(&sbi->fs_lock);
137 spin_lock(&sbi->lookup_lock);
138 if (!(ino->flags & AUTOFS_INF_REHASH)) {
139 spin_unlock(&sbi->lookup_lock);
140 spin_unlock(&sbi->fs_lock);
141 return;
142 }
143 ino->flags &= ~AUTOFS_INF_REHASH;
144 head = &ino->rehash_list;
145 list_for_each_entry_safe(entry, next, head, list) {
146 list_del(&entry->list);
147 kfree(entry);
148 }
149 spin_unlock(&sbi->lookup_lock);
150 spin_unlock(&sbi->fs_lock);
151 dput(ino->dentry);
152
153 return;
154}
155
156static void autofs4_revalidate_drop(struct dentry *dentry,
157 struct rehash_entry *entry)
158{
159 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
160 struct autofs_info *ino = autofs4_dentry_ino(dentry);
161 /*
162 * Add to the active list so we can pick this up in
163 * ->lookup(). Also add an entry to a rehash list so
164 * we know when there are no dentrys in flight so we
165 * know when we can rehash the dentry.
166 */
167 spin_lock(&sbi->lookup_lock);
168 if (list_empty(&ino->active))
169 list_add(&ino->active, &sbi->active_list);
170 autofs4_add_rehash_entry(ino, entry);
171 spin_unlock(&sbi->lookup_lock);
172 if (!(ino->flags & AUTOFS_INF_REHASH)) {
173 ino->flags |= AUTOFS_INF_REHASH;
174 dget(dentry);
175 spin_lock(&dentry->d_lock);
176 __d_drop(dentry);
177 spin_unlock(&dentry->d_lock);
178 }
179 return;
180}
181
182static void autofs4_revalidate_rehash(struct dentry *dentry)
183{
184 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
185 struct autofs_info *ino = autofs4_dentry_ino(dentry);
186 if (ino->flags & AUTOFS_INF_REHASH) {
187 spin_lock(&sbi->lookup_lock);
188 autofs4_remove_rehash_entry(ino);
189 if (list_empty(&ino->rehash_list)) {
190 spin_unlock(&sbi->lookup_lock);
191 ino->flags &= ~AUTOFS_INF_REHASH;
192 d_rehash(dentry);
193 dput(ino->dentry);
194 } else
195 spin_unlock(&sbi->lookup_lock);
196 }
197 return;
198}
199
200static unsigned int autofs4_need_mount(unsigned int flags) 107static unsigned int autofs4_need_mount(unsigned int flags)
201{ 108{
202 unsigned int res = 0; 109 unsigned int res = 0;
@@ -236,7 +143,7 @@ out:
236 return dcache_dir_open(inode, file); 143 return dcache_dir_open(inode, file);
237} 144}
238 145
239static int try_to_fill_dentry(struct dentry *dentry) 146static int try_to_fill_dentry(struct dentry *dentry, int flags)
240{ 147{
241 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 148 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
242 struct autofs_info *ino = autofs4_dentry_ino(dentry); 149 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -249,17 +156,55 @@ static int try_to_fill_dentry(struct dentry *dentry)
249 * Wait for a pending mount, triggering one if there 156 * Wait for a pending mount, triggering one if there
250 * isn't one already 157 * isn't one already
251 */ 158 */
252 DPRINTK("waiting for mount name=%.*s", 159 if (dentry->d_inode == NULL) {
253 dentry->d_name.len, dentry->d_name.name); 160 DPRINTK("waiting for mount name=%.*s",
161 dentry->d_name.len, dentry->d_name.name);
254 162
255 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 163 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
256 164
257 DPRINTK("mount done status=%d", status); 165 DPRINTK("mount done status=%d", status);
258 166
259 /* Update expiry counter */ 167 /* Turn this into a real negative dentry? */
260 ino->last_used = jiffies; 168 if (status == -ENOENT) {
169 spin_lock(&sbi->fs_lock);
170 ino->flags &= ~AUTOFS_INF_PENDING;
171 spin_unlock(&sbi->fs_lock);
172 return status;
173 } else if (status) {
174 /* Return a negative dentry, but leave it "pending" */
175 return status;
176 }
177 /* Trigger mount for path component or follow link */
178 } else if (ino->flags & AUTOFS_INF_PENDING ||
179 autofs4_need_mount(flags) ||
180 current->link_count) {
181 DPRINTK("waiting for mount name=%.*s",
182 dentry->d_name.len, dentry->d_name.name);
261 183
262 return status; 184 spin_lock(&sbi->fs_lock);
185 ino->flags |= AUTOFS_INF_PENDING;
186 spin_unlock(&sbi->fs_lock);
187 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
188
189 DPRINTK("mount done status=%d", status);
190
191 if (status) {
192 spin_lock(&sbi->fs_lock);
193 ino->flags &= ~AUTOFS_INF_PENDING;
194 spin_unlock(&sbi->fs_lock);
195 return status;
196 }
197 }
198
199 /* Initialize expiry counter after successful mount */
200 if (ino)
201 ino->last_used = jiffies;
202
203 spin_lock(&sbi->fs_lock);
204 ino->flags &= ~AUTOFS_INF_PENDING;
205 spin_unlock(&sbi->fs_lock);
206
207 return 0;
263} 208}
264 209
265/* For autofs direct mounts the follow link triggers the mount */ 210/* For autofs direct mounts the follow link triggers the mount */
@@ -313,16 +258,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
313 */ 258 */
314 if (ino->flags & AUTOFS_INF_PENDING || 259 if (ino->flags & AUTOFS_INF_PENDING ||
315 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { 260 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
316 ino->flags |= AUTOFS_INF_PENDING;
317 spin_unlock(&dcache_lock); 261 spin_unlock(&dcache_lock);
318 spin_unlock(&sbi->fs_lock); 262 spin_unlock(&sbi->fs_lock);
319 263
320 status = try_to_fill_dentry(dentry); 264 status = try_to_fill_dentry(dentry, 0);
321
322 spin_lock(&sbi->fs_lock);
323 ino->flags &= ~AUTOFS_INF_PENDING;
324 spin_unlock(&sbi->fs_lock);
325
326 if (status) 265 if (status)
327 goto out_error; 266 goto out_error;
328 267
@@ -361,47 +300,18 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
361{ 300{
362 struct inode *dir = dentry->d_parent->d_inode; 301 struct inode *dir = dentry->d_parent->d_inode;
363 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 302 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
364 struct autofs_info *ino = autofs4_dentry_ino(dentry); 303 int oz_mode = autofs4_oz_mode(sbi);
365 struct rehash_entry *entry;
366 int flags = nd ? nd->flags : 0; 304 int flags = nd ? nd->flags : 0;
367 unsigned int mutex_aquired; 305 int status = 1;
368 306
369 DPRINTK("name = %.*s oz_mode = %d",
370 dentry->d_name.len, dentry->d_name.name, oz_mode);
371
372 /* Daemon never causes a mount to trigger */
373 if (autofs4_oz_mode(sbi))
374 return 1;
375
376 entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL);
377 if (!entry)
378 return -ENOMEM;
379
380 mutex_aquired = mutex_trylock(&dir->i_mutex);
381
382 spin_lock(&sbi->fs_lock);
383 spin_lock(&dcache_lock);
384 /* Pending dentry */ 307 /* Pending dentry */
308 spin_lock(&sbi->fs_lock);
385 if (autofs4_ispending(dentry)) { 309 if (autofs4_ispending(dentry)) {
386 int status; 310 /* The daemon never causes a mount to trigger */
387
388 /*
389 * We can only unhash and send this to ->lookup() if
390 * the directory mutex is held over d_revalidate() and
391 * ->lookup(). This prevents the VFS from incorrectly
392 * seeing the dentry as non-existent.
393 */
394 ino->flags |= AUTOFS_INF_PENDING;
395 if (!mutex_aquired) {
396 autofs4_revalidate_drop(dentry, entry);
397 spin_unlock(&dcache_lock);
398 spin_unlock(&sbi->fs_lock);
399 return 0;
400 }
401 spin_unlock(&dcache_lock);
402 spin_unlock(&sbi->fs_lock); 311 spin_unlock(&sbi->fs_lock);
403 mutex_unlock(&dir->i_mutex); 312
404 kfree(entry); 313 if (oz_mode)
314 return 1;
405 315
406 /* 316 /*
407 * If the directory has gone away due to an expire 317 * If the directory has gone away due to an expire
@@ -415,82 +325,45 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
415 * A zero status is success otherwise we have a 325 * A zero status is success otherwise we have a
416 * negative error code. 326 * negative error code.
417 */ 327 */
418 status = try_to_fill_dentry(dentry); 328 status = try_to_fill_dentry(dentry, flags);
419
420 spin_lock(&sbi->fs_lock);
421 ino->flags &= ~AUTOFS_INF_PENDING;
422 spin_unlock(&sbi->fs_lock);
423
424 if (status == 0) 329 if (status == 0)
425 return 1; 330 return 1;
426 331
427 return status; 332 return status;
428 } 333 }
334 spin_unlock(&sbi->fs_lock);
335
336 /* Negative dentry.. invalidate if "old" */
337 if (dentry->d_inode == NULL)
338 return 0;
429 339
430 /* Check for a non-mountpoint directory with no contents */ 340 /* Check for a non-mountpoint directory with no contents */
341 spin_lock(&dcache_lock);
431 if (S_ISDIR(dentry->d_inode->i_mode) && 342 if (S_ISDIR(dentry->d_inode->i_mode) &&
432 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 343 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
433 DPRINTK("dentry=%p %.*s, emptydir", 344 DPRINTK("dentry=%p %.*s, emptydir",
434 dentry, dentry->d_name.len, dentry->d_name.name); 345 dentry, dentry->d_name.len, dentry->d_name.name);
346 spin_unlock(&dcache_lock);
435 347
436 if (autofs4_need_mount(flags) || current->link_count) { 348 /* The daemon never causes a mount to trigger */
437 int status; 349 if (oz_mode)
438 350 return 1;
439 /*
440 * We can only unhash and send this to ->lookup() if
441 * the directory mutex is held over d_revalidate() and
442 * ->lookup(). This prevents the VFS from incorrectly
443 * seeing the dentry as non-existent.
444 */
445 ino->flags |= AUTOFS_INF_PENDING;
446 if (!mutex_aquired) {
447 autofs4_revalidate_drop(dentry, entry);
448 spin_unlock(&dcache_lock);
449 spin_unlock(&sbi->fs_lock);
450 return 0;
451 }
452 spin_unlock(&dcache_lock);
453 spin_unlock(&sbi->fs_lock);
454 mutex_unlock(&dir->i_mutex);
455 kfree(entry);
456
457 /*
458 * A zero status is success otherwise we have a
459 * negative error code.
460 */
461 status = try_to_fill_dentry(dentry);
462
463 spin_lock(&sbi->fs_lock);
464 ino->flags &= ~AUTOFS_INF_PENDING;
465 spin_unlock(&sbi->fs_lock);
466 351
467 if (status == 0) 352 /*
468 return 1; 353 * A zero status is success otherwise we have a
354 * negative error code.
355 */
356 status = try_to_fill_dentry(dentry, flags);
357 if (status == 0)
358 return 1;
469 359
470 return status; 360 return status;
471 }
472 } 361 }
473 spin_unlock(&dcache_lock); 362 spin_unlock(&dcache_lock);
474 spin_unlock(&sbi->fs_lock);
475
476 if (mutex_aquired)
477 mutex_unlock(&dir->i_mutex);
478
479 kfree(entry);
480 363
481 return 1; 364 return 1;
482} 365}
483 366
484static void autofs4_free_rehash_entrys(struct autofs_info *inf)
485{
486 struct list_head *head = &inf->rehash_list;
487 struct rehash_entry *entry, *next;
488 list_for_each_entry_safe(entry, next, head, list) {
489 list_del(&entry->list);
490 kfree(entry);
491 }
492}
493
494void autofs4_dentry_release(struct dentry *de) 367void autofs4_dentry_release(struct dentry *de)
495{ 368{
496 struct autofs_info *inf; 369 struct autofs_info *inf;
@@ -509,8 +382,6 @@ void autofs4_dentry_release(struct dentry *de)
509 list_del(&inf->active); 382 list_del(&inf->active);
510 if (!list_empty(&inf->expiring)) 383 if (!list_empty(&inf->expiring))
511 list_del(&inf->expiring); 384 list_del(&inf->expiring);
512 if (!list_empty(&inf->rehash_list))
513 autofs4_free_rehash_entrys(inf);
514 spin_unlock(&sbi->lookup_lock); 385 spin_unlock(&sbi->lookup_lock);
515 } 386 }
516 387
@@ -543,7 +414,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
543 const unsigned char *str = name->name; 414 const unsigned char *str = name->name;
544 struct list_head *p, *head; 415 struct list_head *p, *head;
545 416
546restart:
547 spin_lock(&dcache_lock); 417 spin_lock(&dcache_lock);
548 spin_lock(&sbi->lookup_lock); 418 spin_lock(&sbi->lookup_lock);
549 head = &sbi->active_list; 419 head = &sbi->active_list;
@@ -561,19 +431,6 @@ restart:
561 if (atomic_read(&active->d_count) == 0) 431 if (atomic_read(&active->d_count) == 0)
562 goto next; 432 goto next;
563 433
564 if (active->d_inode && IS_DEADDIR(active->d_inode)) {
565 if (!list_empty(&ino->rehash_list)) {
566 dget(active);
567 spin_unlock(&active->d_lock);
568 spin_unlock(&sbi->lookup_lock);
569 spin_unlock(&dcache_lock);
570 autofs4_remove_rehash_entrys(ino);
571 dput(active);
572 goto restart;
573 }
574 goto next;
575 }
576
577 qstr = &active->d_name; 434 qstr = &active->d_name;
578 435
579 if (active->d_name.hash != hash) 436 if (active->d_name.hash != hash)
@@ -586,11 +443,13 @@ restart:
586 if (memcmp(qstr->name, str, len)) 443 if (memcmp(qstr->name, str, len))
587 goto next; 444 goto next;
588 445
589 dget(active); 446 if (d_unhashed(active)) {
590 spin_unlock(&active->d_lock); 447 dget(active);
591 spin_unlock(&sbi->lookup_lock); 448 spin_unlock(&active->d_lock);
592 spin_unlock(&dcache_lock); 449 spin_unlock(&sbi->lookup_lock);
593 return active; 450 spin_unlock(&dcache_lock);
451 return active;
452 }
594next: 453next:
595 spin_unlock(&active->d_lock); 454 spin_unlock(&active->d_lock);
596 } 455 }
@@ -639,11 +498,13 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
639 if (memcmp(qstr->name, str, len)) 498 if (memcmp(qstr->name, str, len))
640 goto next; 499 goto next;
641 500
642 dget(expiring); 501 if (d_unhashed(expiring)) {
643 spin_unlock(&expiring->d_lock); 502 dget(expiring);
644 spin_unlock(&sbi->lookup_lock); 503 spin_unlock(&expiring->d_lock);
645 spin_unlock(&dcache_lock); 504 spin_unlock(&sbi->lookup_lock);
646 return expiring; 505 spin_unlock(&dcache_lock);
506 return expiring;
507 }
647next: 508next:
648 spin_unlock(&expiring->d_lock); 509 spin_unlock(&expiring->d_lock);
649 } 510 }
@@ -653,48 +514,6 @@ next:
653 return NULL; 514 return NULL;
654} 515}
655 516
656static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi,
657 struct dentry *dentry, int oz_mode)
658{
659 struct autofs_info *ino;
660
661 /*
662 * Mark the dentry incomplete but don't hash it. We do this
663 * to serialize our inode creation operations (symlink and
664 * mkdir) which prevents deadlock during the callback to
665 * the daemon. Subsequent user space lookups for the same
666 * dentry are placed on the wait queue while the daemon
667 * itself is allowed passage unresticted so the create
668 * operation itself can then hash the dentry. Finally,
669 * we check for the hashed dentry and return the newly
670 * hashed dentry.
671 */
672 dentry->d_op = &autofs4_root_dentry_operations;
673
674 /*
675 * And we need to ensure that the same dentry is used for
676 * all following lookup calls until it is hashed so that
677 * the dentry flags are persistent throughout the request.
678 */
679 ino = autofs4_init_ino(NULL, sbi, 0555);
680 if (!ino)
681 return ERR_PTR(-ENOMEM);
682
683 dentry->d_fsdata = ino;
684 ino->dentry = dentry;
685
686 /*
687 * Only set the mount pending flag for new dentrys not created
688 * by the daemon.
689 */
690 if (!oz_mode)
691 ino->flags |= AUTOFS_INF_PENDING;
692
693 d_instantiate(dentry, NULL);
694
695 return ino;
696}
697
698/* Lookups in the root directory */ 517/* Lookups in the root directory */
699static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 518static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
700{ 519{
@@ -702,7 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
702 struct autofs_info *ino; 521 struct autofs_info *ino;
703 struct dentry *expiring, *active; 522 struct dentry *expiring, *active;
704 int oz_mode; 523 int oz_mode;
705 int status = 0;
706 524
707 DPRINTK("name = %.*s", 525 DPRINTK("name = %.*s",
708 dentry->d_name.len, dentry->d_name.name); 526 dentry->d_name.len, dentry->d_name.name);
@@ -717,26 +535,44 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
717 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 535 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
718 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 536 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
719 537
720 spin_lock(&sbi->fs_lock);
721 active = autofs4_lookup_active(dentry); 538 active = autofs4_lookup_active(dentry);
722 if (active) { 539 if (active) {
723 dentry = active; 540 dentry = active;
724 ino = autofs4_dentry_ino(dentry); 541 ino = autofs4_dentry_ino(dentry);
725 /* If this came from revalidate, rehash it */
726 autofs4_revalidate_rehash(dentry);
727 spin_unlock(&sbi->fs_lock);
728 } else { 542 } else {
729 spin_unlock(&sbi->fs_lock); 543 /*
730 ino = init_new_dentry(sbi, dentry, oz_mode); 544 * Mark the dentry incomplete but don't hash it. We do this
731 if (IS_ERR(ino)) 545 * to serialize our inode creation operations (symlink and
732 return (struct dentry *) ino; 546 * mkdir) which prevents deadlock during the callback to
733 } 547 * the daemon. Subsequent user space lookups for the same
548 * dentry are placed on the wait queue while the daemon
549 * itself is allowed passage unresticted so the create
550 * operation itself can then hash the dentry. Finally,
551 * we check for the hashed dentry and return the newly
552 * hashed dentry.
553 */
554 dentry->d_op = &autofs4_root_dentry_operations;
555
556 /*
557 * And we need to ensure that the same dentry is used for
558 * all following lookup calls until it is hashed so that
559 * the dentry flags are persistent throughout the request.
560 */
561 ino = autofs4_init_ino(NULL, sbi, 0555);
562 if (!ino)
563 return ERR_PTR(-ENOMEM);
734 564
735 autofs4_add_active(dentry); 565 dentry->d_fsdata = ino;
566 ino->dentry = dentry;
567
568 autofs4_add_active(dentry);
569
570 d_instantiate(dentry, NULL);
571 }
736 572
737 if (!oz_mode) { 573 if (!oz_mode) {
738 expiring = autofs4_lookup_expiring(dentry);
739 mutex_unlock(&dir->i_mutex); 574 mutex_unlock(&dir->i_mutex);
575 expiring = autofs4_lookup_expiring(dentry);
740 if (expiring) { 576 if (expiring) {
741 /* 577 /*
742 * If we are racing with expire the request might not 578 * If we are racing with expire the request might not
@@ -744,22 +580,23 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
744 * so it must have been successful, so just wait for it. 580 * so it must have been successful, so just wait for it.
745 */ 581 */
746 autofs4_expire_wait(expiring); 582 autofs4_expire_wait(expiring);
583 autofs4_del_expiring(expiring);
747 dput(expiring); 584 dput(expiring);
748 } 585 }
749 status = try_to_fill_dentry(dentry); 586
750 mutex_lock(&dir->i_mutex);
751 spin_lock(&sbi->fs_lock); 587 spin_lock(&sbi->fs_lock);
752 ino->flags &= ~AUTOFS_INF_PENDING; 588 ino->flags |= AUTOFS_INF_PENDING;
753 spin_unlock(&sbi->fs_lock); 589 spin_unlock(&sbi->fs_lock);
590 if (dentry->d_op && dentry->d_op->d_revalidate)
591 (dentry->d_op->d_revalidate)(dentry, nd);
592 mutex_lock(&dir->i_mutex);
754 } 593 }
755 594
756 autofs4_del_active(dentry);
757
758 /* 595 /*
759 * If we had a mount fail, check if we had to handle 596 * If we are still pending, check if we had to handle
760 * a signal. If so we can force a restart.. 597 * a signal. If so we can force a restart..
761 */ 598 */
762 if (status) { 599 if (ino->flags & AUTOFS_INF_PENDING) {
763 /* See if we were interrupted */ 600 /* See if we were interrupted */
764 if (signal_pending(current)) { 601 if (signal_pending(current)) {
765 sigset_t *sigset = &current->pending.signal; 602 sigset_t *sigset = &current->pending.signal;
@@ -771,46 +608,43 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
771 return ERR_PTR(-ERESTARTNOINTR); 608 return ERR_PTR(-ERESTARTNOINTR);
772 } 609 }
773 } 610 }
774 } 611 if (!oz_mode) {
775 612 spin_lock(&sbi->fs_lock);
776 /* 613 ino->flags &= ~AUTOFS_INF_PENDING;
777 * User space can (and has done in the past) remove and re-create 614 spin_unlock(&sbi->fs_lock);
778 * this directory during the callback. This can leave us with an
779 * unhashed dentry, but a successful mount! So we need to
780 * perform another cached lookup in case the dentry now exists.
781 */
782 if (!oz_mode && !have_submounts(dentry)) {
783 struct dentry *new;
784 new = d_lookup(dentry->d_parent, &dentry->d_name);
785 if (new) {
786 if (active)
787 dput(active);
788 return new;
789 } else {
790 if (!status)
791 status = -ENOENT;
792 } 615 }
793 } 616 }
794 617
795 /* 618 /*
796 * If we had a mount failure, return status to user space. 619 * If this dentry is unhashed, then we shouldn't honour this
797 * If the mount succeeded and we used a dentry from the active queue 620 * lookup. Returning ENOENT here doesn't do the right thing
798 * return it. 621 * for all system calls, but it should be OK for the operations
622 * we permit from an autofs.
799 */ 623 */
800 if (status) { 624 if (!oz_mode && d_unhashed(dentry)) {
801 dentry = ERR_PTR(status);
802 if (active)
803 dput(active);
804 return dentry;
805 } else {
806 /* 625 /*
807 * Valid successful mount, return active dentry or NULL 626 * A user space application can (and has done in the past)
808 * for a new dentry. 627 * remove and re-create this directory during the callback.
628 * This can leave us with an unhashed dentry, but a
629 * successful mount! So we need to perform another
630 * cached lookup in case the dentry now exists.
809 */ 631 */
632 struct dentry *parent = dentry->d_parent;
633 struct dentry *new = d_lookup(parent, &dentry->d_name);
634 if (new != NULL)
635 dentry = new;
636 else
637 dentry = ERR_PTR(-ENOENT);
638
810 if (active) 639 if (active)
811 return active; 640 dput(active);
641
642 return dentry;
812 } 643 }
813 644
645 if (active)
646 return active;
647
814 return NULL; 648 return NULL;
815} 649}
816 650
@@ -834,6 +668,8 @@ static int autofs4_dir_symlink(struct inode *dir,
834 if (!ino) 668 if (!ino)
835 return -ENOMEM; 669 return -ENOMEM;
836 670
671 autofs4_del_active(dentry);
672
837 ino->size = strlen(symname); 673 ino->size = strlen(symname);
838 cp = kmalloc(ino->size + 1, GFP_KERNEL); 674 cp = kmalloc(ino->size + 1, GFP_KERNEL);
839 if (!cp) { 675 if (!cp) {
@@ -910,6 +746,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
910 dir->i_mtime = CURRENT_TIME; 746 dir->i_mtime = CURRENT_TIME;
911 747
912 spin_lock(&dcache_lock); 748 spin_lock(&dcache_lock);
749 autofs4_add_expiring(dentry);
913 spin_lock(&dentry->d_lock); 750 spin_lock(&dentry->d_lock);
914 __d_drop(dentry); 751 __d_drop(dentry);
915 spin_unlock(&dentry->d_lock); 752 spin_unlock(&dentry->d_lock);
@@ -935,6 +772,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
935 spin_unlock(&dcache_lock); 772 spin_unlock(&dcache_lock);
936 return -ENOTEMPTY; 773 return -ENOTEMPTY;
937 } 774 }
775 autofs4_add_expiring(dentry);
938 spin_lock(&dentry->d_lock); 776 spin_lock(&dentry->d_lock);
939 __d_drop(dentry); 777 __d_drop(dentry);
940 spin_unlock(&dentry->d_lock); 778 spin_unlock(&dentry->d_lock);
@@ -972,6 +810,8 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
972 if (!ino) 810 if (!ino)
973 return -ENOMEM; 811 return -ENOMEM;
974 812
813 autofs4_del_active(dentry);
814
975 inode = autofs4_get_inode(dir->i_sb, ino); 815 inode = autofs4_get_inode(dir->i_sb, ino);
976 if (!inode) { 816 if (!inode) {
977 if (!dentry->d_fsdata) 817 if (!dentry->d_fsdata)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8f3d9fd89604..f22a7d3dc362 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
15#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/vfs.h> 17#include <linux/vfs.h>
18#include <linux/writeback.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include "bfs.h" 20#include "bfs.h"
20 21
@@ -98,7 +99,7 @@ error:
98 return ERR_PTR(-EIO); 99 return ERR_PTR(-EIO);
99} 100}
100 101
101static int bfs_write_inode(struct inode *inode, int wait) 102static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
102{ 103{
103 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 104 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
104 unsigned int ino = (u16)inode->i_ino; 105 unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
147 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); 148 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
148 149
149 mark_buffer_dirty(bh); 150 mark_buffer_dirty(bh);
150 if (wait) { 151 if (wbc->sync_mode == WB_SYNC_ALL) {
151 sync_dirty_buffer(bh); 152 sync_dirty_buffer(bh);
152 if (buffer_req(bh) && !buffer_uptodate(bh)) 153 if (buffer_req(bh) && !buffer_uptodate(bh))
153 err = -EIO; 154 err = -EIO;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index fdd397099172..9b6aef0f75e5 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -24,6 +24,7 @@
24#include <linux/binfmts.h> 24#include <linux/binfmts.h>
25#include <linux/personality.h> 25#include <linux/personality.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/coredump.h>
27 28
28#include <asm/system.h> 29#include <asm/system.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -60,26 +61,6 @@ static int set_brk(unsigned long start, unsigned long end)
60} 61}
61 62
62/* 63/*
63 * These are the only things you should do on a core-file: use only these
64 * macros to write out all the necessary info.
65 */
66
67static int dump_write(struct file *file, const void *addr, int nr)
68{
69 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
70}
71
72#define DUMP_WRITE(addr, nr) \
73 if (!dump_write(file, (void *)(addr), (nr))) \
74 goto end_coredump;
75
76#define DUMP_SEEK(offset) \
77if (file->f_op->llseek) { \
78 if (file->f_op->llseek(file,(offset),0) != (offset)) \
79 goto end_coredump; \
80} else file->f_pos = (offset)
81
82/*
83 * Routine writes a core dump image in the current directory. 64 * Routine writes a core dump image in the current directory.
84 * Currently only a stub-function. 65 * Currently only a stub-function.
85 * 66 *
@@ -94,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
94 struct file *file = cprm->file; 75 struct file *file = cprm->file;
95 mm_segment_t fs; 76 mm_segment_t fs;
96 int has_dumped = 0; 77 int has_dumped = 0;
97 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
98 struct user dump; 80 struct user dump;
99#ifdef __alpha__ 81#ifdef __alpha__
100# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
101#else 83#else
102# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
103#endif 86#endif
104# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
105 88
106 fs = get_fs(); 89 fs = get_fs();
107 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -123,33 +106,38 @@ static int aout_core_dump(struct coredump_params *cprm)
123 106
124/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
125 set_fs(USER_DS); 108 set_fs(USER_DS);
126 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
127 dump.u_dsize = 0; 110 dump.u_dsize = 0;
128 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
129 dump.u_ssize = 0; 112 dump.u_ssize = 0;
130 113
131 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
132/* struct user */ 115/* struct user */
133 DUMP_WRITE(&dump,sizeof(dump)); 116 if (!dump_write(file, &dump, sizeof(dump)))
117 goto end_coredump;
134/* Now dump all of the user data. Include malloced stuff as well */ 118/* Now dump all of the user data. Include malloced stuff as well */
135 DUMP_SEEK(PAGE_SIZE); 119 if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
120 goto end_coredump;
136/* now we start writing out the user space info */ 121/* now we start writing out the user space info */
137 set_fs(USER_DS); 122 set_fs(USER_DS);
138/* Dump the data area */ 123/* Dump the data area */
139 if (dump.u_dsize != 0) { 124 if (dump.u_dsize != 0) {
140 dump_start = START_DATA(dump); 125 dump_start = START_DATA(dump);
141 dump_size = dump.u_dsize << PAGE_SHIFT; 126 dump_size = dump.u_dsize << PAGE_SHIFT;
142 DUMP_WRITE(dump_start,dump_size); 127 if (!dump_write(file, dump_start, dump_size))
128 goto end_coredump;
143 } 129 }
144/* Now prepare to dump the stack area */ 130/* Now prepare to dump the stack area */
145 if (dump.u_ssize != 0) { 131 if (dump.u_ssize != 0) {
146 dump_start = START_STACK(dump); 132 dump_start = START_STACK(dump);
147 dump_size = dump.u_ssize << PAGE_SHIFT; 133 dump_size = dump.u_ssize << PAGE_SHIFT;
148 DUMP_WRITE(dump_start,dump_size); 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump;
149 } 136 }
150/* Finally dump the task struct. Not be used by gdb, but could be useful */ 137/* Finally dump the task struct. Not be used by gdb, but could be useful */
151 set_fs(KERNEL_DS); 138 set_fs(KERNEL_DS);
152 DUMP_WRITE(current,sizeof(*current)); 139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
153end_coredump: 141end_coredump:
154 set_fs(fs); 142 set_fs(fs);
155 return has_dumped; 143 return has_dumped;
@@ -247,7 +235,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
247 * size limits imposed on them by creating programs with large 235 * size limits imposed on them by creating programs with large
248 * arrays in the data or bss. 236 * arrays in the data or bss.
249 */ 237 */
250 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 238 rlim = rlimit(RLIMIT_DATA);
251 if (rlim >= RLIM_INFINITY) 239 if (rlim >= RLIM_INFINITY)
252 rlim = ~0; 240 rlim = ~0;
253 if (ex.a_data + ex.a_bss > rlim) 241 if (ex.a_data + ex.a_bss > rlim)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fd5b2ea5d299..535e763ab1a6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/elf.h> 32#include <linux/elf.h>
33#include <linux/utsname.h> 33#include <linux/utsname.h>
34#include <linux/coredump.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/param.h> 36#include <asm/param.h>
36#include <asm/page.h> 37#include <asm/page.h>
@@ -1085,36 +1086,6 @@ out:
1085 * Modelled on fs/exec.c:aout_core_dump() 1086 * Modelled on fs/exec.c:aout_core_dump()
1086 * Jeremy Fitzhardinge <jeremy@sw.oz.au> 1087 * Jeremy Fitzhardinge <jeremy@sw.oz.au>
1087 */ 1088 */
1088/*
1089 * These are the only things you should do on a core-file: use only these
1090 * functions to write out all the necessary info.
1091 */
1092static int dump_write(struct file *file, const void *addr, int nr)
1093{
1094 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
1095}
1096
1097static int dump_seek(struct file *file, loff_t off)
1098{
1099 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
1100 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
1101 return 0;
1102 } else {
1103 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
1104 if (!buf)
1105 return 0;
1106 while (off > 0) {
1107 unsigned long n = off;
1108 if (n > PAGE_SIZE)
1109 n = PAGE_SIZE;
1110 if (!dump_write(file, buf, n))
1111 return 0;
1112 off -= n;
1113 }
1114 free_page((unsigned long)buf);
1115 }
1116 return 1;
1117}
1118 1089
1119/* 1090/*
1120 * Decide what to dump of a segment, part, all or none. 1091 * Decide what to dump of a segment, part, all or none.
@@ -1249,11 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file,
1249} 1220}
1250#undef DUMP_WRITE 1221#undef DUMP_WRITE
1251 1222
1252#define DUMP_WRITE(addr, nr) \
1253 if ((size += (nr)) > cprm->limit || \
1254 !dump_write(cprm->file, (addr), (nr))) \
1255 goto end_coredump;
1256
1257static void fill_elf_header(struct elfhdr *elf, int segs, 1223static void fill_elf_header(struct elfhdr *elf, int segs,
1258 u16 machine, u32 flags, u8 osabi) 1224 u16 machine, u32 flags, u8 osabi)
1259{ 1225{
@@ -1872,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1872 return gate_vma; 1838 return gate_vma;
1873} 1839}
1874 1840
1841static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
1842 elf_addr_t e_shoff, int segs)
1843{
1844 elf->e_shoff = e_shoff;
1845 elf->e_shentsize = sizeof(*shdr4extnum);
1846 elf->e_shnum = 1;
1847 elf->e_shstrndx = SHN_UNDEF;
1848
1849 memset(shdr4extnum, 0, sizeof(*shdr4extnum));
1850
1851 shdr4extnum->sh_type = SHT_NULL;
1852 shdr4extnum->sh_size = elf->e_shnum;
1853 shdr4extnum->sh_link = elf->e_shstrndx;
1854 shdr4extnum->sh_info = segs;
1855}
1856
1857static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
1858 unsigned long mm_flags)
1859{
1860 struct vm_area_struct *vma;
1861 size_t size = 0;
1862
1863 for (vma = first_vma(current, gate_vma); vma != NULL;
1864 vma = next_vma(vma, gate_vma))
1865 size += vma_dump_size(vma, mm_flags);
1866 return size;
1867}
1868
1875/* 1869/*
1876 * Actual dumper 1870 * Actual dumper
1877 * 1871 *
@@ -1888,8 +1882,11 @@ static int elf_core_dump(struct coredump_params *cprm)
1888 struct vm_area_struct *vma, *gate_vma; 1882 struct vm_area_struct *vma, *gate_vma;
1889 struct elfhdr *elf = NULL; 1883 struct elfhdr *elf = NULL;
1890 loff_t offset = 0, dataoff, foffset; 1884 loff_t offset = 0, dataoff, foffset;
1891 unsigned long mm_flags;
1892 struct elf_note_info info; 1885 struct elf_note_info info;
1886 struct elf_phdr *phdr4note = NULL;
1887 struct elf_shdr *shdr4extnum = NULL;
1888 Elf_Half e_phnum;
1889 elf_addr_t e_shoff;
1893 1890
1894 /* 1891 /*
1895 * We no longer stop all VM operations. 1892 * We no longer stop all VM operations.
@@ -1912,20 +1909,25 @@ static int elf_core_dump(struct coredump_params *cprm)
1912 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. 1909 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
1913 */ 1910 */
1914 segs = current->mm->map_count; 1911 segs = current->mm->map_count;
1915#ifdef ELF_CORE_EXTRA_PHDRS 1912 segs += elf_core_extra_phdrs();
1916 segs += ELF_CORE_EXTRA_PHDRS;
1917#endif
1918 1913
1919 gate_vma = get_gate_vma(current); 1914 gate_vma = get_gate_vma(current);
1920 if (gate_vma != NULL) 1915 if (gate_vma != NULL)
1921 segs++; 1916 segs++;
1922 1917
1918 /* for notes section */
1919 segs++;
1920
1921 /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
1922 * this, kernel supports extended numbering. Have a look at
1923 * include/linux/elf.h for further information. */
1924 e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
1925
1923 /* 1926 /*
1924 * Collect all the non-memory information about the process for the 1927 * Collect all the non-memory information about the process for the
1925 * notes. This also sets up the file header. 1928 * notes. This also sets up the file header.
1926 */ 1929 */
1927 if (!fill_note_info(elf, segs + 1, /* including notes section */ 1930 if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
1928 &info, cprm->signr, cprm->regs))
1929 goto cleanup; 1931 goto cleanup;
1930 1932
1931 has_dumped = 1; 1933 has_dumped = 1;
@@ -1934,31 +1936,47 @@ static int elf_core_dump(struct coredump_params *cprm)
1934 fs = get_fs(); 1936 fs = get_fs();
1935 set_fs(KERNEL_DS); 1937 set_fs(KERNEL_DS);
1936 1938
1937 DUMP_WRITE(elf, sizeof(*elf));
1938 offset += sizeof(*elf); /* Elf header */ 1939 offset += sizeof(*elf); /* Elf header */
1939 offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */ 1940 offset += segs * sizeof(struct elf_phdr); /* Program headers */
1940 foffset = offset; 1941 foffset = offset;
1941 1942
1942 /* Write notes phdr entry */ 1943 /* Write notes phdr entry */
1943 { 1944 {
1944 struct elf_phdr phdr;
1945 size_t sz = get_note_info_size(&info); 1945 size_t sz = get_note_info_size(&info);
1946 1946
1947 sz += elf_coredump_extra_notes_size(); 1947 sz += elf_coredump_extra_notes_size();
1948 1948
1949 fill_elf_note_phdr(&phdr, sz, offset); 1949 phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
1950 if (!phdr4note)
1951 goto end_coredump;
1952
1953 fill_elf_note_phdr(phdr4note, sz, offset);
1950 offset += sz; 1954 offset += sz;
1951 DUMP_WRITE(&phdr, sizeof(phdr));
1952 } 1955 }
1953 1956
1954 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1957 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1955 1958
1956 /* 1959 offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
1957 * We must use the same mm->flags while dumping core to avoid 1960 offset += elf_core_extra_data_size();
1958 * inconsistency between the program headers and bodies, otherwise an 1961 e_shoff = offset;
1959 * unusable core file can be generated. 1962
1960 */ 1963 if (e_phnum == PN_XNUM) {
1961 mm_flags = current->mm->flags; 1964 shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
1965 if (!shdr4extnum)
1966 goto end_coredump;
1967 fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
1968 }
1969
1970 offset = dataoff;
1971
1972 size += sizeof(*elf);
1973 if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
1974 goto end_coredump;
1975
1976 size += sizeof(*phdr4note);
1977 if (size > cprm->limit
1978 || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
1979 goto end_coredump;
1962 1980
1963 /* Write program headers for segments dump */ 1981 /* Write program headers for segments dump */
1964 for (vma = first_vma(current, gate_vma); vma != NULL; 1982 for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -1969,7 +1987,7 @@ static int elf_core_dump(struct coredump_params *cprm)
1969 phdr.p_offset = offset; 1987 phdr.p_offset = offset;
1970 phdr.p_vaddr = vma->vm_start; 1988 phdr.p_vaddr = vma->vm_start;
1971 phdr.p_paddr = 0; 1989 phdr.p_paddr = 0;
1972 phdr.p_filesz = vma_dump_size(vma, mm_flags); 1990 phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
1973 phdr.p_memsz = vma->vm_end - vma->vm_start; 1991 phdr.p_memsz = vma->vm_end - vma->vm_start;
1974 offset += phdr.p_filesz; 1992 offset += phdr.p_filesz;
1975 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 1993 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1979,12 +1997,14 @@ static int elf_core_dump(struct coredump_params *cprm)
1979 phdr.p_flags |= PF_X; 1997 phdr.p_flags |= PF_X;
1980 phdr.p_align = ELF_EXEC_PAGESIZE; 1998 phdr.p_align = ELF_EXEC_PAGESIZE;
1981 1999
1982 DUMP_WRITE(&phdr, sizeof(phdr)); 2000 size += sizeof(phdr);
2001 if (size > cprm->limit
2002 || !dump_write(cprm->file, &phdr, sizeof(phdr)))
2003 goto end_coredump;
1983 } 2004 }
1984 2005
1985#ifdef ELF_CORE_WRITE_EXTRA_PHDRS 2006 if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
1986 ELF_CORE_WRITE_EXTRA_PHDRS; 2007 goto end_coredump;
1987#endif
1988 2008
1989 /* write out the notes section */ 2009 /* write out the notes section */
1990 if (!write_note_info(&info, cprm->file, &foffset)) 2010 if (!write_note_info(&info, cprm->file, &foffset))
@@ -2002,7 +2022,7 @@ static int elf_core_dump(struct coredump_params *cprm)
2002 unsigned long addr; 2022 unsigned long addr;
2003 unsigned long end; 2023 unsigned long end;
2004 2024
2005 end = vma->vm_start + vma_dump_size(vma, mm_flags); 2025 end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
2006 2026
2007 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { 2027 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
2008 struct page *page; 2028 struct page *page;
@@ -2023,15 +2043,24 @@ static int elf_core_dump(struct coredump_params *cprm)
2023 } 2043 }
2024 } 2044 }
2025 2045
2026#ifdef ELF_CORE_WRITE_EXTRA_DATA 2046 if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
2027 ELF_CORE_WRITE_EXTRA_DATA; 2047 goto end_coredump;
2028#endif 2048
2049 if (e_phnum == PN_XNUM) {
2050 size += sizeof(*shdr4extnum);
2051 if (size > cprm->limit
2052 || !dump_write(cprm->file, shdr4extnum,
2053 sizeof(*shdr4extnum)))
2054 goto end_coredump;
2055 }
2029 2056
2030end_coredump: 2057end_coredump:
2031 set_fs(fs); 2058 set_fs(fs);
2032 2059
2033cleanup: 2060cleanup:
2034 free_note_info(&info); 2061 free_note_info(&info);
2062 kfree(shdr4extnum);
2063 kfree(phdr4note);
2035 kfree(elf); 2064 kfree(elf);
2036out: 2065out:
2037 return has_dumped; 2066 return has_dumped;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 18d77297ccc8..7ab23e006e4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -34,6 +34,7 @@
34#include <linux/elf.h> 34#include <linux/elf.h>
35#include <linux/elf-fdpic.h> 35#include <linux/elf-fdpic.h>
36#include <linux/elfcore.h> 36#include <linux/elfcore.h>
37#include <linux/coredump.h>
37 38
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/param.h> 40#include <asm/param.h>
@@ -1216,26 +1217,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1216#ifdef CONFIG_ELF_CORE 1217#ifdef CONFIG_ELF_CORE
1217 1218
1218/* 1219/*
1219 * These are the only things you should do on a core-file: use only these
1220 * functions to write out all the necessary info.
1221 */
1222static int dump_write(struct file *file, const void *addr, int nr)
1223{
1224 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
1225}
1226
1227static int dump_seek(struct file *file, loff_t off)
1228{
1229 if (file->f_op->llseek) {
1230 if (file->f_op->llseek(file, off, SEEK_SET) != off)
1231 return 0;
1232 } else {
1233 file->f_pos = off;
1234 }
1235 return 1;
1236}
1237
1238/*
1239 * Decide whether a segment is worth dumping; default is yes to be 1220 * Decide whether a segment is worth dumping; default is yes to be
1240 * sure (missing info is worse than too much; etc). 1221 * sure (missing info is worse than too much; etc).
1241 * Personally I'd include everything, and use the coredump limit... 1222 * Personally I'd include everything, and use the coredump limit...
@@ -1313,35 +1294,35 @@ static int notesize(struct memelfnote *en)
1313 1294
1314/* #define DEBUG */ 1295/* #define DEBUG */
1315 1296
1316#define DUMP_WRITE(addr, nr) \ 1297#define DUMP_WRITE(addr, nr, foffset) \
1317 do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) 1298 do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
1318#define DUMP_SEEK(off) \
1319 do { if (!dump_seek(file, (off))) return 0; } while(0)
1320 1299
1321static int writenote(struct memelfnote *men, struct file *file) 1300static int alignfile(struct file *file, loff_t *foffset)
1322{ 1301{
1323 struct elf_note en; 1302 static const char buf[4] = { 0, };
1303 DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
1304 return 1;
1305}
1324 1306
1307static int writenote(struct memelfnote *men, struct file *file,
1308 loff_t *foffset)
1309{
1310 struct elf_note en;
1325 en.n_namesz = strlen(men->name) + 1; 1311 en.n_namesz = strlen(men->name) + 1;
1326 en.n_descsz = men->datasz; 1312 en.n_descsz = men->datasz;
1327 en.n_type = men->type; 1313 en.n_type = men->type;
1328 1314
1329 DUMP_WRITE(&en, sizeof(en)); 1315 DUMP_WRITE(&en, sizeof(en), foffset);
1330 DUMP_WRITE(men->name, en.n_namesz); 1316 DUMP_WRITE(men->name, en.n_namesz, foffset);
1331 /* XXX - cast from long long to long to avoid need for libgcc.a */ 1317 if (!alignfile(file, foffset))
1332 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ 1318 return 0;
1333 DUMP_WRITE(men->data, men->datasz); 1319 DUMP_WRITE(men->data, men->datasz, foffset);
1334 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ 1320 if (!alignfile(file, foffset))
1321 return 0;
1335 1322
1336 return 1; 1323 return 1;
1337} 1324}
1338#undef DUMP_WRITE 1325#undef DUMP_WRITE
1339#undef DUMP_SEEK
1340
1341#define DUMP_WRITE(addr, nr) \
1342 if ((size += (nr)) > cprm->limit || \
1343 !dump_write(cprm->file, (addr), (nr))) \
1344 goto end_coredump;
1345 1326
1346static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) 1327static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
1347{ 1328{
@@ -1393,7 +1374,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
1393 1374
1394/* 1375/*
1395 * fill up all the fields in prstatus from the given task struct, except 1376 * fill up all the fields in prstatus from the given task struct, except
1396 * registers which need to be filled up seperately. 1377 * registers which need to be filled up separately.
1397 */ 1378 */
1398static void fill_prstatus(struct elf_prstatus *prstatus, 1379static void fill_prstatus(struct elf_prstatus *prstatus,
1399 struct task_struct *p, long signr) 1380 struct task_struct *p, long signr)
@@ -1524,6 +1505,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
1524 return sz; 1505 return sz;
1525} 1506}
1526 1507
1508static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
1509 elf_addr_t e_shoff, int segs)
1510{
1511 elf->e_shoff = e_shoff;
1512 elf->e_shentsize = sizeof(*shdr4extnum);
1513 elf->e_shnum = 1;
1514 elf->e_shstrndx = SHN_UNDEF;
1515
1516 memset(shdr4extnum, 0, sizeof(*shdr4extnum));
1517
1518 shdr4extnum->sh_type = SHT_NULL;
1519 shdr4extnum->sh_size = elf->e_shnum;
1520 shdr4extnum->sh_link = elf->e_shstrndx;
1521 shdr4extnum->sh_info = segs;
1522}
1523
1527/* 1524/*
1528 * dump the segments for an MMU process 1525 * dump the segments for an MMU process
1529 */ 1526 */
@@ -1552,7 +1549,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1552 err = -EIO; 1549 err = -EIO;
1553 kunmap(page); 1550 kunmap(page);
1554 page_cache_release(page); 1551 page_cache_release(page);
1555 } else if (!dump_seek(file, file->f_pos + PAGE_SIZE)) 1552 } else if (!dump_seek(file, PAGE_SIZE))
1556 err = -EFBIG; 1553 err = -EFBIG;
1557 if (err) 1554 if (err)
1558 goto out; 1555 goto out;
@@ -1588,6 +1585,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1588} 1585}
1589#endif 1586#endif
1590 1587
1588static size_t elf_core_vma_data_size(unsigned long mm_flags)
1589{
1590 struct vm_area_struct *vma;
1591 size_t size = 0;
1592
1593 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1594 if (maydump(vma, mm_flags))
1595 size += vma->vm_end - vma->vm_start;
1596 return size;
1597}
1598
1591/* 1599/*
1592 * Actual dumper 1600 * Actual dumper
1593 * 1601 *
@@ -1605,7 +1613,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1605 int i; 1613 int i;
1606 struct vm_area_struct *vma; 1614 struct vm_area_struct *vma;
1607 struct elfhdr *elf = NULL; 1615 struct elfhdr *elf = NULL;
1608 loff_t offset = 0, dataoff; 1616 loff_t offset = 0, dataoff, foffset;
1609 int numnote; 1617 int numnote;
1610 struct memelfnote *notes = NULL; 1618 struct memelfnote *notes = NULL;
1611 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ 1619 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
@@ -1618,7 +1626,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1618#endif 1626#endif
1619 int thread_status_size = 0; 1627 int thread_status_size = 0;
1620 elf_addr_t *auxv; 1628 elf_addr_t *auxv;
1621 unsigned long mm_flags; 1629 struct elf_phdr *phdr4note = NULL;
1630 struct elf_shdr *shdr4extnum = NULL;
1631 Elf_Half e_phnum;
1632 elf_addr_t e_shoff;
1622 1633
1623 /* 1634 /*
1624 * We no longer stop all VM operations. 1635 * We no longer stop all VM operations.
@@ -1683,12 +1694,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1683 elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); 1694 elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
1684 1695
1685 segs = current->mm->map_count; 1696 segs = current->mm->map_count;
1686#ifdef ELF_CORE_EXTRA_PHDRS 1697 segs += elf_core_extra_phdrs();
1687 segs += ELF_CORE_EXTRA_PHDRS; 1698
1688#endif 1699 /* for notes section */
1700 segs++;
1701
1702 /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
1703 * this, kernel supports extended numbering. Have a look at
1704 * include/linux/elf.h for further information. */
1705 e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
1689 1706
1690 /* Set up header */ 1707 /* Set up header */
1691 fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ 1708 fill_elf_fdpic_header(elf, e_phnum);
1692 1709
1693 has_dumped = 1; 1710 has_dumped = 1;
1694 current->flags |= PF_DUMPCORE; 1711 current->flags |= PF_DUMPCORE;
@@ -1727,13 +1744,12 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1727 fs = get_fs(); 1744 fs = get_fs();
1728 set_fs(KERNEL_DS); 1745 set_fs(KERNEL_DS);
1729 1746
1730 DUMP_WRITE(elf, sizeof(*elf));
1731 offset += sizeof(*elf); /* Elf header */ 1747 offset += sizeof(*elf); /* Elf header */
1732 offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ 1748 offset += segs * sizeof(struct elf_phdr); /* Program headers */
1749 foffset = offset;
1733 1750
1734 /* Write notes phdr entry */ 1751 /* Write notes phdr entry */
1735 { 1752 {
1736 struct elf_phdr phdr;
1737 int sz = 0; 1753 int sz = 0;
1738 1754
1739 for (i = 0; i < numnote; i++) 1755 for (i = 0; i < numnote; i++)
@@ -1741,20 +1757,38 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1741 1757
1742 sz += thread_status_size; 1758 sz += thread_status_size;
1743 1759
1744 fill_elf_note_phdr(&phdr, sz, offset); 1760 phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
1761 if (!phdr4note)
1762 goto end_coredump;
1763
1764 fill_elf_note_phdr(phdr4note, sz, offset);
1745 offset += sz; 1765 offset += sz;
1746 DUMP_WRITE(&phdr, sizeof(phdr));
1747 } 1766 }
1748 1767
1749 /* Page-align dumped data */ 1768 /* Page-align dumped data */
1750 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1769 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1751 1770
1752 /* 1771 offset += elf_core_vma_data_size(cprm->mm_flags);
1753 * We must use the same mm->flags while dumping core to avoid 1772 offset += elf_core_extra_data_size();
1754 * inconsistency between the program headers and bodies, otherwise an 1773 e_shoff = offset;
1755 * unusable core file can be generated. 1774
1756 */ 1775 if (e_phnum == PN_XNUM) {
1757 mm_flags = current->mm->flags; 1776 shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
1777 if (!shdr4extnum)
1778 goto end_coredump;
1779 fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
1780 }
1781
1782 offset = dataoff;
1783
1784 size += sizeof(*elf);
1785 if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
1786 goto end_coredump;
1787
1788 size += sizeof(*phdr4note);
1789 if (size > cprm->limit
1790 || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
1791 goto end_coredump;
1758 1792
1759 /* write program headers for segments dump */ 1793 /* write program headers for segments dump */
1760 for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1794 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
@@ -1767,7 +1801,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1767 phdr.p_offset = offset; 1801 phdr.p_offset = offset;
1768 phdr.p_vaddr = vma->vm_start; 1802 phdr.p_vaddr = vma->vm_start;
1769 phdr.p_paddr = 0; 1803 phdr.p_paddr = 0;
1770 phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0; 1804 phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
1771 phdr.p_memsz = sz; 1805 phdr.p_memsz = sz;
1772 offset += phdr.p_filesz; 1806 offset += phdr.p_filesz;
1773 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 1807 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1777,16 +1811,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1777 phdr.p_flags |= PF_X; 1811 phdr.p_flags |= PF_X;
1778 phdr.p_align = ELF_EXEC_PAGESIZE; 1812 phdr.p_align = ELF_EXEC_PAGESIZE;
1779 1813
1780 DUMP_WRITE(&phdr, sizeof(phdr)); 1814 size += sizeof(phdr);
1815 if (size > cprm->limit
1816 || !dump_write(cprm->file, &phdr, sizeof(phdr)))
1817 goto end_coredump;
1781 } 1818 }
1782 1819
1783#ifdef ELF_CORE_WRITE_EXTRA_PHDRS 1820 if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
1784 ELF_CORE_WRITE_EXTRA_PHDRS; 1821 goto end_coredump;
1785#endif
1786 1822
1787 /* write out the notes section */ 1823 /* write out the notes section */
1788 for (i = 0; i < numnote; i++) 1824 for (i = 0; i < numnote; i++)
1789 if (!writenote(notes + i, cprm->file)) 1825 if (!writenote(notes + i, cprm->file, &foffset))
1790 goto end_coredump; 1826 goto end_coredump;
1791 1827
1792 /* write out the thread status notes section */ 1828 /* write out the thread status notes section */
@@ -1795,20 +1831,27 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1795 list_entry(t, struct elf_thread_status, list); 1831 list_entry(t, struct elf_thread_status, list);
1796 1832
1797 for (i = 0; i < tmp->num_notes; i++) 1833 for (i = 0; i < tmp->num_notes; i++)
1798 if (!writenote(&tmp->notes[i], cprm->file)) 1834 if (!writenote(&tmp->notes[i], cprm->file, &foffset))
1799 goto end_coredump; 1835 goto end_coredump;
1800 } 1836 }
1801 1837
1802 if (!dump_seek(cprm->file, dataoff)) 1838 if (!dump_seek(cprm->file, dataoff - foffset))
1803 goto end_coredump; 1839 goto end_coredump;
1804 1840
1805 if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, 1841 if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
1806 mm_flags) < 0) 1842 cprm->mm_flags) < 0)
1807 goto end_coredump; 1843 goto end_coredump;
1808 1844
1809#ifdef ELF_CORE_WRITE_EXTRA_DATA 1845 if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
1810 ELF_CORE_WRITE_EXTRA_DATA; 1846 goto end_coredump;
1811#endif 1847
1848 if (e_phnum == PN_XNUM) {
1849 size += sizeof(*shdr4extnum);
1850 if (size > cprm->limit
1851 || !dump_write(cprm->file, shdr4extnum,
1852 sizeof(*shdr4extnum)))
1853 goto end_coredump;
1854 }
1812 1855
1813 if (cprm->file->f_pos != offset) { 1856 if (cprm->file->f_pos != offset) {
1814 /* Sanity check */ 1857 /* Sanity check */
@@ -1826,7 +1869,7 @@ cleanup:
1826 list_del(tmp); 1869 list_del(tmp);
1827 kfree(list_entry(tmp, struct elf_thread_status, list)); 1870 kfree(list_entry(tmp, struct elf_thread_status, list));
1828 } 1871 }
1829 1872 kfree(phdr4note);
1830 kfree(elf); 1873 kfree(elf);
1831 kfree(prstatus); 1874 kfree(prstatus);
1832 kfree(psinfo); 1875 kfree(psinfo);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 42c6b4a54445..e0e769bdca59 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm,
501 * size limits imposed on them by creating programs with large 501 * size limits imposed on them by creating programs with large
502 * arrays in the data or bss. 502 * arrays in the data or bss.
503 */ 503 */
504 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 504 rlim = rlimit(RLIMIT_DATA);
505 if (rlim >= RLIM_INFINITY) 505 if (rlim >= RLIM_INFINITY)
506 rlim = ~0; 506 rlim = ~0;
507 if (data_len + bss_len > rlim) { 507 if (data_len + bss_len > rlim) {
diff --git a/fs/bio.c b/fs/bio.c
index 0bda289f86fc..e1f922184b45 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -264,13 +264,12 @@ EXPORT_SYMBOL(bio_init);
264 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
265 * @gfp_mask: the GFP_ mask given to the slab allocator 265 * @gfp_mask: the GFP_ mask given to the slab allocator
266 * @nr_iovecs: number of iovecs to pre-allocate 266 * @nr_iovecs: number of iovecs to pre-allocate
267 * @bs: the bio_set to allocate from. If %NULL, just use kmalloc 267 * @bs: the bio_set to allocate from.
268 * 268 *
269 * Description: 269 * Description:
270 * bio_alloc_bioset will first try its own mempool to satisfy the allocation. 270 * bio_alloc_bioset will try its own mempool to satisfy the allocation.
271 * If %__GFP_WAIT is set then we will block on the internal pool waiting 271 * If %__GFP_WAIT is set then we will block on the internal pool waiting
272 * for a &struct bio to become free. If a %NULL @bs is passed in, we will 272 * for a &struct bio to become free.
273 * fall back to just using @kmalloc to allocate the required memory.
274 * 273 *
275 * Note that the caller must set ->bi_destructor on successful return 274 * Note that the caller must set ->bi_destructor on successful return
276 * of a bio, to do the appropriate freeing of the bio once the reference 275 * of a bio, to do the appropriate freeing of the bio once the reference
@@ -555,7 +554,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
555 .bi_rw = bio->bi_rw, 554 .bi_rw = bio->bi_rw,
556 }; 555 };
557 556
558 if (q->merge_bvec_fn(q, &bvm, prev) != prev->bv_len) { 557 if (q->merge_bvec_fn(q, &bvm, prev) < len) {
559 prev->bv_len -= len; 558 prev->bv_len -= len;
560 return 0; 559 return 0;
561 } 560 }
@@ -608,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
608 * merge_bvec_fn() returns number of bytes it can accept 607 * merge_bvec_fn() returns number of bytes it can accept
609 * at this offset 608 * at this offset
610 */ 609 */
611 if (q->merge_bvec_fn(q, &bvm, bvec) != bvec->bv_len) { 610 if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
612 bvec->bv_page = NULL; 611 bvec->bv_page = NULL;
613 bvec->bv_len = 0; 612 bvec->bv_len = 0;
614 bvec->bv_offset = 0; 613 bvec->bv_offset = 0;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..7a4dee199832 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned dummy_inode:1; 154 unsigned dummy_inode:1;
155 155
156 /*
157 * always compress this one file
158 */
159 unsigned force_compress:1;
160
156 struct inode vfs_inode; 161 struct inode vfs_inode;
157}; 162};
158 163
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..28b92a7218ab 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -478,7 +478,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 478 goto next;
479 } 479 }
480 480
481 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); 481 page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS);
482 if (!page) 482 if (!page)
483 break; 483 break;
484 484
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a0981..0af2e3868573 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -373,11 +373,13 @@ struct btrfs_super_block {
373 * ones specified below then we will fail to mount 373 * ones specified below then we will fail to mount
374 */ 374 */
375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
376#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
376 377
377#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 378#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
378#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 379#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
379#define BTRFS_FEATURE_INCOMPAT_SUPP \ 380#define BTRFS_FEATURE_INCOMPAT_SUPP \
380 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF 381 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
382 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
381 383
382/* 384/*
383 * A leaf is full of items. offset and size tell us where to find 385 * A leaf is full of items. offset and size tell us where to find
@@ -1182,7 +1184,6 @@ struct btrfs_root {
1182#define BTRFS_INODE_NOATIME (1 << 9) 1184#define BTRFS_INODE_NOATIME (1 << 9)
1183#define BTRFS_INODE_DIRSYNC (1 << 10) 1185#define BTRFS_INODE_DIRSYNC (1 << 10)
1184 1186
1185
1186/* some macros to generate set/get funcs for the struct fields. This 1187/* some macros to generate set/get funcs for the struct fields. This
1187 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1188 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1188 * one for u8: 1189 * one for u8:
@@ -1842,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1842BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, 1843BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1843 compat_flags, 64); 1844 compat_flags, 64);
1844BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, 1845BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1845 compat_flags, 64); 1846 compat_ro_flags, 64);
1846BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, 1847BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1847 incompat_flags, 64); 1848 incompat_flags, 64);
1848BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1849BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -2310,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2310 u32 min_type); 2311 u32 min_type);
2311 2312
2312int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2313int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state);
2314int btrfs_writepages(struct address_space *mapping, 2316int btrfs_writepages(struct address_space *mapping,
2315 struct writeback_control *wbc); 2317 struct writeback_control *wbc);
2316int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2318int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2326,7 +2328,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2326int btrfs_readpage(struct file *file, struct page *page); 2328int btrfs_readpage(struct file *file, struct page *page);
2327void btrfs_delete_inode(struct inode *inode); 2329void btrfs_delete_inode(struct inode *inode);
2328void btrfs_put_inode(struct inode *inode); 2330void btrfs_put_inode(struct inode *inode);
2329int btrfs_write_inode(struct inode *inode, int wait); 2331int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2330void btrfs_dirty_inode(struct inode *inode); 2332void btrfs_dirty_inode(struct inode *inode);
2331struct inode *btrfs_alloc_inode(struct super_block *sb); 2333struct inode *btrfs_alloc_inode(struct super_block *sb);
2332void btrfs_destroy_inode(struct inode *inode); 2334void btrfs_destroy_inode(struct inode *inode);
@@ -2335,7 +2337,7 @@ int btrfs_init_cachep(void);
2335void btrfs_destroy_cachep(void); 2337void btrfs_destroy_cachep(void);
2336long btrfs_ioctl_trans_end(struct file *file); 2338long btrfs_ioctl_trans_end(struct file *file);
2337struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2339struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2338 struct btrfs_root *root); 2340 struct btrfs_root *root, int *was_new);
2339int btrfs_commit_write(struct file *file, struct page *page, 2341int btrfs_commit_write(struct file *file, struct page *page,
2340 unsigned from, unsigned to); 2342 unsigned from, unsigned to);
2341struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2343struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2386,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2386ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2388ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2387 2389
2388/* super.c */ 2390/* super.c */
2389u64 btrfs_parse_size(char *str);
2390int btrfs_parse_options(struct btrfs_root *root, char *options); 2391int btrfs_parse_options(struct btrfs_root *root, char *options);
2391int btrfs_sync_fs(struct super_block *sb, int wait); 2392int btrfs_sync_fs(struct super_block *sb, int wait);
2392 2393
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b59201b955c..11d0ad30e203 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -263,13 +263,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
263static int verify_parent_transid(struct extent_io_tree *io_tree, 263static int verify_parent_transid(struct extent_io_tree *io_tree,
264 struct extent_buffer *eb, u64 parent_transid) 264 struct extent_buffer *eb, u64 parent_transid)
265{ 265{
266 struct extent_state *cached_state = NULL;
266 int ret; 267 int ret;
267 268
268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 269 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
269 return 0; 270 return 0;
270 271
271 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 272 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
272 if (extent_buffer_uptodate(io_tree, eb) && 273 0, &cached_state, GFP_NOFS);
274 if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
273 btrfs_header_generation(eb) == parent_transid) { 275 btrfs_header_generation(eb) == parent_transid) {
274 ret = 0; 276 ret = 0;
275 goto out; 277 goto out;
@@ -282,10 +284,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
282 (unsigned long long)btrfs_header_generation(eb)); 284 (unsigned long long)btrfs_header_generation(eb));
283 } 285 }
284 ret = 1; 286 ret = 1;
285 clear_extent_buffer_uptodate(io_tree, eb); 287 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
286out: 288out:
287 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 289 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
288 GFP_NOFS); 290 &cached_state, GFP_NOFS);
289 return ret; 291 return ret;
290} 292}
291 293
@@ -901,7 +903,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
901 root->highest_objectid = 0; 903 root->highest_objectid = 0;
902 root->name = NULL; 904 root->name = NULL;
903 root->in_sysfs = 0; 905 root->in_sysfs = 0;
904 root->inode_tree.rb_node = NULL; 906 root->inode_tree = RB_ROOT;
905 907
906 INIT_LIST_HEAD(&root->dirty_list); 908 INIT_LIST_HEAD(&root->dirty_list);
907 INIT_LIST_HEAD(&root->orphan_list); 909 INIT_LIST_HEAD(&root->orphan_list);
@@ -1673,7 +1675,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1673 insert_inode_hash(fs_info->btree_inode); 1675 insert_inode_hash(fs_info->btree_inode);
1674 1676
1675 spin_lock_init(&fs_info->block_group_cache_lock); 1677 spin_lock_init(&fs_info->block_group_cache_lock);
1676 fs_info->block_group_cache_tree.rb_node = NULL; 1678 fs_info->block_group_cache_tree = RB_ROOT;
1677 1679
1678 extent_io_tree_init(&fs_info->freed_extents[0], 1680 extent_io_tree_init(&fs_info->freed_extents[0],
1679 fs_info->btree_inode->i_mapping, GFP_NOFS); 1681 fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2497,7 +2499,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2497 int ret; 2499 int ret;
2498 struct inode *btree_inode = buf->first_page->mapping->host; 2500 struct inode *btree_inode = buf->first_page->mapping->host;
2499 2501
2500 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 2502 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
2503 NULL);
2501 if (!ret) 2504 if (!ret)
2502 return ret; 2505 return ret;
2503 2506
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
96 key.offset = 0; 96 key.offset = 0;
97 97
98 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 err = PTR_ERR(inode); 100 err = PTR_ERR(inode);
101 goto fail; 101 goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 223
224 key.type = BTRFS_INODE_ITEM_KEY; 224 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 225 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry)) 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations; 228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry; 229 return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 559f72489b3b..1727b26fb194 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6561,6 +6561,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6561 struct btrfs_key key; 6561 struct btrfs_key key;
6562 struct inode *inode = NULL; 6562 struct inode *inode = NULL;
6563 struct btrfs_file_extent_item *fi; 6563 struct btrfs_file_extent_item *fi;
6564 struct extent_state *cached_state = NULL;
6564 u64 num_bytes; 6565 u64 num_bytes;
6565 u64 skip_objectid = 0; 6566 u64 skip_objectid = 0;
6566 u32 nritems; 6567 u32 nritems;
@@ -6589,12 +6590,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6589 } 6590 }
6590 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6591 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6591 6592
6592 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6593 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6593 key.offset + num_bytes - 1, GFP_NOFS); 6594 key.offset + num_bytes - 1, 0, &cached_state,
6595 GFP_NOFS);
6594 btrfs_drop_extent_cache(inode, key.offset, 6596 btrfs_drop_extent_cache(inode, key.offset,
6595 key.offset + num_bytes - 1, 1); 6597 key.offset + num_bytes - 1, 1);
6596 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6598 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6597 key.offset + num_bytes - 1, GFP_NOFS); 6599 key.offset + num_bytes - 1, &cached_state,
6600 GFP_NOFS);
6598 cond_resched(); 6601 cond_resched();
6599 } 6602 }
6600 iput(inode); 6603 iput(inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b177ed319612..c99121ac5d6b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,8 +104,8 @@ void extent_io_exit(void)
104void extent_io_tree_init(struct extent_io_tree *tree, 104void extent_io_tree_init(struct extent_io_tree *tree,
105 struct address_space *mapping, gfp_t mask) 105 struct address_space *mapping, gfp_t mask)
106{ 106{
107 tree->state.rb_node = NULL; 107 tree->state = RB_ROOT;
108 tree->buffer.rb_node = NULL; 108 tree->buffer = RB_ROOT;
109 tree->ops = NULL; 109 tree->ops = NULL;
110 tree->dirty_bytes = 0; 110 tree->dirty_bytes = 0;
111 spin_lock_init(&tree->lock); 111 spin_lock_init(&tree->lock);
@@ -513,7 +513,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
513 u64 last_end; 513 u64 last_end;
514 int err; 514 int err;
515 int set = 0; 515 int set = 0;
516 int clear = 0;
516 517
518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
519 clear = 1;
517again: 520again:
518 if (!prealloc && (mask & __GFP_WAIT)) { 521 if (!prealloc && (mask & __GFP_WAIT)) {
519 prealloc = alloc_extent_state(mask); 522 prealloc = alloc_extent_state(mask);
@@ -524,14 +527,20 @@ again:
524 spin_lock(&tree->lock); 527 spin_lock(&tree->lock);
525 if (cached_state) { 528 if (cached_state) {
526 cached = *cached_state; 529 cached = *cached_state;
527 *cached_state = NULL; 530
528 cached_state = NULL; 531 if (clear) {
532 *cached_state = NULL;
533 cached_state = NULL;
534 }
535
529 if (cached && cached->tree && cached->start == start) { 536 if (cached && cached->tree && cached->start == start) {
530 atomic_dec(&cached->refs); 537 if (clear)
538 atomic_dec(&cached->refs);
531 state = cached; 539 state = cached;
532 goto hit_next; 540 goto hit_next;
533 } 541 }
534 free_extent_state(cached); 542 if (clear)
543 free_extent_state(cached);
535 } 544 }
536 /* 545 /*
537 * this search will find the extents that end after 546 * this search will find the extents that end after
@@ -946,11 +955,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
946} 955}
947 956
948int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 957int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 gfp_t mask) 958 struct extent_state **cached_state, gfp_t mask)
950{ 959{
951 return set_extent_bit(tree, start, end, 960 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 961 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, NULL, mask); 962 0, NULL, cached_state, mask);
954} 963}
955 964
956int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 965int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +993,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
984} 993}
985 994
986static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 995static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
987 u64 end, gfp_t mask) 996 u64 end, struct extent_state **cached_state,
997 gfp_t mask)
988{ 998{
989 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 999 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
990 NULL, mask); 1000 cached_state, mask);
991} 1001}
992 1002
993int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1003int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1181,8 @@ out:
1171 * 1 is returned if we find something, 0 if nothing was in the tree 1181 * 1 is returned if we find something, 0 if nothing was in the tree
1172 */ 1182 */
1173static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1183static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1174 u64 *start, u64 *end, u64 max_bytes) 1184 u64 *start, u64 *end, u64 max_bytes,
1185 struct extent_state **cached_state)
1175{ 1186{
1176 struct rb_node *node; 1187 struct rb_node *node;
1177 struct extent_state *state; 1188 struct extent_state *state;
@@ -1203,8 +1214,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1203 *end = state->end; 1214 *end = state->end;
1204 goto out; 1215 goto out;
1205 } 1216 }
1206 if (!found) 1217 if (!found) {
1207 *start = state->start; 1218 *start = state->start;
1219 *cached_state = state;
1220 atomic_inc(&state->refs);
1221 }
1208 found++; 1222 found++;
1209 *end = state->end; 1223 *end = state->end;
1210 cur_start = state->end + 1; 1224 cur_start = state->end + 1;
@@ -1336,10 +1350,11 @@ again:
1336 delalloc_start = *start; 1350 delalloc_start = *start;
1337 delalloc_end = 0; 1351 delalloc_end = 0;
1338 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1352 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1339 max_bytes); 1353 max_bytes, &cached_state);
1340 if (!found || delalloc_end <= *start) { 1354 if (!found || delalloc_end <= *start) {
1341 *start = delalloc_start; 1355 *start = delalloc_start;
1342 *end = delalloc_end; 1356 *end = delalloc_end;
1357 free_extent_state(cached_state);
1343 return found; 1358 return found;
1344 } 1359 }
1345 1360
@@ -1722,7 +1737,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1722 } 1737 }
1723 1738
1724 if (!uptodate) { 1739 if (!uptodate) {
1725 clear_extent_uptodate(tree, start, end, GFP_NOFS); 1740 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1726 ClearPageUptodate(page); 1741 ClearPageUptodate(page);
1727 SetPageError(page); 1742 SetPageError(page);
1728 } 1743 }
@@ -1750,7 +1765,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1750static void end_bio_extent_readpage(struct bio *bio, int err) 1765static void end_bio_extent_readpage(struct bio *bio, int err)
1751{ 1766{
1752 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1767 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1753 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1768 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1769 struct bio_vec *bvec = bio->bi_io_vec;
1754 struct extent_io_tree *tree; 1770 struct extent_io_tree *tree;
1755 u64 start; 1771 u64 start;
1756 u64 end; 1772 u64 end;
@@ -1773,7 +1789,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 else 1789 else
1774 whole_page = 0; 1790 whole_page = 0;
1775 1791
1776 if (--bvec >= bio->bi_io_vec) 1792 if (++bvec <= bvec_end)
1777 prefetchw(&bvec->bv_page->flags); 1793 prefetchw(&bvec->bv_page->flags);
1778 1794
1779 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1795 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1834,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1818 } 1834 }
1819 check_page_locked(tree, page); 1835 check_page_locked(tree, page);
1820 } 1836 }
1821 } while (bvec >= bio->bi_io_vec); 1837 } while (bvec <= bvec_end);
1822 1838
1823 bio_put(bio); 1839 bio_put(bio);
1824} 1840}
@@ -2704,6 +2720,7 @@ int extent_readpages(struct extent_io_tree *tree,
2704int extent_invalidatepage(struct extent_io_tree *tree, 2720int extent_invalidatepage(struct extent_io_tree *tree,
2705 struct page *page, unsigned long offset) 2721 struct page *page, unsigned long offset)
2706{ 2722{
2723 struct extent_state *cached_state = NULL;
2707 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2724 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2708 u64 end = start + PAGE_CACHE_SIZE - 1; 2725 u64 end = start + PAGE_CACHE_SIZE - 1;
2709 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2726 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2729,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2712 if (start > end) 2729 if (start > end)
2713 return 0; 2730 return 0;
2714 2731
2715 lock_extent(tree, start, end, GFP_NOFS); 2732 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2716 wait_on_page_writeback(page); 2733 wait_on_page_writeback(page);
2717 clear_extent_bit(tree, start, end, 2734 clear_extent_bit(tree, start, end,
2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2735 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING, 2736 EXTENT_DO_ACCOUNTING,
2720 1, 1, NULL, GFP_NOFS); 2737 1, 1, &cached_state, GFP_NOFS);
2721 return 0; 2738 return 0;
2722} 2739}
2723 2740
@@ -2920,16 +2937,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2920 get_extent_t *get_extent) 2937 get_extent_t *get_extent)
2921{ 2938{
2922 struct inode *inode = mapping->host; 2939 struct inode *inode = mapping->host;
2940 struct extent_state *cached_state = NULL;
2923 u64 start = iblock << inode->i_blkbits; 2941 u64 start = iblock << inode->i_blkbits;
2924 sector_t sector = 0; 2942 sector_t sector = 0;
2925 size_t blksize = (1 << inode->i_blkbits); 2943 size_t blksize = (1 << inode->i_blkbits);
2926 struct extent_map *em; 2944 struct extent_map *em;
2927 2945
2928 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2946 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2929 GFP_NOFS); 2947 0, &cached_state, GFP_NOFS);
2930 em = get_extent(inode, NULL, 0, start, blksize, 0); 2948 em = get_extent(inode, NULL, 0, start, blksize, 0);
2931 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2949 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2932 GFP_NOFS); 2950 start + blksize - 1, &cached_state, GFP_NOFS);
2933 if (!em || IS_ERR(em)) 2951 if (!em || IS_ERR(em))
2934 return 0; 2952 return 0;
2935 2953
@@ -2951,6 +2969,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2951 u32 flags = 0; 2969 u32 flags = 0;
2952 u64 disko = 0; 2970 u64 disko = 0;
2953 struct extent_map *em = NULL; 2971 struct extent_map *em = NULL;
2972 struct extent_state *cached_state = NULL;
2954 int end = 0; 2973 int end = 0;
2955 u64 em_start = 0, em_len = 0; 2974 u64 em_start = 0, em_len = 0;
2956 unsigned long emflags; 2975 unsigned long emflags;
@@ -2959,8 +2978,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2959 if (len == 0) 2978 if (len == 0)
2960 return -EINVAL; 2979 return -EINVAL;
2961 2980
2962 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2981 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2963 GFP_NOFS); 2982 &cached_state, GFP_NOFS);
2964 em = get_extent(inode, NULL, 0, off, max - off, 0); 2983 em = get_extent(inode, NULL, 0, off, max - off, 0);
2965 if (!em) 2984 if (!em)
2966 goto out; 2985 goto out;
@@ -3023,8 +3042,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3023out_free: 3042out_free:
3024 free_extent_map(em); 3043 free_extent_map(em);
3025out: 3044out:
3026 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 3045 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3027 GFP_NOFS); 3046 &cached_state, GFP_NOFS);
3028 return ret; 3047 return ret;
3029} 3048}
3030 3049
@@ -3264,7 +3283,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3264} 3283}
3265 3284
3266int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3285int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3267 struct extent_buffer *eb) 3286 struct extent_buffer *eb,
3287 struct extent_state **cached_state)
3268{ 3288{
3269 unsigned long i; 3289 unsigned long i;
3270 struct page *page; 3290 struct page *page;
@@ -3274,7 +3294,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3274 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3294 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3275 3295
3276 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3296 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3277 GFP_NOFS); 3297 cached_state, GFP_NOFS);
3278 for (i = 0; i < num_pages; i++) { 3298 for (i = 0; i < num_pages; i++) {
3279 page = extent_buffer_page(eb, i); 3299 page = extent_buffer_page(eb, i);
3280 if (page) 3300 if (page)
@@ -3334,7 +3354,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3334} 3354}
3335 3355
3336int extent_buffer_uptodate(struct extent_io_tree *tree, 3356int extent_buffer_uptodate(struct extent_io_tree *tree,
3337 struct extent_buffer *eb) 3357 struct extent_buffer *eb,
3358 struct extent_state *cached_state)
3338{ 3359{
3339 int ret = 0; 3360 int ret = 0;
3340 unsigned long num_pages; 3361 unsigned long num_pages;
@@ -3346,7 +3367,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3346 return 1; 3367 return 1;
3347 3368
3348 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3369 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3349 EXTENT_UPTODATE, 1, NULL); 3370 EXTENT_UPTODATE, 1, cached_state);
3350 if (ret) 3371 if (ret)
3351 return ret; 3372 return ret;
3352 3373
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..bbab4813646f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, struct extent_state **cached, gfp_t mask); 164 int bits, struct extent_state **cached, gfp_t mask);
165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
166int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
167 struct extent_state **cached, gfp_t mask);
166int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 168int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask); 169 gfp_t mask);
168int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 170int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
196int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, 198int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
197 u64 end, gfp_t mask); 199 u64 end, gfp_t mask);
198int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 200int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 201 struct extent_state **cached_state, gfp_t mask);
200int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, 202int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 203 gfp_t mask);
202int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 204int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
281int set_extent_buffer_uptodate(struct extent_io_tree *tree, 283int set_extent_buffer_uptodate(struct extent_io_tree *tree,
282 struct extent_buffer *eb); 284 struct extent_buffer *eb);
283int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 285int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 286 struct extent_buffer *eb,
287 struct extent_state **cached_state);
285int extent_buffer_uptodate(struct extent_io_tree *tree, 288int extent_buffer_uptodate(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 289 struct extent_buffer *eb,
290 struct extent_state *cached_state);
287int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, 291int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
288 unsigned long min_len, char **token, char **map, 292 unsigned long min_len, char **token, char **map,
289 unsigned long *map_start, 293 unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 428fcac45f90..28d87ba60ce8 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -35,7 +35,7 @@ void extent_map_exit(void)
35 */ 35 */
36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
37{ 37{
38 tree->map.rb_node = NULL; 38 tree->map = RB_ROOT;
39 rwlock_init(&tree->lock); 39 rwlock_init(&tree->lock);
40} 40}
41 41
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6ed434ac037f..ee3323c7fc1c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 124
125 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
127 NULL);
127 if (err) 128 if (err)
128 return err; 129 return err;
129 130
@@ -753,6 +754,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
753 loff_t pos, unsigned long first_index, 754 loff_t pos, unsigned long first_index,
754 unsigned long last_index, size_t write_bytes) 755 unsigned long last_index, size_t write_bytes)
755{ 756{
757 struct extent_state *cached_state = NULL;
756 int i; 758 int i;
757 unsigned long index = pos >> PAGE_CACHE_SHIFT; 759 unsigned long index = pos >> PAGE_CACHE_SHIFT;
758 struct inode *inode = fdentry(file)->d_inode; 760 struct inode *inode = fdentry(file)->d_inode;
@@ -781,16 +783,18 @@ again:
781 } 783 }
782 if (start_pos < inode->i_size) { 784 if (start_pos < inode->i_size) {
783 struct btrfs_ordered_extent *ordered; 785 struct btrfs_ordered_extent *ordered;
784 lock_extent(&BTRFS_I(inode)->io_tree, 786 lock_extent_bits(&BTRFS_I(inode)->io_tree,
785 start_pos, last_pos - 1, GFP_NOFS); 787 start_pos, last_pos - 1, 0, &cached_state,
788 GFP_NOFS);
786 ordered = btrfs_lookup_first_ordered_extent(inode, 789 ordered = btrfs_lookup_first_ordered_extent(inode,
787 last_pos - 1); 790 last_pos - 1);
788 if (ordered && 791 if (ordered &&
789 ordered->file_offset + ordered->len > start_pos && 792 ordered->file_offset + ordered->len > start_pos &&
790 ordered->file_offset < last_pos) { 793 ordered->file_offset < last_pos) {
791 btrfs_put_ordered_extent(ordered); 794 btrfs_put_ordered_extent(ordered);
792 unlock_extent(&BTRFS_I(inode)->io_tree, 795 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
793 start_pos, last_pos - 1, GFP_NOFS); 796 start_pos, last_pos - 1,
797 &cached_state, GFP_NOFS);
794 for (i = 0; i < num_pages; i++) { 798 for (i = 0; i < num_pages; i++) {
795 unlock_page(pages[i]); 799 unlock_page(pages[i]);
796 page_cache_release(pages[i]); 800 page_cache_release(pages[i]);
@@ -802,12 +806,13 @@ again:
802 if (ordered) 806 if (ordered)
803 btrfs_put_ordered_extent(ordered); 807 btrfs_put_ordered_extent(ordered);
804 808
805 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 809 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
806 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 810 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
807 EXTENT_DO_ACCOUNTING, 811 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
808 GFP_NOFS); 812 GFP_NOFS);
809 unlock_extent(&BTRFS_I(inode)->io_tree, 813 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
810 start_pos, last_pos - 1, GFP_NOFS); 814 start_pos, last_pos - 1, &cached_state,
815 GFP_NOFS);
811 } 816 }
812 for (i = 0; i < num_pages; i++) { 817 for (i = 0; i < num_pages; i++) {
813 clear_page_dirty_for_io(pages[i]); 818 clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f03251..dd831ed31eea 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -870,7 +870,7 @@ __btrfs_return_cluster_to_free_space(
870 tree_insert_offset(&block_group->free_space_offset, 870 tree_insert_offset(&block_group->free_space_offset,
871 entry->offset, &entry->offset_index, 0); 871 entry->offset, &entry->offset_index, 0);
872 } 872 }
873 cluster->root.rb_node = NULL; 873 cluster->root = RB_ROOT;
874 874
875out: 875out:
876 spin_unlock(&cluster->lock); 876 spin_unlock(&cluster->lock);
@@ -1355,7 +1355,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1355{ 1355{
1356 spin_lock_init(&cluster->lock); 1356 spin_lock_init(&cluster->lock);
1357 spin_lock_init(&cluster->refill_lock); 1357 spin_lock_init(&cluster->refill_lock);
1358 cluster->root.rb_node = NULL; 1358 cluster->root = RB_ROOT;
1359 cluster->max_size = 0; 1359 cluster->max_size = 0;
1360 cluster->points_to_bitmap = false; 1360 cluster->points_to_bitmap = false;
1361 INIT_LIST_HEAD(&cluster->block_group_list); 1361 INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4deb280f8969..02bb099845fd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -379,7 +379,8 @@ again:
379 * change at any time if we discover bad compression ratios. 379 * change at any time if we discover bad compression ratios.
380 */ 380 */
381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
382 btrfs_test_opt(root, COMPRESS)) { 382 (btrfs_test_opt(root, COMPRESS) ||
383 (BTRFS_I(inode)->force_compress))) {
383 WARN_ON(pages); 384 WARN_ON(pages);
384 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 385 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
385 386
@@ -483,8 +484,10 @@ again:
483 nr_pages_ret = 0; 484 nr_pages_ret = 0;
484 485
485 /* flag the file so we don't compress in the future */ 486 /* flag the file so we don't compress in the future */
486 if (!btrfs_test_opt(root, FORCE_COMPRESS)) 487 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
488 !(BTRFS_I(inode)->force_compress)) {
487 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 489 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
490 }
488 } 491 }
489 if (will_compress) { 492 if (will_compress) {
490 *num_added += 1; 493 *num_added += 1;
@@ -570,8 +573,8 @@ retry:
570 unsigned long nr_written = 0; 573 unsigned long nr_written = 0;
571 574
572 lock_extent(io_tree, async_extent->start, 575 lock_extent(io_tree, async_extent->start,
573 async_extent->start + 576 async_extent->start +
574 async_extent->ram_size - 1, GFP_NOFS); 577 async_extent->ram_size - 1, GFP_NOFS);
575 578
576 /* allocate blocks */ 579 /* allocate blocks */
577 ret = cow_file_range(inode, async_cow->locked_page, 580 ret = cow_file_range(inode, async_cow->locked_page,
@@ -1211,7 +1214,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1211 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1214 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1212 ret = run_delalloc_nocow(inode, locked_page, start, end, 1215 ret = run_delalloc_nocow(inode, locked_page, start, end,
1213 page_started, 0, nr_written); 1216 page_started, 0, nr_written);
1214 else if (!btrfs_test_opt(root, COMPRESS)) 1217 else if (!btrfs_test_opt(root, COMPRESS) &&
1218 !(BTRFS_I(inode)->force_compress))
1215 ret = cow_file_range(inode, locked_page, start, end, 1219 ret = cow_file_range(inode, locked_page, start, end,
1216 page_started, nr_written, 1); 1220 page_started, nr_written, 1);
1217 else 1221 else
@@ -1508,12 +1512,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1508 return 0; 1512 return 0;
1509} 1513}
1510 1514
1511int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1515int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1516 struct extent_state **cached_state)
1512{ 1517{
1513 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1518 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1514 WARN_ON(1); 1519 WARN_ON(1);
1515 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1520 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1516 GFP_NOFS); 1521 cached_state, GFP_NOFS);
1517} 1522}
1518 1523
1519/* see btrfs_writepage_start_hook for details on why this is required */ 1524/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1531,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1526{ 1531{
1527 struct btrfs_writepage_fixup *fixup; 1532 struct btrfs_writepage_fixup *fixup;
1528 struct btrfs_ordered_extent *ordered; 1533 struct btrfs_ordered_extent *ordered;
1534 struct extent_state *cached_state = NULL;
1529 struct page *page; 1535 struct page *page;
1530 struct inode *inode; 1536 struct inode *inode;
1531 u64 page_start; 1537 u64 page_start;
@@ -1544,7 +1550,8 @@ again:
1544 page_start = page_offset(page); 1550 page_start = page_offset(page);
1545 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1551 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1546 1552
1547 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1553 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1554 &cached_state, GFP_NOFS);
1548 1555
1549 /* already ordered? We're done */ 1556 /* already ordered? We're done */
1550 if (PagePrivate2(page)) 1557 if (PagePrivate2(page))
@@ -1552,17 +1559,18 @@ again:
1552 1559
1553 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1560 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1554 if (ordered) { 1561 if (ordered) {
1555 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1562 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1556 page_end, GFP_NOFS); 1563 page_end, &cached_state, GFP_NOFS);
1557 unlock_page(page); 1564 unlock_page(page);
1558 btrfs_start_ordered_extent(inode, ordered, 1); 1565 btrfs_start_ordered_extent(inode, ordered, 1);
1559 goto again; 1566 goto again;
1560 } 1567 }
1561 1568
1562 btrfs_set_extent_delalloc(inode, page_start, page_end); 1569 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1563 ClearPageChecked(page); 1570 ClearPageChecked(page);
1564out: 1571out:
1565 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1572 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1573 &cached_state, GFP_NOFS);
1566out_page: 1574out_page:
1567 unlock_page(page); 1575 unlock_page(page);
1568 page_cache_release(page); 1576 page_cache_release(page);
@@ -1691,14 +1699,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1691 struct btrfs_trans_handle *trans; 1699 struct btrfs_trans_handle *trans;
1692 struct btrfs_ordered_extent *ordered_extent = NULL; 1700 struct btrfs_ordered_extent *ordered_extent = NULL;
1693 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1701 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1702 struct extent_state *cached_state = NULL;
1694 int compressed = 0; 1703 int compressed = 0;
1695 int ret; 1704 int ret;
1696 1705
1697 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1706 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1707 end - start + 1);
1698 if (!ret) 1708 if (!ret)
1699 return 0; 1709 return 0;
1700
1701 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1702 BUG_ON(!ordered_extent); 1710 BUG_ON(!ordered_extent);
1703 1711
1704 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1712 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
@@ -1713,9 +1721,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1713 goto out; 1721 goto out;
1714 } 1722 }
1715 1723
1716 lock_extent(io_tree, ordered_extent->file_offset, 1724 lock_extent_bits(io_tree, ordered_extent->file_offset,
1717 ordered_extent->file_offset + ordered_extent->len - 1, 1725 ordered_extent->file_offset + ordered_extent->len - 1,
1718 GFP_NOFS); 1726 0, &cached_state, GFP_NOFS);
1719 1727
1720 trans = btrfs_join_transaction(root, 1); 1728 trans = btrfs_join_transaction(root, 1);
1721 1729
@@ -1742,9 +1750,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1742 ordered_extent->len); 1750 ordered_extent->len);
1743 BUG_ON(ret); 1751 BUG_ON(ret);
1744 } 1752 }
1745 unlock_extent(io_tree, ordered_extent->file_offset, 1753 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1746 ordered_extent->file_offset + ordered_extent->len - 1, 1754 ordered_extent->file_offset +
1747 GFP_NOFS); 1755 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1756
1748 add_pending_csums(trans, inode, ordered_extent->file_offset, 1757 add_pending_csums(trans, inode, ordered_extent->file_offset,
1749 &ordered_extent->list); 1758 &ordered_extent->list);
1750 1759
@@ -2153,7 +2162,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 found_key.objectid = found_key.offset; 2162 found_key.objectid = found_key.offset;
2154 found_key.type = BTRFS_INODE_ITEM_KEY; 2163 found_key.type = BTRFS_INODE_ITEM_KEY;
2155 found_key.offset = 0; 2164 found_key.offset = 0;
2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2165 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2157 if (IS_ERR(inode)) 2166 if (IS_ERR(inode))
2158 break; 2167 break;
2159 2168
@@ -3081,6 +3090,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3081 struct btrfs_root *root = BTRFS_I(inode)->root; 3090 struct btrfs_root *root = BTRFS_I(inode)->root;
3082 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3091 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3083 struct btrfs_ordered_extent *ordered; 3092 struct btrfs_ordered_extent *ordered;
3093 struct extent_state *cached_state = NULL;
3084 char *kaddr; 3094 char *kaddr;
3085 u32 blocksize = root->sectorsize; 3095 u32 blocksize = root->sectorsize;
3086 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3096 pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3127,12 +3137,14 @@ again:
3127 } 3137 }
3128 wait_on_page_writeback(page); 3138 wait_on_page_writeback(page);
3129 3139
3130 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3140 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3141 GFP_NOFS);
3131 set_page_extent_mapped(page); 3142 set_page_extent_mapped(page);
3132 3143
3133 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3144 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3134 if (ordered) { 3145 if (ordered) {
3135 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3146 unlock_extent_cached(io_tree, page_start, page_end,
3147 &cached_state, GFP_NOFS);
3136 unlock_page(page); 3148 unlock_page(page);
3137 page_cache_release(page); 3149 page_cache_release(page);
3138 btrfs_start_ordered_extent(inode, ordered, 1); 3150 btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3140,13 +3152,15 @@ again:
3140 goto again; 3152 goto again;
3141 } 3153 }
3142 3154
3143 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3155 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3144 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3156 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3145 GFP_NOFS); 3157 0, 0, &cached_state, GFP_NOFS);
3146 3158
3147 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3159 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3160 &cached_state);
3148 if (ret) { 3161 if (ret) {
3149 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3162 unlock_extent_cached(io_tree, page_start, page_end,
3163 &cached_state, GFP_NOFS);
3150 goto out_unlock; 3164 goto out_unlock;
3151 } 3165 }
3152 3166
@@ -3159,7 +3173,8 @@ again:
3159 } 3173 }
3160 ClearPageChecked(page); 3174 ClearPageChecked(page);
3161 set_page_dirty(page); 3175 set_page_dirty(page);
3162 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3176 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3177 GFP_NOFS);
3163 3178
3164out_unlock: 3179out_unlock:
3165 if (ret) 3180 if (ret)
@@ -3177,6 +3192,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3177 struct btrfs_root *root = BTRFS_I(inode)->root; 3192 struct btrfs_root *root = BTRFS_I(inode)->root;
3178 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3193 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3179 struct extent_map *em; 3194 struct extent_map *em;
3195 struct extent_state *cached_state = NULL;
3180 u64 mask = root->sectorsize - 1; 3196 u64 mask = root->sectorsize - 1;
3181 u64 hole_start = (inode->i_size + mask) & ~mask; 3197 u64 hole_start = (inode->i_size + mask) & ~mask;
3182 u64 block_end = (size + mask) & ~mask; 3198 u64 block_end = (size + mask) & ~mask;
@@ -3192,11 +3208,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3192 struct btrfs_ordered_extent *ordered; 3208 struct btrfs_ordered_extent *ordered;
3193 btrfs_wait_ordered_range(inode, hole_start, 3209 btrfs_wait_ordered_range(inode, hole_start,
3194 block_end - hole_start); 3210 block_end - hole_start);
3195 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3211 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3212 &cached_state, GFP_NOFS);
3196 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3213 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3197 if (!ordered) 3214 if (!ordered)
3198 break; 3215 break;
3199 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3216 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3217 &cached_state, GFP_NOFS);
3200 btrfs_put_ordered_extent(ordered); 3218 btrfs_put_ordered_extent(ordered);
3201 } 3219 }
3202 3220
@@ -3241,7 +3259,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3241 break; 3259 break;
3242 } 3260 }
3243 3261
3244 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3262 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3263 GFP_NOFS);
3245 return err; 3264 return err;
3246} 3265}
3247 3266
@@ -3639,6 +3658,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3639 bi->index_cnt = (u64)-1; 3658 bi->index_cnt = (u64)-1;
3640 bi->last_unlink_trans = 0; 3659 bi->last_unlink_trans = 0;
3641 bi->ordered_data_close = 0; 3660 bi->ordered_data_close = 0;
3661 bi->force_compress = 0;
3642 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3662 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3643 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3663 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3644 inode->i_mapping, GFP_NOFS); 3664 inode->i_mapping, GFP_NOFS);
@@ -3687,7 +3707,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
3687 * Returns in *is_new if the inode was read from disk 3707 * Returns in *is_new if the inode was read from disk
3688 */ 3708 */
3689struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3709struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3690 struct btrfs_root *root) 3710 struct btrfs_root *root, int *new)
3691{ 3711{
3692 struct inode *inode; 3712 struct inode *inode;
3693 3713
@@ -3702,6 +3722,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3702 3722
3703 inode_tree_add(inode); 3723 inode_tree_add(inode);
3704 unlock_new_inode(inode); 3724 unlock_new_inode(inode);
3725 if (new)
3726 *new = 1;
3705 } 3727 }
3706 3728
3707 return inode; 3729 return inode;
@@ -3754,7 +3776,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3754 return NULL; 3776 return NULL;
3755 3777
3756 if (location.type == BTRFS_INODE_ITEM_KEY) { 3778 if (location.type == BTRFS_INODE_ITEM_KEY) {
3757 inode = btrfs_iget(dir->i_sb, &location, root); 3779 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3758 return inode; 3780 return inode;
3759 } 3781 }
3760 3782
@@ -3769,7 +3791,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3769 else 3791 else
3770 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3792 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3771 } else { 3793 } else {
3772 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3794 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3773 } 3795 }
3774 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3796 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3775 3797
@@ -3968,7 +3990,7 @@ err:
3968 return ret; 3990 return ret;
3969} 3991}
3970 3992
3971int btrfs_write_inode(struct inode *inode, int wait) 3993int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3972{ 3994{
3973 struct btrfs_root *root = BTRFS_I(inode)->root; 3995 struct btrfs_root *root = BTRFS_I(inode)->root;
3974 struct btrfs_trans_handle *trans; 3996 struct btrfs_trans_handle *trans;
@@ -3977,7 +3999,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
3977 if (root->fs_info->btree_inode == inode) 3999 if (root->fs_info->btree_inode == inode)
3978 return 0; 4000 return 0;
3979 4001
3980 if (wait) { 4002 if (wbc->sync_mode == WB_SYNC_ALL) {
3981 trans = btrfs_join_transaction(root, 1); 4003 trans = btrfs_join_transaction(root, 1);
3982 btrfs_set_trans_block_group(trans, inode); 4004 btrfs_set_trans_block_group(trans, inode);
3983 ret = btrfs_commit_transaction(trans, root); 4005 ret = btrfs_commit_transaction(trans, root);
@@ -4501,7 +4523,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4501 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4523 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4502 if (err) { 4524 if (err) {
4503 err = -ENOSPC; 4525 err = -ENOSPC;
4504 goto out_unlock; 4526 goto out_fail;
4505 } 4527 }
4506 4528
4507 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4529 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
@@ -4979,6 +5001,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4979{ 5001{
4980 struct extent_io_tree *tree; 5002 struct extent_io_tree *tree;
4981 struct btrfs_ordered_extent *ordered; 5003 struct btrfs_ordered_extent *ordered;
5004 struct extent_state *cached_state = NULL;
4982 u64 page_start = page_offset(page); 5005 u64 page_start = page_offset(page);
4983 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 5006 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4984 5007
@@ -4997,7 +5020,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4997 btrfs_releasepage(page, GFP_NOFS); 5020 btrfs_releasepage(page, GFP_NOFS);
4998 return; 5021 return;
4999 } 5022 }
5000 lock_extent(tree, page_start, page_end, GFP_NOFS); 5023 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5024 GFP_NOFS);
5001 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 5025 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
5002 page_offset(page)); 5026 page_offset(page));
5003 if (ordered) { 5027 if (ordered) {
@@ -5008,7 +5032,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5008 clear_extent_bit(tree, page_start, page_end, 5032 clear_extent_bit(tree, page_start, page_end,
5009 EXTENT_DIRTY | EXTENT_DELALLOC | 5033 EXTENT_DIRTY | EXTENT_DELALLOC |
5010 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 5034 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5011 NULL, GFP_NOFS); 5035 &cached_state, GFP_NOFS);
5012 /* 5036 /*
5013 * whoever cleared the private bit is responsible 5037 * whoever cleared the private bit is responsible
5014 * for the finish_ordered_io 5038 * for the finish_ordered_io
@@ -5018,11 +5042,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5018 page_start, page_end); 5042 page_start, page_end);
5019 } 5043 }
5020 btrfs_put_ordered_extent(ordered); 5044 btrfs_put_ordered_extent(ordered);
5021 lock_extent(tree, page_start, page_end, GFP_NOFS); 5045 cached_state = NULL;
5046 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5047 GFP_NOFS);
5022 } 5048 }
5023 clear_extent_bit(tree, page_start, page_end, 5049 clear_extent_bit(tree, page_start, page_end,
5024 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 5050 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5025 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 5051 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
5026 __btrfs_releasepage(page, GFP_NOFS); 5052 __btrfs_releasepage(page, GFP_NOFS);
5027 5053
5028 ClearPageChecked(page); 5054 ClearPageChecked(page);
@@ -5055,6 +5081,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5055 struct btrfs_root *root = BTRFS_I(inode)->root; 5081 struct btrfs_root *root = BTRFS_I(inode)->root;
5056 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5082 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5057 struct btrfs_ordered_extent *ordered; 5083 struct btrfs_ordered_extent *ordered;
5084 struct extent_state *cached_state = NULL;
5058 char *kaddr; 5085 char *kaddr;
5059 unsigned long zero_start; 5086 unsigned long zero_start;
5060 loff_t size; 5087 loff_t size;
@@ -5093,7 +5120,8 @@ again:
5093 } 5120 }
5094 wait_on_page_writeback(page); 5121 wait_on_page_writeback(page);
5095 5122
5096 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5123 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
5124 GFP_NOFS);
5097 set_page_extent_mapped(page); 5125 set_page_extent_mapped(page);
5098 5126
5099 /* 5127 /*
@@ -5102,7 +5130,8 @@ again:
5102 */ 5130 */
5103 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5131 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5104 if (ordered) { 5132 if (ordered) {
5105 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5133 unlock_extent_cached(io_tree, page_start, page_end,
5134 &cached_state, GFP_NOFS);
5106 unlock_page(page); 5135 unlock_page(page);
5107 btrfs_start_ordered_extent(inode, ordered, 1); 5136 btrfs_start_ordered_extent(inode, ordered, 1);
5108 btrfs_put_ordered_extent(ordered); 5137 btrfs_put_ordered_extent(ordered);
@@ -5116,13 +5145,15 @@ again:
5116 * is probably a better way to do this, but for now keep consistent with 5145 * is probably a better way to do this, but for now keep consistent with
5117 * prepare_pages in the normal write path. 5146 * prepare_pages in the normal write path.
5118 */ 5147 */
5119 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 5148 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
5120 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 5149 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5121 GFP_NOFS); 5150 0, 0, &cached_state, GFP_NOFS);
5122 5151
5123 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 5152 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
5153 &cached_state);
5124 if (ret) { 5154 if (ret) {
5125 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5155 unlock_extent_cached(io_tree, page_start, page_end,
5156 &cached_state, GFP_NOFS);
5126 ret = VM_FAULT_SIGBUS; 5157 ret = VM_FAULT_SIGBUS;
5127 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 5158 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5128 goto out_unlock; 5159 goto out_unlock;
@@ -5148,7 +5179,7 @@ again:
5148 BTRFS_I(inode)->last_trans = root->fs_info->generation; 5179 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5149 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 5180 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5150 5181
5151 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5182 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5152 5183
5153out_unlock: 5184out_unlock:
5154 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 5185 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5827,6 +5858,7 @@ stop_trans:
5827static long btrfs_fallocate(struct inode *inode, int mode, 5858static long btrfs_fallocate(struct inode *inode, int mode,
5828 loff_t offset, loff_t len) 5859 loff_t offset, loff_t len)
5829{ 5860{
5861 struct extent_state *cached_state = NULL;
5830 u64 cur_offset; 5862 u64 cur_offset;
5831 u64 last_byte; 5863 u64 last_byte;
5832 u64 alloc_start; 5864 u64 alloc_start;
@@ -5865,16 +5897,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5865 /* the extent lock is ordered inside the running 5897 /* the extent lock is ordered inside the running
5866 * transaction 5898 * transaction
5867 */ 5899 */
5868 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5900 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
5869 GFP_NOFS); 5901 locked_end, 0, &cached_state, GFP_NOFS);
5870 ordered = btrfs_lookup_first_ordered_extent(inode, 5902 ordered = btrfs_lookup_first_ordered_extent(inode,
5871 alloc_end - 1); 5903 alloc_end - 1);
5872 if (ordered && 5904 if (ordered &&
5873 ordered->file_offset + ordered->len > alloc_start && 5905 ordered->file_offset + ordered->len > alloc_start &&
5874 ordered->file_offset < alloc_end) { 5906 ordered->file_offset < alloc_end) {
5875 btrfs_put_ordered_extent(ordered); 5907 btrfs_put_ordered_extent(ordered);
5876 unlock_extent(&BTRFS_I(inode)->io_tree, 5908 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
5877 alloc_start, locked_end, GFP_NOFS); 5909 alloc_start, locked_end,
5910 &cached_state, GFP_NOFS);
5878 /* 5911 /*
5879 * we can't wait on the range with the transaction 5912 * we can't wait on the range with the transaction
5880 * running or with the extent lock held 5913 * running or with the extent lock held
@@ -5916,8 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5916 break; 5949 break;
5917 } 5950 }
5918 } 5951 }
5919 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5952 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5920 GFP_NOFS); 5953 &cached_state, GFP_NOFS);
5921 5954
5922 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 5955 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
5923 alloc_end - alloc_start); 5956 alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..2845c6ceecd2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
48#include "print-tree.h" 48#include "print-tree.h"
49#include "volumes.h" 49#include "volumes.h"
50#include "locking.h" 50#include "locking.h"
51#include "ctree.h"
51 52
52/* Mask out flags that are inappropriate for the given type of inode. */ 53/* Mask out flags that are inappropriate for the given type of inode. */
53static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -474,7 +475,79 @@ out_unlock:
474 return error; 475 return error;
475} 476}
476 477
477static int btrfs_defrag_file(struct file *file) 478static int should_defrag_range(struct inode *inode, u64 start, u64 len,
479 int thresh, u64 *last_len, u64 *skip,
480 u64 *defrag_end)
481{
482 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
483 struct extent_map *em = NULL;
484 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
485 int ret = 1;
486
487
488 if (thresh == 0)
489 thresh = 256 * 1024;
490
491 /*
492 * make sure that once we start defragging and extent, we keep on
493 * defragging it
494 */
495 if (start < *defrag_end)
496 return 1;
497
498 *skip = 0;
499
500 /*
501 * hopefully we have this extent in the tree already, try without
502 * the full extent lock
503 */
504 read_lock(&em_tree->lock);
505 em = lookup_extent_mapping(em_tree, start, len);
506 read_unlock(&em_tree->lock);
507
508 if (!em) {
509 /* get the big lock and read metadata off disk */
510 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513
514 if (!em)
515 return 0;
516 }
517
518 /* this will cover holes, and inline extents */
519 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
520 ret = 0;
521
522 /*
523 * we hit a real extent, if it is big don't bother defragging it again
524 */
525 if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
526 ret = 0;
527
528 /*
529 * last_len ends up being a counter of how many bytes we've defragged.
530 * every time we choose not to defrag an extent, we reset *last_len
531 * so that the next tiny extent will force a defrag.
532 *
533 * The end result of this is that tiny extents before a single big
534 * extent will force at least part of that big extent to be defragged.
535 */
536 if (ret) {
537 *last_len += len;
538 *defrag_end = extent_map_end(em);
539 } else {
540 *last_len = 0;
541 *skip = extent_map_end(em);
542 *defrag_end = 0;
543 }
544
545 free_extent_map(em);
546 return ret;
547}
548
549static int btrfs_defrag_file(struct file *file,
550 struct btrfs_ioctl_defrag_range_args *range)
478{ 551{
479 struct inode *inode = fdentry(file)->d_inode; 552 struct inode *inode = fdentry(file)->d_inode;
480 struct btrfs_root *root = BTRFS_I(inode)->root; 553 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file)
486 unsigned long total_read = 0; 559 unsigned long total_read = 0;
487 u64 page_start; 560 u64 page_start;
488 u64 page_end; 561 u64 page_end;
562 u64 last_len = 0;
563 u64 skip = 0;
564 u64 defrag_end = 0;
489 unsigned long i; 565 unsigned long i;
490 int ret; 566 int ret;
491 567
492 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 568 if (inode->i_size == 0)
493 if (ret) 569 return 0;
494 return -ENOSPC; 570
571 if (range->start + range->len > range->start) {
572 last_index = min_t(u64, inode->i_size - 1,
573 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
574 } else {
575 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
576 }
577
578 i = range->start >> PAGE_CACHE_SHIFT;
579 while (i <= last_index) {
580 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
581 PAGE_CACHE_SIZE,
582 range->extent_thresh,
583 &last_len, &skip,
584 &defrag_end)) {
585 unsigned long next;
586 /*
587 * the should_defrag function tells us how much to skip
588 * bump our counter by the suggested amount
589 */
590 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
591 i = max(i + 1, next);
592 continue;
593 }
495 594
496 mutex_lock(&inode->i_mutex);
497 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
498 for (i = 0; i <= last_index; i++) {
499 if (total_read % ra_pages == 0) { 595 if (total_read % ra_pages == 0) {
500 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 596 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
501 min(last_index, i + ra_pages - 1)); 597 min(last_index, i + ra_pages - 1));
502 } 598 }
503 total_read++; 599 total_read++;
600 mutex_lock(&inode->i_mutex);
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1;
603
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
605 if (ret) {
606 ret = -ENOSPC;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
504again: 617again:
618 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
620 ret = 0;
621 goto err_reservations;
622 }
623
505 page = grab_cache_page(inode->i_mapping, i); 624 page = grab_cache_page(inode->i_mapping, i);
506 if (!page) 625 if (!page)
507 goto out_unlock; 626 goto err_reservations;
627
508 if (!PageUptodate(page)) { 628 if (!PageUptodate(page)) {
509 btrfs_readpage(NULL, page); 629 btrfs_readpage(NULL, page);
510 lock_page(page); 630 lock_page(page);
511 if (!PageUptodate(page)) { 631 if (!PageUptodate(page)) {
512 unlock_page(page); 632 unlock_page(page);
513 page_cache_release(page); 633 page_cache_release(page);
514 goto out_unlock; 634 goto err_reservations;
515 } 635 }
516 } 636 }
517 637
638 if (page->mapping != inode->i_mapping) {
639 unlock_page(page);
640 page_cache_release(page);
641 goto again;
642 }
643
518 wait_on_page_writeback(page); 644 wait_on_page_writeback(page);
519 645
646 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode,
648 PAGE_CACHE_SIZE);
649 goto loop_unlock;
650 }
651
520 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 652 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
521 page_end = page_start + PAGE_CACHE_SIZE - 1; 653 page_end = page_start + PAGE_CACHE_SIZE - 1;
522 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 654 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +669,54 @@ again:
537 * page if it is dirtied again later 669 * page if it is dirtied again later
538 */ 670 */
539 clear_page_dirty_for_io(page); 671 clear_page_dirty_for_io(page);
672 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
673 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
674 EXTENT_DO_ACCOUNTING, GFP_NOFS);
540 675
541 btrfs_set_extent_delalloc(inode, page_start, page_end); 676 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
677 ClearPageChecked(page);
542 set_page_dirty(page); 678 set_page_dirty(page);
543 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 679 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
680
681loop_unlock:
544 unlock_page(page); 682 unlock_page(page);
545 page_cache_release(page); 683 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex);
685
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
546 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++;
689 }
690
691 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
692 filemap_flush(inode->i_mapping);
693
694 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
695 /* the filemap_flush will queue IO into the worker threads, but
696 * we have to make sure the IO is actually started and that
697 * ordered extents get created before we return
698 */
699 atomic_inc(&root->fs_info->async_submit_draining);
700 while (atomic_read(&root->fs_info->nr_async_submits) ||
701 atomic_read(&root->fs_info->async_delalloc_pages)) {
702 wait_event(root->fs_info->async_submit_wait,
703 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
704 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
705 }
706 atomic_dec(&root->fs_info->async_submit_draining);
707
708 mutex_lock(&inode->i_mutex);
709 BTRFS_I(inode)->force_compress = 0;
710 mutex_unlock(&inode->i_mutex);
547 } 711 }
548 712
549out_unlock:
550 mutex_unlock(&inode->i_mutex);
551 return 0; 713 return 0;
714
715err_reservations:
716 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret;
552} 720}
553 721
554static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 722static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
608 mod = 1; 776 mod = 1;
609 sizestr++; 777 sizestr++;
610 } 778 }
611 new_size = btrfs_parse_size(sizestr); 779 new_size = memparse(sizestr, NULL);
612 if (new_size == 0) { 780 if (new_size == 0) {
613 ret = -EINVAL; 781 ret = -EINVAL;
614 goto out_unlock; 782 goto out_unlock;
@@ -743,6 +911,327 @@ out:
743 return ret; 911 return ret;
744} 912}
745 913
914static noinline int key_in_sk(struct btrfs_key *key,
915 struct btrfs_ioctl_search_key *sk)
916{
917 struct btrfs_key test;
918 int ret;
919
920 test.objectid = sk->min_objectid;
921 test.type = sk->min_type;
922 test.offset = sk->min_offset;
923
924 ret = btrfs_comp_cpu_keys(key, &test);
925 if (ret < 0)
926 return 0;
927
928 test.objectid = sk->max_objectid;
929 test.type = sk->max_type;
930 test.offset = sk->max_offset;
931
932 ret = btrfs_comp_cpu_keys(key, &test);
933 if (ret > 0)
934 return 0;
935 return 1;
936}
937
938static noinline int copy_to_sk(struct btrfs_root *root,
939 struct btrfs_path *path,
940 struct btrfs_key *key,
941 struct btrfs_ioctl_search_key *sk,
942 char *buf,
943 unsigned long *sk_offset,
944 int *num_found)
945{
946 u64 found_transid;
947 struct extent_buffer *leaf;
948 struct btrfs_ioctl_search_header sh;
949 unsigned long item_off;
950 unsigned long item_len;
951 int nritems;
952 int i;
953 int slot;
954 int found = 0;
955 int ret = 0;
956
957 leaf = path->nodes[0];
958 slot = path->slots[0];
959 nritems = btrfs_header_nritems(leaf);
960
961 if (btrfs_header_generation(leaf) > sk->max_transid) {
962 i = nritems;
963 goto advance_key;
964 }
965 found_transid = btrfs_header_generation(leaf);
966
967 for (i = slot; i < nritems; i++) {
968 item_off = btrfs_item_ptr_offset(leaf, i);
969 item_len = btrfs_item_size_nr(leaf, i);
970
971 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
972 item_len = 0;
973
974 if (sizeof(sh) + item_len + *sk_offset >
975 BTRFS_SEARCH_ARGS_BUFSIZE) {
976 ret = 1;
977 goto overflow;
978 }
979
980 btrfs_item_key_to_cpu(leaf, key, i);
981 if (!key_in_sk(key, sk))
982 continue;
983
984 sh.objectid = key->objectid;
985 sh.offset = key->offset;
986 sh.type = key->type;
987 sh.len = item_len;
988 sh.transid = found_transid;
989
990 /* copy search result header */
991 memcpy(buf + *sk_offset, &sh, sizeof(sh));
992 *sk_offset += sizeof(sh);
993
994 if (item_len) {
995 char *p = buf + *sk_offset;
996 /* copy the item */
997 read_extent_buffer(leaf, p,
998 item_off, item_len);
999 *sk_offset += item_len;
1000 }
1001 found++;
1002
1003 if (*num_found >= sk->nr_items)
1004 break;
1005 }
1006advance_key:
1007 ret = 0;
1008 if (key->offset < (u64)-1 && key->offset < sk->max_offset)
1009 key->offset++;
1010 else if (key->type < (u8)-1 && key->type < sk->max_type) {
1011 key->offset = 0;
1012 key->type++;
1013 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
1014 key->offset = 0;
1015 key->type = 0;
1016 key->objectid++;
1017 } else
1018 ret = 1;
1019overflow:
1020 *num_found += found;
1021 return ret;
1022}
1023
1024static noinline int search_ioctl(struct inode *inode,
1025 struct btrfs_ioctl_search_args *args)
1026{
1027 struct btrfs_root *root;
1028 struct btrfs_key key;
1029 struct btrfs_key max_key;
1030 struct btrfs_path *path;
1031 struct btrfs_ioctl_search_key *sk = &args->key;
1032 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1033 int ret;
1034 int num_found = 0;
1035 unsigned long sk_offset = 0;
1036
1037 path = btrfs_alloc_path();
1038 if (!path)
1039 return -ENOMEM;
1040
1041 if (sk->tree_id == 0) {
1042 /* search the root of the inode that was passed */
1043 root = BTRFS_I(inode)->root;
1044 } else {
1045 key.objectid = sk->tree_id;
1046 key.type = BTRFS_ROOT_ITEM_KEY;
1047 key.offset = (u64)-1;
1048 root = btrfs_read_fs_root_no_name(info, &key);
1049 if (IS_ERR(root)) {
1050 printk(KERN_ERR "could not find root %llu\n",
1051 sk->tree_id);
1052 btrfs_free_path(path);
1053 return -ENOENT;
1054 }
1055 }
1056
1057 key.objectid = sk->min_objectid;
1058 key.type = sk->min_type;
1059 key.offset = sk->min_offset;
1060
1061 max_key.objectid = sk->max_objectid;
1062 max_key.type = sk->max_type;
1063 max_key.offset = sk->max_offset;
1064
1065 path->keep_locks = 1;
1066
1067 while(1) {
1068 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1069 sk->min_transid);
1070 if (ret != 0) {
1071 if (ret > 0)
1072 ret = 0;
1073 goto err;
1074 }
1075 ret = copy_to_sk(root, path, &key, sk, args->buf,
1076 &sk_offset, &num_found);
1077 btrfs_release_path(root, path);
1078 if (ret || num_found >= sk->nr_items)
1079 break;
1080
1081 }
1082 ret = 0;
1083err:
1084 sk->nr_items = num_found;
1085 btrfs_free_path(path);
1086 return ret;
1087}
1088
1089static noinline int btrfs_ioctl_tree_search(struct file *file,
1090 void __user *argp)
1091{
1092 struct btrfs_ioctl_search_args *args;
1093 struct inode *inode;
1094 int ret;
1095
1096 if (!capable(CAP_SYS_ADMIN))
1097 return -EPERM;
1098
1099 args = kmalloc(sizeof(*args), GFP_KERNEL);
1100 if (!args)
1101 return -ENOMEM;
1102
1103 if (copy_from_user(args, argp, sizeof(*args))) {
1104 kfree(args);
1105 return -EFAULT;
1106 }
1107 inode = fdentry(file)->d_inode;
1108 ret = search_ioctl(inode, args);
1109 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1110 ret = -EFAULT;
1111 kfree(args);
1112 return ret;
1113}
1114
1115/*
1116 * Search INODE_REFs to identify path name of 'dirid' directory
1117 * in a 'tree_id' tree. and sets path name to 'name'.
1118 */
1119static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1120 u64 tree_id, u64 dirid, char *name)
1121{
1122 struct btrfs_root *root;
1123 struct btrfs_key key;
1124 char *ptr;
1125 int ret = -1;
1126 int slot;
1127 int len;
1128 int total_len = 0;
1129 struct btrfs_inode_ref *iref;
1130 struct extent_buffer *l;
1131 struct btrfs_path *path;
1132
1133 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1134 name[0]='\0';
1135 return 0;
1136 }
1137
1138 path = btrfs_alloc_path();
1139 if (!path)
1140 return -ENOMEM;
1141
1142 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1143
1144 key.objectid = tree_id;
1145 key.type = BTRFS_ROOT_ITEM_KEY;
1146 key.offset = (u64)-1;
1147 root = btrfs_read_fs_root_no_name(info, &key);
1148 if (IS_ERR(root)) {
1149 printk(KERN_ERR "could not find root %llu\n", tree_id);
1150 ret = -ENOENT;
1151 goto out;
1152 }
1153
1154 key.objectid = dirid;
1155 key.type = BTRFS_INODE_REF_KEY;
1156 key.offset = (u64)-1;
1157
1158 while(1) {
1159 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1160 if (ret < 0)
1161 goto out;
1162
1163 l = path->nodes[0];
1164 slot = path->slots[0];
1165 if (ret > 0 && slot > 0)
1166 slot--;
1167 btrfs_item_key_to_cpu(l, &key, slot);
1168
1169 if (ret > 0 && (key.objectid != dirid ||
1170 key.type != BTRFS_INODE_REF_KEY)) {
1171 ret = -ENOENT;
1172 goto out;
1173 }
1174
1175 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1176 len = btrfs_inode_ref_name_len(l, iref);
1177 ptr -= len + 1;
1178 total_len += len + 1;
1179 if (ptr < name)
1180 goto out;
1181
1182 *(ptr + len) = '/';
1183 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1184
1185 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1186 break;
1187
1188 btrfs_release_path(root, path);
1189 key.objectid = key.offset;
1190 key.offset = (u64)-1;
1191 dirid = key.objectid;
1192
1193 }
1194 if (ptr < name)
1195 goto out;
1196 memcpy(name, ptr, total_len);
1197 name[total_len]='\0';
1198 ret = 0;
1199out:
1200 btrfs_free_path(path);
1201 return ret;
1202}
1203
1204static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1205 void __user *argp)
1206{
1207 struct btrfs_ioctl_ino_lookup_args *args;
1208 struct inode *inode;
1209 int ret;
1210
1211 if (!capable(CAP_SYS_ADMIN))
1212 return -EPERM;
1213
1214 args = kmalloc(sizeof(*args), GFP_KERNEL);
1215 if (copy_from_user(args, argp, sizeof(*args))) {
1216 kfree(args);
1217 return -EFAULT;
1218 }
1219 inode = fdentry(file)->d_inode;
1220
1221 if (args->treeid == 0)
1222 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1223
1224 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1225 args->treeid, args->objectid,
1226 args->name);
1227
1228 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1229 ret = -EFAULT;
1230
1231 kfree(args);
1232 return ret;
1233}
1234
746static noinline int btrfs_ioctl_snap_destroy(struct file *file, 1235static noinline int btrfs_ioctl_snap_destroy(struct file *file,
747 void __user *arg) 1236 void __user *arg)
748{ 1237{
@@ -849,10 +1338,11 @@ out:
849 return err; 1338 return err;
850} 1339}
851 1340
852static int btrfs_ioctl_defrag(struct file *file) 1341static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
853{ 1342{
854 struct inode *inode = fdentry(file)->d_inode; 1343 struct inode *inode = fdentry(file)->d_inode;
855 struct btrfs_root *root = BTRFS_I(inode)->root; 1344 struct btrfs_root *root = BTRFS_I(inode)->root;
1345 struct btrfs_ioctl_defrag_range_args *range;
856 int ret; 1346 int ret;
857 1347
858 ret = mnt_want_write(file->f_path.mnt); 1348 ret = mnt_want_write(file->f_path.mnt);
@@ -873,7 +1363,30 @@ static int btrfs_ioctl_defrag(struct file *file)
873 ret = -EINVAL; 1363 ret = -EINVAL;
874 goto out; 1364 goto out;
875 } 1365 }
876 btrfs_defrag_file(file); 1366
1367 range = kzalloc(sizeof(*range), GFP_KERNEL);
1368 if (!range) {
1369 ret = -ENOMEM;
1370 goto out;
1371 }
1372
1373 if (argp) {
1374 if (copy_from_user(range, argp,
1375 sizeof(*range))) {
1376 ret = -EFAULT;
1377 kfree(range);
1378 }
1379 /* compression requires us to start the IO */
1380 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1381 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1382 range->extent_thresh = (u32)-1;
1383 }
1384 } else {
1385 /* the rest are all set to zero by kzalloc */
1386 range->len = (u64)-1;
1387 }
1388 btrfs_defrag_file(file, range);
1389 kfree(range);
877 break; 1390 break;
878 } 1391 }
879out: 1392out:
@@ -1274,6 +1787,157 @@ out:
1274 return ret; 1787 return ret;
1275} 1788}
1276 1789
1790static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1791{
1792 struct inode *inode = fdentry(file)->d_inode;
1793 struct btrfs_root *root = BTRFS_I(inode)->root;
1794 struct btrfs_root *new_root;
1795 struct btrfs_dir_item *di;
1796 struct btrfs_trans_handle *trans;
1797 struct btrfs_path *path;
1798 struct btrfs_key location;
1799 struct btrfs_disk_key disk_key;
1800 struct btrfs_super_block *disk_super;
1801 u64 features;
1802 u64 objectid = 0;
1803 u64 dir_id;
1804
1805 if (!capable(CAP_SYS_ADMIN))
1806 return -EPERM;
1807
1808 if (copy_from_user(&objectid, argp, sizeof(objectid)))
1809 return -EFAULT;
1810
1811 if (!objectid)
1812 objectid = root->root_key.objectid;
1813
1814 location.objectid = objectid;
1815 location.type = BTRFS_ROOT_ITEM_KEY;
1816 location.offset = (u64)-1;
1817
1818 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
1819 if (IS_ERR(new_root))
1820 return PTR_ERR(new_root);
1821
1822 if (btrfs_root_refs(&new_root->root_item) == 0)
1823 return -ENOENT;
1824
1825 path = btrfs_alloc_path();
1826 if (!path)
1827 return -ENOMEM;
1828 path->leave_spinning = 1;
1829
1830 trans = btrfs_start_transaction(root, 1);
1831 if (!trans) {
1832 btrfs_free_path(path);
1833 return -ENOMEM;
1834 }
1835
1836 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1837 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1838 dir_id, "default", 7, 1);
1839 if (!di) {
1840 btrfs_free_path(path);
1841 btrfs_end_transaction(trans, root);
1842 printk(KERN_ERR "Umm, you don't have the default dir item, "
1843 "this isn't going to work\n");
1844 return -ENOENT;
1845 }
1846
1847 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
1848 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
1849 btrfs_mark_buffer_dirty(path->nodes[0]);
1850 btrfs_free_path(path);
1851
1852 disk_super = &root->fs_info->super_copy;
1853 features = btrfs_super_incompat_flags(disk_super);
1854 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
1855 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
1856 btrfs_set_super_incompat_flags(disk_super, features);
1857 }
1858 btrfs_end_transaction(trans, root);
1859
1860 return 0;
1861}
1862
1863long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1864{
1865 struct btrfs_ioctl_space_args space_args;
1866 struct btrfs_ioctl_space_info space;
1867 struct btrfs_ioctl_space_info *dest;
1868 struct btrfs_ioctl_space_info *dest_orig;
1869 struct btrfs_ioctl_space_info *user_dest;
1870 struct btrfs_space_info *info;
1871 int alloc_size;
1872 int ret = 0;
1873 int slot_count = 0;
1874
1875 if (copy_from_user(&space_args,
1876 (struct btrfs_ioctl_space_args __user *)arg,
1877 sizeof(space_args)))
1878 return -EFAULT;
1879
1880 /* first we count slots */
1881 rcu_read_lock();
1882 list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
1883 slot_count++;
1884 rcu_read_unlock();
1885
1886 /* space_slots == 0 means they are asking for a count */
1887 if (space_args.space_slots == 0) {
1888 space_args.total_spaces = slot_count;
1889 goto out;
1890 }
1891 alloc_size = sizeof(*dest) * slot_count;
1892 /* we generally have at most 6 or so space infos, one for each raid
1893 * level. So, a whole page should be more than enough for everyone
1894 */
1895 if (alloc_size > PAGE_CACHE_SIZE)
1896 return -ENOMEM;
1897
1898 space_args.total_spaces = 0;
1899 dest = kmalloc(alloc_size, GFP_NOFS);
1900 if (!dest)
1901 return -ENOMEM;
1902 dest_orig = dest;
1903
1904 /* now we have a buffer to copy into */
1905 rcu_read_lock();
1906 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
1907 /* make sure we don't copy more than we allocated
1908 * in our buffer
1909 */
1910 if (slot_count == 0)
1911 break;
1912 slot_count--;
1913
1914 /* make sure userland has enough room in their buffer */
1915 if (space_args.total_spaces >= space_args.space_slots)
1916 break;
1917
1918 space.flags = info->flags;
1919 space.total_bytes = info->total_bytes;
1920 space.used_bytes = info->bytes_used;
1921 memcpy(dest, &space, sizeof(space));
1922 dest++;
1923 space_args.total_spaces++;
1924 }
1925 rcu_read_unlock();
1926
1927 user_dest = (struct btrfs_ioctl_space_info *)
1928 (arg + sizeof(struct btrfs_ioctl_space_args));
1929
1930 if (copy_to_user(user_dest, dest_orig, alloc_size))
1931 ret = -EFAULT;
1932
1933 kfree(dest_orig);
1934out:
1935 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
1936 ret = -EFAULT;
1937
1938 return ret;
1939}
1940
1277/* 1941/*
1278 * there are many ways the trans_start and trans_end ioctls can lead 1942 * there are many ways the trans_start and trans_end ioctls can lead
1279 * to deadlocks. They should only be used by applications that 1943 * to deadlocks. They should only be used by applications that
@@ -1320,8 +1984,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1320 return btrfs_ioctl_snap_create(file, argp, 1); 1984 return btrfs_ioctl_snap_create(file, argp, 1);
1321 case BTRFS_IOC_SNAP_DESTROY: 1985 case BTRFS_IOC_SNAP_DESTROY:
1322 return btrfs_ioctl_snap_destroy(file, argp); 1986 return btrfs_ioctl_snap_destroy(file, argp);
1987 case BTRFS_IOC_DEFAULT_SUBVOL:
1988 return btrfs_ioctl_default_subvol(file, argp);
1323 case BTRFS_IOC_DEFRAG: 1989 case BTRFS_IOC_DEFRAG:
1324 return btrfs_ioctl_defrag(file); 1990 return btrfs_ioctl_defrag(file, NULL);
1991 case BTRFS_IOC_DEFRAG_RANGE:
1992 return btrfs_ioctl_defrag(file, argp);
1325 case BTRFS_IOC_RESIZE: 1993 case BTRFS_IOC_RESIZE:
1326 return btrfs_ioctl_resize(root, argp); 1994 return btrfs_ioctl_resize(root, argp);
1327 case BTRFS_IOC_ADD_DEV: 1995 case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2006,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1338 return btrfs_ioctl_trans_start(file); 2006 return btrfs_ioctl_trans_start(file);
1339 case BTRFS_IOC_TRANS_END: 2007 case BTRFS_IOC_TRANS_END:
1340 return btrfs_ioctl_trans_end(file); 2008 return btrfs_ioctl_trans_end(file);
2009 case BTRFS_IOC_TREE_SEARCH:
2010 return btrfs_ioctl_tree_search(file, argp);
2011 case BTRFS_IOC_INO_LOOKUP:
2012 return btrfs_ioctl_ino_lookup(file, argp);
2013 case BTRFS_IOC_SPACE_INFO:
2014 return btrfs_ioctl_space_info(root, argp);
1341 case BTRFS_IOC_SYNC: 2015 case BTRFS_IOC_SYNC:
1342 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2016 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1343 return 0; 2017 return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid;
36 __u64 objectid;
37 char name[BTRFS_INO_LOOKUP_PATH_MAX];
38};
39
40struct btrfs_ioctl_search_key {
41 /* which root are we searching. 0 is the tree of tree roots */
42 __u64 tree_id;
43
44 /* keys returned will be >= min and <= max */
45 __u64 min_objectid;
46 __u64 max_objectid;
47
48 /* keys returned will be >= min and <= max */
49 __u64 min_offset;
50 __u64 max_offset;
51
52 /* max and min transids to search for */
53 __u64 min_transid;
54 __u64 max_transid;
55
56 /* keys returned will be >= min and <= max */
57 __u32 min_type;
58 __u32 max_type;
59
60 /*
61 * how many items did userland ask for, and how many are we
62 * returning
63 */
64 __u32 nr_items;
65
66 /* align to 64 bits */
67 __u32 unused;
68
69 /* some extra for later */
70 __u64 unused1;
71 __u64 unused2;
72 __u64 unused3;
73 __u64 unused4;
74};
75
76struct btrfs_ioctl_search_header {
77 __u64 transid;
78 __u64 objectid;
79 __u64 offset;
80 __u32 type;
81 __u32 len;
82};
83
84#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
85/*
86 * the buf is an array of search headers where
87 * each header is followed by the actual item
88 * the type field is expanded to 32 bits for alignment
89 */
90struct btrfs_ioctl_search_args {
91 struct btrfs_ioctl_search_key key;
92 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
93};
94
33struct btrfs_ioctl_clone_range_args { 95struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd; 96 __s64 src_fd;
35 __u64 src_offset, src_length; 97 __u64 src_offset, src_length;
36 __u64 dest_offset; 98 __u64 dest_offset;
37}; 99};
38 100
101/* flags for the defrag range ioctl */
102#define BTRFS_DEFRAG_RANGE_COMPRESS 1
103#define BTRFS_DEFRAG_RANGE_START_IO 2
104
105struct btrfs_ioctl_defrag_range_args {
106 /* start of the defrag operation */
107 __u64 start;
108
109 /* number of bytes to defrag, use (u64)-1 to say all */
110 __u64 len;
111
112 /*
113 * flags for the operation, which can include turning
114 * on compression for this one defrag
115 */
116 __u64 flags;
117
118 /*
119 * any extent bigger than this will be considered
120 * already defragged. Use 0 to take the kernel default
121 * Use 1 to say every single extent must be rewritten
122 */
123 __u32 extent_thresh;
124
125 /* spare for later */
126 __u32 unused[5];
127};
128
129struct btrfs_ioctl_space_info {
130 __u64 flags;
131 __u64 total_bytes;
132 __u64 used_bytes;
133};
134
135struct btrfs_ioctl_space_args {
136 __u64 space_slots;
137 __u64 total_spaces;
138 struct btrfs_ioctl_space_info spaces[0];
139};
140
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 141#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args) 142 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 143#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
67 struct btrfs_ioctl_vol_args) 169 struct btrfs_ioctl_vol_args)
68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ 170#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args) 171 struct btrfs_ioctl_vol_args)
172#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
173 struct btrfs_ioctl_defrag_range_args)
174#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
175 struct btrfs_ioctl_search_args)
176#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
177 struct btrfs_ioctl_ino_lookup_args)
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args)
70#endif 181#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..a8ffecd0b491 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -174,7 +174,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 if (!entry) 174 if (!entry)
175 return -ENOMEM; 175 return -ENOMEM;
176 176
177 mutex_lock(&tree->mutex);
178 entry->file_offset = file_offset; 177 entry->file_offset = file_offset;
179 entry->start = start; 178 entry->start = start;
180 entry->len = len; 179 entry->len = len;
@@ -190,16 +189,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
190 INIT_LIST_HEAD(&entry->list); 189 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list); 190 INIT_LIST_HEAD(&entry->root_extent_list);
192 191
192 spin_lock(&tree->lock);
193 node = tree_insert(&tree->tree, file_offset, 193 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node); 194 &entry->rb_node);
195 BUG_ON(node); 195 BUG_ON(node);
196 spin_unlock(&tree->lock);
196 197
197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 198 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
198 list_add_tail(&entry->root_extent_list, 199 list_add_tail(&entry->root_extent_list,
199 &BTRFS_I(inode)->root->fs_info->ordered_extents); 200 &BTRFS_I(inode)->root->fs_info->ordered_extents);
200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 201 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
201 202
202 mutex_unlock(&tree->mutex);
203 BUG_ON(node); 203 BUG_ON(node);
204 return 0; 204 return 0;
205} 205}
@@ -216,9 +216,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
216 struct btrfs_ordered_inode_tree *tree; 216 struct btrfs_ordered_inode_tree *tree;
217 217
218 tree = &BTRFS_I(inode)->ordered_tree; 218 tree = &BTRFS_I(inode)->ordered_tree;
219 mutex_lock(&tree->mutex); 219 spin_lock(&tree->lock);
220 list_add_tail(&sum->list, &entry->list); 220 list_add_tail(&sum->list, &entry->list);
221 mutex_unlock(&tree->mutex); 221 spin_unlock(&tree->lock);
222 return 0; 222 return 0;
223} 223}
224 224
@@ -232,15 +232,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
232 * to make sure this function only returns 1 once for a given ordered extent. 232 * to make sure this function only returns 1 once for a given ordered extent.
233 */ 233 */
234int btrfs_dec_test_ordered_pending(struct inode *inode, 234int btrfs_dec_test_ordered_pending(struct inode *inode,
235 struct btrfs_ordered_extent **cached,
235 u64 file_offset, u64 io_size) 236 u64 file_offset, u64 io_size)
236{ 237{
237 struct btrfs_ordered_inode_tree *tree; 238 struct btrfs_ordered_inode_tree *tree;
238 struct rb_node *node; 239 struct rb_node *node;
239 struct btrfs_ordered_extent *entry; 240 struct btrfs_ordered_extent *entry = NULL;
240 int ret; 241 int ret;
241 242
242 tree = &BTRFS_I(inode)->ordered_tree; 243 tree = &BTRFS_I(inode)->ordered_tree;
243 mutex_lock(&tree->mutex); 244 spin_lock(&tree->lock);
244 node = tree_search(tree, file_offset); 245 node = tree_search(tree, file_offset);
245 if (!node) { 246 if (!node) {
246 ret = 1; 247 ret = 1;
@@ -264,7 +265,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
264 else 265 else
265 ret = 1; 266 ret = 1;
266out: 267out:
267 mutex_unlock(&tree->mutex); 268 if (!ret && cached && entry) {
269 *cached = entry;
270 atomic_inc(&entry->refs);
271 }
272 spin_unlock(&tree->lock);
268 return ret == 0; 273 return ret == 0;
269} 274}
270 275
@@ -291,7 +296,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 296
292/* 297/*
293 * remove an ordered extent from the tree. No references are dropped 298 * remove an ordered extent from the tree. No references are dropped
294 * and you must wake_up entry->wait. You must hold the tree mutex 299 * and you must wake_up entry->wait. You must hold the tree lock
295 * while you call this function. 300 * while you call this function.
296 */ 301 */
297static int __btrfs_remove_ordered_extent(struct inode *inode, 302static int __btrfs_remove_ordered_extent(struct inode *inode,
@@ -340,9 +345,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
340 int ret; 345 int ret;
341 346
342 tree = &BTRFS_I(inode)->ordered_tree; 347 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex); 348 spin_lock(&tree->lock);
344 ret = __btrfs_remove_ordered_extent(inode, entry); 349 ret = __btrfs_remove_ordered_extent(inode, entry);
345 mutex_unlock(&tree->mutex); 350 spin_unlock(&tree->lock);
346 wake_up(&entry->wait); 351 wake_up(&entry->wait);
347 352
348 return ret; 353 return ret;
@@ -567,7 +572,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
567 struct btrfs_ordered_extent *entry = NULL; 572 struct btrfs_ordered_extent *entry = NULL;
568 573
569 tree = &BTRFS_I(inode)->ordered_tree; 574 tree = &BTRFS_I(inode)->ordered_tree;
570 mutex_lock(&tree->mutex); 575 spin_lock(&tree->lock);
571 node = tree_search(tree, file_offset); 576 node = tree_search(tree, file_offset);
572 if (!node) 577 if (!node)
573 goto out; 578 goto out;
@@ -578,7 +583,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
578 if (entry) 583 if (entry)
579 atomic_inc(&entry->refs); 584 atomic_inc(&entry->refs);
580out: 585out:
581 mutex_unlock(&tree->mutex); 586 spin_unlock(&tree->lock);
582 return entry; 587 return entry;
583} 588}
584 589
@@ -594,7 +599,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
594 struct btrfs_ordered_extent *entry = NULL; 599 struct btrfs_ordered_extent *entry = NULL;
595 600
596 tree = &BTRFS_I(inode)->ordered_tree; 601 tree = &BTRFS_I(inode)->ordered_tree;
597 mutex_lock(&tree->mutex); 602 spin_lock(&tree->lock);
598 node = tree_search(tree, file_offset); 603 node = tree_search(tree, file_offset);
599 if (!node) 604 if (!node)
600 goto out; 605 goto out;
@@ -602,7 +607,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
602 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 607 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
603 atomic_inc(&entry->refs); 608 atomic_inc(&entry->refs);
604out: 609out:
605 mutex_unlock(&tree->mutex); 610 spin_unlock(&tree->lock);
606 return entry; 611 return entry;
607} 612}
608 613
@@ -629,7 +634,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
629 else 634 else
630 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 635 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
631 636
632 mutex_lock(&tree->mutex); 637 spin_lock(&tree->lock);
633 disk_i_size = BTRFS_I(inode)->disk_i_size; 638 disk_i_size = BTRFS_I(inode)->disk_i_size;
634 639
635 /* truncate file */ 640 /* truncate file */
@@ -735,7 +740,7 @@ out:
735 */ 740 */
736 if (ordered) 741 if (ordered)
737 __btrfs_remove_ordered_extent(inode, ordered); 742 __btrfs_remove_ordered_extent(inode, ordered);
738 mutex_unlock(&tree->mutex); 743 spin_unlock(&tree->lock);
739 if (ordered) 744 if (ordered)
740 wake_up(&ordered->wait); 745 wake_up(&ordered->wait);
741 return ret; 746 return ret;
@@ -762,7 +767,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
762 if (!ordered) 767 if (!ordered)
763 return 1; 768 return 1;
764 769
765 mutex_lock(&tree->mutex); 770 spin_lock(&tree->lock);
766 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 771 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
767 if (disk_bytenr >= ordered_sum->bytenr) { 772 if (disk_bytenr >= ordered_sum->bytenr) {
768 num_sectors = ordered_sum->len / sectorsize; 773 num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +782,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
777 } 782 }
778 } 783 }
779out: 784out:
780 mutex_unlock(&tree->mutex); 785 spin_unlock(&tree->lock);
781 btrfs_put_ordered_extent(ordered); 786 btrfs_put_ordered_extent(ordered);
782 return ret; 787 return ret;
783} 788}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47c..c82f76a9f040 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
21 21
22/* one of these per inode */ 22/* one of these per inode */
23struct btrfs_ordered_inode_tree { 23struct btrfs_ordered_inode_tree {
24 struct mutex mutex; 24 spinlock_t lock;
25 struct rb_root tree; 25 struct rb_root tree;
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
@@ -128,8 +128,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
128static inline void 128static inline void
129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) 129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
130{ 130{
131 mutex_init(&t->mutex); 131 spin_lock_init(&t->lock);
132 t->tree.rb_node = NULL; 132 t->tree = RB_ROOT;
133 t->last = NULL; 133 t->last = NULL;
134} 134}
135 135
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
137int btrfs_remove_ordered_extent(struct inode *inode, 137int btrfs_remove_ordered_extent(struct inode *inode,
138 struct btrfs_ordered_extent *entry); 138 struct btrfs_ordered_extent *entry);
139int btrfs_dec_test_ordered_pending(struct inode *inode, 139int btrfs_dec_test_ordered_pending(struct inode *inode,
140 u64 file_offset, u64 io_size); 140 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size);
141int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
142 u64 start, u64 len, u64 disk_len, int tyep); 143 u64 start, u64 len, u64 disk_len, int tyep);
143int btrfs_add_ordered_sum(struct inode *inode, 144int btrfs_add_ordered_sum(struct inode *inode,
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db73..e2a55cb2072b 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
52 52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) 53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{ 54{
55 tree->root.rb_node = NULL; 55 tree->root = RB_ROOT;
56 INIT_LIST_HEAD(&tree->list); 56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock); 57 spin_lock_init(&tree->lock);
58} 58}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ab7ab5318745..0b23942cbc0d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -170,14 +170,14 @@ struct async_merge {
170 170
171static void mapping_tree_init(struct mapping_tree *tree) 171static void mapping_tree_init(struct mapping_tree *tree)
172{ 172{
173 tree->rb_root.rb_node = NULL; 173 tree->rb_root = RB_ROOT;
174 spin_lock_init(&tree->lock); 174 spin_lock_init(&tree->lock);
175} 175}
176 176
177static void backref_cache_init(struct backref_cache *cache) 177static void backref_cache_init(struct backref_cache *cache)
178{ 178{
179 int i; 179 int i;
180 cache->rb_root.rb_node = NULL; 180 cache->rb_root = RB_ROOT;
181 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 181 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 INIT_LIST_HEAD(&cache->pending[i]); 182 INIT_LIST_HEAD(&cache->pending[i]);
183 spin_lock_init(&cache->lock); 183 spin_lock_init(&cache->lock);
@@ -2659,7 +2659,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2659 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2660 nr++;
2661 } 2661 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end); 2662 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2663 2663
2664 set_page_dirty(page); 2664 set_page_dirty(page);
2665 dirty_page++; 2665 dirty_page++;
@@ -3487,7 +3487,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3487 key.objectid = objectid; 3487 key.objectid = objectid;
3488 key.type = BTRFS_INODE_ITEM_KEY; 3488 key.type = BTRFS_INODE_ITEM_KEY;
3489 key.offset = 0; 3489 key.offset = 0;
3490 inode = btrfs_iget(root->fs_info->sb, &key, root); 3490 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); 3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3492 BTRFS_I(inode)->index_cnt = group->key.objectid; 3492 BTRFS_I(inode)->index_cnt = group->key.objectid;
3493 3493
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a1ea6e64575..9ac612e6ca60 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -63,10 +63,10 @@ static void btrfs_put_super(struct super_block *sb)
63} 63}
64 64
65enum { 65enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 66 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 67 Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 68 Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool,
69 Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 69 Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
70 Opt_flushoncommit, 70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
@@ -74,6 +74,7 @@ enum {
74static match_table_t tokens = { 74static match_table_t tokens = {
75 {Opt_degraded, "degraded"}, 75 {Opt_degraded, "degraded"},
76 {Opt_subvol, "subvol=%s"}, 76 {Opt_subvol, "subvol=%s"},
77 {Opt_subvolid, "subvolid=%d"},
77 {Opt_device, "device=%s"}, 78 {Opt_device, "device=%s"},
78 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
79 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
@@ -95,31 +96,6 @@ static match_table_t tokens = {
95 {Opt_err, NULL}, 96 {Opt_err, NULL},
96}; 97};
97 98
98u64 btrfs_parse_size(char *str)
99{
100 u64 res;
101 int mult = 1;
102 char *end;
103 char last;
104
105 res = simple_strtoul(str, &end, 10);
106
107 last = end[0];
108 if (isalpha(last)) {
109 last = tolower(last);
110 switch (last) {
111 case 'g':
112 mult *= 1024;
113 case 'm':
114 mult *= 1024;
115 case 'k':
116 mult *= 1024;
117 }
118 res = res * mult;
119 }
120 return res;
121}
122
123/* 99/*
124 * Regular mount options parser. Everything that is needed only when 100 * Regular mount options parser. Everything that is needed only when
125 * reading in a new superblock is parsed here. 101 * reading in a new superblock is parsed here.
@@ -128,7 +104,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
128{ 104{
129 struct btrfs_fs_info *info = root->fs_info; 105 struct btrfs_fs_info *info = root->fs_info;
130 substring_t args[MAX_OPT_ARGS]; 106 substring_t args[MAX_OPT_ARGS];
131 char *p, *num; 107 char *p, *num, *orig;
132 int intarg; 108 int intarg;
133 int ret = 0; 109 int ret = 0;
134 110
@@ -143,6 +119,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
143 if (!options) 119 if (!options)
144 return -ENOMEM; 120 return -ENOMEM;
145 121
122 orig = options;
146 123
147 while ((p = strsep(&options, ",")) != NULL) { 124 while ((p = strsep(&options, ",")) != NULL) {
148 int token; 125 int token;
@@ -156,6 +133,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
156 btrfs_set_opt(info->mount_opt, DEGRADED); 133 btrfs_set_opt(info->mount_opt, DEGRADED);
157 break; 134 break;
158 case Opt_subvol: 135 case Opt_subvol:
136 case Opt_subvolid:
159 case Opt_device: 137 case Opt_device:
160 /* 138 /*
161 * These are parsed by btrfs_parse_early_options 139 * These are parsed by btrfs_parse_early_options
@@ -213,7 +191,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
213 case Opt_max_extent: 191 case Opt_max_extent:
214 num = match_strdup(&args[0]); 192 num = match_strdup(&args[0]);
215 if (num) { 193 if (num) {
216 info->max_extent = btrfs_parse_size(num); 194 info->max_extent = memparse(num, NULL);
217 kfree(num); 195 kfree(num);
218 196
219 info->max_extent = max_t(u64, 197 info->max_extent = max_t(u64,
@@ -225,7 +203,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
225 case Opt_max_inline: 203 case Opt_max_inline:
226 num = match_strdup(&args[0]); 204 num = match_strdup(&args[0]);
227 if (num) { 205 if (num) {
228 info->max_inline = btrfs_parse_size(num); 206 info->max_inline = memparse(num, NULL);
229 kfree(num); 207 kfree(num);
230 208
231 if (info->max_inline) { 209 if (info->max_inline) {
@@ -240,7 +218,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
240 case Opt_alloc_start: 218 case Opt_alloc_start:
241 num = match_strdup(&args[0]); 219 num = match_strdup(&args[0]);
242 if (num) { 220 if (num) {
243 info->alloc_start = btrfs_parse_size(num); 221 info->alloc_start = memparse(num, NULL);
244 kfree(num); 222 kfree(num);
245 printk(KERN_INFO 223 printk(KERN_INFO
246 "btrfs: allocations start at %llu\n", 224 "btrfs: allocations start at %llu\n",
@@ -280,7 +258,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
280 } 258 }
281 } 259 }
282out: 260out:
283 kfree(options); 261 kfree(orig);
284 return ret; 262 return ret;
285} 263}
286 264
@@ -291,12 +269,13 @@ out:
291 * only when we need to allocate a new super block. 269 * only when we need to allocate a new super block.
292 */ 270 */
293static int btrfs_parse_early_options(const char *options, fmode_t flags, 271static int btrfs_parse_early_options(const char *options, fmode_t flags,
294 void *holder, char **subvol_name, 272 void *holder, char **subvol_name, u64 *subvol_objectid,
295 struct btrfs_fs_devices **fs_devices) 273 struct btrfs_fs_devices **fs_devices)
296{ 274{
297 substring_t args[MAX_OPT_ARGS]; 275 substring_t args[MAX_OPT_ARGS];
298 char *opts, *p; 276 char *opts, *p;
299 int error = 0; 277 int error = 0;
278 int intarg;
300 279
301 if (!options) 280 if (!options)
302 goto out; 281 goto out;
@@ -319,6 +298,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
319 case Opt_subvol: 298 case Opt_subvol:
320 *subvol_name = match_strdup(&args[0]); 299 *subvol_name = match_strdup(&args[0]);
321 break; 300 break;
301 case Opt_subvolid:
302 intarg = 0;
303 error = match_int(&args[0], &intarg);
304 if (!error) {
305 /* we want the original fs_tree */
306 if (!intarg)
307 *subvol_objectid =
308 BTRFS_FS_TREE_OBJECTID;
309 else
310 *subvol_objectid = intarg;
311 }
312 break;
322 case Opt_device: 313 case Opt_device:
323 error = btrfs_scan_one_device(match_strdup(&args[0]), 314 error = btrfs_scan_one_device(match_strdup(&args[0]),
324 flags, holder, fs_devices); 315 flags, holder, fs_devices);
@@ -346,6 +337,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
346 return error; 337 return error;
347} 338}
348 339
340static struct dentry *get_default_root(struct super_block *sb,
341 u64 subvol_objectid)
342{
343 struct btrfs_root *root = sb->s_fs_info;
344 struct btrfs_root *new_root;
345 struct btrfs_dir_item *di;
346 struct btrfs_path *path;
347 struct btrfs_key location;
348 struct inode *inode;
349 struct dentry *dentry;
350 u64 dir_id;
351 int new = 0;
352
353 /*
354 * We have a specific subvol we want to mount, just setup location and
355 * go look up the root.
356 */
357 if (subvol_objectid) {
358 location.objectid = subvol_objectid;
359 location.type = BTRFS_ROOT_ITEM_KEY;
360 location.offset = (u64)-1;
361 goto find_root;
362 }
363
364 path = btrfs_alloc_path();
365 if (!path)
366 return ERR_PTR(-ENOMEM);
367 path->leave_spinning = 1;
368
369 /*
370 * Find the "default" dir item which points to the root item that we
371 * will mount by default if we haven't been given a specific subvolume
372 * to mount.
373 */
374 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
375 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
376 if (!di) {
377 /*
378 * Ok the default dir item isn't there. This is weird since
379 * it's always been there, but don't freak out, just try and
380 * mount to root most subvolume.
381 */
382 btrfs_free_path(path);
383 dir_id = BTRFS_FIRST_FREE_OBJECTID;
384 new_root = root->fs_info->fs_root;
385 goto setup_root;
386 }
387
388 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
389 btrfs_free_path(path);
390
391find_root:
392 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
393 if (IS_ERR(new_root))
394 return ERR_PTR(PTR_ERR(new_root));
395
396 if (btrfs_root_refs(&new_root->root_item) == 0)
397 return ERR_PTR(-ENOENT);
398
399 dir_id = btrfs_root_dirid(&new_root->root_item);
400setup_root:
401 location.objectid = dir_id;
402 location.type = BTRFS_INODE_ITEM_KEY;
403 location.offset = 0;
404
405 inode = btrfs_iget(sb, &location, new_root, &new);
406 if (!inode)
407 return ERR_PTR(-ENOMEM);
408
409 /*
410 * If we're just mounting the root most subvol put the inode and return
411 * a reference to the dentry. We will have already gotten a reference
412 * to the inode in btrfs_fill_super so we're good to go.
413 */
414 if (!new && sb->s_root->d_inode == inode) {
415 iput(inode);
416 return dget(sb->s_root);
417 }
418
419 if (new) {
420 const struct qstr name = { .name = "/", .len = 1 };
421
422 /*
423 * New inode, we need to make the dentry a sibling of s_root so
424 * everything gets cleaned up properly on unmount.
425 */
426 dentry = d_alloc(sb->s_root, &name);
427 if (!dentry) {
428 iput(inode);
429 return ERR_PTR(-ENOMEM);
430 }
431 d_splice_alias(inode, dentry);
432 } else {
433 /*
434 * We found the inode in cache, just find a dentry for it and
435 * put the reference to the inode we just got.
436 */
437 dentry = d_find_alias(inode);
438 iput(inode);
439 }
440
441 return dentry;
442}
443
349static int btrfs_fill_super(struct super_block *sb, 444static int btrfs_fill_super(struct super_block *sb,
350 struct btrfs_fs_devices *fs_devices, 445 struct btrfs_fs_devices *fs_devices,
351 void *data, int silent) 446 void *data, int silent)
@@ -379,7 +474,7 @@ static int btrfs_fill_super(struct super_block *sb,
379 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 474 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
380 key.type = BTRFS_INODE_ITEM_KEY; 475 key.type = BTRFS_INODE_ITEM_KEY;
381 key.offset = 0; 476 key.offset = 0;
382 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); 477 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
383 if (IS_ERR(inode)) { 478 if (IS_ERR(inode)) {
384 err = PTR_ERR(inode); 479 err = PTR_ERR(inode);
385 goto fail_close; 480 goto fail_close;
@@ -391,12 +486,6 @@ static int btrfs_fill_super(struct super_block *sb,
391 err = -ENOMEM; 486 err = -ENOMEM;
392 goto fail_close; 487 goto fail_close;
393 } 488 }
394#if 0
395 /* this does the super kobj at the same time */
396 err = btrfs_sysfs_add_super(tree_root->fs_info);
397 if (err)
398 goto fail_close;
399#endif
400 489
401 sb->s_root = root_dentry; 490 sb->s_root = root_dentry;
402 491
@@ -488,19 +577,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
488static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 577static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
489 const char *dev_name, void *data, struct vfsmount *mnt) 578 const char *dev_name, void *data, struct vfsmount *mnt)
490{ 579{
491 char *subvol_name = NULL;
492 struct block_device *bdev = NULL; 580 struct block_device *bdev = NULL;
493 struct super_block *s; 581 struct super_block *s;
494 struct dentry *root; 582 struct dentry *root;
495 struct btrfs_fs_devices *fs_devices = NULL; 583 struct btrfs_fs_devices *fs_devices = NULL;
496 fmode_t mode = FMODE_READ; 584 fmode_t mode = FMODE_READ;
585 char *subvol_name = NULL;
586 u64 subvol_objectid = 0;
497 int error = 0; 587 int error = 0;
588 int found = 0;
498 589
499 if (!(flags & MS_RDONLY)) 590 if (!(flags & MS_RDONLY))
500 mode |= FMODE_WRITE; 591 mode |= FMODE_WRITE;
501 592
502 error = btrfs_parse_early_options(data, mode, fs_type, 593 error = btrfs_parse_early_options(data, mode, fs_type,
503 &subvol_name, &fs_devices); 594 &subvol_name, &subvol_objectid,
595 &fs_devices);
504 if (error) 596 if (error)
505 return error; 597 return error;
506 598
@@ -529,6 +621,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
529 goto error_close_devices; 621 goto error_close_devices;
530 } 622 }
531 623
624 found = 1;
532 btrfs_close_devices(fs_devices); 625 btrfs_close_devices(fs_devices);
533 } else { 626 } else {
534 char b[BDEVNAME_SIZE]; 627 char b[BDEVNAME_SIZE];
@@ -546,25 +639,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
546 s->s_flags |= MS_ACTIVE; 639 s->s_flags |= MS_ACTIVE;
547 } 640 }
548 641
549 if (!strcmp(subvol_name, ".")) 642 root = get_default_root(s, subvol_objectid);
550 root = dget(s->s_root); 643 if (IS_ERR(root)) {
551 else { 644 error = PTR_ERR(root);
552 mutex_lock(&s->s_root->d_inode->i_mutex); 645 deactivate_locked_super(s);
553 root = lookup_one_len(subvol_name, s->s_root, 646 goto error;
647 }
648 /* if they gave us a subvolume name bind mount into that */
649 if (strcmp(subvol_name, ".")) {
650 struct dentry *new_root;
651 mutex_lock(&root->d_inode->i_mutex);
652 new_root = lookup_one_len(subvol_name, root,
554 strlen(subvol_name)); 653 strlen(subvol_name));
555 mutex_unlock(&s->s_root->d_inode->i_mutex); 654 mutex_unlock(&root->d_inode->i_mutex);
556 655
557 if (IS_ERR(root)) { 656 if (IS_ERR(new_root)) {
558 deactivate_locked_super(s); 657 deactivate_locked_super(s);
559 error = PTR_ERR(root); 658 error = PTR_ERR(new_root);
560 goto error_free_subvol_name; 659 dput(root);
660 goto error_close_devices;
561 } 661 }
562 if (!root->d_inode) { 662 if (!new_root->d_inode) {
563 dput(root); 663 dput(root);
664 dput(new_root);
564 deactivate_locked_super(s); 665 deactivate_locked_super(s);
565 error = -ENXIO; 666 error = -ENXIO;
566 goto error_free_subvol_name; 667 goto error_close_devices;
567 } 668 }
669 dput(root);
670 root = new_root;
568 } 671 }
569 672
570 mnt->mnt_sb = s; 673 mnt->mnt_sb = s;
@@ -579,6 +682,7 @@ error_close_devices:
579 btrfs_close_devices(fs_devices); 682 btrfs_close_devices(fs_devices);
580error_free_subvol_name: 683error_free_subvol_name:
581 kfree(subvol_name); 684 kfree(subvol_name);
685error:
582 return error; 686 return error;
583} 687}
584 688
@@ -623,14 +727,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
623{ 727{
624 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 728 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
625 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 729 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
730 struct list_head *head = &root->fs_info->space_info;
731 struct btrfs_space_info *found;
732 u64 total_used = 0;
733 u64 data_used = 0;
626 int bits = dentry->d_sb->s_blocksize_bits; 734 int bits = dentry->d_sb->s_blocksize_bits;
627 __be32 *fsid = (__be32 *)root->fs_info->fsid; 735 __be32 *fsid = (__be32 *)root->fs_info->fsid;
628 736
737 rcu_read_lock();
738 list_for_each_entry_rcu(found, head, list) {
739 if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
740 BTRFS_BLOCK_GROUP_RAID10|
741 BTRFS_BLOCK_GROUP_RAID1)) {
742 total_used += found->bytes_used;
743 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
744 data_used += found->bytes_used;
745 else
746 data_used += found->total_bytes;
747 }
748
749 total_used += found->bytes_used;
750 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
751 data_used += found->bytes_used;
752 else
753 data_used += found->total_bytes;
754 }
755 rcu_read_unlock();
756
629 buf->f_namelen = BTRFS_NAME_LEN; 757 buf->f_namelen = BTRFS_NAME_LEN;
630 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 758 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
631 buf->f_bfree = buf->f_blocks - 759 buf->f_bfree = buf->f_blocks - (total_used >> bits);
632 (btrfs_super_bytes_used(disk_super) >> bits); 760 buf->f_bavail = buf->f_blocks - (data_used >> bits);
633 buf->f_bavail = buf->f_bfree;
634 buf->f_bsize = dentry->d_sb->s_blocksize; 761 buf->f_bsize = dentry->d_sb->s_blocksize;
635 buf->f_type = BTRFS_SUPER_MAGIC; 762 buf->f_type = BTRFS_SUPER_MAGIC;
636 763
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
164 complete(&root->kobj_unregister); 164 complete(&root->kobj_unregister);
165} 165}
166 166
167static struct sysfs_ops btrfs_super_attr_ops = { 167static const struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show, 168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store, 169 .store = btrfs_super_attr_store,
170}; 170};
171 171
172static struct sysfs_ops btrfs_root_attr_ops = { 172static const struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show, 173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store, 174 .store = btrfs_root_attr_store,
175}; 175};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b34..2d654c1c794d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -69,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root)
69 cur_trans->commit_done = 0; 69 cur_trans->commit_done = 0;
70 cur_trans->start_time = get_seconds(); 70 cur_trans->start_time = get_seconds();
71 71
72 cur_trans->delayed_refs.root.rb_node = NULL; 72 cur_trans->delayed_refs.root = RB_ROOT;
73 cur_trans->delayed_refs.num_entries = 0; 73 cur_trans->delayed_refs.num_entries = 0;
74 cur_trans->delayed_refs.num_heads_ready = 0; 74 cur_trans->delayed_refs.num_heads_ready = 0;
75 cur_trans->delayed_refs.num_heads = 0; 75 cur_trans->delayed_refs.num_heads = 0;
@@ -997,13 +997,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
997 997
998 mutex_unlock(&root->fs_info->trans_mutex); 998 mutex_unlock(&root->fs_info->trans_mutex);
999 999
1000 if (flush_on_commit) { 1000 if (flush_on_commit || snap_pending) {
1001 btrfs_start_delalloc_inodes(root, 1); 1001 btrfs_start_delalloc_inodes(root, 1);
1002 ret = btrfs_wait_ordered_extents(root, 0, 1); 1002 ret = btrfs_wait_ordered_extents(root, 0, 1);
1003 BUG_ON(ret); 1003 BUG_ON(ret);
1004 } else if (snap_pending) {
1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
1006 BUG_ON(ret);
1007 } 1004 }
1008 1005
1009 /* 1006 /*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..1255fcc8ade5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -445,7 +445,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
445 key.objectid = objectid; 445 key.objectid = objectid;
446 key.type = BTRFS_INODE_ITEM_KEY; 446 key.type = BTRFS_INODE_ITEM_KEY;
447 key.offset = 0; 447 key.offset = 0;
448 inode = btrfs_iget(root->fs_info->sb, &key, root); 448 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
449 if (IS_ERR(inode)) { 449 if (IS_ERR(inode)) {
450 inode = NULL; 450 inode = NULL;
451 } else if (is_bad_inode(inode)) { 451 } else if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..9df8e3f1ccab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -256,13 +256,13 @@ loop_lock:
256 wake_up(&fs_info->async_submit_wait); 256 wake_up(&fs_info->async_submit_wait);
257 257
258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 258 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259 submit_bio(cur->bi_rw, cur);
260 num_run++;
261 batch_run++;
262 259
263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 260 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 261 num_sync_run++;
265 262
263 submit_bio(cur->bi_rw, cur);
264 num_run++;
265 batch_run++;
266 if (need_resched()) { 266 if (need_resched()) {
267 if (num_sync_run) { 267 if (num_sync_run) {
268 blk_run_backing_dev(bdi, NULL); 268 blk_run_backing_dev(bdi, NULL);
@@ -325,16 +325,6 @@ loop_lock:
325 num_sync_run = 0; 325 num_sync_run = 0;
326 blk_run_backing_dev(bdi, NULL); 326 blk_run_backing_dev(bdi, NULL);
327 } 327 }
328
329 cond_resched();
330 if (again)
331 goto loop;
332
333 spin_lock(&device->io_lock);
334 if (device->pending_bios.head || device->pending_sync_bios.head)
335 goto loop_lock;
336 spin_unlock(&device->io_lock);
337
338 /* 328 /*
339 * IO has already been through a long path to get here. Checksumming, 329 * IO has already been through a long path to get here. Checksumming,
340 * async helper threads, perhaps compression. We've done a pretty 330 * async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +336,16 @@ loop_lock:
346 * cared about found its way down here. 336 * cared about found its way down here.
347 */ 337 */
348 blk_run_backing_dev(bdi, NULL); 338 blk_run_backing_dev(bdi, NULL);
339
340 cond_resched();
341 if (again)
342 goto loop;
343
344 spin_lock(&device->io_lock);
345 if (device->pending_bios.head || device->pending_sync_bios.head)
346 goto loop_lock;
347 spin_unlock(&device->io_lock);
348
349done: 349done:
350 return 0; 350 return 0;
351} 351}
@@ -365,6 +365,7 @@ static noinline int device_list_add(const char *path,
365 struct btrfs_device *device; 365 struct btrfs_device *device;
366 struct btrfs_fs_devices *fs_devices; 366 struct btrfs_fs_devices *fs_devices;
367 u64 found_transid = btrfs_super_generation(disk_super); 367 u64 found_transid = btrfs_super_generation(disk_super);
368 char *name;
368 369
369 fs_devices = find_fsid(disk_super->fsid); 370 fs_devices = find_fsid(disk_super->fsid);
370 if (!fs_devices) { 371 if (!fs_devices) {
@@ -411,6 +412,12 @@ static noinline int device_list_add(const char *path,
411 412
412 device->fs_devices = fs_devices; 413 device->fs_devices = fs_devices;
413 fs_devices->num_devices++; 414 fs_devices->num_devices++;
415 } else if (strcmp(device->name, path)) {
416 name = kstrdup(path, GFP_NOFS);
417 if (!name)
418 return -ENOMEM;
419 kfree(device->name);
420 device->name = name;
414 } 421 }
415 422
416 if (found_transid > fs_devices->latest_trans) { 423 if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +599,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
592 goto error_close; 599 goto error_close;
593 600
594 disk_super = (struct btrfs_super_block *)bh->b_data; 601 disk_super = (struct btrfs_super_block *)bh->b_data;
595 devid = le64_to_cpu(disk_super->dev_item.devid); 602 devid = btrfs_stack_device_id(&disk_super->dev_item);
596 if (devid != device->devid) 603 if (devid != device->devid)
597 goto error_brelse; 604 goto error_brelse;
598 605
@@ -694,7 +701,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
694 goto error_close; 701 goto error_close;
695 } 702 }
696 disk_super = (struct btrfs_super_block *)bh->b_data; 703 disk_super = (struct btrfs_super_block *)bh->b_data;
697 devid = le64_to_cpu(disk_super->dev_item.devid); 704 devid = btrfs_stack_device_id(&disk_super->dev_item);
698 transid = btrfs_super_generation(disk_super); 705 transid = btrfs_super_generation(disk_super);
699 if (disk_super->label[0]) 706 if (disk_super->label[0])
700 printk(KERN_INFO "device label %s ", disk_super->label); 707 printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1187,7 +1194,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1187 goto error_close; 1194 goto error_close;
1188 } 1195 }
1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1196 disk_super = (struct btrfs_super_block *)bh->b_data;
1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1197 devid = btrfs_stack_device_id(&disk_super->dev_item);
1191 dev_uuid = disk_super->dev_item.uuid; 1198 dev_uuid = disk_super->dev_item.uuid;
1192 device = btrfs_find_device(root, devid, dev_uuid, 1199 device = btrfs_find_device(root, devid, dev_uuid,
1193 disk_super->fsid); 1200 disk_super->fsid);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bfd..c9c266db0624 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2893 2893
2894 /* 2894 /*
2895 * The page straddles i_size. It must be zeroed out on each and every 2895 * The page straddles i_size. It must be zeroed out on each and every
2896 * writepage invokation because it may be mmapped. "A file is mapped 2896 * writepage invocation because it may be mmapped. "A file is mapped
2897 * in multiples of the page size. For a file that is not a multiple of 2897 * in multiples of the page size. For a file that is not a multiple of
2898 * the page size, the remaining memory is zeroed when mapped, and 2898 * the page size, the remaining memory is zeroed when mapped, and
2899 * writes to that region are not written out to the file." 2899 * writes to that region are not written out to the file."
@@ -3265,7 +3265,7 @@ static void recalc_bh_state(void)
3265 3265
3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3267{ 3267{
3268 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); 3268 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3269 if (ret) { 3269 if (ret) {
3270 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3270 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3271 get_cpu_var(bh_accounting).nr++; 3271 get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh)
3352} 3352}
3353EXPORT_SYMBOL(bh_submit_read); 3353EXPORT_SYMBOL(bh_submit_read);
3354 3354
3355static void
3356init_buffer_head(void *data)
3357{
3358 struct buffer_head *bh = data;
3359
3360 memset(bh, 0, sizeof(*bh));
3361 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3362}
3363
3364void __init buffer_init(void) 3355void __init buffer_init(void)
3365{ 3356{
3366 int nrpages; 3357 int nrpages;
@@ -3369,7 +3360,7 @@ void __init buffer_init(void)
3369 sizeof(struct buffer_head), 0, 3360 sizeof(struct buffer_head), 0,
3370 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 3361 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3371 SLAB_MEM_SPREAD), 3362 SLAB_MEM_SPREAD),
3372 init_buffer_head); 3363 NULL);
3373 3364
3374 /* 3365 /*
3375 * Limit the bh occupancy to 10% of ZONE_NORMAL 3366 * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..ce8ef6107727
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1194 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/pagevec.h>
9#include <linux/task_io_accounting_ops.h>
10
11#include "super.h"
12#include "osd_client.h"
13
14/*
15 * Ceph address space ops.
16 *
17 * There are a few funny things going on here.
18 *
19 * The page->private field is used to reference a struct
20 * ceph_snap_context for _every_ dirty page. This indicates which
21 * snapshot the page was logically dirtied in, and thus which snap
22 * context needs to be associated with the osd write during writeback.
23 *
24 * Similarly, struct ceph_inode_info maintains a set of counters to
25 * count dirty pages on the inode. In the absense of snapshots,
26 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
27 *
28 * When a snapshot is taken (that is, when the client receives
29 * notification that a snapshot was taken), each inode with caps and
30 * with dirty pages (dirty pages implies there is a cap) gets a new
31 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
32 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
33 * moved to capsnap->dirty. (Unless a sync write is currently in
34 * progress. In that case, the capsnap is said to be "pending", new
35 * writes cannot start, and the capsnap isn't "finalized" until the
36 * write completes (or fails) and a final size/mtime for the inode for
37 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
38 *
39 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
40 * we look for the first capsnap in i_cap_snaps and write out pages in
41 * that snap context _only_. Then we move on to the next capsnap,
42 * eventually reaching the "live" or "head" context (i.e., pages that
43 * are not yet snapped) and are writing the most recently dirtied
44 * pages.
45 *
46 * Invalidate and so forth must take care to ensure the dirty page
47 * accounting is preserved.
48 */
49
50#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
51#define CONGESTION_OFF_THRESH(congestion_kb) \
52 (CONGESTION_ON_THRESH(congestion_kb) - \
53 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
54
55
56
57/*
58 * Dirty a page. Optimistically adjust accounting, on the assumption
59 * that we won't race with invalidate. If we do, readjust.
60 */
61static int ceph_set_page_dirty(struct page *page)
62{
63 struct address_space *mapping = page->mapping;
64 struct inode *inode;
65 struct ceph_inode_info *ci;
66 int undo = 0;
67 struct ceph_snap_context *snapc;
68
69 if (unlikely(!mapping))
70 return !TestSetPageDirty(page);
71
72 if (TestSetPageDirty(page)) {
73 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
74 mapping->host, page, page->index);
75 return 0;
76 }
77
78 inode = mapping->host;
79 ci = ceph_inode(inode);
80
81 /*
82 * Note that we're grabbing a snapc ref here without holding
83 * any locks!
84 */
85 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
86
87 /* dirty the head */
88 spin_lock(&inode->i_lock);
89 if (ci->i_wrbuffer_ref_head == 0)
90 ci->i_head_snapc = ceph_get_snap_context(snapc);
91 ++ci->i_wrbuffer_ref_head;
92 if (ci->i_wrbuffer_ref == 0)
93 igrab(inode);
94 ++ci->i_wrbuffer_ref;
95 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
96 "snapc %p seq %lld (%d snaps)\n",
97 mapping->host, page, page->index,
98 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
99 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
100 snapc, snapc->seq, snapc->num_snaps);
101 spin_unlock(&inode->i_lock);
102
103 /* now adjust page */
104 spin_lock_irq(&mapping->tree_lock);
105 if (page->mapping) { /* Race with truncate? */
106 WARN_ON_ONCE(!PageUptodate(page));
107
108 if (mapping_cap_account_dirty(mapping)) {
109 __inc_zone_page_state(page, NR_FILE_DIRTY);
110 __inc_bdi_stat(mapping->backing_dev_info,
111 BDI_RECLAIMABLE);
112 task_io_account_write(PAGE_CACHE_SIZE);
113 }
114 radix_tree_tag_set(&mapping->page_tree,
115 page_index(page), PAGECACHE_TAG_DIRTY);
116
117 /*
118 * Reference snap context in page->private. Also set
119 * PagePrivate so that we get invalidatepage callback.
120 */
121 page->private = (unsigned long)snapc;
122 SetPagePrivate(page);
123 } else {
124 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
125 undo = 1;
126 }
127
128 spin_unlock_irq(&mapping->tree_lock);
129
130 if (undo)
131 /* whoops, we failed to dirty the page */
132 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
133
134 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
135
136 BUG_ON(!PageDirty(page));
137 return 1;
138}
139
140/*
141 * If we are truncating the full page (i.e. offset == 0), adjust the
142 * dirty page counters appropriately. Only called if there is private
143 * data on the page.
144 */
145static void ceph_invalidatepage(struct page *page, unsigned long offset)
146{
147 struct inode *inode;
148 struct ceph_inode_info *ci;
149 struct ceph_snap_context *snapc = (void *)page->private;
150
151 BUG_ON(!PageLocked(page));
152 BUG_ON(!page->private);
153 BUG_ON(!PagePrivate(page));
154 BUG_ON(!page->mapping);
155
156 inode = page->mapping->host;
157
158 /*
159 * We can get non-dirty pages here due to races between
160 * set_page_dirty and truncate_complete_page; just spit out a
161 * warning, in case we end up with accounting problems later.
162 */
163 if (!PageDirty(page))
164 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
165
166 if (offset == 0)
167 ClearPageChecked(page);
168
169 ci = ceph_inode(inode);
170 if (offset == 0) {
171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
172 inode, page, page->index, offset);
173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_snap_context(snapc);
175 page->private = 0;
176 ClearPagePrivate(page);
177 } else {
178 dout("%p invalidatepage %p idx %lu partial dirty page\n",
179 inode, page, page->index);
180 }
181}
182
183/* just a sanity check */
184static int ceph_releasepage(struct page *page, gfp_t g)
185{
186 struct inode *inode = page->mapping ? page->mapping->host : NULL;
187 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
188 WARN_ON(PageDirty(page));
189 WARN_ON(page->private);
190 WARN_ON(PagePrivate(page));
191 return 0;
192}
193
194/*
195 * read a single page, without unlocking it.
196 */
197static int readpage_nounlock(struct file *filp, struct page *page)
198{
199 struct inode *inode = filp->f_dentry->d_inode;
200 struct ceph_inode_info *ci = ceph_inode(inode);
201 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
202 int err = 0;
203 u64 len = PAGE_CACHE_SIZE;
204
205 dout("readpage inode %p file %p page %p index %lu\n",
206 inode, filp, page, page->index);
207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
208 page->index << PAGE_CACHE_SHIFT, &len,
209 ci->i_truncate_seq, ci->i_truncate_size,
210 &page, 1);
211 if (err == -ENOENT)
212 err = 0;
213 if (err < 0) {
214 SetPageError(page);
215 goto out;
216 } else if (err < PAGE_CACHE_SIZE) {
217 /* zero fill remainder of page */
218 zero_user_segment(page, err, PAGE_CACHE_SIZE);
219 }
220 SetPageUptodate(page);
221
222out:
223 return err < 0 ? err : 0;
224}
225
226static int ceph_readpage(struct file *filp, struct page *page)
227{
228 int r = readpage_nounlock(filp, page);
229 unlock_page(page);
230 return r;
231}
232
233/*
234 * Build a vector of contiguous pages from the provided page list.
235 */
236static struct page **page_vector_from_list(struct list_head *page_list,
237 unsigned *nr_pages)
238{
239 struct page **pages;
240 struct page *page;
241 int next_index, contig_pages = 0;
242
243 /* build page vector */
244 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
245 if (!pages)
246 return ERR_PTR(-ENOMEM);
247
248 BUG_ON(list_empty(page_list));
249 next_index = list_entry(page_list->prev, struct page, lru)->index;
250 list_for_each_entry_reverse(page, page_list, lru) {
251 if (page->index == next_index) {
252 dout("readpages page %d %p\n", contig_pages, page);
253 pages[contig_pages] = page;
254 contig_pages++;
255 next_index++;
256 } else {
257 break;
258 }
259 }
260 *nr_pages = contig_pages;
261 return pages;
262}
263
264/*
265 * Read multiple pages. Leave pages we don't read + unlock in page_list;
266 * the caller (VM) cleans them up.
267 */
268static int ceph_readpages(struct file *file, struct address_space *mapping,
269 struct list_head *page_list, unsigned nr_pages)
270{
271 struct inode *inode = file->f_dentry->d_inode;
272 struct ceph_inode_info *ci = ceph_inode(inode);
273 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
274 int rc = 0;
275 struct page **pages;
276 struct pagevec pvec;
277 loff_t offset;
278 u64 len;
279
280 dout("readpages %p file %p nr_pages %d\n",
281 inode, file, nr_pages);
282
283 pages = page_vector_from_list(page_list, &nr_pages);
284 if (IS_ERR(pages))
285 return PTR_ERR(pages);
286
287 /* guess read extent */
288 offset = pages[0]->index << PAGE_CACHE_SHIFT;
289 len = nr_pages << PAGE_CACHE_SHIFT;
290 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
291 offset, &len,
292 ci->i_truncate_seq, ci->i_truncate_size,
293 pages, nr_pages);
294 if (rc == -ENOENT)
295 rc = 0;
296 if (rc < 0)
297 goto out;
298
299 /* set uptodate and add to lru in pagevec-sized chunks */
300 pagevec_init(&pvec, 0);
301 for (; !list_empty(page_list) && len > 0;
302 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
303 struct page *page =
304 list_entry(page_list->prev, struct page, lru);
305
306 list_del(&page->lru);
307
308 if (rc < (int)PAGE_CACHE_SIZE) {
309 /* zero (remainder of) page */
310 int s = rc < 0 ? 0 : rc;
311 zero_user_segment(page, s, PAGE_CACHE_SIZE);
312 }
313
314 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
315 page_cache_release(page);
316 dout("readpages %p add_to_page_cache failed %p\n",
317 inode, page);
318 continue;
319 }
320 dout("readpages %p adding %p idx %lu\n", inode, page,
321 page->index);
322 flush_dcache_page(page);
323 SetPageUptodate(page);
324 unlock_page(page);
325 if (pagevec_add(&pvec, page) == 0)
326 pagevec_lru_add_file(&pvec); /* add to lru */
327 }
328 pagevec_lru_add_file(&pvec);
329 rc = 0;
330
331out:
332 kfree(pages);
333 return rc;
334}
335
336/*
337 * Get ref for the oldest snapc for an inode with dirty data... that is, the
338 * only snap context we are allowed to write back.
339 *
340 * Caller holds i_lock.
341 */
342static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
343 u64 *snap_size)
344{
345 struct ceph_inode_info *ci = ceph_inode(inode);
346 struct ceph_snap_context *snapc = NULL;
347 struct ceph_cap_snap *capsnap = NULL;
348
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_snap_realm) {
360 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 return snapc;
365}
366
367static struct ceph_snap_context *get_oldest_context(struct inode *inode,
368 u64 *snap_size)
369{
370 struct ceph_snap_context *snapc = NULL;
371
372 spin_lock(&inode->i_lock);
373 snapc = __get_oldest_context(inode, snap_size);
374 spin_unlock(&inode->i_lock);
375 return snapc;
376}
377
378/*
379 * Write a single page, but leave the page locked.
380 *
381 * If we get a write error, set the page error bit, but still adjust the
382 * dirty page accounting (i.e., page is no longer dirty).
383 */
384static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
385{
386 struct inode *inode;
387 struct ceph_inode_info *ci;
388 struct ceph_client *client;
389 struct ceph_osd_client *osdc;
390 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
391 int len = PAGE_CACHE_SIZE;
392 loff_t i_size;
393 int err = 0;
394 struct ceph_snap_context *snapc;
395 u64 snap_size = 0;
396 long writeback_stat;
397
398 dout("writepage %p idx %lu\n", page, page->index);
399
400 if (!page->mapping || !page->mapping->host) {
401 dout("writepage %p - no mapping\n", page);
402 return -EFAULT;
403 }
404 inode = page->mapping->host;
405 ci = ceph_inode(inode);
406 client = ceph_inode_to_client(inode);
407 osdc = &client->osdc;
408
409 /* verify this is a writeable snap context */
410 snapc = (void *)page->private;
411 if (snapc == NULL) {
412 dout("writepage %p page %p not dirty?\n", inode, page);
413 goto out;
414 }
415 if (snapc != get_oldest_context(inode, &snap_size)) {
416 dout("writepage %p page %p snapc %p not writeable - noop\n",
417 inode, page, (void *)page->private);
418 /* we should only noop if called by kswapd */
419 WARN_ON((current->flags & PF_MEMALLOC) == 0);
420 goto out;
421 }
422
423 /* is this a partial page at end of file? */
424 if (snap_size)
425 i_size = snap_size;
426 else
427 i_size = i_size_read(inode);
428 if (i_size < page_off + len)
429 len = i_size - page_off;
430
431 dout("writepage %p page %p index %lu on %llu~%u\n",
432 inode, page, page->index, page_off, len);
433
434 writeback_stat = atomic_long_inc_return(&client->writeback_count);
435 if (writeback_stat >
436 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
437 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
438
439 set_page_writeback(page);
440 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
441 &ci->i_layout, snapc,
442 page_off, len,
443 ci->i_truncate_seq, ci->i_truncate_size,
444 &inode->i_mtime,
445 &page, 1, 0, 0, true);
446 if (err < 0) {
447 dout("writepage setting page/mapping error %d %p\n", err, page);
448 SetPageError(page);
449 mapping_set_error(&inode->i_data, err);
450 if (wbc)
451 wbc->pages_skipped++;
452 } else {
453 dout("writepage cleaned page %p\n", page);
454 err = 0; /* vfs expects us to return 0 */
455 }
456 page->private = 0;
457 ClearPagePrivate(page);
458 end_page_writeback(page);
459 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
460 ceph_put_snap_context(snapc);
461out:
462 return err;
463}
464
465static int ceph_writepage(struct page *page, struct writeback_control *wbc)
466{
467 int err;
468 struct inode *inode = page->mapping->host;
469 BUG_ON(!inode);
470 igrab(inode);
471 err = writepage_nounlock(page, wbc);
472 unlock_page(page);
473 iput(inode);
474 return err;
475}
476
477
478/*
479 * lame release_pages helper. release_pages() isn't exported to
480 * modules.
481 */
482static void ceph_release_pages(struct page **pages, int num)
483{
484 struct pagevec pvec;
485 int i;
486
487 pagevec_init(&pvec, 0);
488 for (i = 0; i < num; i++) {
489 if (pagevec_add(&pvec, pages[i]) == 0)
490 pagevec_release(&pvec);
491 }
492 pagevec_release(&pvec);
493}
494
495
496/*
497 * async writeback completion handler.
498 *
499 * If we get an error, set the mapping error bit, but not the individual
500 * page error bits.
501 */
502static void writepages_finish(struct ceph_osd_request *req,
503 struct ceph_msg *msg)
504{
505 struct inode *inode = req->r_inode;
506 struct ceph_osd_reply_head *replyhead;
507 struct ceph_osd_op *op;
508 struct ceph_inode_info *ci = ceph_inode(inode);
509 unsigned wrote;
510 struct page *page;
511 int i;
512 struct ceph_snap_context *snapc = req->r_snapc;
513 struct address_space *mapping = inode->i_mapping;
514 struct writeback_control *wbc = req->r_wbc;
515 __s32 rc = -EIO;
516 u64 bytes = 0;
517 struct ceph_client *client = ceph_inode_to_client(inode);
518 long writeback_stat;
519 unsigned issued = __ceph_caps_issued(ci, NULL);
520
521 /* parse reply */
522 replyhead = msg->front.iov_base;
523 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
524 op = (void *)(replyhead + 1);
525 rc = le32_to_cpu(replyhead->result);
526 bytes = le64_to_cpu(op->extent.length);
527
528 if (rc >= 0) {
529 /*
530 * Assume we wrote the pages we originally sent. The
531 * osd might reply with fewer pages if our writeback
532 * raced with a truncation and was adjusted at the osd,
533 * so don't believe the reply.
534 */
535 wrote = req->r_num_pages;
536 } else {
537 wrote = 0;
538 mapping_set_error(mapping, rc);
539 }
540 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
541 inode, rc, bytes, wrote);
542
543 /* clean all pages */
544 for (i = 0; i < req->r_num_pages; i++) {
545 page = req->r_pages[i];
546 BUG_ON(!page);
547 WARN_ON(!PageUptodate(page));
548
549 writeback_stat =
550 atomic_long_dec_return(&client->writeback_count);
551 if (writeback_stat <
552 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
553 clear_bdi_congested(&client->backing_dev_info,
554 BLK_RW_ASYNC);
555
556 if (i >= wrote) {
557 dout("inode %p skipping page %p\n", inode, page);
558 wbc->pages_skipped++;
559 }
560 page->private = 0;
561 ClearPagePrivate(page);
562 ceph_put_snap_context(snapc);
563 dout("unlocking %d %p\n", i, page);
564 end_page_writeback(page);
565
566 /*
567 * We lost the cache cap, need to truncate the page before
568 * it is unlocked, otherwise we'd truncate it later in the
569 * page truncation thread, possibly losing some data that
570 * raced its way in
571 */
572 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
573 generic_error_remove_page(inode->i_mapping, page);
574
575 unlock_page(page);
576 }
577 dout("%p wrote+cleaned %d pages\n", inode, wrote);
578 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
579
580 ceph_release_pages(req->r_pages, req->r_num_pages);
581 if (req->r_pages_from_pool)
582 mempool_free(req->r_pages,
583 ceph_client(inode->i_sb)->wb_pagevec_pool);
584 else
585 kfree(req->r_pages);
586 ceph_osdc_put_request(req);
587}
588
589/*
590 * allocate a page vec, either directly, or if necessary, via a the
591 * mempool. we avoid the mempool if we can because req->r_num_pages
592 * may be less than the maximum write size.
593 */
594static void alloc_page_vec(struct ceph_client *client,
595 struct ceph_osd_request *req)
596{
597 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
598 GFP_NOFS);
599 if (!req->r_pages) {
600 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
601 req->r_pages_from_pool = 1;
602 WARN_ON(!req->r_pages);
603 }
604}
605
606/*
607 * initiate async writeback
608 */
609static int ceph_writepages_start(struct address_space *mapping,
610 struct writeback_control *wbc)
611{
612 struct inode *inode = mapping->host;
613 struct backing_dev_info *bdi = mapping->backing_dev_info;
614 struct ceph_inode_info *ci = ceph_inode(inode);
615 struct ceph_client *client;
616 pgoff_t index, start, end;
617 int range_whole = 0;
618 int should_loop = 1;
619 pgoff_t max_pages = 0, max_pages_ever = 0;
620 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
621 struct pagevec pvec;
622 int done = 0;
623 int rc = 0;
624 unsigned wsize = 1 << inode->i_blkbits;
625 struct ceph_osd_request *req = NULL;
626 int do_sync;
627 u64 snap_size = 0;
628
629 /*
630 * Include a 'sync' in the OSD request if this is a data
631 * integrity write (e.g., O_SYNC write or fsync()), or if our
632 * cap is being revoked.
633 */
634 do_sync = wbc->sync_mode == WB_SYNC_ALL;
635 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
636 do_sync = 1;
637 dout("writepages_start %p dosync=%d (mode=%s)\n",
638 inode, do_sync,
639 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
640 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
641
642 client = ceph_inode_to_client(inode);
643 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
644 pr_warning("writepage_start %p on forced umount\n", inode);
645 return -EIO; /* we're in a forced umount, don't write! */
646 }
647 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
648 wsize = client->mount_args->wsize;
649 if (wsize < PAGE_CACHE_SIZE)
650 wsize = PAGE_CACHE_SIZE;
651 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
652
653 pagevec_init(&pvec, 0);
654
655 /* ?? */
656 if (wbc->nonblocking && bdi_write_congested(bdi)) {
657 dout(" writepages congested\n");
658 wbc->encountered_congestion = 1;
659 goto out_final;
660 }
661
662 /* where to start/end? */
663 if (wbc->range_cyclic) {
664 start = mapping->writeback_index; /* Start from prev offset */
665 end = -1;
666 dout(" cyclic, start at %lu\n", start);
667 } else {
668 start = wbc->range_start >> PAGE_CACHE_SHIFT;
669 end = wbc->range_end >> PAGE_CACHE_SHIFT;
670 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
671 range_whole = 1;
672 should_loop = 0;
673 dout(" not cyclic, %lu to %lu\n", start, end);
674 }
675 index = start;
676
677retry:
678 /* find oldest snap context with dirty data */
679 ceph_put_snap_context(snapc);
680 snapc = get_oldest_context(inode, &snap_size);
681 if (!snapc) {
682 /* hmm, why does writepages get called when there
683 is no dirty data? */
684 dout(" no snap context with dirty data?\n");
685 goto out;
686 }
687 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
688 snapc, snapc->seq, snapc->num_snaps);
689 if (last_snapc && snapc != last_snapc) {
690 /* if we switched to a newer snapc, restart our scan at the
691 * start of the original file range. */
692 dout(" snapc differs from last pass, restarting at %lu\n",
693 index);
694 index = start;
695 }
696 last_snapc = snapc;
697
698 while (!done && index <= end) {
699 unsigned i;
700 int first;
701 pgoff_t next;
702 int pvec_pages, locked_pages;
703 struct page *page;
704 int want;
705 u64 offset, len;
706 struct ceph_osd_request_head *reqhead;
707 struct ceph_osd_op *op;
708 long writeback_stat;
709
710 next = 0;
711 locked_pages = 0;
712 max_pages = max_pages_ever;
713
714get_more_pages:
715 first = -1;
716 want = min(end - index,
717 min((pgoff_t)PAGEVEC_SIZE,
718 max_pages - (pgoff_t)locked_pages) - 1)
719 + 1;
720 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
721 PAGECACHE_TAG_DIRTY,
722 want);
723 dout("pagevec_lookup_tag got %d\n", pvec_pages);
724 if (!pvec_pages && !locked_pages)
725 break;
726 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
727 page = pvec.pages[i];
728 dout("? %p idx %lu\n", page, page->index);
729 if (locked_pages == 0)
730 lock_page(page); /* first page */
731 else if (!trylock_page(page))
732 break;
733
734 /* only dirty pages, or our accounting breaks */
735 if (unlikely(!PageDirty(page)) ||
736 unlikely(page->mapping != mapping)) {
737 dout("!dirty or !mapping %p\n", page);
738 unlock_page(page);
739 break;
740 }
741 if (!wbc->range_cyclic && page->index > end) {
742 dout("end of range %p\n", page);
743 done = 1;
744 unlock_page(page);
745 break;
746 }
747 if (next && (page->index != next)) {
748 dout("not consecutive %p\n", page);
749 unlock_page(page);
750 break;
751 }
752 if (wbc->sync_mode != WB_SYNC_NONE) {
753 dout("waiting on writeback %p\n", page);
754 wait_on_page_writeback(page);
755 }
756 if ((snap_size && page_offset(page) > snap_size) ||
757 (!snap_size &&
758 page_offset(page) > i_size_read(inode))) {
759 dout("%p page eof %llu\n", page, snap_size ?
760 snap_size : i_size_read(inode));
761 done = 1;
762 unlock_page(page);
763 break;
764 }
765 if (PageWriteback(page)) {
766 dout("%p under writeback\n", page);
767 unlock_page(page);
768 break;
769 }
770
771 /* only if matching snap context */
772 if (snapc != (void *)page->private) {
773 dout("page snapc %p != oldest %p\n",
774 (void *)page->private, snapc);
775 unlock_page(page);
776 if (!locked_pages)
777 continue; /* keep looking for snap */
778 break;
779 }
780
781 if (!clear_page_dirty_for_io(page)) {
782 dout("%p !clear_page_dirty_for_io\n", page);
783 unlock_page(page);
784 break;
785 }
786
787 /* ok */
788 if (locked_pages == 0) {
789 /* prepare async write request */
790 offset = page->index << PAGE_CACHE_SHIFT;
791 len = wsize;
792 req = ceph_osdc_new_request(&client->osdc,
793 &ci->i_layout,
794 ceph_vino(inode),
795 offset, &len,
796 CEPH_OSD_OP_WRITE,
797 CEPH_OSD_FLAG_WRITE |
798 CEPH_OSD_FLAG_ONDISK,
799 snapc, do_sync,
800 ci->i_truncate_seq,
801 ci->i_truncate_size,
802 &inode->i_mtime, true, 1);
803 max_pages = req->r_num_pages;
804
805 alloc_page_vec(client, req);
806 req->r_callback = writepages_finish;
807 req->r_inode = inode;
808 req->r_wbc = wbc;
809 }
810
811 /* note position of first page in pvec */
812 if (first < 0)
813 first = i;
814 dout("%p will write page %p idx %lu\n",
815 inode, page, page->index);
816
817 writeback_stat = atomic_long_inc_return(&client->writeback_count);
818 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
819 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
820 }
821
822 set_page_writeback(page);
823 req->r_pages[locked_pages] = page;
824 locked_pages++;
825 next = page->index + 1;
826 }
827
828 /* did we get anything? */
829 if (!locked_pages)
830 goto release_pvec_pages;
831 if (i) {
832 int j;
833 BUG_ON(!locked_pages || first < 0);
834
835 if (pvec_pages && i == pvec_pages &&
836 locked_pages < max_pages) {
837 dout("reached end pvec, trying for more\n");
838 pagevec_reinit(&pvec);
839 goto get_more_pages;
840 }
841
842 /* shift unused pages over in the pvec... we
843 * will need to release them below. */
844 for (j = i; j < pvec_pages; j++) {
845 dout(" pvec leftover page %p\n",
846 pvec.pages[j]);
847 pvec.pages[j-i+first] = pvec.pages[j];
848 }
849 pvec.nr -= i-first;
850 }
851
852 /* submit the write */
853 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
854 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
855 (u64)locked_pages << PAGE_CACHE_SHIFT);
856 dout("writepages got %d pages at %llu~%llu\n",
857 locked_pages, offset, len);
858
859 /* revise final length, page count */
860 req->r_num_pages = locked_pages;
861 reqhead = req->r_request->front.iov_base;
862 op = (void *)(reqhead + 1);
863 op->extent.length = cpu_to_le64(len);
864 op->payload_len = cpu_to_le32(len);
865 req->r_request->hdr.data_len = cpu_to_le32(len);
866
867 ceph_osdc_start_request(&client->osdc, req, true);
868 req = NULL;
869
870 /* continue? */
871 index = next;
872 wbc->nr_to_write -= locked_pages;
873 if (wbc->nr_to_write <= 0)
874 done = 1;
875
876release_pvec_pages:
877 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
878 pvec.nr ? pvec.pages[0] : NULL);
879 pagevec_release(&pvec);
880
881 if (locked_pages && !done)
882 goto retry;
883 }
884
885 if (should_loop && !done) {
886 /* more to do; loop back to beginning of file */
887 dout("writepages looping back to beginning of file\n");
888 should_loop = 0;
889 index = 0;
890 goto retry;
891 }
892
893 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
894 mapping->writeback_index = index;
895
896out:
897 if (req)
898 ceph_osdc_put_request(req);
899 if (rc > 0)
900 rc = 0; /* vfs expects us to return 0 */
901 ceph_put_snap_context(snapc);
902 dout("writepages done, rc = %d\n", rc);
903out_final:
904 return rc;
905}
906
907
908
909/*
910 * See if a given @snapc is either writeable, or already written.
911 */
912static int context_is_writeable_or_written(struct inode *inode,
913 struct ceph_snap_context *snapc)
914{
915 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
916 return !oldest || snapc->seq <= oldest->seq;
917}
918
919/*
920 * We are only allowed to write into/dirty the page if the page is
921 * clean, or already dirty within the same snap context.
922 *
923 * called with page locked.
924 * return success with page locked,
925 * or any failure (incl -EAGAIN) with page unlocked.
926 */
927static int ceph_update_writeable_page(struct file *file,
928 loff_t pos, unsigned len,
929 struct page *page)
930{
931 struct inode *inode = file->f_dentry->d_inode;
932 struct ceph_inode_info *ci = ceph_inode(inode);
933 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
934 loff_t page_off = pos & PAGE_CACHE_MASK;
935 int pos_in_page = pos & ~PAGE_CACHE_MASK;
936 int end_in_page = pos_in_page + len;
937 loff_t i_size;
938 struct ceph_snap_context *snapc;
939 int r;
940
941retry_locked:
942 /* writepages currently holds page lock, but if we change that later, */
943 wait_on_page_writeback(page);
944
945 /* check snap context */
946 BUG_ON(!ci->i_snap_realm);
947 down_read(&mdsc->snap_rwsem);
948 BUG_ON(!ci->i_snap_realm->cached_context);
949 if (page->private &&
950 (void *)page->private != ci->i_snap_realm->cached_context) {
951 /*
952 * this page is already dirty in another (older) snap
953 * context! is it writeable now?
954 */
955 snapc = get_oldest_context(inode, NULL);
956 up_read(&mdsc->snap_rwsem);
957
958 if (snapc != (void *)page->private) {
959 dout(" page %p snapc %p not current or oldest\n",
960 page, (void *)page->private);
961 /*
962 * queue for writeback, and wait for snapc to
963 * be writeable or written
964 */
965 snapc = ceph_get_snap_context((void *)page->private);
966 unlock_page(page);
967 ceph_queue_writeback(inode);
968 r = wait_event_interruptible(ci->i_cap_wq,
969 context_is_writeable_or_written(inode, snapc));
970 ceph_put_snap_context(snapc);
971 if (r == -ERESTARTSYS)
972 return r;
973 return -EAGAIN;
974 }
975
976 /* yay, writeable, do it now (without dropping page lock) */
977 dout(" page %p snapc %p not current, but oldest\n",
978 page, snapc);
979 if (!clear_page_dirty_for_io(page))
980 goto retry_locked;
981 r = writepage_nounlock(page, NULL);
982 if (r < 0)
983 goto fail_nosnap;
984 goto retry_locked;
985 }
986
987 if (PageUptodate(page)) {
988 dout(" page %p already uptodate\n", page);
989 return 0;
990 }
991
992 /* full page? */
993 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
994 return 0;
995
996 /* past end of file? */
997 i_size = inode->i_size; /* caller holds i_mutex */
998
999 if (i_size + len > inode->i_sb->s_maxbytes) {
1000 /* file is too big */
1001 r = -EINVAL;
1002 goto fail;
1003 }
1004
1005 if (page_off >= i_size ||
1006 (pos_in_page == 0 && (pos+len) >= i_size &&
1007 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1008 dout(" zeroing %p 0 - %d and %d - %d\n",
1009 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1010 zero_user_segments(page,
1011 0, pos_in_page,
1012 end_in_page, PAGE_CACHE_SIZE);
1013 return 0;
1014 }
1015
1016 /* we need to read it. */
1017 up_read(&mdsc->snap_rwsem);
1018 r = readpage_nounlock(file, page);
1019 if (r < 0)
1020 goto fail_nosnap;
1021 goto retry_locked;
1022
1023fail:
1024 up_read(&mdsc->snap_rwsem);
1025fail_nosnap:
1026 unlock_page(page);
1027 return r;
1028}
1029
1030/*
1031 * We are only allowed to write into/dirty the page if the page is
1032 * clean, or already dirty within the same snap context.
1033 */
1034static int ceph_write_begin(struct file *file, struct address_space *mapping,
1035 loff_t pos, unsigned len, unsigned flags,
1036 struct page **pagep, void **fsdata)
1037{
1038 struct inode *inode = file->f_dentry->d_inode;
1039 struct page *page;
1040 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1041 int r;
1042
1043 do {
1044 /* get a page */
1045 page = grab_cache_page_write_begin(mapping, index, 0);
1046 if (!page)
1047 return -ENOMEM;
1048 *pagep = page;
1049
1050 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1051 inode, page, (int)pos, (int)len);
1052
1053 r = ceph_update_writeable_page(file, pos, len, page);
1054 } while (r == -EAGAIN);
1055
1056 return r;
1057}
1058
1059/*
1060 * we don't do anything in here that simple_write_end doesn't do
1061 * except adjust dirty page accounting and drop read lock on
1062 * mdsc->snap_rwsem.
1063 */
1064static int ceph_write_end(struct file *file, struct address_space *mapping,
1065 loff_t pos, unsigned len, unsigned copied,
1066 struct page *page, void *fsdata)
1067{
1068 struct inode *inode = file->f_dentry->d_inode;
1069 struct ceph_client *client = ceph_inode_to_client(inode);
1070 struct ceph_mds_client *mdsc = &client->mdsc;
1071 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1072 int check_cap = 0;
1073
1074 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1075 inode, page, (int)pos, (int)copied, (int)len);
1076
1077 /* zero the stale part of the page if we did a short copy */
1078 if (copied < len)
1079 zero_user_segment(page, from+copied, len);
1080
1081 /* did file size increase? */
1082 /* (no need for i_size_read(); we caller holds i_mutex */
1083 if (pos+copied > inode->i_size)
1084 check_cap = ceph_inode_set_size(inode, pos+copied);
1085
1086 if (!PageUptodate(page))
1087 SetPageUptodate(page);
1088
1089 set_page_dirty(page);
1090
1091 unlock_page(page);
1092 up_read(&mdsc->snap_rwsem);
1093 page_cache_release(page);
1094
1095 if (check_cap)
1096 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1097
1098 return copied;
1099}
1100
1101/*
1102 * we set .direct_IO to indicate direct io is supported, but since we
1103 * intercept O_DIRECT reads and writes early, this function should
1104 * never get called.
1105 */
1106static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1107 const struct iovec *iov,
1108 loff_t pos, unsigned long nr_segs)
1109{
1110 WARN_ON(1);
1111 return -EINVAL;
1112}
1113
1114const struct address_space_operations ceph_aops = {
1115 .readpage = ceph_readpage,
1116 .readpages = ceph_readpages,
1117 .writepage = ceph_writepage,
1118 .writepages = ceph_writepages_start,
1119 .write_begin = ceph_write_begin,
1120 .write_end = ceph_write_end,
1121 .set_page_dirty = ceph_set_page_dirty,
1122 .invalidatepage = ceph_invalidatepage,
1123 .releasepage = ceph_releasepage,
1124 .direct_IO = ceph_direct_io,
1125};
1126
1127
1128/*
1129 * vm ops
1130 */
1131
1132/*
1133 * Reuse write_begin here for simplicity.
1134 */
1135static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1136{
1137 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1138 struct page *page = vmf->page;
1139 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1140 loff_t off = page->index << PAGE_CACHE_SHIFT;
1141 loff_t size, len;
1142 int ret;
1143
1144 size = i_size_read(inode);
1145 if (off + PAGE_CACHE_SIZE <= size)
1146 len = PAGE_CACHE_SIZE;
1147 else
1148 len = size & ~PAGE_CACHE_MASK;
1149
1150 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1151 off, len, page, page->index);
1152
1153 lock_page(page);
1154
1155 ret = VM_FAULT_NOPAGE;
1156 if ((off > size) ||
1157 (page->mapping != inode->i_mapping))
1158 goto out;
1159
1160 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1161 if (ret == 0) {
1162 /* success. we'll keep the page locked. */
1163 set_page_dirty(page);
1164 up_read(&mdsc->snap_rwsem);
1165 ret = VM_FAULT_LOCKED;
1166 } else {
1167 if (ret == -ENOMEM)
1168 ret = VM_FAULT_OOM;
1169 else
1170 ret = VM_FAULT_SIGBUS;
1171 }
1172out:
1173 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1174 if (ret != VM_FAULT_LOCKED)
1175 unlock_page(page);
1176 return ret;
1177}
1178
1179static struct vm_operations_struct ceph_vmops = {
1180 .fault = filemap_fault,
1181 .page_mkwrite = ceph_page_mkwrite,
1182};
1183
1184int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1185{
1186 struct address_space *mapping = file->f_mapping;
1187
1188 if (!mapping->a_ops->readpage)
1189 return -ENOEXEC;
1190 file_accessed(file);
1191 vma->vm_ops = &ceph_vmops;
1192 vma->vm_flags |= VM_CAN_NONLINEAR;
1193 return 0;
1194}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..abb204fea6c7
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,257 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5
6#include "types.h"
7#include "auth_none.h"
8#include "auth_x.h"
9#include "decode.h"
10#include "super.h"
11
12#include "messenger.h"
13
14/*
15 * get protocol handler
16 */
17static u32 supported_protocols[] = {
18 CEPH_AUTH_NONE,
19 CEPH_AUTH_CEPHX
20};
21
22int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
23{
24 switch (protocol) {
25 case CEPH_AUTH_NONE:
26 return ceph_auth_none_init(ac);
27 case CEPH_AUTH_CEPHX:
28 return ceph_x_init(ac);
29 default:
30 return -ENOENT;
31 }
32}
33
34/*
35 * setup, teardown.
36 */
37struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
38{
39 struct ceph_auth_client *ac;
40 int ret;
41
42 dout("auth_init name '%s' secret '%s'\n", name, secret);
43
44 ret = -ENOMEM;
45 ac = kzalloc(sizeof(*ac), GFP_NOFS);
46 if (!ac)
47 goto out;
48
49 ac->negotiating = true;
50 if (name)
51 ac->name = name;
52 else
53 ac->name = CEPH_AUTH_NAME_DEFAULT;
54 dout("auth_init name %s secret %s\n", ac->name, secret);
55 ac->secret = secret;
56 return ac;
57
58out:
59 return ERR_PTR(ret);
60}
61
62void ceph_auth_destroy(struct ceph_auth_client *ac)
63{
64 dout("auth_destroy %p\n", ac);
65 if (ac->ops)
66 ac->ops->destroy(ac);
67 kfree(ac);
68}
69
70/*
71 * Reset occurs when reconnecting to the monitor.
72 */
73void ceph_auth_reset(struct ceph_auth_client *ac)
74{
75 dout("auth_reset %p\n", ac);
76 if (ac->ops && !ac->negotiating)
77 ac->ops->reset(ac);
78 ac->negotiating = true;
79}
80
81int ceph_entity_name_encode(const char *name, void **p, void *end)
82{
83 int len = strlen(name);
84
85 if (*p + 2*sizeof(u32) + len > end)
86 return -ERANGE;
87 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
88 ceph_encode_32(p, len);
89 ceph_encode_copy(p, name, len);
90 return 0;
91}
92
93/*
94 * Initiate protocol negotiation with monitor. Include entity name
95 * and list supported protocols.
96 */
97int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
98{
99 struct ceph_mon_request_header *monhdr = buf;
100 void *p = monhdr + 1, *end = buf + len, *lenp;
101 int i, num;
102 int ret;
103
104 dout("auth_build_hello\n");
105 monhdr->have_version = 0;
106 monhdr->session_mon = cpu_to_le16(-1);
107 monhdr->session_mon_tid = 0;
108
109 ceph_encode_32(&p, 0); /* no protocol, yet */
110
111 lenp = p;
112 p += sizeof(u32);
113
114 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
115 ceph_encode_8(&p, 1);
116 num = ARRAY_SIZE(supported_protocols);
117 ceph_encode_32(&p, num);
118 ceph_decode_need(&p, end, num * sizeof(u32), bad);
119 for (i = 0; i < num; i++)
120 ceph_encode_32(&p, supported_protocols[i]);
121
122 ret = ceph_entity_name_encode(ac->name, &p, end);
123 if (ret < 0)
124 return ret;
125 ceph_decode_need(&p, end, sizeof(u64), bad);
126 ceph_encode_64(&p, ac->global_id);
127
128 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
129 return p - buf;
130
131bad:
132 return -ERANGE;
133}
134
135int ceph_build_auth_request(struct ceph_auth_client *ac,
136 void *msg_buf, size_t msg_len)
137{
138 struct ceph_mon_request_header *monhdr = msg_buf;
139 void *p = monhdr + 1;
140 void *end = msg_buf + msg_len;
141 int ret;
142
143 monhdr->have_version = 0;
144 monhdr->session_mon = cpu_to_le16(-1);
145 monhdr->session_mon_tid = 0;
146
147 ceph_encode_32(&p, ac->protocol);
148
149 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
150 if (ret < 0) {
151 pr_err("error %d building request\n", ret);
152 return ret;
153 }
154 dout(" built request %d bytes\n", ret);
155 ceph_encode_32(&p, ret);
156 return p + ret - msg_buf;
157}
158
159/*
160 * Handle auth message from monitor.
161 */
162int ceph_handle_auth_reply(struct ceph_auth_client *ac,
163 void *buf, size_t len,
164 void *reply_buf, size_t reply_len)
165{
166 void *p = buf;
167 void *end = buf + len;
168 int protocol;
169 s32 result;
170 u64 global_id;
171 void *payload, *payload_end;
172 int payload_len;
173 char *result_msg;
174 int result_msg_len;
175 int ret = -EINVAL;
176
177 dout("handle_auth_reply %p %p\n", p, end);
178 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
179 protocol = ceph_decode_32(&p);
180 result = ceph_decode_32(&p);
181 global_id = ceph_decode_64(&p);
182 payload_len = ceph_decode_32(&p);
183 payload = p;
184 p += payload_len;
185 ceph_decode_need(&p, end, sizeof(u32), bad);
186 result_msg_len = ceph_decode_32(&p);
187 result_msg = p;
188 p += result_msg_len;
189 if (p != end)
190 goto bad;
191
192 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
193 result_msg, global_id, payload_len);
194
195 payload_end = payload + payload_len;
196
197 if (global_id && ac->global_id != global_id) {
198 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
199 ac->global_id = global_id;
200 }
201
202 if (ac->negotiating) {
203 /* server does not support our protocols? */
204 if (!protocol && result < 0) {
205 ret = result;
206 goto out;
207 }
208 /* set up (new) protocol handler? */
209 if (ac->protocol && ac->protocol != protocol) {
210 ac->ops->destroy(ac);
211 ac->protocol = 0;
212 ac->ops = NULL;
213 }
214 if (ac->protocol != protocol) {
215 ret = ceph_auth_init_protocol(ac, protocol);
216 if (ret) {
217 pr_err("error %d on auth protocol %d init\n",
218 ret, protocol);
219 goto out;
220 }
221 }
222
223 ac->negotiating = false;
224 }
225
226 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
227 if (ret == -EAGAIN) {
228 return ceph_build_auth_request(ac, reply_buf, reply_len);
229 } else if (ret) {
230 pr_err("authentication error %d\n", ret);
231 return ret;
232 }
233 return 0;
234
235bad:
236 pr_err("failed to decode auth msg\n");
237out:
238 return ret;
239}
240
241int ceph_build_auth(struct ceph_auth_client *ac,
242 void *msg_buf, size_t msg_len)
243{
244 if (!ac->protocol)
245 return ceph_auth_build_hello(ac, msg_buf, msg_len);
246 BUG_ON(!ac->ops);
247 if (!ac->ops->is_authenticated(ac))
248 return ceph_build_auth_request(ac, msg_buf, msg_len);
249 return 0;
250}
251
252int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
253{
254 if (!ac->ops)
255 return 0;
256 return ac->ops->is_authenticated(ac);
257}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..b4ef6f0a6c85
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,121 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_none.h"
9#include "auth.h"
10#include "decode.h"
11
12static void reset(struct ceph_auth_client *ac)
13{
14 struct ceph_auth_none_info *xi = ac->private;
15
16 xi->starting = true;
17 xi->built_authorizer = false;
18}
19
20static void destroy(struct ceph_auth_client *ac)
21{
22 kfree(ac->private);
23 ac->private = NULL;
24}
25
26static int is_authenticated(struct ceph_auth_client *ac)
27{
28 struct ceph_auth_none_info *xi = ac->private;
29
30 return !xi->starting;
31}
32
33/*
34 * the generic auth code decode the global_id, and we carry no actual
35 * authenticate state, so nothing happens here.
36 */
37static int handle_reply(struct ceph_auth_client *ac, int result,
38 void *buf, void *end)
39{
40 struct ceph_auth_none_info *xi = ac->private;
41
42 xi->starting = false;
43 return result;
44}
45
46/*
47 * build an 'authorizer' with our entity_name and global_id. we can
48 * reuse a single static copy since it is identical for all services
49 * we connect to.
50 */
51static int ceph_auth_none_create_authorizer(
52 struct ceph_auth_client *ac, int peer_type,
53 struct ceph_authorizer **a,
54 void **buf, size_t *len,
55 void **reply_buf, size_t *reply_len)
56{
57 struct ceph_auth_none_info *ai = ac->private;
58 struct ceph_none_authorizer *au = &ai->au;
59 void *p, *end;
60 int ret;
61
62 if (!ai->built_authorizer) {
63 p = au->buf;
64 end = p + sizeof(au->buf);
65 ceph_encode_8(&p, 1);
66 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
67 if (ret < 0)
68 goto bad;
69 ceph_decode_need(&p, end, sizeof(u64), bad2);
70 ceph_encode_64(&p, ac->global_id);
71 au->buf_len = p - (void *)au->buf;
72 ai->built_authorizer = true;
73 dout("built authorizer len %d\n", au->buf_len);
74 }
75
76 *a = (struct ceph_authorizer *)au;
77 *buf = au->buf;
78 *len = au->buf_len;
79 *reply_buf = au->reply_buf;
80 *reply_len = sizeof(au->reply_buf);
81 return 0;
82
83bad2:
84 ret = -ERANGE;
85bad:
86 return ret;
87}
88
89static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
90 struct ceph_authorizer *a)
91{
92 /* nothing to do */
93}
94
95static const struct ceph_auth_client_ops ceph_auth_none_ops = {
96 .reset = reset,
97 .destroy = destroy,
98 .is_authenticated = is_authenticated,
99 .handle_reply = handle_reply,
100 .create_authorizer = ceph_auth_none_create_authorizer,
101 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
102};
103
104int ceph_auth_none_init(struct ceph_auth_client *ac)
105{
106 struct ceph_auth_none_info *xi;
107
108 dout("ceph_auth_none_init %p\n", ac);
109 xi = kzalloc(sizeof(*xi), GFP_NOFS);
110 if (!xi)
111 return -ENOMEM;
112
113 xi->starting = true;
114 xi->built_authorizer = false;
115
116 ac->protocol = CEPH_AUTH_NONE;
117 ac->private = xi;
118 ac->ops = &ceph_auth_none_ops;
119 return 0;
120}
121
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..8d8a84964763
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,679 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_x.h"
9#include "auth_x_protocol.h"
10#include "crypto.h"
11#include "auth.h"
12#include "decode.h"
13
14struct kmem_cache *ceph_x_ticketbuf_cachep;
15
16#define TEMP_TICKET_BUF_LEN 256
17
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
19
20static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
21{
22 struct ceph_x_info *xi = ac->private;
23 int need;
24
25 ceph_x_validate_tickets(ac, &need);
26 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
27 ac->want_keys, need, xi->have_keys);
28 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29}
30
31static int ceph_x_encrypt_buflen(int ilen)
32{
33 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
34 sizeof(u32);
35}
36
37static int ceph_x_encrypt(struct ceph_crypto_key *secret,
38 void *ibuf, int ilen, void *obuf, size_t olen)
39{
40 struct ceph_x_encrypt_header head = {
41 .struct_v = 1,
42 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
43 };
44 size_t len = olen - sizeof(u32);
45 int ret;
46
47 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
48 &head, sizeof(head), ibuf, ilen);
49 if (ret)
50 return ret;
51 ceph_encode_32(&obuf, len);
52 return len + sizeof(u32);
53}
54
55static int ceph_x_decrypt(struct ceph_crypto_key *secret,
56 void **p, void *end, void *obuf, size_t olen)
57{
58 struct ceph_x_encrypt_header head;
59 size_t head_len = sizeof(head);
60 int len, ret;
61
62 len = ceph_decode_32(p);
63 if (*p + len > end)
64 return -EINVAL;
65
66 dout("ceph_x_decrypt len %d\n", len);
67 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
68 *p, len);
69 if (ret)
70 return ret;
71 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
72 return -EPERM;
73 *p += len;
74 return olen;
75}
76
77/*
78 * get existing (or insert new) ticket handler
79 */
80struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
81 int service)
82{
83 struct ceph_x_ticket_handler *th;
84 struct ceph_x_info *xi = ac->private;
85 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
86
87 while (*p) {
88 parent = *p;
89 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
90 if (service < th->service)
91 p = &(*p)->rb_left;
92 else if (service > th->service)
93 p = &(*p)->rb_right;
94 else
95 return th;
96 }
97
98 /* add it */
99 th = kzalloc(sizeof(*th), GFP_NOFS);
100 if (!th)
101 return ERR_PTR(-ENOMEM);
102 th->service = service;
103 rb_link_node(&th->node, parent, p);
104 rb_insert_color(&th->node, &xi->ticket_handlers);
105 return th;
106}
107
108static void remove_ticket_handler(struct ceph_auth_client *ac,
109 struct ceph_x_ticket_handler *th)
110{
111 struct ceph_x_info *xi = ac->private;
112
113 dout("remove_ticket_handler %p %d\n", th, th->service);
114 rb_erase(&th->node, &xi->ticket_handlers);
115 ceph_crypto_key_destroy(&th->session_key);
116 if (th->ticket_blob)
117 ceph_buffer_put(th->ticket_blob);
118 kfree(th);
119}
120
121static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
122 struct ceph_crypto_key *secret,
123 void *buf, void *end)
124{
125 struct ceph_x_info *xi = ac->private;
126 int num;
127 void *p = buf;
128 int ret;
129 char *dbuf;
130 char *ticket_buf;
131 u8 struct_v;
132
133 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
134 if (!dbuf)
135 return -ENOMEM;
136
137 ret = -ENOMEM;
138 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
139 GFP_NOFS | GFP_ATOMIC);
140 if (!ticket_buf)
141 goto out_dbuf;
142
143 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
144 struct_v = ceph_decode_8(&p);
145 if (struct_v != 1)
146 goto bad;
147 num = ceph_decode_32(&p);
148 dout("%d tickets\n", num);
149 while (num--) {
150 int type;
151 u8 struct_v;
152 struct ceph_x_ticket_handler *th;
153 void *dp, *dend;
154 int dlen;
155 char is_enc;
156 struct timespec validity;
157 struct ceph_crypto_key old_key;
158 void *tp, *tpend;
159 struct ceph_timespec new_validity;
160 struct ceph_crypto_key new_session_key;
161 struct ceph_buffer *new_ticket_blob;
162 unsigned long new_expires, new_renew_after;
163 u64 new_secret_id;
164
165 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
166
167 type = ceph_decode_32(&p);
168 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
169
170 struct_v = ceph_decode_8(&p);
171 if (struct_v != 1)
172 goto bad;
173
174 th = get_ticket_handler(ac, type);
175 if (IS_ERR(th)) {
176 ret = PTR_ERR(th);
177 goto out;
178 }
179
180 /* blob for me */
181 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
182 TEMP_TICKET_BUF_LEN);
183 if (dlen <= 0) {
184 ret = dlen;
185 goto out;
186 }
187 dout(" decrypted %d bytes\n", dlen);
188 dend = dbuf + dlen;
189 dp = dbuf;
190
191 struct_v = ceph_decode_8(&dp);
192 if (struct_v != 1)
193 goto bad;
194
195 memcpy(&old_key, &th->session_key, sizeof(old_key));
196 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
197 if (ret)
198 goto out;
199
200 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
201 ceph_decode_timespec(&validity, &new_validity);
202 new_expires = get_seconds() + validity.tv_sec;
203 new_renew_after = new_expires - (validity.tv_sec / 4);
204 dout(" expires=%lu renew_after=%lu\n", new_expires,
205 new_renew_after);
206
207 /* ticket blob for service */
208 ceph_decode_8_safe(&p, end, is_enc, bad);
209 tp = ticket_buf;
210 if (is_enc) {
211 /* encrypted */
212 dout(" encrypted ticket\n");
213 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
214 TEMP_TICKET_BUF_LEN);
215 if (dlen < 0) {
216 ret = dlen;
217 goto out;
218 }
219 dlen = ceph_decode_32(&tp);
220 } else {
221 /* unencrypted */
222 ceph_decode_32_safe(&p, end, dlen, bad);
223 ceph_decode_need(&p, end, dlen, bad);
224 ceph_decode_copy(&p, ticket_buf, dlen);
225 }
226 tpend = tp + dlen;
227 dout(" ticket blob is %d bytes\n", dlen);
228 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
229 struct_v = ceph_decode_8(&tp);
230 new_secret_id = ceph_decode_64(&tp);
231 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
232 if (ret)
233 goto out;
234
235 /* all is well, update our ticket */
236 ceph_crypto_key_destroy(&th->session_key);
237 if (th->ticket_blob)
238 ceph_buffer_put(th->ticket_blob);
239 th->session_key = new_session_key;
240 th->ticket_blob = new_ticket_blob;
241 th->validity = new_validity;
242 th->secret_id = new_secret_id;
243 th->expires = new_expires;
244 th->renew_after = new_renew_after;
245 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
246 type, ceph_entity_type_name(type), th->secret_id,
247 (int)th->ticket_blob->vec.iov_len);
248 xi->have_keys |= th->service;
249 }
250
251 ret = 0;
252out:
253 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
254out_dbuf:
255 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
256 return ret;
257
258bad:
259 ret = -EINVAL;
260 goto out;
261}
262
263static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
264 struct ceph_x_ticket_handler *th,
265 struct ceph_x_authorizer *au)
266{
267 int maxlen;
268 struct ceph_x_authorize_a *msg_a;
269 struct ceph_x_authorize_b msg_b;
270 void *p, *end;
271 int ret;
272 int ticket_blob_len =
273 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
274
275 dout("build_authorizer for %s %p\n",
276 ceph_entity_type_name(th->service), au);
277
278 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
279 ceph_x_encrypt_buflen(ticket_blob_len);
280 dout(" need len %d\n", maxlen);
281 if (au->buf && au->buf->alloc_len < maxlen) {
282 ceph_buffer_put(au->buf);
283 au->buf = NULL;
284 }
285 if (!au->buf) {
286 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
287 if (!au->buf)
288 return -ENOMEM;
289 }
290 au->service = th->service;
291
292 msg_a = au->buf->vec.iov_base;
293 msg_a->struct_v = 1;
294 msg_a->global_id = cpu_to_le64(ac->global_id);
295 msg_a->service_id = cpu_to_le32(th->service);
296 msg_a->ticket_blob.struct_v = 1;
297 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
298 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
299 if (ticket_blob_len) {
300 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
301 th->ticket_blob->vec.iov_len);
302 }
303 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
304 le64_to_cpu(msg_a->ticket_blob.secret_id));
305
306 p = msg_a + 1;
307 p += ticket_blob_len;
308 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
309
310 get_random_bytes(&au->nonce, sizeof(au->nonce));
311 msg_b.struct_v = 1;
312 msg_b.nonce = cpu_to_le64(au->nonce);
313 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
314 p, end - p);
315 if (ret < 0)
316 goto out_buf;
317 p += ret;
318 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
319 dout(" built authorizer nonce %llx len %d\n", au->nonce,
320 (int)au->buf->vec.iov_len);
321 BUG_ON(au->buf->vec.iov_len > maxlen);
322 return 0;
323
324out_buf:
325 ceph_buffer_put(au->buf);
326 au->buf = NULL;
327 return ret;
328}
329
330static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
331 void **p, void *end)
332{
333 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
334 ceph_encode_8(p, 1);
335 ceph_encode_64(p, th->secret_id);
336 if (th->ticket_blob) {
337 const char *buf = th->ticket_blob->vec.iov_base;
338 u32 len = th->ticket_blob->vec.iov_len;
339
340 ceph_encode_32_safe(p, end, len, bad);
341 ceph_encode_copy_safe(p, end, buf, len, bad);
342 } else {
343 ceph_encode_32_safe(p, end, 0, bad);
344 }
345
346 return 0;
347bad:
348 return -ERANGE;
349}
350
351static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
352{
353 int want = ac->want_keys;
354 struct ceph_x_info *xi = ac->private;
355 int service;
356
357 *pneed = ac->want_keys & ~(xi->have_keys);
358
359 for (service = 1; service <= want; service <<= 1) {
360 struct ceph_x_ticket_handler *th;
361
362 if (!(ac->want_keys & service))
363 continue;
364
365 if (*pneed & service)
366 continue;
367
368 th = get_ticket_handler(ac, service);
369
370 if (!th) {
371 *pneed |= service;
372 continue;
373 }
374
375 if (get_seconds() >= th->renew_after)
376 *pneed |= service;
377 if (get_seconds() >= th->expires)
378 xi->have_keys &= ~service;
379 }
380}
381
382
383static int ceph_x_build_request(struct ceph_auth_client *ac,
384 void *buf, void *end)
385{
386 struct ceph_x_info *xi = ac->private;
387 int need;
388 struct ceph_x_request_header *head = buf;
389 int ret;
390 struct ceph_x_ticket_handler *th =
391 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
392
393 ceph_x_validate_tickets(ac, &need);
394
395 dout("build_request want %x have %x need %x\n",
396 ac->want_keys, xi->have_keys, need);
397
398 if (need & CEPH_ENTITY_TYPE_AUTH) {
399 struct ceph_x_authenticate *auth = (void *)(head + 1);
400 void *p = auth + 1;
401 struct ceph_x_challenge_blob tmp;
402 char tmp_enc[40];
403 u64 *u;
404
405 if (p > end)
406 return -ERANGE;
407
408 dout(" get_auth_session_key\n");
409 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
410
411 /* encrypt and hash */
412 get_random_bytes(&auth->client_challenge, sizeof(u64));
413 tmp.client_challenge = auth->client_challenge;
414 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
415 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
416 tmp_enc, sizeof(tmp_enc));
417 if (ret < 0)
418 return ret;
419
420 auth->struct_v = 1;
421 auth->key = 0;
422 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
423 auth->key ^= *u;
424 dout(" server_challenge %llx client_challenge %llx key %llx\n",
425 xi->server_challenge, le64_to_cpu(auth->client_challenge),
426 le64_to_cpu(auth->key));
427
428 /* now encode the old ticket if exists */
429 ret = ceph_x_encode_ticket(th, &p, end);
430 if (ret < 0)
431 return ret;
432
433 return p - buf;
434 }
435
436 if (need) {
437 void *p = head + 1;
438 struct ceph_x_service_ticket_request *req;
439
440 if (p > end)
441 return -ERANGE;
442 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
443
444 BUG_ON(!th);
445 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
446 if (ret)
447 return ret;
448 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
449 xi->auth_authorizer.buf->vec.iov_len);
450
451 req = p;
452 req->keys = cpu_to_le32(need);
453 p += sizeof(*req);
454 return p - buf;
455 }
456
457 return 0;
458}
459
460static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
461 void *buf, void *end)
462{
463 struct ceph_x_info *xi = ac->private;
464 struct ceph_x_reply_header *head = buf;
465 struct ceph_x_ticket_handler *th;
466 int len = end - buf;
467 int op;
468 int ret;
469
470 if (result)
471 return result; /* XXX hmm? */
472
473 if (xi->starting) {
474 /* it's a hello */
475 struct ceph_x_server_challenge *sc = buf;
476
477 if (len != sizeof(*sc))
478 return -EINVAL;
479 xi->server_challenge = le64_to_cpu(sc->server_challenge);
480 dout("handle_reply got server challenge %llx\n",
481 xi->server_challenge);
482 xi->starting = false;
483 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
484 return -EAGAIN;
485 }
486
487 op = le32_to_cpu(head->op);
488 result = le32_to_cpu(head->result);
489 dout("handle_reply op %d result %d\n", op, result);
490 switch (op) {
491 case CEPHX_GET_AUTH_SESSION_KEY:
492 /* verify auth key */
493 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
494 buf + sizeof(*head), end);
495 break;
496
497 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
498 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
499 BUG_ON(!th);
500 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
501 buf + sizeof(*head), end);
502 break;
503
504 default:
505 return -EINVAL;
506 }
507 if (ret)
508 return ret;
509 if (ac->want_keys == xi->have_keys)
510 return 0;
511 return -EAGAIN;
512}
513
514static int ceph_x_create_authorizer(
515 struct ceph_auth_client *ac, int peer_type,
516 struct ceph_authorizer **a,
517 void **buf, size_t *len,
518 void **reply_buf, size_t *reply_len)
519{
520 struct ceph_x_authorizer *au;
521 struct ceph_x_ticket_handler *th;
522 int ret;
523
524 th = get_ticket_handler(ac, peer_type);
525 if (IS_ERR(th))
526 return PTR_ERR(th);
527
528 au = kzalloc(sizeof(*au), GFP_NOFS);
529 if (!au)
530 return -ENOMEM;
531
532 ret = ceph_x_build_authorizer(ac, th, au);
533 if (ret) {
534 kfree(au);
535 return ret;
536 }
537
538 *a = (struct ceph_authorizer *)au;
539 *buf = au->buf->vec.iov_base;
540 *len = au->buf->vec.iov_len;
541 *reply_buf = au->reply_buf;
542 *reply_len = sizeof(au->reply_buf);
543 return 0;
544}
545
546static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
547 struct ceph_authorizer *a, size_t len)
548{
549 struct ceph_x_authorizer *au = (void *)a;
550 struct ceph_x_ticket_handler *th;
551 int ret = 0;
552 struct ceph_x_authorize_reply reply;
553 void *p = au->reply_buf;
554 void *end = p + sizeof(au->reply_buf);
555
556 th = get_ticket_handler(ac, au->service);
557 if (!th)
558 return -EIO; /* hrm! */
559 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
560 if (ret < 0)
561 return ret;
562 if (ret != sizeof(reply))
563 return -EPERM;
564
565 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
566 ret = -EPERM;
567 else
568 ret = 0;
569 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
570 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
571 return ret;
572}
573
574static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
575 struct ceph_authorizer *a)
576{
577 struct ceph_x_authorizer *au = (void *)a;
578
579 ceph_buffer_put(au->buf);
580 kfree(au);
581}
582
583
584static void ceph_x_reset(struct ceph_auth_client *ac)
585{
586 struct ceph_x_info *xi = ac->private;
587
588 dout("reset\n");
589 xi->starting = true;
590 xi->server_challenge = 0;
591}
592
593static void ceph_x_destroy(struct ceph_auth_client *ac)
594{
595 struct ceph_x_info *xi = ac->private;
596 struct rb_node *p;
597
598 dout("ceph_x_destroy %p\n", ac);
599 ceph_crypto_key_destroy(&xi->secret);
600
601 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
602 struct ceph_x_ticket_handler *th =
603 rb_entry(p, struct ceph_x_ticket_handler, node);
604 remove_ticket_handler(ac, th);
605 }
606
607 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
608
609 kfree(ac->private);
610 ac->private = NULL;
611}
612
613static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
614 int peer_type)
615{
616 struct ceph_x_ticket_handler *th;
617
618 th = get_ticket_handler(ac, peer_type);
619 if (th && !IS_ERR(th))
620 remove_ticket_handler(ac, th);
621}
622
623
624static const struct ceph_auth_client_ops ceph_x_ops = {
625 .is_authenticated = ceph_x_is_authenticated,
626 .build_request = ceph_x_build_request,
627 .handle_reply = ceph_x_handle_reply,
628 .create_authorizer = ceph_x_create_authorizer,
629 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
630 .destroy_authorizer = ceph_x_destroy_authorizer,
631 .invalidate_authorizer = ceph_x_invalidate_authorizer,
632 .reset = ceph_x_reset,
633 .destroy = ceph_x_destroy,
634};
635
636
637int ceph_x_init(struct ceph_auth_client *ac)
638{
639 struct ceph_x_info *xi;
640 int ret;
641
642 dout("ceph_x_init %p\n", ac);
643 xi = kzalloc(sizeof(*xi), GFP_NOFS);
644 if (!xi)
645 return -ENOMEM;
646
647 ret = -ENOMEM;
648 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
649 TEMP_TICKET_BUF_LEN, 8,
650 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
651 NULL);
652 if (!ceph_x_ticketbuf_cachep)
653 goto done_nomem;
654 ret = -EINVAL;
655 if (!ac->secret) {
656 pr_err("no secret set (for auth_x protocol)\n");
657 goto done_nomem;
658 }
659
660 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
661 if (ret)
662 goto done_nomem;
663
664 xi->starting = true;
665 xi->ticket_handlers = RB_ROOT;
666
667 ac->protocol = CEPH_AUTH_CEPHX;
668 ac->private = xi;
669 ac->ops = &ceph_x_ops;
670 return 0;
671
672done_nomem:
673 kfree(xi);
674 if (ceph_x_ticketbuf_cachep)
675 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
676 return ret;
677}
678
679
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..b98086c7aeba
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,78 @@
1
2#include "ceph_debug.h"
3#include "buffer.h"
4#include "decode.h"
5
6struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
7{
8 struct ceph_buffer *b;
9
10 b = kmalloc(sizeof(*b), gfp);
11 if (!b)
12 return NULL;
13
14 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
15 if (b->vec.iov_base) {
16 b->is_vmalloc = false;
17 } else {
18 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
19 if (!b->vec.iov_base) {
20 kfree(b);
21 return NULL;
22 }
23 b->is_vmalloc = true;
24 }
25
26 kref_init(&b->kref);
27 b->alloc_len = len;
28 b->vec.iov_len = len;
29 dout("buffer_new %p\n", b);
30 return b;
31}
32
33void ceph_buffer_release(struct kref *kref)
34{
35 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
36
37 dout("buffer_release %p\n", b);
38 if (b->vec.iov_base) {
39 if (b->is_vmalloc)
40 vfree(b->vec.iov_base);
41 else
42 kfree(b->vec.iov_base);
43 }
44 kfree(b);
45}
46
47int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
48{
49 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
50 if (b->vec.iov_base) {
51 b->is_vmalloc = false;
52 } else {
53 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
54 b->is_vmalloc = true;
55 }
56 if (!b->vec.iov_base)
57 return -ENOMEM;
58 b->alloc_len = len;
59 b->vec.iov_len = len;
60 return 0;
61}
62
63int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
64{
65 size_t len;
66
67 ceph_decode_need(p, end, sizeof(u32), bad);
68 len = ceph_decode_32(p);
69 dout("decode_buffer len %d\n", (int)len);
70 ceph_decode_need(p, end, len, bad);
71 *b = ceph_buffer_new(len, GFP_NOFS);
72 if (!*b)
73 return -ENOMEM;
74 ceph_decode_copy(p, (*b)->vec.iov_base, len);
75 return 0;
76bad:
77 return -EINVAL;
78}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..7d0a0d0adc18
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2932 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/vmalloc.h>
7#include <linux/wait.h>
8#include <linux/writeback.h>
9
10#include "super.h"
11#include "decode.h"
12#include "messenger.h"
13
14/*
15 * Capability management
16 *
17 * The Ceph metadata servers control client access to inode metadata
18 * and file data by issuing capabilities, granting clients permission
19 * to read and/or write both inode field and file data to OSDs
20 * (storage nodes). Each capability consists of a set of bits
21 * indicating which operations are allowed.
22 *
23 * If the client holds a *_SHARED cap, the client has a coherent value
24 * that can be safely read from the cached inode.
25 *
26 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
27 * client is allowed to change inode attributes (e.g., file size,
28 * mtime), note its dirty state in the ceph_cap, and asynchronously
29 * flush that metadata change to the MDS.
30 *
31 * In the event of a conflicting operation (perhaps by another
32 * client), the MDS will revoke the conflicting client capabilities.
33 *
34 * In order for a client to cache an inode, it must hold a capability
35 * with at least one MDS server. When inodes are released, release
36 * notifications are batched and periodically sent en masse to the MDS
37 * cluster to release server state.
38 */
39
40
41/*
42 * Generate readable cap strings for debugging output.
43 */
44#define MAX_CAP_STR 20
45static char cap_str[MAX_CAP_STR][40];
46static DEFINE_SPINLOCK(cap_str_lock);
47static int last_cap_str;
48
49static char *gcap_string(char *s, int c)
50{
51 if (c & CEPH_CAP_GSHARED)
52 *s++ = 's';
53 if (c & CEPH_CAP_GEXCL)
54 *s++ = 'x';
55 if (c & CEPH_CAP_GCACHE)
56 *s++ = 'c';
57 if (c & CEPH_CAP_GRD)
58 *s++ = 'r';
59 if (c & CEPH_CAP_GWR)
60 *s++ = 'w';
61 if (c & CEPH_CAP_GBUFFER)
62 *s++ = 'b';
63 if (c & CEPH_CAP_GLAZYIO)
64 *s++ = 'l';
65 return s;
66}
67
68const char *ceph_cap_string(int caps)
69{
70 int i;
71 char *s;
72 int c;
73
74 spin_lock(&cap_str_lock);
75 i = last_cap_str++;
76 if (last_cap_str == MAX_CAP_STR)
77 last_cap_str = 0;
78 spin_unlock(&cap_str_lock);
79
80 s = cap_str[i];
81
82 if (caps & CEPH_CAP_PIN)
83 *s++ = 'p';
84
85 c = (caps >> CEPH_CAP_SAUTH) & 3;
86 if (c) {
87 *s++ = 'A';
88 s = gcap_string(s, c);
89 }
90
91 c = (caps >> CEPH_CAP_SLINK) & 3;
92 if (c) {
93 *s++ = 'L';
94 s = gcap_string(s, c);
95 }
96
97 c = (caps >> CEPH_CAP_SXATTR) & 3;
98 if (c) {
99 *s++ = 'X';
100 s = gcap_string(s, c);
101 }
102
103 c = caps >> CEPH_CAP_SFILE;
104 if (c) {
105 *s++ = 'F';
106 s = gcap_string(s, c);
107 }
108
109 if (s == cap_str[i])
110 *s++ = '-';
111 *s = 0;
112 return cap_str[i];
113}
114
115/*
116 * Cap reservations
117 *
118 * Maintain a global pool of preallocated struct ceph_caps, referenced
119 * by struct ceph_caps_reservations. This ensures that we preallocate
120 * memory needed to successfully process an MDS response. (If an MDS
121 * sends us cap information and we fail to process it, we will have
122 * problems due to the client and MDS being out of sync.)
123 *
124 * Reservations are 'owned' by a ceph_cap_reservation context.
125 */
126static spinlock_t caps_list_lock;
127static struct list_head caps_list; /* unused (reserved or unreserved) */
128static int caps_total_count; /* total caps allocated */
129static int caps_use_count; /* in use */
130static int caps_reserve_count; /* unused, reserved */
131static int caps_avail_count; /* unused, unreserved */
132static int caps_min_count; /* keep at least this many (unreserved) */
133
134void __init ceph_caps_init(void)
135{
136 INIT_LIST_HEAD(&caps_list);
137 spin_lock_init(&caps_list_lock);
138}
139
140void ceph_caps_finalize(void)
141{
142 struct ceph_cap *cap;
143
144 spin_lock(&caps_list_lock);
145 while (!list_empty(&caps_list)) {
146 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
147 list_del(&cap->caps_item);
148 kmem_cache_free(ceph_cap_cachep, cap);
149 }
150 caps_total_count = 0;
151 caps_avail_count = 0;
152 caps_use_count = 0;
153 caps_reserve_count = 0;
154 caps_min_count = 0;
155 spin_unlock(&caps_list_lock);
156}
157
158void ceph_adjust_min_caps(int delta)
159{
160 spin_lock(&caps_list_lock);
161 caps_min_count += delta;
162 BUG_ON(caps_min_count < 0);
163 spin_unlock(&caps_list_lock);
164}
165
166int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
167{
168 int i;
169 struct ceph_cap *cap;
170 int have;
171 int alloc = 0;
172 LIST_HEAD(newcaps);
173 int ret = 0;
174
175 dout("reserve caps ctx=%p need=%d\n", ctx, need);
176
177 /* first reserve any caps that are already allocated */
178 spin_lock(&caps_list_lock);
179 if (caps_avail_count >= need)
180 have = need;
181 else
182 have = caps_avail_count;
183 caps_avail_count -= have;
184 caps_reserve_count += have;
185 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
186 caps_avail_count);
187 spin_unlock(&caps_list_lock);
188
189 for (i = have; i < need; i++) {
190 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
191 if (!cap) {
192 ret = -ENOMEM;
193 goto out_alloc_count;
194 }
195 list_add(&cap->caps_item, &newcaps);
196 alloc++;
197 }
198 BUG_ON(have + alloc != need);
199
200 spin_lock(&caps_list_lock);
201 caps_total_count += alloc;
202 caps_reserve_count += alloc;
203 list_splice(&newcaps, &caps_list);
204
205 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
206 caps_avail_count);
207 spin_unlock(&caps_list_lock);
208
209 ctx->count = need;
210 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
211 ctx, caps_total_count, caps_use_count, caps_reserve_count,
212 caps_avail_count);
213 return 0;
214
215out_alloc_count:
216 /* we didn't manage to reserve as much as we needed */
217 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
218 ctx, need, have);
219 return ret;
220}
221
222int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
223{
224 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
225 if (ctx->count) {
226 spin_lock(&caps_list_lock);
227 BUG_ON(caps_reserve_count < ctx->count);
228 caps_reserve_count -= ctx->count;
229 caps_avail_count += ctx->count;
230 ctx->count = 0;
231 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
232 caps_total_count, caps_use_count, caps_reserve_count,
233 caps_avail_count);
234 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
235 caps_avail_count);
236 spin_unlock(&caps_list_lock);
237 }
238 return 0;
239}
240
241static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
242{
243 struct ceph_cap *cap = NULL;
244
245 /* temporary, until we do something about cap import/export */
246 if (!ctx)
247 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
248
249 spin_lock(&caps_list_lock);
250 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
251 ctx, ctx->count, caps_total_count, caps_use_count,
252 caps_reserve_count, caps_avail_count);
253 BUG_ON(!ctx->count);
254 BUG_ON(ctx->count > caps_reserve_count);
255 BUG_ON(list_empty(&caps_list));
256
257 ctx->count--;
258 caps_reserve_count--;
259 caps_use_count++;
260
261 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
262 list_del(&cap->caps_item);
263
264 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
265 caps_avail_count);
266 spin_unlock(&caps_list_lock);
267 return cap;
268}
269
270void ceph_put_cap(struct ceph_cap *cap)
271{
272 spin_lock(&caps_list_lock);
273 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
274 cap, caps_total_count, caps_use_count,
275 caps_reserve_count, caps_avail_count);
276 caps_use_count--;
277 /*
278 * Keep some preallocated caps around (ceph_min_count), to
279 * avoid lots of free/alloc churn.
280 */
281 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
282 caps_total_count--;
283 kmem_cache_free(ceph_cap_cachep, cap);
284 } else {
285 caps_avail_count++;
286 list_add(&cap->caps_item, &caps_list);
287 }
288
289 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
290 caps_avail_count);
291 spin_unlock(&caps_list_lock);
292}
293
294void ceph_reservation_status(struct ceph_client *client,
295 int *total, int *avail, int *used, int *reserved,
296 int *min)
297{
298 if (total)
299 *total = caps_total_count;
300 if (avail)
301 *avail = caps_avail_count;
302 if (used)
303 *used = caps_use_count;
304 if (reserved)
305 *reserved = caps_reserve_count;
306 if (min)
307 *min = caps_min_count;
308}
309
310/*
311 * Find ceph_cap for given mds, if any.
312 *
313 * Called with i_lock held.
314 */
315static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
316{
317 struct ceph_cap *cap;
318 struct rb_node *n = ci->i_caps.rb_node;
319
320 while (n) {
321 cap = rb_entry(n, struct ceph_cap, ci_node);
322 if (mds < cap->mds)
323 n = n->rb_left;
324 else if (mds > cap->mds)
325 n = n->rb_right;
326 else
327 return cap;
328 }
329 return NULL;
330}
331
332/*
333 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
334 * -1.
335 */
336static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
337{
338 struct ceph_cap *cap;
339 int mds = -1;
340 struct rb_node *p;
341
342 /* prefer mds with WR|WRBUFFER|EXCL caps */
343 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
344 cap = rb_entry(p, struct ceph_cap, ci_node);
345 mds = cap->mds;
346 if (mseq)
347 *mseq = cap->mseq;
348 if (cap->issued & (CEPH_CAP_FILE_WR |
349 CEPH_CAP_FILE_BUFFER |
350 CEPH_CAP_FILE_EXCL))
351 break;
352 }
353 return mds;
354}
355
356int ceph_get_cap_mds(struct inode *inode)
357{
358 int mds;
359 spin_lock(&inode->i_lock);
360 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
361 spin_unlock(&inode->i_lock);
362 return mds;
363}
364
365/*
366 * Called under i_lock.
367 */
368static void __insert_cap_node(struct ceph_inode_info *ci,
369 struct ceph_cap *new)
370{
371 struct rb_node **p = &ci->i_caps.rb_node;
372 struct rb_node *parent = NULL;
373 struct ceph_cap *cap = NULL;
374
375 while (*p) {
376 parent = *p;
377 cap = rb_entry(parent, struct ceph_cap, ci_node);
378 if (new->mds < cap->mds)
379 p = &(*p)->rb_left;
380 else if (new->mds > cap->mds)
381 p = &(*p)->rb_right;
382 else
383 BUG();
384 }
385
386 rb_link_node(&new->ci_node, parent, p);
387 rb_insert_color(&new->ci_node, &ci->i_caps);
388}
389
390/*
391 * (re)set cap hold timeouts, which control the delayed release
392 * of unused caps back to the MDS. Should be called on cap use.
393 */
394static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
395 struct ceph_inode_info *ci)
396{
397 struct ceph_mount_args *ma = mdsc->client->mount_args;
398
399 ci->i_hold_caps_min = round_jiffies(jiffies +
400 ma->caps_wanted_delay_min * HZ);
401 ci->i_hold_caps_max = round_jiffies(jiffies +
402 ma->caps_wanted_delay_max * HZ);
403 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
404 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
405}
406
407/*
408 * (Re)queue cap at the end of the delayed cap release list.
409 *
410 * If I_FLUSH is set, leave the inode at the front of the list.
411 *
412 * Caller holds i_lock
413 * -> we take mdsc->cap_delay_lock
414 */
415static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
416 struct ceph_inode_info *ci)
417{
418 __cap_set_timeouts(mdsc, ci);
419 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
420 ci->i_ceph_flags, ci->i_hold_caps_max);
421 if (!mdsc->stopping) {
422 spin_lock(&mdsc->cap_delay_lock);
423 if (!list_empty(&ci->i_cap_delay_list)) {
424 if (ci->i_ceph_flags & CEPH_I_FLUSH)
425 goto no_change;
426 list_del_init(&ci->i_cap_delay_list);
427 }
428 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
429no_change:
430 spin_unlock(&mdsc->cap_delay_lock);
431 }
432}
433
434/*
435 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
436 * indicating we should send a cap message to flush dirty metadata
437 * asap, and move to the front of the delayed cap list.
438 */
439static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
440 struct ceph_inode_info *ci)
441{
442 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
443 spin_lock(&mdsc->cap_delay_lock);
444 ci->i_ceph_flags |= CEPH_I_FLUSH;
445 if (!list_empty(&ci->i_cap_delay_list))
446 list_del_init(&ci->i_cap_delay_list);
447 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
448 spin_unlock(&mdsc->cap_delay_lock);
449}
450
451/*
452 * Cancel delayed work on cap.
453 *
454 * Caller must hold i_lock.
455 */
456static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
457 struct ceph_inode_info *ci)
458{
459 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
460 if (list_empty(&ci->i_cap_delay_list))
461 return;
462 spin_lock(&mdsc->cap_delay_lock);
463 list_del_init(&ci->i_cap_delay_list);
464 spin_unlock(&mdsc->cap_delay_lock);
465}
466
467/*
468 * Common issue checks for add_cap, handle_cap_grant.
469 */
470static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
471 unsigned issued)
472{
473 unsigned had = __ceph_caps_issued(ci, NULL);
474
475 /*
476 * Each time we receive FILE_CACHE anew, we increment
477 * i_rdcache_gen.
478 */
479 if ((issued & CEPH_CAP_FILE_CACHE) &&
480 (had & CEPH_CAP_FILE_CACHE) == 0)
481 ci->i_rdcache_gen++;
482
483 /*
484 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
485 * don't know what happened to this directory while we didn't
486 * have the cap.
487 */
488 if ((issued & CEPH_CAP_FILE_SHARED) &&
489 (had & CEPH_CAP_FILE_SHARED) == 0) {
490 ci->i_shared_gen++;
491 if (S_ISDIR(ci->vfs_inode.i_mode)) {
492 dout(" marking %p NOT complete\n", &ci->vfs_inode);
493 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
494 }
495 }
496}
497
498/*
499 * Add a capability under the given MDS session.
500 *
501 * Caller should hold session snap_rwsem (read) and s_mutex.
502 *
503 * @fmode is the open file mode, if we are opening a file, otherwise
504 * it is < 0. (This is so we can atomically add the cap and add an
505 * open file reference to it.)
506 */
507int ceph_add_cap(struct inode *inode,
508 struct ceph_mds_session *session, u64 cap_id,
509 int fmode, unsigned issued, unsigned wanted,
510 unsigned seq, unsigned mseq, u64 realmino, int flags,
511 struct ceph_cap_reservation *caps_reservation)
512{
513 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
514 struct ceph_inode_info *ci = ceph_inode(inode);
515 struct ceph_cap *new_cap = NULL;
516 struct ceph_cap *cap;
517 int mds = session->s_mds;
518 int actual_wanted;
519
520 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
521 session->s_mds, cap_id, ceph_cap_string(issued), seq);
522
523 /*
524 * If we are opening the file, include file mode wanted bits
525 * in wanted.
526 */
527 if (fmode >= 0)
528 wanted |= ceph_caps_for_mode(fmode);
529
530retry:
531 spin_lock(&inode->i_lock);
532 cap = __get_cap_for_mds(ci, mds);
533 if (!cap) {
534 if (new_cap) {
535 cap = new_cap;
536 new_cap = NULL;
537 } else {
538 spin_unlock(&inode->i_lock);
539 new_cap = get_cap(caps_reservation);
540 if (new_cap == NULL)
541 return -ENOMEM;
542 goto retry;
543 }
544
545 cap->issued = 0;
546 cap->implemented = 0;
547 cap->mds = mds;
548 cap->mds_wanted = 0;
549
550 cap->ci = ci;
551 __insert_cap_node(ci, cap);
552
553 /* clear out old exporting info? (i.e. on cap import) */
554 if (ci->i_cap_exporting_mds == mds) {
555 ci->i_cap_exporting_issued = 0;
556 ci->i_cap_exporting_mseq = 0;
557 ci->i_cap_exporting_mds = -1;
558 }
559
560 /* add to session cap list */
561 cap->session = session;
562 spin_lock(&session->s_cap_lock);
563 list_add_tail(&cap->session_caps, &session->s_caps);
564 session->s_nr_caps++;
565 spin_unlock(&session->s_cap_lock);
566 }
567
568 if (!ci->i_snap_realm) {
569 /*
570 * add this inode to the appropriate snap realm
571 */
572 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
573 realmino);
574 if (realm) {
575 ceph_get_snap_realm(mdsc, realm);
576 spin_lock(&realm->inodes_with_caps_lock);
577 ci->i_snap_realm = realm;
578 list_add(&ci->i_snap_realm_item,
579 &realm->inodes_with_caps);
580 spin_unlock(&realm->inodes_with_caps_lock);
581 } else {
582 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
583 realmino);
584 }
585 }
586
587 __check_cap_issue(ci, cap, issued);
588
589 /*
590 * If we are issued caps we don't want, or the mds' wanted
591 * value appears to be off, queue a check so we'll release
592 * later and/or update the mds wanted value.
593 */
594 actual_wanted = __ceph_caps_wanted(ci);
595 if ((wanted & ~actual_wanted) ||
596 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
597 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
598 ceph_cap_string(issued), ceph_cap_string(wanted),
599 ceph_cap_string(actual_wanted));
600 __cap_delay_requeue(mdsc, ci);
601 }
602
603 if (flags & CEPH_CAP_FLAG_AUTH)
604 ci->i_auth_cap = cap;
605 else if (ci->i_auth_cap == cap)
606 ci->i_auth_cap = NULL;
607
608 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
609 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
610 ceph_cap_string(issued|cap->issued), seq, mds);
611 cap->cap_id = cap_id;
612 cap->issued = issued;
613 cap->implemented |= issued;
614 cap->mds_wanted |= wanted;
615 cap->seq = seq;
616 cap->issue_seq = seq;
617 cap->mseq = mseq;
618 cap->cap_gen = session->s_cap_gen;
619
620 if (fmode >= 0)
621 __ceph_get_fmode(ci, fmode);
622 spin_unlock(&inode->i_lock);
623 wake_up(&ci->i_cap_wq);
624 return 0;
625}
626
627/*
628 * Return true if cap has not timed out and belongs to the current
629 * generation of the MDS session (i.e. has not gone 'stale' due to
630 * us losing touch with the mds).
631 */
632static int __cap_is_valid(struct ceph_cap *cap)
633{
634 unsigned long ttl;
635 u32 gen;
636
637 spin_lock(&cap->session->s_cap_lock);
638 gen = cap->session->s_cap_gen;
639 ttl = cap->session->s_cap_ttl;
640 spin_unlock(&cap->session->s_cap_lock);
641
642 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
643 dout("__cap_is_valid %p cap %p issued %s "
644 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
645 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
646 return 0;
647 }
648
649 return 1;
650}
651
652/*
653 * Return set of valid cap bits issued to us. Note that caps time
654 * out, and may be invalidated in bulk if the client session times out
655 * and session->s_cap_gen is bumped.
656 */
657int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
658{
659 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
660 struct ceph_cap *cap;
661 struct rb_node *p;
662
663 if (implemented)
664 *implemented = 0;
665 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
666 cap = rb_entry(p, struct ceph_cap, ci_node);
667 if (!__cap_is_valid(cap))
668 continue;
669 dout("__ceph_caps_issued %p cap %p issued %s\n",
670 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
671 have |= cap->issued;
672 if (implemented)
673 *implemented |= cap->implemented;
674 }
675 return have;
676}
677
678/*
679 * Get cap bits issued by caps other than @ocap
680 */
681int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
682{
683 int have = ci->i_snap_caps;
684 struct ceph_cap *cap;
685 struct rb_node *p;
686
687 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
688 cap = rb_entry(p, struct ceph_cap, ci_node);
689 if (cap == ocap)
690 continue;
691 if (!__cap_is_valid(cap))
692 continue;
693 have |= cap->issued;
694 }
695 return have;
696}
697
698/*
699 * Move a cap to the end of the LRU (oldest caps at list head, newest
700 * at list tail).
701 */
702static void __touch_cap(struct ceph_cap *cap)
703{
704 struct ceph_mds_session *s = cap->session;
705
706 spin_lock(&s->s_cap_lock);
707 if (s->s_cap_iterator == NULL) {
708 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
709 s->s_mds);
710 list_move_tail(&cap->session_caps, &s->s_caps);
711 } else {
712 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
713 &cap->ci->vfs_inode, cap, s->s_mds);
714 }
715 spin_unlock(&s->s_cap_lock);
716}
717
718/*
719 * Check if we hold the given mask. If so, move the cap(s) to the
720 * front of their respective LRUs. (This is the preferred way for
721 * callers to check for caps they want.)
722 */
723int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
724{
725 struct ceph_cap *cap;
726 struct rb_node *p;
727 int have = ci->i_snap_caps;
728
729 if ((have & mask) == mask) {
730 dout("__ceph_caps_issued_mask %p snap issued %s"
731 " (mask %s)\n", &ci->vfs_inode,
732 ceph_cap_string(have),
733 ceph_cap_string(mask));
734 return 1;
735 }
736
737 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
738 cap = rb_entry(p, struct ceph_cap, ci_node);
739 if (!__cap_is_valid(cap))
740 continue;
741 if ((cap->issued & mask) == mask) {
742 dout("__ceph_caps_issued_mask %p cap %p issued %s"
743 " (mask %s)\n", &ci->vfs_inode, cap,
744 ceph_cap_string(cap->issued),
745 ceph_cap_string(mask));
746 if (touch)
747 __touch_cap(cap);
748 return 1;
749 }
750
751 /* does a combination of caps satisfy mask? */
752 have |= cap->issued;
753 if ((have & mask) == mask) {
754 dout("__ceph_caps_issued_mask %p combo issued %s"
755 " (mask %s)\n", &ci->vfs_inode,
756 ceph_cap_string(cap->issued),
757 ceph_cap_string(mask));
758 if (touch) {
759 struct rb_node *q;
760
761 /* touch this + preceeding caps */
762 __touch_cap(cap);
763 for (q = rb_first(&ci->i_caps); q != p;
764 q = rb_next(q)) {
765 cap = rb_entry(q, struct ceph_cap,
766 ci_node);
767 if (!__cap_is_valid(cap))
768 continue;
769 __touch_cap(cap);
770 }
771 }
772 return 1;
773 }
774 }
775
776 return 0;
777}
778
779/*
780 * Return true if mask caps are currently being revoked by an MDS.
781 */
782int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
783{
784 struct inode *inode = &ci->vfs_inode;
785 struct ceph_cap *cap;
786 struct rb_node *p;
787 int ret = 0;
788
789 spin_lock(&inode->i_lock);
790 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
791 cap = rb_entry(p, struct ceph_cap, ci_node);
792 if (__cap_is_valid(cap) &&
793 (cap->implemented & ~cap->issued & mask)) {
794 ret = 1;
795 break;
796 }
797 }
798 spin_unlock(&inode->i_lock);
799 dout("ceph_caps_revoking %p %s = %d\n", inode,
800 ceph_cap_string(mask), ret);
801 return ret;
802}
803
804int __ceph_caps_used(struct ceph_inode_info *ci)
805{
806 int used = 0;
807 if (ci->i_pin_ref)
808 used |= CEPH_CAP_PIN;
809 if (ci->i_rd_ref)
810 used |= CEPH_CAP_FILE_RD;
811 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
812 used |= CEPH_CAP_FILE_CACHE;
813 if (ci->i_wr_ref)
814 used |= CEPH_CAP_FILE_WR;
815 if (ci->i_wrbuffer_ref)
816 used |= CEPH_CAP_FILE_BUFFER;
817 return used;
818}
819
820/*
821 * wanted, by virtue of open file modes
822 */
823int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
824{
825 int want = 0;
826 int mode;
827 for (mode = 0; mode < 4; mode++)
828 if (ci->i_nr_by_mode[mode])
829 want |= ceph_caps_for_mode(mode);
830 return want;
831}
832
833/*
834 * Return caps we have registered with the MDS(s) as 'wanted'.
835 */
836int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
837{
838 struct ceph_cap *cap;
839 struct rb_node *p;
840 int mds_wanted = 0;
841
842 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
843 cap = rb_entry(p, struct ceph_cap, ci_node);
844 if (!__cap_is_valid(cap))
845 continue;
846 mds_wanted |= cap->mds_wanted;
847 }
848 return mds_wanted;
849}
850
851/*
852 * called under i_lock
853 */
854static int __ceph_is_any_caps(struct ceph_inode_info *ci)
855{
856 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
857}
858
859/*
860 * caller should hold i_lock.
861 * caller will not hold session s_mutex if called from destroy_inode.
862 */
863void __ceph_remove_cap(struct ceph_cap *cap)
864{
865 struct ceph_mds_session *session = cap->session;
866 struct ceph_inode_info *ci = cap->ci;
867 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
868
869 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
870
871 /* remove from inode list */
872 rb_erase(&cap->ci_node, &ci->i_caps);
873 cap->ci = NULL;
874 if (ci->i_auth_cap == cap)
875 ci->i_auth_cap = NULL;
876
877 /* remove from session list */
878 spin_lock(&session->s_cap_lock);
879 if (session->s_cap_iterator == cap) {
880 /* not yet, we are iterating over this very cap */
881 dout("__ceph_remove_cap delaying %p removal from session %p\n",
882 cap, cap->session);
883 } else {
884 list_del_init(&cap->session_caps);
885 session->s_nr_caps--;
886 cap->session = NULL;
887 }
888 spin_unlock(&session->s_cap_lock);
889
890 if (cap->session == NULL)
891 ceph_put_cap(cap);
892
893 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
894 struct ceph_snap_realm *realm = ci->i_snap_realm;
895 spin_lock(&realm->inodes_with_caps_lock);
896 list_del_init(&ci->i_snap_realm_item);
897 ci->i_snap_realm_counter++;
898 ci->i_snap_realm = NULL;
899 spin_unlock(&realm->inodes_with_caps_lock);
900 ceph_put_snap_realm(mdsc, realm);
901 }
902 if (!__ceph_is_any_real_caps(ci))
903 __cap_delay_cancel(mdsc, ci);
904}
905
906/*
907 * Build and send a cap message to the given MDS.
908 *
909 * Caller should be holding s_mutex.
910 */
911static int send_cap_msg(struct ceph_mds_session *session,
912 u64 ino, u64 cid, int op,
913 int caps, int wanted, int dirty,
914 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
915 u64 size, u64 max_size,
916 struct timespec *mtime, struct timespec *atime,
917 u64 time_warp_seq,
918 uid_t uid, gid_t gid, mode_t mode,
919 u64 xattr_version,
920 struct ceph_buffer *xattrs_buf,
921 u64 follows)
922{
923 struct ceph_mds_caps *fc;
924 struct ceph_msg *msg;
925
926 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
927 " seq %u/%u mseq %u follows %lld size %llu/%llu"
928 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
929 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
930 ceph_cap_string(dirty),
931 seq, issue_seq, mseq, follows, size, max_size,
932 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
933
934 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
935 if (IS_ERR(msg))
936 return PTR_ERR(msg);
937
938 msg->hdr.tid = cpu_to_le64(flush_tid);
939
940 fc = msg->front.iov_base;
941 memset(fc, 0, sizeof(*fc));
942
943 fc->cap_id = cpu_to_le64(cid);
944 fc->op = cpu_to_le32(op);
945 fc->seq = cpu_to_le32(seq);
946 fc->issue_seq = cpu_to_le32(issue_seq);
947 fc->migrate_seq = cpu_to_le32(mseq);
948 fc->caps = cpu_to_le32(caps);
949 fc->wanted = cpu_to_le32(wanted);
950 fc->dirty = cpu_to_le32(dirty);
951 fc->ino = cpu_to_le64(ino);
952 fc->snap_follows = cpu_to_le64(follows);
953
954 fc->size = cpu_to_le64(size);
955 fc->max_size = cpu_to_le64(max_size);
956 if (mtime)
957 ceph_encode_timespec(&fc->mtime, mtime);
958 if (atime)
959 ceph_encode_timespec(&fc->atime, atime);
960 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
961
962 fc->uid = cpu_to_le32(uid);
963 fc->gid = cpu_to_le32(gid);
964 fc->mode = cpu_to_le32(mode);
965
966 fc->xattr_version = cpu_to_le64(xattr_version);
967 if (xattrs_buf) {
968 msg->middle = ceph_buffer_get(xattrs_buf);
969 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
970 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 }
972
973 ceph_con_send(&session->s_con, msg);
974 return 0;
975}
976
977/*
978 * Queue cap releases when an inode is dropped from our cache. Since
979 * inode is about to be destroyed, there is no need for i_lock.
980 */
981void ceph_queue_caps_release(struct inode *inode)
982{
983 struct ceph_inode_info *ci = ceph_inode(inode);
984 struct rb_node *p;
985
986 p = rb_first(&ci->i_caps);
987 while (p) {
988 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
989 struct ceph_mds_session *session = cap->session;
990 struct ceph_msg *msg;
991 struct ceph_mds_cap_release *head;
992 struct ceph_mds_cap_item *item;
993
994 spin_lock(&session->s_cap_lock);
995 BUG_ON(!session->s_num_cap_releases);
996 msg = list_first_entry(&session->s_cap_releases,
997 struct ceph_msg, list_head);
998
999 dout(" adding %p release to mds%d msg %p (%d left)\n",
1000 inode, session->s_mds, msg, session->s_num_cap_releases);
1001
1002 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1003 head = msg->front.iov_base;
1004 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1005 item = msg->front.iov_base + msg->front.iov_len;
1006 item->ino = cpu_to_le64(ceph_ino(inode));
1007 item->cap_id = cpu_to_le64(cap->cap_id);
1008 item->migrate_seq = cpu_to_le32(cap->mseq);
1009 item->seq = cpu_to_le32(cap->issue_seq);
1010
1011 session->s_num_cap_releases--;
1012
1013 msg->front.iov_len += sizeof(*item);
1014 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1015 dout(" release msg %p full\n", msg);
1016 list_move_tail(&msg->list_head,
1017 &session->s_cap_releases_done);
1018 } else {
1019 dout(" release msg %p at %d/%d (%d)\n", msg,
1020 (int)le32_to_cpu(head->num),
1021 (int)CEPH_CAPS_PER_RELEASE,
1022 (int)msg->front.iov_len);
1023 }
1024 spin_unlock(&session->s_cap_lock);
1025 p = rb_next(p);
1026 __ceph_remove_cap(cap);
1027 }
1028}
1029
1030/*
1031 * Send a cap msg on the given inode. Update our caps state, then
1032 * drop i_lock and send the message.
1033 *
1034 * Make note of max_size reported/requested from mds, revoked caps
1035 * that have now been implemented.
1036 *
1037 * Make half-hearted attempt ot to invalidate page cache if we are
1038 * dropping RDCACHE. Note that this will leave behind locked pages
1039 * that we'll then need to deal with elsewhere.
1040 *
1041 * Return non-zero if delayed release, or we experienced an error
1042 * such that the caller should requeue + retry later.
1043 *
1044 * called with i_lock, then drops it.
1045 * caller should hold snap_rwsem (read), s_mutex.
1046 */
1047static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1048 int op, int used, int want, int retain, int flushing,
1049 unsigned *pflush_tid)
1050 __releases(cap->ci->vfs_inode->i_lock)
1051{
1052 struct ceph_inode_info *ci = cap->ci;
1053 struct inode *inode = &ci->vfs_inode;
1054 u64 cap_id = cap->cap_id;
1055 int held, revoking, dropping, keep;
1056 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1057 u64 size, max_size;
1058 struct timespec mtime, atime;
1059 int wake = 0;
1060 mode_t mode;
1061 uid_t uid;
1062 gid_t gid;
1063 struct ceph_mds_session *session;
1064 u64 xattr_version = 0;
1065 int delayed = 0;
1066 u64 flush_tid = 0;
1067 int i;
1068 int ret;
1069
1070 held = cap->issued | cap->implemented;
1071 revoking = cap->implemented & ~cap->issued;
1072 retain &= ~revoking;
1073 dropping = cap->issued & ~retain;
1074
1075 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1076 inode, cap, cap->session,
1077 ceph_cap_string(held), ceph_cap_string(held & retain),
1078 ceph_cap_string(revoking));
1079 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1080
1081 session = cap->session;
1082
1083 /* don't release wanted unless we've waited a bit. */
1084 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1085 time_before(jiffies, ci->i_hold_caps_min)) {
1086 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1087 ceph_cap_string(cap->issued),
1088 ceph_cap_string(cap->issued & retain),
1089 ceph_cap_string(cap->mds_wanted),
1090 ceph_cap_string(want));
1091 want |= cap->mds_wanted;
1092 retain |= cap->issued;
1093 delayed = 1;
1094 }
1095 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1096
1097 cap->issued &= retain; /* drop bits we don't want */
1098 if (cap->implemented & ~cap->issued) {
1099 /*
1100 * Wake up any waiters on wanted -> needed transition.
1101 * This is due to the weird transition from buffered
1102 * to sync IO... we need to flush dirty pages _before_
1103 * allowing sync writes to avoid reordering.
1104 */
1105 wake = 1;
1106 }
1107 cap->implemented &= cap->issued | used;
1108 cap->mds_wanted = want;
1109
1110 if (flushing) {
1111 /*
1112 * assign a tid for flush operations so we can avoid
1113 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1114 * clean type races. track latest tid for every bit
1115 * so we can handle flush AxFw, flush Fw, and have the
1116 * first ack clean Ax.
1117 */
1118 flush_tid = ++ci->i_cap_flush_last_tid;
1119 if (pflush_tid)
1120 *pflush_tid = flush_tid;
1121 dout(" cap_flush_tid %d\n", (int)flush_tid);
1122 for (i = 0; i < CEPH_CAP_BITS; i++)
1123 if (flushing & (1 << i))
1124 ci->i_cap_flush_tid[i] = flush_tid;
1125 }
1126
1127 keep = cap->implemented;
1128 seq = cap->seq;
1129 issue_seq = cap->issue_seq;
1130 mseq = cap->mseq;
1131 size = inode->i_size;
1132 ci->i_reported_size = size;
1133 max_size = ci->i_wanted_max_size;
1134 ci->i_requested_max_size = max_size;
1135 mtime = inode->i_mtime;
1136 atime = inode->i_atime;
1137 time_warp_seq = ci->i_time_warp_seq;
1138 follows = ci->i_snap_realm->cached_context->seq;
1139 uid = inode->i_uid;
1140 gid = inode->i_gid;
1141 mode = inode->i_mode;
1142
1143 if (dropping & CEPH_CAP_XATTR_EXCL) {
1144 __ceph_build_xattrs_blob(ci);
1145 xattr_version = ci->i_xattrs.version + 1;
1146 }
1147
1148 spin_unlock(&inode->i_lock);
1149
1150 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1151 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1152 size, max_size, &mtime, &atime, time_warp_seq,
1153 uid, gid, mode,
1154 xattr_version,
1155 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1156 follows);
1157 if (ret < 0) {
1158 dout("error sending cap msg, must requeue %p\n", inode);
1159 delayed = 1;
1160 }
1161
1162 if (wake)
1163 wake_up(&ci->i_cap_wq);
1164
1165 return delayed;
1166}
1167
1168/*
1169 * When a snapshot is taken, clients accumulate dirty metadata on
1170 * inodes with capabilities in ceph_cap_snaps to describe the file
1171 * state at the time the snapshot was taken. This must be flushed
1172 * asynchronously back to the MDS once sync writes complete and dirty
1173 * data is written out.
1174 *
1175 * Called under i_lock. Takes s_mutex as needed.
1176 */
1177void __ceph_flush_snaps(struct ceph_inode_info *ci,
1178 struct ceph_mds_session **psession)
1179{
1180 struct inode *inode = &ci->vfs_inode;
1181 int mds;
1182 struct ceph_cap_snap *capsnap;
1183 u32 mseq;
1184 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1185 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1186 session->s_mutex */
1187 u64 next_follows = 0; /* keep track of how far we've gotten through the
1188 i_cap_snaps list, and skip these entries next time
1189 around to avoid an infinite loop */
1190
1191 if (psession)
1192 session = *psession;
1193
1194 dout("__flush_snaps %p\n", inode);
1195retry:
1196 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1197 /* avoid an infiniute loop after retry */
1198 if (capsnap->follows < next_follows)
1199 continue;
1200 /*
1201 * we need to wait for sync writes to complete and for dirty
1202 * pages to be written out.
1203 */
1204 if (capsnap->dirty_pages || capsnap->writing)
1205 continue;
1206
1207 /* pick mds, take s_mutex */
1208 mds = __ceph_get_cap_mds(ci, &mseq);
1209 if (session && session->s_mds != mds) {
1210 dout("oops, wrong session %p mutex\n", session);
1211 mutex_unlock(&session->s_mutex);
1212 ceph_put_mds_session(session);
1213 session = NULL;
1214 }
1215 if (!session) {
1216 spin_unlock(&inode->i_lock);
1217 mutex_lock(&mdsc->mutex);
1218 session = __ceph_lookup_mds_session(mdsc, mds);
1219 mutex_unlock(&mdsc->mutex);
1220 if (session) {
1221 dout("inverting session/ino locks on %p\n",
1222 session);
1223 mutex_lock(&session->s_mutex);
1224 }
1225 /*
1226 * if session == NULL, we raced against a cap
1227 * deletion. retry, and we'll get a better
1228 * @mds value next time.
1229 */
1230 spin_lock(&inode->i_lock);
1231 goto retry;
1232 }
1233
1234 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1235 atomic_inc(&capsnap->nref);
1236 if (!list_empty(&capsnap->flushing_item))
1237 list_del_init(&capsnap->flushing_item);
1238 list_add_tail(&capsnap->flushing_item,
1239 &session->s_cap_snaps_flushing);
1240 spin_unlock(&inode->i_lock);
1241
1242 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1243 inode, capsnap, next_follows, capsnap->size);
1244 send_cap_msg(session, ceph_vino(inode).ino, 0,
1245 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1246 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1247 capsnap->size, 0,
1248 &capsnap->mtime, &capsnap->atime,
1249 capsnap->time_warp_seq,
1250 capsnap->uid, capsnap->gid, capsnap->mode,
1251 0, NULL,
1252 capsnap->follows);
1253
1254 next_follows = capsnap->follows + 1;
1255 ceph_put_cap_snap(capsnap);
1256
1257 spin_lock(&inode->i_lock);
1258 goto retry;
1259 }
1260
1261 /* we flushed them all; remove this inode from the queue */
1262 spin_lock(&mdsc->snap_flush_lock);
1263 list_del_init(&ci->i_snap_flush_item);
1264 spin_unlock(&mdsc->snap_flush_lock);
1265
1266 if (psession)
1267 *psession = session;
1268 else if (session) {
1269 mutex_unlock(&session->s_mutex);
1270 ceph_put_mds_session(session);
1271 }
1272}
1273
1274static void ceph_flush_snaps(struct ceph_inode_info *ci)
1275{
1276 struct inode *inode = &ci->vfs_inode;
1277
1278 spin_lock(&inode->i_lock);
1279 __ceph_flush_snaps(ci, NULL);
1280 spin_unlock(&inode->i_lock);
1281}
1282
1283/*
1284 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1285 * list.
1286 */
1287void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1288{
1289 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1290 struct inode *inode = &ci->vfs_inode;
1291 int was = ci->i_dirty_caps;
1292 int dirty = 0;
1293
1294 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1295 ceph_cap_string(mask), ceph_cap_string(was),
1296 ceph_cap_string(was | mask));
1297 ci->i_dirty_caps |= mask;
1298 if (was == 0) {
1299 dout(" inode %p now dirty\n", &ci->vfs_inode);
1300 BUG_ON(!list_empty(&ci->i_dirty_item));
1301 spin_lock(&mdsc->cap_dirty_lock);
1302 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1303 spin_unlock(&mdsc->cap_dirty_lock);
1304 if (ci->i_flushing_caps == 0) {
1305 igrab(inode);
1306 dirty |= I_DIRTY_SYNC;
1307 }
1308 }
1309 BUG_ON(list_empty(&ci->i_dirty_item));
1310 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1311 (mask & CEPH_CAP_FILE_BUFFER))
1312 dirty |= I_DIRTY_DATASYNC;
1313 if (dirty)
1314 __mark_inode_dirty(inode, dirty);
1315 __cap_delay_requeue(mdsc, ci);
1316}
1317
1318/*
1319 * Add dirty inode to the flushing list. Assigned a seq number so we
1320 * can wait for caps to flush without starving.
1321 *
1322 * Called under i_lock.
1323 */
1324static int __mark_caps_flushing(struct inode *inode,
1325 struct ceph_mds_session *session)
1326{
1327 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1328 struct ceph_inode_info *ci = ceph_inode(inode);
1329 int flushing;
1330
1331 BUG_ON(ci->i_dirty_caps == 0);
1332 BUG_ON(list_empty(&ci->i_dirty_item));
1333
1334 flushing = ci->i_dirty_caps;
1335 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1336 ceph_cap_string(flushing),
1337 ceph_cap_string(ci->i_flushing_caps),
1338 ceph_cap_string(ci->i_flushing_caps | flushing));
1339 ci->i_flushing_caps |= flushing;
1340 ci->i_dirty_caps = 0;
1341 dout(" inode %p now !dirty\n", inode);
1342
1343 spin_lock(&mdsc->cap_dirty_lock);
1344 list_del_init(&ci->i_dirty_item);
1345
1346 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1347 if (list_empty(&ci->i_flushing_item)) {
1348 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1349 mdsc->num_cap_flushing++;
1350 dout(" inode %p now flushing seq %lld\n", inode,
1351 ci->i_cap_flush_seq);
1352 } else {
1353 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1354 dout(" inode %p now flushing (more) seq %lld\n", inode,
1355 ci->i_cap_flush_seq);
1356 }
1357 spin_unlock(&mdsc->cap_dirty_lock);
1358
1359 return flushing;
1360}
1361
1362/*
1363 * try to invalidate mapping pages without blocking.
1364 */
1365static int mapping_is_empty(struct address_space *mapping)
1366{
1367 struct page *page = find_get_page(mapping, 0);
1368
1369 if (!page)
1370 return 1;
1371
1372 put_page(page);
1373 return 0;
1374}
1375
1376static int try_nonblocking_invalidate(struct inode *inode)
1377{
1378 struct ceph_inode_info *ci = ceph_inode(inode);
1379 u32 invalidating_gen = ci->i_rdcache_gen;
1380
1381 spin_unlock(&inode->i_lock);
1382 invalidate_mapping_pages(&inode->i_data, 0, -1);
1383 spin_lock(&inode->i_lock);
1384
1385 if (mapping_is_empty(&inode->i_data) &&
1386 invalidating_gen == ci->i_rdcache_gen) {
1387 /* success. */
1388 dout("try_nonblocking_invalidate %p success\n", inode);
1389 ci->i_rdcache_gen = 0;
1390 ci->i_rdcache_revoking = 0;
1391 return 0;
1392 }
1393 dout("try_nonblocking_invalidate %p failed\n", inode);
1394 return -1;
1395}
1396
1397/*
1398 * Swiss army knife function to examine currently used and wanted
1399 * versus held caps. Release, flush, ack revoked caps to mds as
1400 * appropriate.
1401 *
1402 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1403 * cap release further.
1404 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1405 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1406 * further delay.
1407 */
1408void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1409 struct ceph_mds_session *session)
1410 __releases(session->s_mutex)
1411{
1412 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1413 struct ceph_mds_client *mdsc = &client->mdsc;
1414 struct inode *inode = &ci->vfs_inode;
1415 struct ceph_cap *cap;
1416 int file_wanted, used;
1417 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1418 int issued, implemented, want, retain, revoking, flushing = 0;
1419 int mds = -1; /* keep track of how far we've gone through i_caps list
1420 to avoid an infinite loop on retry */
1421 struct rb_node *p;
1422 int tried_invalidate = 0;
1423 int delayed = 0, sent = 0, force_requeue = 0, num;
1424 int queue_invalidate = 0;
1425 int is_delayed = flags & CHECK_CAPS_NODELAY;
1426
1427 /* if we are unmounting, flush any unused caps immediately. */
1428 if (mdsc->stopping)
1429 is_delayed = 1;
1430
1431 spin_lock(&inode->i_lock);
1432
1433 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1434 flags |= CHECK_CAPS_FLUSH;
1435
1436 /* flush snaps first time around only */
1437 if (!list_empty(&ci->i_cap_snaps))
1438 __ceph_flush_snaps(ci, &session);
1439 goto retry_locked;
1440retry:
1441 spin_lock(&inode->i_lock);
1442retry_locked:
1443 file_wanted = __ceph_caps_file_wanted(ci);
1444 used = __ceph_caps_used(ci);
1445 want = file_wanted | used;
1446 issued = __ceph_caps_issued(ci, &implemented);
1447 revoking = implemented & ~issued;
1448
1449 retain = want | CEPH_CAP_PIN;
1450 if (!mdsc->stopping && inode->i_nlink > 0) {
1451 if (want) {
1452 retain |= CEPH_CAP_ANY; /* be greedy */
1453 } else {
1454 retain |= CEPH_CAP_ANY_SHARED;
1455 /*
1456 * keep RD only if we didn't have the file open RW,
1457 * because then the mds would revoke it anyway to
1458 * journal max_size=0.
1459 */
1460 if (ci->i_max_size == 0)
1461 retain |= CEPH_CAP_ANY_RD;
1462 }
1463 }
1464
1465 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1466 " issued %s revoking %s retain %s %s%s%s\n", inode,
1467 ceph_cap_string(file_wanted),
1468 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1469 ceph_cap_string(ci->i_flushing_caps),
1470 ceph_cap_string(issued), ceph_cap_string(revoking),
1471 ceph_cap_string(retain),
1472 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1473 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1474 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1475
1476 /*
1477 * If we no longer need to hold onto old our caps, and we may
1478 * have cached pages, but don't want them, then try to invalidate.
1479 * If we fail, it's because pages are locked.... try again later.
1480 */
1481 if ((!is_delayed || mdsc->stopping) &&
1482 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1483 ci->i_rdcache_gen && /* may have cached pages */
1484 (file_wanted == 0 || /* no open files */
1485 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1486 !tried_invalidate) {
1487 dout("check_caps trying to invalidate on %p\n", inode);
1488 if (try_nonblocking_invalidate(inode) < 0) {
1489 if (revoking & CEPH_CAP_FILE_CACHE) {
1490 dout("check_caps queuing invalidate\n");
1491 queue_invalidate = 1;
1492 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1493 } else {
1494 dout("check_caps failed to invalidate pages\n");
1495 /* we failed to invalidate pages. check these
1496 caps again later. */
1497 force_requeue = 1;
1498 __cap_set_timeouts(mdsc, ci);
1499 }
1500 }
1501 tried_invalidate = 1;
1502 goto retry_locked;
1503 }
1504
1505 num = 0;
1506 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1507 cap = rb_entry(p, struct ceph_cap, ci_node);
1508 num++;
1509
1510 /* avoid looping forever */
1511 if (mds >= cap->mds ||
1512 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1513 continue;
1514
1515 /* NOTE: no side-effects allowed, until we take s_mutex */
1516
1517 revoking = cap->implemented & ~cap->issued;
1518 if (revoking)
1519 dout(" mds%d revoking %s\n", cap->mds,
1520 ceph_cap_string(revoking));
1521
1522 if (cap == ci->i_auth_cap &&
1523 (cap->issued & CEPH_CAP_FILE_WR)) {
1524 /* request larger max_size from MDS? */
1525 if (ci->i_wanted_max_size > ci->i_max_size &&
1526 ci->i_wanted_max_size > ci->i_requested_max_size) {
1527 dout("requesting new max_size\n");
1528 goto ack;
1529 }
1530
1531 /* approaching file_max? */
1532 if ((inode->i_size << 1) >= ci->i_max_size &&
1533 (ci->i_reported_size << 1) < ci->i_max_size) {
1534 dout("i_size approaching max_size\n");
1535 goto ack;
1536 }
1537 }
1538 /* flush anything dirty? */
1539 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1540 ci->i_dirty_caps) {
1541 dout("flushing dirty caps\n");
1542 goto ack;
1543 }
1544
1545 /* completed revocation? going down and there are no caps? */
1546 if (revoking && (revoking & used) == 0) {
1547 dout("completed revocation of %s\n",
1548 ceph_cap_string(cap->implemented & ~cap->issued));
1549 goto ack;
1550 }
1551
1552 /* want more caps from mds? */
1553 if (want & ~(cap->mds_wanted | cap->issued))
1554 goto ack;
1555
1556 /* things we might delay */
1557 if ((cap->issued & ~retain) == 0 &&
1558 cap->mds_wanted == want)
1559 continue; /* nope, all good */
1560
1561 if (is_delayed)
1562 goto ack;
1563
1564 /* delay? */
1565 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1566 time_before(jiffies, ci->i_hold_caps_max)) {
1567 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1568 ceph_cap_string(cap->issued),
1569 ceph_cap_string(cap->issued & retain),
1570 ceph_cap_string(cap->mds_wanted),
1571 ceph_cap_string(want));
1572 delayed++;
1573 continue;
1574 }
1575
1576ack:
1577 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1578 dout(" skipping %p I_NOFLUSH set\n", inode);
1579 continue;
1580 }
1581
1582 if (session && session != cap->session) {
1583 dout("oops, wrong session %p mutex\n", session);
1584 mutex_unlock(&session->s_mutex);
1585 session = NULL;
1586 }
1587 if (!session) {
1588 session = cap->session;
1589 if (mutex_trylock(&session->s_mutex) == 0) {
1590 dout("inverting session/ino locks on %p\n",
1591 session);
1592 spin_unlock(&inode->i_lock);
1593 if (took_snap_rwsem) {
1594 up_read(&mdsc->snap_rwsem);
1595 took_snap_rwsem = 0;
1596 }
1597 mutex_lock(&session->s_mutex);
1598 goto retry;
1599 }
1600 }
1601 /* take snap_rwsem after session mutex */
1602 if (!took_snap_rwsem) {
1603 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1604 dout("inverting snap/in locks on %p\n",
1605 inode);
1606 spin_unlock(&inode->i_lock);
1607 down_read(&mdsc->snap_rwsem);
1608 took_snap_rwsem = 1;
1609 goto retry;
1610 }
1611 took_snap_rwsem = 1;
1612 }
1613
1614 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1615 flushing = __mark_caps_flushing(inode, session);
1616
1617 mds = cap->mds; /* remember mds, so we don't repeat */
1618 sent++;
1619
1620 /* __send_cap drops i_lock */
1621 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1622 retain, flushing, NULL);
1623 goto retry; /* retake i_lock and restart our cap scan. */
1624 }
1625
1626 /*
1627 * Reschedule delayed caps release if we delayed anything,
1628 * otherwise cancel.
1629 */
1630 if (delayed && is_delayed)
1631 force_requeue = 1; /* __send_cap delayed release; requeue */
1632 if (!delayed && !is_delayed)
1633 __cap_delay_cancel(mdsc, ci);
1634 else if (!is_delayed || force_requeue)
1635 __cap_delay_requeue(mdsc, ci);
1636
1637 spin_unlock(&inode->i_lock);
1638
1639 if (queue_invalidate)
1640 ceph_queue_invalidate(inode);
1641
1642 if (session)
1643 mutex_unlock(&session->s_mutex);
1644 if (took_snap_rwsem)
1645 up_read(&mdsc->snap_rwsem);
1646}
1647
1648/*
1649 * Try to flush dirty caps back to the auth mds.
1650 */
1651static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1652 unsigned *flush_tid)
1653{
1654 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1655 struct ceph_inode_info *ci = ceph_inode(inode);
1656 int unlock_session = session ? 0 : 1;
1657 int flushing = 0;
1658
1659retry:
1660 spin_lock(&inode->i_lock);
1661 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1662 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1663 goto out;
1664 }
1665 if (ci->i_dirty_caps && ci->i_auth_cap) {
1666 struct ceph_cap *cap = ci->i_auth_cap;
1667 int used = __ceph_caps_used(ci);
1668 int want = __ceph_caps_wanted(ci);
1669 int delayed;
1670
1671 if (!session) {
1672 spin_unlock(&inode->i_lock);
1673 session = cap->session;
1674 mutex_lock(&session->s_mutex);
1675 goto retry;
1676 }
1677 BUG_ON(session != cap->session);
1678 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1679 goto out;
1680
1681 flushing = __mark_caps_flushing(inode, session);
1682
1683 /* __send_cap drops i_lock */
1684 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1685 cap->issued | cap->implemented, flushing,
1686 flush_tid);
1687 if (!delayed)
1688 goto out_unlocked;
1689
1690 spin_lock(&inode->i_lock);
1691 __cap_delay_requeue(mdsc, ci);
1692 }
1693out:
1694 spin_unlock(&inode->i_lock);
1695out_unlocked:
1696 if (session && unlock_session)
1697 mutex_unlock(&session->s_mutex);
1698 return flushing;
1699}
1700
1701/*
1702 * Return true if we've flushed caps through the given flush_tid.
1703 */
1704static int caps_are_flushed(struct inode *inode, unsigned tid)
1705{
1706 struct ceph_inode_info *ci = ceph_inode(inode);
1707 int dirty, i, ret = 1;
1708
1709 spin_lock(&inode->i_lock);
1710 dirty = __ceph_caps_dirty(ci);
1711 for (i = 0; i < CEPH_CAP_BITS; i++)
1712 if ((ci->i_flushing_caps & (1 << i)) &&
1713 ci->i_cap_flush_tid[i] <= tid) {
1714 /* still flushing this bit */
1715 ret = 0;
1716 break;
1717 }
1718 spin_unlock(&inode->i_lock);
1719 return ret;
1720}
1721
1722/*
1723 * Wait on any unsafe replies for the given inode. First wait on the
1724 * newest request, and make that the upper bound. Then, if there are
1725 * more requests, keep waiting on the oldest as long as it is still older
1726 * than the original request.
1727 */
1728static void sync_write_wait(struct inode *inode)
1729{
1730 struct ceph_inode_info *ci = ceph_inode(inode);
1731 struct list_head *head = &ci->i_unsafe_writes;
1732 struct ceph_osd_request *req;
1733 u64 last_tid;
1734
1735 spin_lock(&ci->i_unsafe_lock);
1736 if (list_empty(head))
1737 goto out;
1738
1739 /* set upper bound as _last_ entry in chain */
1740 req = list_entry(head->prev, struct ceph_osd_request,
1741 r_unsafe_item);
1742 last_tid = req->r_tid;
1743
1744 do {
1745 ceph_osdc_get_request(req);
1746 spin_unlock(&ci->i_unsafe_lock);
1747 dout("sync_write_wait on tid %llu (until %llu)\n",
1748 req->r_tid, last_tid);
1749 wait_for_completion(&req->r_safe_completion);
1750 spin_lock(&ci->i_unsafe_lock);
1751 ceph_osdc_put_request(req);
1752
1753 /*
1754 * from here on look at first entry in chain, since we
1755 * only want to wait for anything older than last_tid
1756 */
1757 if (list_empty(head))
1758 break;
1759 req = list_entry(head->next, struct ceph_osd_request,
1760 r_unsafe_item);
1761 } while (req->r_tid < last_tid);
1762out:
1763 spin_unlock(&ci->i_unsafe_lock);
1764}
1765
1766int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1767{
1768 struct inode *inode = dentry->d_inode;
1769 struct ceph_inode_info *ci = ceph_inode(inode);
1770 unsigned flush_tid;
1771 int ret;
1772 int dirty;
1773
1774 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1775 sync_write_wait(inode);
1776
1777 ret = filemap_write_and_wait(inode->i_mapping);
1778 if (ret < 0)
1779 return ret;
1780
1781 dirty = try_flush_caps(inode, NULL, &flush_tid);
1782 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1783
1784 /*
1785 * only wait on non-file metadata writeback (the mds
1786 * can recover size and mtime, so we don't need to
1787 * wait for that)
1788 */
1789 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1790 dout("fsync waiting for flush_tid %u\n", flush_tid);
1791 ret = wait_event_interruptible(ci->i_cap_wq,
1792 caps_are_flushed(inode, flush_tid));
1793 }
1794
1795 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1796 return ret;
1797}
1798
1799/*
1800 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1801 * queue inode for flush but don't do so immediately, because we can
1802 * get by with fewer MDS messages if we wait for data writeback to
1803 * complete first.
1804 */
1805int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1806{
1807 struct ceph_inode_info *ci = ceph_inode(inode);
1808 unsigned flush_tid;
1809 int err = 0;
1810 int dirty;
1811 int wait = wbc->sync_mode == WB_SYNC_ALL;
1812
1813 dout("write_inode %p wait=%d\n", inode, wait);
1814 if (wait) {
1815 dirty = try_flush_caps(inode, NULL, &flush_tid);
1816 if (dirty)
1817 err = wait_event_interruptible(ci->i_cap_wq,
1818 caps_are_flushed(inode, flush_tid));
1819 } else {
1820 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1821
1822 spin_lock(&inode->i_lock);
1823 if (__ceph_caps_dirty(ci))
1824 __cap_delay_requeue_front(mdsc, ci);
1825 spin_unlock(&inode->i_lock);
1826 }
1827 return err;
1828}
1829
1830/*
1831 * After a recovering MDS goes active, we need to resend any caps
1832 * we were flushing.
1833 *
1834 * Caller holds session->s_mutex.
1835 */
1836static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1837 struct ceph_mds_session *session)
1838{
1839 struct ceph_cap_snap *capsnap;
1840
1841 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1842 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1843 flushing_item) {
1844 struct ceph_inode_info *ci = capsnap->ci;
1845 struct inode *inode = &ci->vfs_inode;
1846 struct ceph_cap *cap;
1847
1848 spin_lock(&inode->i_lock);
1849 cap = ci->i_auth_cap;
1850 if (cap && cap->session == session) {
1851 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1852 cap, capsnap);
1853 __ceph_flush_snaps(ci, &session);
1854 } else {
1855 pr_err("%p auth cap %p not mds%d ???\n", inode,
1856 cap, session->s_mds);
1857 spin_unlock(&inode->i_lock);
1858 }
1859 }
1860}
1861
1862void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1863 struct ceph_mds_session *session)
1864{
1865 struct ceph_inode_info *ci;
1866
1867 kick_flushing_capsnaps(mdsc, session);
1868
1869 dout("kick_flushing_caps mds%d\n", session->s_mds);
1870 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1871 struct inode *inode = &ci->vfs_inode;
1872 struct ceph_cap *cap;
1873 int delayed = 0;
1874
1875 spin_lock(&inode->i_lock);
1876 cap = ci->i_auth_cap;
1877 if (cap && cap->session == session) {
1878 dout("kick_flushing_caps %p cap %p %s\n", inode,
1879 cap, ceph_cap_string(ci->i_flushing_caps));
1880 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1881 __ceph_caps_used(ci),
1882 __ceph_caps_wanted(ci),
1883 cap->issued | cap->implemented,
1884 ci->i_flushing_caps, NULL);
1885 if (delayed) {
1886 spin_lock(&inode->i_lock);
1887 __cap_delay_requeue(mdsc, ci);
1888 spin_unlock(&inode->i_lock);
1889 }
1890 } else {
1891 pr_err("%p auth cap %p not mds%d ???\n", inode,
1892 cap, session->s_mds);
1893 spin_unlock(&inode->i_lock);
1894 }
1895 }
1896}
1897
1898
1899/*
1900 * Take references to capabilities we hold, so that we don't release
1901 * them to the MDS prematurely.
1902 *
1903 * Protected by i_lock.
1904 */
1905static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1906{
1907 if (got & CEPH_CAP_PIN)
1908 ci->i_pin_ref++;
1909 if (got & CEPH_CAP_FILE_RD)
1910 ci->i_rd_ref++;
1911 if (got & CEPH_CAP_FILE_CACHE)
1912 ci->i_rdcache_ref++;
1913 if (got & CEPH_CAP_FILE_WR)
1914 ci->i_wr_ref++;
1915 if (got & CEPH_CAP_FILE_BUFFER) {
1916 if (ci->i_wrbuffer_ref == 0)
1917 igrab(&ci->vfs_inode);
1918 ci->i_wrbuffer_ref++;
1919 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1920 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1921 }
1922}
1923
1924/*
1925 * Try to grab cap references. Specify those refs we @want, and the
1926 * minimal set we @need. Also include the larger offset we are writing
1927 * to (when applicable), and check against max_size here as well.
1928 * Note that caller is responsible for ensuring max_size increases are
1929 * requested from the MDS.
1930 */
1931static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1932 int *got, loff_t endoff, int *check_max, int *err)
1933{
1934 struct inode *inode = &ci->vfs_inode;
1935 int ret = 0;
1936 int have, implemented;
1937 int file_wanted;
1938
1939 dout("get_cap_refs %p need %s want %s\n", inode,
1940 ceph_cap_string(need), ceph_cap_string(want));
1941 spin_lock(&inode->i_lock);
1942
1943 /* make sure file is actually open */
1944 file_wanted = __ceph_caps_file_wanted(ci);
1945 if ((file_wanted & need) == 0) {
1946 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1947 ceph_cap_string(need), ceph_cap_string(file_wanted));
1948 *err = -EBADF;
1949 ret = 1;
1950 goto out;
1951 }
1952
1953 if (need & CEPH_CAP_FILE_WR) {
1954 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1955 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1956 inode, endoff, ci->i_max_size);
1957 if (endoff > ci->i_wanted_max_size) {
1958 *check_max = 1;
1959 ret = 1;
1960 }
1961 goto out;
1962 }
1963 /*
1964 * If a sync write is in progress, we must wait, so that we
1965 * can get a final snapshot value for size+mtime.
1966 */
1967 if (__ceph_have_pending_cap_snap(ci)) {
1968 dout("get_cap_refs %p cap_snap_pending\n", inode);
1969 goto out;
1970 }
1971 }
1972 have = __ceph_caps_issued(ci, &implemented);
1973
1974 /*
1975 * disallow writes while a truncate is pending
1976 */
1977 if (ci->i_truncate_pending)
1978 have &= ~CEPH_CAP_FILE_WR;
1979
1980 if ((have & need) == need) {
1981 /*
1982 * Look at (implemented & ~have & not) so that we keep waiting
1983 * on transition from wanted -> needed caps. This is needed
1984 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1985 * going before a prior buffered writeback happens.
1986 */
1987 int not = want & ~(have & need);
1988 int revoking = implemented & ~have;
1989 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1990 inode, ceph_cap_string(have), ceph_cap_string(not),
1991 ceph_cap_string(revoking));
1992 if ((revoking & not) == 0) {
1993 *got = need | (have & want);
1994 __take_cap_refs(ci, *got);
1995 ret = 1;
1996 }
1997 } else {
1998 dout("get_cap_refs %p have %s needed %s\n", inode,
1999 ceph_cap_string(have), ceph_cap_string(need));
2000 }
2001out:
2002 spin_unlock(&inode->i_lock);
2003 dout("get_cap_refs %p ret %d got %s\n", inode,
2004 ret, ceph_cap_string(*got));
2005 return ret;
2006}
2007
2008/*
2009 * Check the offset we are writing up to against our current
2010 * max_size. If necessary, tell the MDS we want to write to
2011 * a larger offset.
2012 */
2013static void check_max_size(struct inode *inode, loff_t endoff)
2014{
2015 struct ceph_inode_info *ci = ceph_inode(inode);
2016 int check = 0;
2017
2018 /* do we need to explicitly request a larger max_size? */
2019 spin_lock(&inode->i_lock);
2020 if ((endoff >= ci->i_max_size ||
2021 endoff > (inode->i_size << 1)) &&
2022 endoff > ci->i_wanted_max_size) {
2023 dout("write %p at large endoff %llu, req max_size\n",
2024 inode, endoff);
2025 ci->i_wanted_max_size = endoff;
2026 check = 1;
2027 }
2028 spin_unlock(&inode->i_lock);
2029 if (check)
2030 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2031}
2032
2033/*
2034 * Wait for caps, and take cap references. If we can't get a WR cap
2035 * due to a small max_size, make sure we check_max_size (and possibly
2036 * ask the mds) so we don't get hung up indefinitely.
2037 */
2038int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2039 loff_t endoff)
2040{
2041 int check_max, ret, err;
2042
2043retry:
2044 if (endoff > 0)
2045 check_max_size(&ci->vfs_inode, endoff);
2046 check_max = 0;
2047 err = 0;
2048 ret = wait_event_interruptible(ci->i_cap_wq,
2049 try_get_cap_refs(ci, need, want,
2050 got, endoff,
2051 &check_max, &err));
2052 if (err)
2053 ret = err;
2054 if (check_max)
2055 goto retry;
2056 return ret;
2057}
2058
2059/*
2060 * Take cap refs. Caller must already know we hold at least one ref
2061 * on the caps in question or we don't know this is safe.
2062 */
2063void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2064{
2065 spin_lock(&ci->vfs_inode.i_lock);
2066 __take_cap_refs(ci, caps);
2067 spin_unlock(&ci->vfs_inode.i_lock);
2068}
2069
2070/*
2071 * Release cap refs.
2072 *
2073 * If we released the last ref on any given cap, call ceph_check_caps
2074 * to release (or schedule a release).
2075 *
2076 * If we are releasing a WR cap (from a sync write), finalize any affected
2077 * cap_snap, and wake up any waiters.
2078 */
2079void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2080{
2081 struct inode *inode = &ci->vfs_inode;
2082 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2083 struct ceph_cap_snap *capsnap;
2084
2085 spin_lock(&inode->i_lock);
2086 if (had & CEPH_CAP_PIN)
2087 --ci->i_pin_ref;
2088 if (had & CEPH_CAP_FILE_RD)
2089 if (--ci->i_rd_ref == 0)
2090 last++;
2091 if (had & CEPH_CAP_FILE_CACHE)
2092 if (--ci->i_rdcache_ref == 0)
2093 last++;
2094 if (had & CEPH_CAP_FILE_BUFFER) {
2095 if (--ci->i_wrbuffer_ref == 0) {
2096 last++;
2097 put++;
2098 }
2099 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2100 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2101 }
2102 if (had & CEPH_CAP_FILE_WR)
2103 if (--ci->i_wr_ref == 0) {
2104 last++;
2105 if (!list_empty(&ci->i_cap_snaps)) {
2106 capsnap = list_first_entry(&ci->i_cap_snaps,
2107 struct ceph_cap_snap,
2108 ci_item);
2109 if (capsnap->writing) {
2110 capsnap->writing = 0;
2111 flushsnaps =
2112 __ceph_finish_cap_snap(ci,
2113 capsnap);
2114 wake = 1;
2115 }
2116 }
2117 }
2118 spin_unlock(&inode->i_lock);
2119
2120 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
2121 last ? "last" : "");
2122
2123 if (last && !flushsnaps)
2124 ceph_check_caps(ci, 0, NULL);
2125 else if (flushsnaps)
2126 ceph_flush_snaps(ci);
2127 if (wake)
2128 wake_up(&ci->i_cap_wq);
2129 if (put)
2130 iput(inode);
2131}
2132
2133/*
2134 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2135 * context. Adjust per-snap dirty page accounting as appropriate.
2136 * Once all dirty data for a cap_snap is flushed, flush snapped file
2137 * metadata back to the MDS. If we dropped the last ref, call
2138 * ceph_check_caps.
2139 */
2140void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2141 struct ceph_snap_context *snapc)
2142{
2143 struct inode *inode = &ci->vfs_inode;
2144 int last = 0;
2145 int last_snap = 0;
2146 int found = 0;
2147 struct ceph_cap_snap *capsnap = NULL;
2148
2149 spin_lock(&inode->i_lock);
2150 ci->i_wrbuffer_ref -= nr;
2151 last = !ci->i_wrbuffer_ref;
2152
2153 if (ci->i_head_snapc == snapc) {
2154 ci->i_wrbuffer_ref_head -= nr;
2155 if (!ci->i_wrbuffer_ref_head) {
2156 ceph_put_snap_context(ci->i_head_snapc);
2157 ci->i_head_snapc = NULL;
2158 }
2159 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2160 inode,
2161 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2162 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2163 last ? " LAST" : "");
2164 } else {
2165 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2166 if (capsnap->context == snapc) {
2167 found = 1;
2168 capsnap->dirty_pages -= nr;
2169 last_snap = !capsnap->dirty_pages;
2170 break;
2171 }
2172 }
2173 BUG_ON(!found);
2174 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2175 " snap %lld %d/%d -> %d/%d %s%s\n",
2176 inode, capsnap, capsnap->context->seq,
2177 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2178 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2179 last ? " (wrbuffer last)" : "",
2180 last_snap ? " (capsnap last)" : "");
2181 }
2182
2183 spin_unlock(&inode->i_lock);
2184
2185 if (last) {
2186 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2187 iput(inode);
2188 } else if (last_snap) {
2189 ceph_flush_snaps(ci);
2190 wake_up(&ci->i_cap_wq);
2191 }
2192}
2193
2194/*
2195 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2196 * actually be a revocation if it specifies a smaller cap set.)
2197 *
2198 * caller holds s_mutex and i_lock, we drop both.
2199 *
2200 * return value:
2201 * 0 - ok
2202 * 1 - check_caps on auth cap only (writeback)
2203 * 2 - check_caps (ack revoke)
2204 */
2205static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2206 struct ceph_mds_session *session,
2207 struct ceph_cap *cap,
2208 struct ceph_buffer *xattr_buf)
2209 __releases(inode->i_lock)
2210 __releases(session->s_mutex)
2211{
2212 struct ceph_inode_info *ci = ceph_inode(inode);
2213 int mds = session->s_mds;
2214 int seq = le32_to_cpu(grant->seq);
2215 int newcaps = le32_to_cpu(grant->caps);
2216 int issued, implemented, used, wanted, dirty;
2217 u64 size = le64_to_cpu(grant->size);
2218 u64 max_size = le64_to_cpu(grant->max_size);
2219 struct timespec mtime, atime, ctime;
2220 int check_caps = 0;
2221 int wake = 0;
2222 int writeback = 0;
2223 int revoked_rdcache = 0;
2224 int queue_invalidate = 0;
2225
2226 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2227 inode, cap, mds, seq, ceph_cap_string(newcaps));
2228 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2229 inode->i_size);
2230
2231 /*
2232 * If CACHE is being revoked, and we have no dirty buffers,
2233 * try to invalidate (once). (If there are dirty buffers, we
2234 * will invalidate _after_ writeback.)
2235 */
2236 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2237 !ci->i_wrbuffer_ref) {
2238 if (try_nonblocking_invalidate(inode) == 0) {
2239 revoked_rdcache = 1;
2240 } else {
2241 /* there were locked pages.. invalidate later
2242 in a separate thread. */
2243 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2244 queue_invalidate = 1;
2245 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2246 }
2247 }
2248 }
2249
2250 /* side effects now are allowed */
2251
2252 issued = __ceph_caps_issued(ci, &implemented);
2253 issued |= implemented | __ceph_caps_dirty(ci);
2254
2255 cap->cap_gen = session->s_cap_gen;
2256
2257 __check_cap_issue(ci, cap, newcaps);
2258
2259 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2260 inode->i_mode = le32_to_cpu(grant->mode);
2261 inode->i_uid = le32_to_cpu(grant->uid);
2262 inode->i_gid = le32_to_cpu(grant->gid);
2263 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2264 inode->i_uid, inode->i_gid);
2265 }
2266
2267 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2268 inode->i_nlink = le32_to_cpu(grant->nlink);
2269
2270 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2271 int len = le32_to_cpu(grant->xattr_len);
2272 u64 version = le64_to_cpu(grant->xattr_version);
2273
2274 if (version > ci->i_xattrs.version) {
2275 dout(" got new xattrs v%llu on %p len %d\n",
2276 version, inode, len);
2277 if (ci->i_xattrs.blob)
2278 ceph_buffer_put(ci->i_xattrs.blob);
2279 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2280 ci->i_xattrs.version = version;
2281 }
2282 }
2283
2284 /* size/ctime/mtime/atime? */
2285 ceph_fill_file_size(inode, issued,
2286 le32_to_cpu(grant->truncate_seq),
2287 le64_to_cpu(grant->truncate_size), size);
2288 ceph_decode_timespec(&mtime, &grant->mtime);
2289 ceph_decode_timespec(&atime, &grant->atime);
2290 ceph_decode_timespec(&ctime, &grant->ctime);
2291 ceph_fill_file_time(inode, issued,
2292 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2293 &atime);
2294
2295 /* max size increase? */
2296 if (max_size != ci->i_max_size) {
2297 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2298 ci->i_max_size = max_size;
2299 if (max_size >= ci->i_wanted_max_size) {
2300 ci->i_wanted_max_size = 0; /* reset */
2301 ci->i_requested_max_size = 0;
2302 }
2303 wake = 1;
2304 }
2305
2306 /* check cap bits */
2307 wanted = __ceph_caps_wanted(ci);
2308 used = __ceph_caps_used(ci);
2309 dirty = __ceph_caps_dirty(ci);
2310 dout(" my wanted = %s, used = %s, dirty %s\n",
2311 ceph_cap_string(wanted),
2312 ceph_cap_string(used),
2313 ceph_cap_string(dirty));
2314 if (wanted != le32_to_cpu(grant->wanted)) {
2315 dout("mds wanted %s -> %s\n",
2316 ceph_cap_string(le32_to_cpu(grant->wanted)),
2317 ceph_cap_string(wanted));
2318 grant->wanted = cpu_to_le32(wanted);
2319 }
2320
2321 cap->seq = seq;
2322
2323 /* file layout may have changed */
2324 ci->i_layout = grant->layout;
2325
2326 /* revocation, grant, or no-op? */
2327 if (cap->issued & ~newcaps) {
2328 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2329 ceph_cap_string(newcaps));
2330 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2331 writeback = 1; /* will delay ack */
2332 else if (dirty & ~newcaps)
2333 check_caps = 1; /* initiate writeback in check_caps */
2334 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2335 revoked_rdcache)
2336 check_caps = 2; /* send revoke ack in check_caps */
2337 cap->issued = newcaps;
2338 cap->implemented |= newcaps;
2339 } else if (cap->issued == newcaps) {
2340 dout("caps unchanged: %s -> %s\n",
2341 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2342 } else {
2343 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2344 ceph_cap_string(newcaps));
2345 cap->issued = newcaps;
2346 cap->implemented |= newcaps; /* add bits only, to
2347 * avoid stepping on a
2348 * pending revocation */
2349 wake = 1;
2350 }
2351 BUG_ON(cap->issued & ~cap->implemented);
2352
2353 spin_unlock(&inode->i_lock);
2354 if (writeback)
2355 /*
2356 * queue inode for writeback: we can't actually call
2357 * filemap_write_and_wait, etc. from message handler
2358 * context.
2359 */
2360 ceph_queue_writeback(inode);
2361 if (queue_invalidate)
2362 ceph_queue_invalidate(inode);
2363 if (wake)
2364 wake_up(&ci->i_cap_wq);
2365
2366 if (check_caps == 1)
2367 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2368 session);
2369 else if (check_caps == 2)
2370 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2371 else
2372 mutex_unlock(&session->s_mutex);
2373}
2374
2375/*
2376 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2377 * MDS has been safely committed.
2378 */
2379static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2380 struct ceph_mds_caps *m,
2381 struct ceph_mds_session *session,
2382 struct ceph_cap *cap)
2383 __releases(inode->i_lock)
2384{
2385 struct ceph_inode_info *ci = ceph_inode(inode);
2386 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2387 unsigned seq = le32_to_cpu(m->seq);
2388 int dirty = le32_to_cpu(m->dirty);
2389 int cleaned = 0;
2390 int drop = 0;
2391 int i;
2392
2393 for (i = 0; i < CEPH_CAP_BITS; i++)
2394 if ((dirty & (1 << i)) &&
2395 flush_tid == ci->i_cap_flush_tid[i])
2396 cleaned |= 1 << i;
2397
2398 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2399 " flushing %s -> %s\n",
2400 inode, session->s_mds, seq, ceph_cap_string(dirty),
2401 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2402 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2403
2404 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2405 goto out;
2406
2407 ci->i_flushing_caps &= ~cleaned;
2408
2409 spin_lock(&mdsc->cap_dirty_lock);
2410 if (ci->i_flushing_caps == 0) {
2411 list_del_init(&ci->i_flushing_item);
2412 if (!list_empty(&session->s_cap_flushing))
2413 dout(" mds%d still flushing cap on %p\n",
2414 session->s_mds,
2415 &list_entry(session->s_cap_flushing.next,
2416 struct ceph_inode_info,
2417 i_flushing_item)->vfs_inode);
2418 mdsc->num_cap_flushing--;
2419 wake_up(&mdsc->cap_flushing_wq);
2420 dout(" inode %p now !flushing\n", inode);
2421
2422 if (ci->i_dirty_caps == 0) {
2423 dout(" inode %p now clean\n", inode);
2424 BUG_ON(!list_empty(&ci->i_dirty_item));
2425 drop = 1;
2426 } else {
2427 BUG_ON(list_empty(&ci->i_dirty_item));
2428 }
2429 }
2430 spin_unlock(&mdsc->cap_dirty_lock);
2431 wake_up(&ci->i_cap_wq);
2432
2433out:
2434 spin_unlock(&inode->i_lock);
2435 if (drop)
2436 iput(inode);
2437}
2438
2439/*
2440 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2441 * throw away our cap_snap.
2442 *
2443 * Caller hold s_mutex.
2444 */
2445static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2446 struct ceph_mds_caps *m,
2447 struct ceph_mds_session *session)
2448{
2449 struct ceph_inode_info *ci = ceph_inode(inode);
2450 u64 follows = le64_to_cpu(m->snap_follows);
2451 struct ceph_cap_snap *capsnap;
2452 int drop = 0;
2453
2454 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2455 inode, ci, session->s_mds, follows);
2456
2457 spin_lock(&inode->i_lock);
2458 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2459 if (capsnap->follows == follows) {
2460 if (capsnap->flush_tid != flush_tid) {
2461 dout(" cap_snap %p follows %lld tid %lld !="
2462 " %lld\n", capsnap, follows,
2463 flush_tid, capsnap->flush_tid);
2464 break;
2465 }
2466 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2467 dout(" removing cap_snap %p follows %lld\n",
2468 capsnap, follows);
2469 ceph_put_snap_context(capsnap->context);
2470 list_del(&capsnap->ci_item);
2471 list_del(&capsnap->flushing_item);
2472 ceph_put_cap_snap(capsnap);
2473 drop = 1;
2474 break;
2475 } else {
2476 dout(" skipping cap_snap %p follows %lld\n",
2477 capsnap, capsnap->follows);
2478 }
2479 }
2480 spin_unlock(&inode->i_lock);
2481 if (drop)
2482 iput(inode);
2483}
2484
2485/*
2486 * Handle TRUNC from MDS, indicating file truncation.
2487 *
2488 * caller hold s_mutex.
2489 */
2490static void handle_cap_trunc(struct inode *inode,
2491 struct ceph_mds_caps *trunc,
2492 struct ceph_mds_session *session)
2493 __releases(inode->i_lock)
2494{
2495 struct ceph_inode_info *ci = ceph_inode(inode);
2496 int mds = session->s_mds;
2497 int seq = le32_to_cpu(trunc->seq);
2498 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2499 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2500 u64 size = le64_to_cpu(trunc->size);
2501 int implemented = 0;
2502 int dirty = __ceph_caps_dirty(ci);
2503 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2504 int queue_trunc = 0;
2505
2506 issued |= implemented | dirty;
2507
2508 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2509 inode, mds, seq, truncate_size, truncate_seq);
2510 queue_trunc = ceph_fill_file_size(inode, issued,
2511 truncate_seq, truncate_size, size);
2512 spin_unlock(&inode->i_lock);
2513
2514 if (queue_trunc)
2515 ceph_queue_vmtruncate(inode);
2516}
2517
2518/*
2519 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2520 * different one. If we are the most recent migration we've seen (as
2521 * indicated by mseq), make note of the migrating cap bits for the
2522 * duration (until we see the corresponding IMPORT).
2523 *
2524 * caller holds s_mutex
2525 */
2526static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2527 struct ceph_mds_session *session)
2528{
2529 struct ceph_inode_info *ci = ceph_inode(inode);
2530 int mds = session->s_mds;
2531 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2532 struct ceph_cap *cap = NULL, *t;
2533 struct rb_node *p;
2534 int remember = 1;
2535
2536 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2537 inode, ci, mds, mseq);
2538
2539 spin_lock(&inode->i_lock);
2540
2541 /* make sure we haven't seen a higher mseq */
2542 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2543 t = rb_entry(p, struct ceph_cap, ci_node);
2544 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2545 dout(" higher mseq on cap from mds%d\n",
2546 t->session->s_mds);
2547 remember = 0;
2548 }
2549 if (t->session->s_mds == mds)
2550 cap = t;
2551 }
2552
2553 if (cap) {
2554 if (remember) {
2555 /* make note */
2556 ci->i_cap_exporting_mds = mds;
2557 ci->i_cap_exporting_mseq = mseq;
2558 ci->i_cap_exporting_issued = cap->issued;
2559 }
2560 __ceph_remove_cap(cap);
2561 }
2562 /* else, we already released it */
2563
2564 spin_unlock(&inode->i_lock);
2565}
2566
2567/*
2568 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2569 * clean them up.
2570 *
2571 * caller holds s_mutex.
2572 */
2573static void handle_cap_import(struct ceph_mds_client *mdsc,
2574 struct inode *inode, struct ceph_mds_caps *im,
2575 struct ceph_mds_session *session,
2576 void *snaptrace, int snaptrace_len)
2577{
2578 struct ceph_inode_info *ci = ceph_inode(inode);
2579 int mds = session->s_mds;
2580 unsigned issued = le32_to_cpu(im->caps);
2581 unsigned wanted = le32_to_cpu(im->wanted);
2582 unsigned seq = le32_to_cpu(im->seq);
2583 unsigned mseq = le32_to_cpu(im->migrate_seq);
2584 u64 realmino = le64_to_cpu(im->realm);
2585 u64 cap_id = le64_to_cpu(im->cap_id);
2586
2587 if (ci->i_cap_exporting_mds >= 0 &&
2588 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2589 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2590 " - cleared exporting from mds%d\n",
2591 inode, ci, mds, mseq,
2592 ci->i_cap_exporting_mds);
2593 ci->i_cap_exporting_issued = 0;
2594 ci->i_cap_exporting_mseq = 0;
2595 ci->i_cap_exporting_mds = -1;
2596 } else {
2597 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2598 inode, ci, mds, mseq);
2599 }
2600
2601 down_write(&mdsc->snap_rwsem);
2602 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2603 false);
2604 downgrade_write(&mdsc->snap_rwsem);
2605 ceph_add_cap(inode, session, cap_id, -1,
2606 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2607 NULL /* no caps context */);
2608 try_flush_caps(inode, session, NULL);
2609 up_read(&mdsc->snap_rwsem);
2610}
2611
2612/*
2613 * Handle a caps message from the MDS.
2614 *
2615 * Identify the appropriate session, inode, and call the right handler
2616 * based on the cap op.
2617 */
2618void ceph_handle_caps(struct ceph_mds_session *session,
2619 struct ceph_msg *msg)
2620{
2621 struct ceph_mds_client *mdsc = session->s_mdsc;
2622 struct super_block *sb = mdsc->client->sb;
2623 struct inode *inode;
2624 struct ceph_cap *cap;
2625 struct ceph_mds_caps *h;
2626 int mds = session->s_mds;
2627 int op;
2628 u32 seq;
2629 struct ceph_vino vino;
2630 u64 cap_id;
2631 u64 size, max_size;
2632 u64 tid;
2633 void *snaptrace;
2634
2635 dout("handle_caps from mds%d\n", mds);
2636
2637 /* decode */
2638 tid = le64_to_cpu(msg->hdr.tid);
2639 if (msg->front.iov_len < sizeof(*h))
2640 goto bad;
2641 h = msg->front.iov_base;
2642 snaptrace = h + 1;
2643 op = le32_to_cpu(h->op);
2644 vino.ino = le64_to_cpu(h->ino);
2645 vino.snap = CEPH_NOSNAP;
2646 cap_id = le64_to_cpu(h->cap_id);
2647 seq = le32_to_cpu(h->seq);
2648 size = le64_to_cpu(h->size);
2649 max_size = le64_to_cpu(h->max_size);
2650
2651 mutex_lock(&session->s_mutex);
2652 session->s_seq++;
2653 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2654 (unsigned)seq);
2655
2656 /* lookup ino */
2657 inode = ceph_find_inode(sb, vino);
2658 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2659 vino.snap, inode);
2660 if (!inode) {
2661 dout(" i don't have ino %llx\n", vino.ino);
2662 goto done;
2663 }
2664
2665 /* these will work even if we don't have a cap yet */
2666 switch (op) {
2667 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2668 handle_cap_flushsnap_ack(inode, tid, h, session);
2669 goto done;
2670
2671 case CEPH_CAP_OP_EXPORT:
2672 handle_cap_export(inode, h, session);
2673 goto done;
2674
2675 case CEPH_CAP_OP_IMPORT:
2676 handle_cap_import(mdsc, inode, h, session,
2677 snaptrace, le32_to_cpu(h->snap_trace_len));
2678 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2679 session);
2680 goto done_unlocked;
2681 }
2682
2683 /* the rest require a cap */
2684 spin_lock(&inode->i_lock);
2685 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2686 if (!cap) {
2687 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2688 inode, ceph_ino(inode), ceph_snap(inode), mds);
2689 spin_unlock(&inode->i_lock);
2690 goto done;
2691 }
2692
2693 /* note that each of these drops i_lock for us */
2694 switch (op) {
2695 case CEPH_CAP_OP_REVOKE:
2696 case CEPH_CAP_OP_GRANT:
2697 handle_cap_grant(inode, h, session, cap, msg->middle);
2698 goto done_unlocked;
2699
2700 case CEPH_CAP_OP_FLUSH_ACK:
2701 handle_cap_flush_ack(inode, tid, h, session, cap);
2702 break;
2703
2704 case CEPH_CAP_OP_TRUNC:
2705 handle_cap_trunc(inode, h, session);
2706 break;
2707
2708 default:
2709 spin_unlock(&inode->i_lock);
2710 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2711 ceph_cap_op_name(op));
2712 }
2713
2714done:
2715 mutex_unlock(&session->s_mutex);
2716done_unlocked:
2717 if (inode)
2718 iput(inode);
2719 return;
2720
2721bad:
2722 pr_err("ceph_handle_caps: corrupt message\n");
2723 ceph_msg_dump(msg);
2724 return;
2725}
2726
2727/*
2728 * Delayed work handler to process end of delayed cap release LRU list.
2729 */
2730void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2731{
2732 struct ceph_inode_info *ci;
2733 int flags = CHECK_CAPS_NODELAY;
2734
2735 dout("check_delayed_caps\n");
2736 while (1) {
2737 spin_lock(&mdsc->cap_delay_lock);
2738 if (list_empty(&mdsc->cap_delay_list))
2739 break;
2740 ci = list_first_entry(&mdsc->cap_delay_list,
2741 struct ceph_inode_info,
2742 i_cap_delay_list);
2743 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2744 time_before(jiffies, ci->i_hold_caps_max))
2745 break;
2746 list_del_init(&ci->i_cap_delay_list);
2747 spin_unlock(&mdsc->cap_delay_lock);
2748 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2749 ceph_check_caps(ci, flags, NULL);
2750 }
2751 spin_unlock(&mdsc->cap_delay_lock);
2752}
2753
2754/*
2755 * Flush all dirty caps to the mds
2756 */
2757void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2758{
2759 struct ceph_inode_info *ci, *nci = NULL;
2760 struct inode *inode, *ninode = NULL;
2761 struct list_head *p, *n;
2762
2763 dout("flush_dirty_caps\n");
2764 spin_lock(&mdsc->cap_dirty_lock);
2765 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2766 if (nci) {
2767 ci = nci;
2768 inode = ninode;
2769 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2770 dout("flush_dirty_caps inode %p (was next inode)\n",
2771 inode);
2772 } else {
2773 ci = list_entry(p, struct ceph_inode_info,
2774 i_dirty_item);
2775 inode = igrab(&ci->vfs_inode);
2776 BUG_ON(!inode);
2777 dout("flush_dirty_caps inode %p\n", inode);
2778 }
2779 if (n != &mdsc->cap_dirty) {
2780 nci = list_entry(n, struct ceph_inode_info,
2781 i_dirty_item);
2782 ninode = igrab(&nci->vfs_inode);
2783 BUG_ON(!ninode);
2784 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2785 dout("flush_dirty_caps next inode %p, noflush\n",
2786 ninode);
2787 } else {
2788 nci = NULL;
2789 ninode = NULL;
2790 }
2791 spin_unlock(&mdsc->cap_dirty_lock);
2792 if (inode) {
2793 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2794 NULL);
2795 iput(inode);
2796 }
2797 spin_lock(&mdsc->cap_dirty_lock);
2798 }
2799 spin_unlock(&mdsc->cap_dirty_lock);
2800}
2801
2802/*
2803 * Drop open file reference. If we were the last open file,
2804 * we may need to release capabilities to the MDS (or schedule
2805 * their delayed release).
2806 */
2807void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2808{
2809 struct inode *inode = &ci->vfs_inode;
2810 int last = 0;
2811
2812 spin_lock(&inode->i_lock);
2813 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2814 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2815 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2816 if (--ci->i_nr_by_mode[fmode] == 0)
2817 last++;
2818 spin_unlock(&inode->i_lock);
2819
2820 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2821 ceph_check_caps(ci, 0, NULL);
2822}
2823
2824/*
2825 * Helpers for embedding cap and dentry lease releases into mds
2826 * requests.
2827 *
2828 * @force is used by dentry_release (below) to force inclusion of a
2829 * record for the directory inode, even when there aren't any caps to
2830 * drop.
2831 */
2832int ceph_encode_inode_release(void **p, struct inode *inode,
2833 int mds, int drop, int unless, int force)
2834{
2835 struct ceph_inode_info *ci = ceph_inode(inode);
2836 struct ceph_cap *cap;
2837 struct ceph_mds_request_release *rel = *p;
2838 int ret = 0;
2839 int used = 0;
2840
2841 spin_lock(&inode->i_lock);
2842 used = __ceph_caps_used(ci);
2843
2844 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2845 mds, ceph_cap_string(used), ceph_cap_string(drop),
2846 ceph_cap_string(unless));
2847
2848 /* only drop unused caps */
2849 drop &= ~used;
2850
2851 cap = __get_cap_for_mds(ci, mds);
2852 if (cap && __cap_is_valid(cap)) {
2853 if (force ||
2854 ((cap->issued & drop) &&
2855 (cap->issued & unless) == 0)) {
2856 if ((cap->issued & drop) &&
2857 (cap->issued & unless) == 0) {
2858 dout("encode_inode_release %p cap %p %s -> "
2859 "%s\n", inode, cap,
2860 ceph_cap_string(cap->issued),
2861 ceph_cap_string(cap->issued & ~drop));
2862 cap->issued &= ~drop;
2863 cap->implemented &= ~drop;
2864 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2865 int wanted = __ceph_caps_wanted(ci);
2866 dout(" wanted %s -> %s (act %s)\n",
2867 ceph_cap_string(cap->mds_wanted),
2868 ceph_cap_string(cap->mds_wanted &
2869 ~wanted),
2870 ceph_cap_string(wanted));
2871 cap->mds_wanted &= wanted;
2872 }
2873 } else {
2874 dout("encode_inode_release %p cap %p %s"
2875 " (force)\n", inode, cap,
2876 ceph_cap_string(cap->issued));
2877 }
2878
2879 rel->ino = cpu_to_le64(ceph_ino(inode));
2880 rel->cap_id = cpu_to_le64(cap->cap_id);
2881 rel->seq = cpu_to_le32(cap->seq);
2882 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2883 rel->mseq = cpu_to_le32(cap->mseq);
2884 rel->caps = cpu_to_le32(cap->issued);
2885 rel->wanted = cpu_to_le32(cap->mds_wanted);
2886 rel->dname_len = 0;
2887 rel->dname_seq = 0;
2888 *p += sizeof(*rel);
2889 ret = 1;
2890 } else {
2891 dout("encode_inode_release %p cap %p %s\n",
2892 inode, cap, ceph_cap_string(cap->issued));
2893 }
2894 }
2895 spin_unlock(&inode->i_lock);
2896 return ret;
2897}
2898
2899int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2900 int mds, int drop, int unless)
2901{
2902 struct inode *dir = dentry->d_parent->d_inode;
2903 struct ceph_mds_request_release *rel = *p;
2904 struct ceph_dentry_info *di = ceph_dentry(dentry);
2905 int force = 0;
2906 int ret;
2907
2908 /*
2909 * force an record for the directory caps if we have a dentry lease.
2910 * this is racy (can't take i_lock and d_lock together), but it
2911 * doesn't have to be perfect; the mds will revoke anything we don't
2912 * release.
2913 */
2914 spin_lock(&dentry->d_lock);
2915 if (di->lease_session && di->lease_session->s_mds == mds)
2916 force = 1;
2917 spin_unlock(&dentry->d_lock);
2918
2919 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2920
2921 spin_lock(&dentry->d_lock);
2922 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2923 dout("encode_dentry_release %p mds%d seq %d\n",
2924 dentry, mds, (int)di->lease_seq);
2925 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2926 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2927 *p += dentry->d_name.len;
2928 rel->dname_seq = cpu_to_le32(di->lease_seq);
2929 }
2930 spin_unlock(&dentry->d_lock);
2931 return ret;
2932}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..291ac288e791
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <crypto/hash.h>
7
8#include "crypto.h"
9#include "decode.h"
10
11int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
12{
13 if (*p + sizeof(u16) + sizeof(key->created) +
14 sizeof(u16) + key->len > end)
15 return -ERANGE;
16 ceph_encode_16(p, key->type);
17 ceph_encode_copy(p, &key->created, sizeof(key->created));
18 ceph_encode_16(p, key->len);
19 ceph_encode_copy(p, key->key, key->len);
20 return 0;
21}
22
23int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
24{
25 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
26 key->type = ceph_decode_16(p);
27 ceph_decode_copy(p, &key->created, sizeof(key->created));
28 key->len = ceph_decode_16(p);
29 ceph_decode_need(p, end, key->len, bad);
30 key->key = kmalloc(key->len, GFP_NOFS);
31 if (!key->key)
32 return -ENOMEM;
33 ceph_decode_copy(p, key->key, key->len);
34 return 0;
35
36bad:
37 dout("failed to decode crypto key\n");
38 return -EINVAL;
39}
40
41int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
42{
43 int inlen = strlen(inkey);
44 int blen = inlen * 3 / 4;
45 void *buf, *p;
46 int ret;
47
48 dout("crypto_key_unarmor %s\n", inkey);
49 buf = kmalloc(blen, GFP_NOFS);
50 if (!buf)
51 return -ENOMEM;
52 blen = ceph_unarmor(buf, inkey, inkey+inlen);
53 if (blen < 0) {
54 kfree(buf);
55 return blen;
56 }
57
58 p = buf;
59 ret = ceph_crypto_key_decode(key, &p, p + blen);
60 kfree(buf);
61 if (ret)
62 return ret;
63 dout("crypto_key_unarmor key %p type %d len %d\n", key,
64 key->type, key->len);
65 return 0;
66}
67
68
69
70#define AES_KEY_SIZE 16
71
72static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
73{
74 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
75}
76
77const u8 *aes_iv = "cephsageyudagreg";
78
79int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
80 const void *src, size_t src_len)
81{
82 struct scatterlist sg_in[2], sg_out[1];
83 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
84 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
85 int ret;
86 void *iv;
87 int ivsize;
88 size_t zero_padding = (0x10 - (src_len & 0x0f));
89 char pad[16];
90
91 if (IS_ERR(tfm))
92 return PTR_ERR(tfm);
93
94 memset(pad, zero_padding, zero_padding);
95
96 *dst_len = src_len + zero_padding;
97
98 crypto_blkcipher_setkey((void *)tfm, key, key_len);
99 sg_init_table(sg_in, 2);
100 sg_set_buf(&sg_in[0], src, src_len);
101 sg_set_buf(&sg_in[1], pad, zero_padding);
102 sg_init_table(sg_out, 1);
103 sg_set_buf(sg_out, dst, *dst_len);
104 iv = crypto_blkcipher_crt(tfm)->iv;
105 ivsize = crypto_blkcipher_ivsize(tfm);
106
107 memcpy(iv, aes_iv, ivsize);
108 /*
109 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
110 key, key_len, 1);
111 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
112 src, src_len, 1);
113 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
114 pad, zero_padding, 1);
115 */
116 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
117 src_len + zero_padding);
118 crypto_free_blkcipher(tfm);
119 if (ret < 0)
120 pr_err("ceph_aes_crypt failed %d\n", ret);
121 /*
122 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
123 dst, *dst_len, 1);
124 */
125 return 0;
126}
127
128int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
129 const void *src1, size_t src1_len,
130 const void *src2, size_t src2_len)
131{
132 struct scatterlist sg_in[3], sg_out[1];
133 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
134 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
135 int ret;
136 void *iv;
137 int ivsize;
138 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
139 char pad[16];
140
141 if (IS_ERR(tfm))
142 return PTR_ERR(tfm);
143
144 memset(pad, zero_padding, zero_padding);
145
146 *dst_len = src1_len + src2_len + zero_padding;
147
148 crypto_blkcipher_setkey((void *)tfm, key, key_len);
149 sg_init_table(sg_in, 3);
150 sg_set_buf(&sg_in[0], src1, src1_len);
151 sg_set_buf(&sg_in[1], src2, src2_len);
152 sg_set_buf(&sg_in[2], pad, zero_padding);
153 sg_init_table(sg_out, 1);
154 sg_set_buf(sg_out, dst, *dst_len);
155 iv = crypto_blkcipher_crt(tfm)->iv;
156 ivsize = crypto_blkcipher_ivsize(tfm);
157
158 memcpy(iv, aes_iv, ivsize);
159 /*
160 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
161 key, key_len, 1);
162 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
163 src1, src1_len, 1);
164 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
165 src2, src2_len, 1);
166 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
167 pad, zero_padding, 1);
168 */
169 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
170 src1_len + src2_len + zero_padding);
171 crypto_free_blkcipher(tfm);
172 if (ret < 0)
173 pr_err("ceph_aes_crypt2 failed %d\n", ret);
174 /*
175 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
176 dst, *dst_len, 1);
177 */
178 return 0;
179}
180
181int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
182 const void *src, size_t src_len)
183{
184 struct scatterlist sg_in[1], sg_out[2];
185 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
186 struct blkcipher_desc desc = { .tfm = tfm };
187 char pad[16];
188 void *iv;
189 int ivsize;
190 int ret;
191 int last_byte;
192
193 if (IS_ERR(tfm))
194 return PTR_ERR(tfm);
195
196 crypto_blkcipher_setkey((void *)tfm, key, key_len);
197 sg_init_table(sg_in, 1);
198 sg_init_table(sg_out, 2);
199 sg_set_buf(sg_in, src, src_len);
200 sg_set_buf(&sg_out[0], dst, *dst_len);
201 sg_set_buf(&sg_out[1], pad, sizeof(pad));
202
203 iv = crypto_blkcipher_crt(tfm)->iv;
204 ivsize = crypto_blkcipher_ivsize(tfm);
205
206 memcpy(iv, aes_iv, ivsize);
207
208 /*
209 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
210 key, key_len, 1);
211 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
212 src, src_len, 1);
213 */
214
215 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
216 crypto_free_blkcipher(tfm);
217 if (ret < 0) {
218 pr_err("ceph_aes_decrypt failed %d\n", ret);
219 return ret;
220 }
221
222 if (src_len <= *dst_len)
223 last_byte = ((char *)dst)[src_len - 1];
224 else
225 last_byte = pad[src_len - *dst_len - 1];
226 if (last_byte <= 16 && src_len >= last_byte) {
227 *dst_len = src_len - last_byte;
228 } else {
229 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
230 last_byte, (int)src_len);
231 return -EPERM; /* bad padding */
232 }
233 /*
234 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
235 dst, *dst_len, 1);
236 */
237 return 0;
238}
239
240int ceph_aes_decrypt2(const void *key, int key_len,
241 void *dst1, size_t *dst1_len,
242 void *dst2, size_t *dst2_len,
243 const void *src, size_t src_len)
244{
245 struct scatterlist sg_in[1], sg_out[3];
246 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
247 struct blkcipher_desc desc = { .tfm = tfm };
248 char pad[16];
249 void *iv;
250 int ivsize;
251 int ret;
252 int last_byte;
253
254 if (IS_ERR(tfm))
255 return PTR_ERR(tfm);
256
257 sg_init_table(sg_in, 1);
258 sg_set_buf(sg_in, src, src_len);
259 sg_init_table(sg_out, 3);
260 sg_set_buf(&sg_out[0], dst1, *dst1_len);
261 sg_set_buf(&sg_out[1], dst2, *dst2_len);
262 sg_set_buf(&sg_out[2], pad, sizeof(pad));
263
264 crypto_blkcipher_setkey((void *)tfm, key, key_len);
265 iv = crypto_blkcipher_crt(tfm)->iv;
266 ivsize = crypto_blkcipher_ivsize(tfm);
267
268 memcpy(iv, aes_iv, ivsize);
269
270 /*
271 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
272 key, key_len, 1);
273 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
274 src, src_len, 1);
275 */
276
277 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
278 crypto_free_blkcipher(tfm);
279 if (ret < 0) {
280 pr_err("ceph_aes_decrypt failed %d\n", ret);
281 return ret;
282 }
283
284 if (src_len <= *dst1_len)
285 last_byte = ((char *)dst1)[src_len - 1];
286 else if (src_len <= *dst1_len + *dst2_len)
287 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
288 else
289 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
290 if (last_byte <= 16 && src_len >= last_byte) {
291 src_len -= last_byte;
292 } else {
293 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
294 last_byte, (int)src_len);
295 return -EPERM; /* bad padding */
296 }
297
298 if (src_len < *dst1_len) {
299 *dst1_len = src_len;
300 *dst2_len = 0;
301 } else {
302 *dst2_len = src_len - *dst1_len;
303 }
304 /*
305 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
306 dst1, *dst1_len, 1);
307 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
308 dst2, *dst2_len, 1);
309 */
310
311 return 0;
312}
313
314
315int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
316 const void *src, size_t src_len)
317{
318 switch (secret->type) {
319 case CEPH_CRYPTO_NONE:
320 if (*dst_len < src_len)
321 return -ERANGE;
322 memcpy(dst, src, src_len);
323 *dst_len = src_len;
324 return 0;
325
326 case CEPH_CRYPTO_AES:
327 return ceph_aes_decrypt(secret->key, secret->len, dst,
328 dst_len, src, src_len);
329
330 default:
331 return -EINVAL;
332 }
333}
334
335int ceph_decrypt2(struct ceph_crypto_key *secret,
336 void *dst1, size_t *dst1_len,
337 void *dst2, size_t *dst2_len,
338 const void *src, size_t src_len)
339{
340 size_t t;
341
342 switch (secret->type) {
343 case CEPH_CRYPTO_NONE:
344 if (*dst1_len + *dst2_len < src_len)
345 return -ERANGE;
346 t = min(*dst1_len, src_len);
347 memcpy(dst1, src, t);
348 *dst1_len = t;
349 src += t;
350 src_len -= t;
351 if (src_len) {
352 t = min(*dst2_len, src_len);
353 memcpy(dst2, src, t);
354 *dst2_len = t;
355 }
356 return 0;
357
358 case CEPH_CRYPTO_AES:
359 return ceph_aes_decrypt2(secret->key, secret->len,
360 dst1, dst1_len, dst2, dst2_len,
361 src, src_len);
362
363 default:
364 return -EINVAL;
365 }
366}
367
368int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
369 const void *src, size_t src_len)
370{
371 switch (secret->type) {
372 case CEPH_CRYPTO_NONE:
373 if (*dst_len < src_len)
374 return -ERANGE;
375 memcpy(dst, src, src_len);
376 *dst_len = src_len;
377 return 0;
378
379 case CEPH_CRYPTO_AES:
380 return ceph_aes_encrypt(secret->key, secret->len, dst,
381 dst_len, src, src_len);
382
383 default:
384 return -EINVAL;
385 }
386}
387
388int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
389 const void *src1, size_t src1_len,
390 const void *src2, size_t src2_len)
391{
392 switch (secret->type) {
393 case CEPH_CRYPTO_NONE:
394 if (*dst_len < src1_len + src2_len)
395 return -ERANGE;
396 memcpy(dst, src1, src1_len);
397 memcpy(dst + src1_len, src2, src2_len);
398 *dst_len = src1_len + src2_len;
399 return 0;
400
401 case CEPH_CRYPTO_AES:
402 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
403 src1, src1_len, src2, src2_len);
404
405 default:
406 return -EINVAL;
407 }
408}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..e159f1415110
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,483 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/module.h>
5#include <linux/ctype.h>
6#include <linux/debugfs.h>
7#include <linux/seq_file.h>
8
9#include "super.h"
10#include "mds_client.h"
11#include "mon_client.h"
12#include "auth.h"
13
14#ifdef CONFIG_DEBUG_FS
15
16/*
17 * Implement /sys/kernel/debug/ceph fun
18 *
19 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
20 * .../osdmap - current osdmap
21 * .../mdsmap - current mdsmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../mdsc - active mds requests
25 * .../monc - mon client state
26 * .../dentry_lru - dump contents of dentry lru
27 * .../caps - expose cap (reservation) stats
28 * .../bdi - symlink to ../../bdi/something
29 */
30
31static struct dentry *ceph_debugfs_dir;
32
33static int monmap_show(struct seq_file *s, void *p)
34{
35 int i;
36 struct ceph_client *client = s->private;
37
38 if (client->monc.monmap == NULL)
39 return 0;
40
41 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
42 for (i = 0; i < client->monc.monmap->num_mon; i++) {
43 struct ceph_entity_inst *inst =
44 &client->monc.monmap->mon_inst[i];
45
46 seq_printf(s, "\t%s%lld\t%s\n",
47 ENTITY_NAME(inst->name),
48 pr_addr(&inst->addr.in_addr));
49 }
50 return 0;
51}
52
53static int mdsmap_show(struct seq_file *s, void *p)
54{
55 int i;
56 struct ceph_client *client = s->private;
57
58 if (client->mdsc.mdsmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
61 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
62 seq_printf(s, "session_timeout %d\n",
63 client->mdsc.mdsmap->m_session_timeout);
64 seq_printf(s, "session_autoclose %d\n",
65 client->mdsc.mdsmap->m_session_autoclose);
66 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
67 struct ceph_entity_addr *addr =
68 &client->mdsc.mdsmap->m_info[i].addr;
69 int state = client->mdsc.mdsmap->m_info[i].state;
70
71 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
72 ceph_mds_state_name(state));
73 }
74 return 0;
75}
76
77static int osdmap_show(struct seq_file *s, void *p)
78{
79 int i;
80 struct ceph_client *client = s->private;
81 struct rb_node *n;
82
83 if (client->osdc.osdmap == NULL)
84 return 0;
85 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
86 seq_printf(s, "flags%s%s\n",
87 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
88 " NEARFULL" : "",
89 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
90 " FULL" : "");
91 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
92 struct ceph_pg_pool_info *pool =
93 rb_entry(n, struct ceph_pg_pool_info, node);
94 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
95 pool->id, pool->v.pg_num, pool->pg_num_mask,
96 pool->v.lpg_num, pool->lpg_num_mask);
97 }
98 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
99 struct ceph_entity_addr *addr =
100 &client->osdc.osdmap->osd_addr[i];
101 int state = client->osdc.osdmap->osd_state[i];
102 char sb[64];
103
104 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
105 i, pr_addr(&addr->in_addr),
106 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
107 ceph_osdmap_state_str(sb, sizeof(sb), state));
108 }
109 return 0;
110}
111
112static int monc_show(struct seq_file *s, void *p)
113{
114 struct ceph_client *client = s->private;
115 struct ceph_mon_statfs_request *req;
116 struct ceph_mon_client *monc = &client->monc;
117 struct rb_node *rp;
118
119 mutex_lock(&monc->mutex);
120
121 if (monc->have_mdsmap)
122 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
123 if (monc->have_osdmap)
124 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
125 if (monc->want_next_osdmap)
126 seq_printf(s, "want next osdmap\n");
127
128 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
129 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
130 seq_printf(s, "%lld statfs\n", req->tid);
131 }
132
133 mutex_unlock(&monc->mutex);
134 return 0;
135}
136
137static int mdsc_show(struct seq_file *s, void *p)
138{
139 struct ceph_client *client = s->private;
140 struct ceph_mds_client *mdsc = &client->mdsc;
141 struct ceph_mds_request *req;
142 struct rb_node *rp;
143 int pathlen;
144 u64 pathbase;
145 char *path;
146
147 mutex_lock(&mdsc->mutex);
148 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
149 req = rb_entry(rp, struct ceph_mds_request, r_node);
150
151 if (req->r_request)
152 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
153 else
154 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
155
156 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
157
158 if (req->r_got_unsafe)
159 seq_printf(s, "\t(unsafe)");
160 else
161 seq_printf(s, "\t");
162
163 if (req->r_inode) {
164 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
165 } else if (req->r_dentry) {
166 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
167 &pathbase, 0);
168 spin_lock(&req->r_dentry->d_lock);
169 seq_printf(s, " #%llx/%.*s (%s)",
170 ceph_ino(req->r_dentry->d_parent->d_inode),
171 req->r_dentry->d_name.len,
172 req->r_dentry->d_name.name,
173 path ? path : "");
174 spin_unlock(&req->r_dentry->d_lock);
175 kfree(path);
176 } else if (req->r_path1) {
177 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
178 req->r_path1);
179 }
180
181 if (req->r_old_dentry) {
182 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
183 &pathbase, 0);
184 spin_lock(&req->r_old_dentry->d_lock);
185 seq_printf(s, " #%llx/%.*s (%s)",
186 ceph_ino(req->r_old_dentry->d_parent->d_inode),
187 req->r_old_dentry->d_name.len,
188 req->r_old_dentry->d_name.name,
189 path ? path : "");
190 spin_unlock(&req->r_old_dentry->d_lock);
191 kfree(path);
192 } else if (req->r_path2) {
193 if (req->r_ino2.ino)
194 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
195 req->r_path2);
196 else
197 seq_printf(s, " %s", req->r_path2);
198 }
199
200 seq_printf(s, "\n");
201 }
202 mutex_unlock(&mdsc->mutex);
203
204 return 0;
205}
206
207static int osdc_show(struct seq_file *s, void *pp)
208{
209 struct ceph_client *client = s->private;
210 struct ceph_osd_client *osdc = &client->osdc;
211 struct rb_node *p;
212
213 mutex_lock(&osdc->request_mutex);
214 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
215 struct ceph_osd_request *req;
216 struct ceph_osd_request_head *head;
217 struct ceph_osd_op *op;
218 int num_ops;
219 int opcode, olen;
220 int i;
221
222 req = rb_entry(p, struct ceph_osd_request, r_node);
223
224 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
225 req->r_osd ? req->r_osd->o_osd : -1,
226 le32_to_cpu(req->r_pgid.pool),
227 le16_to_cpu(req->r_pgid.ps));
228
229 head = req->r_request->front.iov_base;
230 op = (void *)(head + 1);
231
232 num_ops = le16_to_cpu(head->num_ops);
233 olen = le32_to_cpu(head->object_len);
234 seq_printf(s, "%.*s", olen,
235 (const char *)(head->ops + num_ops));
236
237 if (req->r_reassert_version.epoch)
238 seq_printf(s, "\t%u'%llu",
239 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
240 le64_to_cpu(req->r_reassert_version.version));
241 else
242 seq_printf(s, "\t");
243
244 for (i = 0; i < num_ops; i++) {
245 opcode = le16_to_cpu(op->op);
246 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
247 op++;
248 }
249
250 seq_printf(s, "\n");
251 }
252 mutex_unlock(&osdc->request_mutex);
253 return 0;
254}
255
256static int caps_show(struct seq_file *s, void *p)
257{
258 struct ceph_client *client = p;
259 int total, avail, used, reserved, min;
260
261 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
262 seq_printf(s, "total\t\t%d\n"
263 "avail\t\t%d\n"
264 "used\t\t%d\n"
265 "reserved\t%d\n"
266 "min\t%d\n",
267 total, avail, used, reserved, min);
268 return 0;
269}
270
271static int dentry_lru_show(struct seq_file *s, void *ptr)
272{
273 struct ceph_client *client = s->private;
274 struct ceph_mds_client *mdsc = &client->mdsc;
275 struct ceph_dentry_info *di;
276
277 spin_lock(&mdsc->dentry_lru_lock);
278 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
279 struct dentry *dentry = di->dentry;
280 seq_printf(s, "%p %p\t%.*s\n",
281 di, dentry, dentry->d_name.len, dentry->d_name.name);
282 }
283 spin_unlock(&mdsc->dentry_lru_lock);
284
285 return 0;
286}
287
288#define DEFINE_SHOW_FUNC(name) \
289static int name##_open(struct inode *inode, struct file *file) \
290{ \
291 struct seq_file *sf; \
292 int ret; \
293 \
294 ret = single_open(file, name, NULL); \
295 sf = file->private_data; \
296 sf->private = inode->i_private; \
297 return ret; \
298} \
299 \
300static const struct file_operations name##_fops = { \
301 .open = name##_open, \
302 .read = seq_read, \
303 .llseek = seq_lseek, \
304 .release = single_release, \
305};
306
307DEFINE_SHOW_FUNC(monmap_show)
308DEFINE_SHOW_FUNC(mdsmap_show)
309DEFINE_SHOW_FUNC(osdmap_show)
310DEFINE_SHOW_FUNC(monc_show)
311DEFINE_SHOW_FUNC(mdsc_show)
312DEFINE_SHOW_FUNC(osdc_show)
313DEFINE_SHOW_FUNC(dentry_lru_show)
314DEFINE_SHOW_FUNC(caps_show)
315
316static int congestion_kb_set(void *data, u64 val)
317{
318 struct ceph_client *client = (struct ceph_client *)data;
319
320 if (client)
321 client->mount_args->congestion_kb = (int)val;
322
323 return 0;
324}
325
326static int congestion_kb_get(void *data, u64 *val)
327{
328 struct ceph_client *client = (struct ceph_client *)data;
329
330 if (client)
331 *val = (u64)client->mount_args->congestion_kb;
332
333 return 0;
334}
335
336
337DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
338 congestion_kb_set, "%llu\n");
339
340int __init ceph_debugfs_init(void)
341{
342 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
343 if (!ceph_debugfs_dir)
344 return -ENOMEM;
345 return 0;
346}
347
348void ceph_debugfs_cleanup(void)
349{
350 debugfs_remove(ceph_debugfs_dir);
351}
352
353int ceph_debugfs_client_init(struct ceph_client *client)
354{
355 int ret = 0;
356 char name[80];
357
358 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
359 PR_FSID(&client->fsid), client->monc.auth->global_id);
360
361 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
362 if (!client->debugfs_dir)
363 goto out;
364
365 client->monc.debugfs_file = debugfs_create_file("monc",
366 0600,
367 client->debugfs_dir,
368 client,
369 &monc_show_fops);
370 if (!client->monc.debugfs_file)
371 goto out;
372
373 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
374 0600,
375 client->debugfs_dir,
376 client,
377 &mdsc_show_fops);
378 if (!client->mdsc.debugfs_file)
379 goto out;
380
381 client->osdc.debugfs_file = debugfs_create_file("osdc",
382 0600,
383 client->debugfs_dir,
384 client,
385 &osdc_show_fops);
386 if (!client->osdc.debugfs_file)
387 goto out;
388
389 client->debugfs_monmap = debugfs_create_file("monmap",
390 0600,
391 client->debugfs_dir,
392 client,
393 &monmap_show_fops);
394 if (!client->debugfs_monmap)
395 goto out;
396
397 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
398 0600,
399 client->debugfs_dir,
400 client,
401 &mdsmap_show_fops);
402 if (!client->debugfs_mdsmap)
403 goto out;
404
405 client->debugfs_osdmap = debugfs_create_file("osdmap",
406 0600,
407 client->debugfs_dir,
408 client,
409 &osdmap_show_fops);
410 if (!client->debugfs_osdmap)
411 goto out;
412
413 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
414 0600,
415 client->debugfs_dir,
416 client,
417 &dentry_lru_show_fops);
418 if (!client->debugfs_dentry_lru)
419 goto out;
420
421 client->debugfs_caps = debugfs_create_file("caps",
422 0400,
423 client->debugfs_dir,
424 client,
425 &caps_show_fops);
426 if (!client->debugfs_caps)
427 goto out;
428
429 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
430 0600,
431 client->debugfs_dir,
432 client,
433 &congestion_kb_fops);
434 if (!client->debugfs_congestion_kb)
435 goto out;
436
437 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
438 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
439 name);
440
441 return 0;
442
443out:
444 ceph_debugfs_client_cleanup(client);
445 return ret;
446}
447
448void ceph_debugfs_client_cleanup(struct ceph_client *client)
449{
450 debugfs_remove(client->debugfs_bdi);
451 debugfs_remove(client->debugfs_caps);
452 debugfs_remove(client->debugfs_dentry_lru);
453 debugfs_remove(client->debugfs_osdmap);
454 debugfs_remove(client->debugfs_mdsmap);
455 debugfs_remove(client->debugfs_monmap);
456 debugfs_remove(client->osdc.debugfs_file);
457 debugfs_remove(client->mdsc.debugfs_file);
458 debugfs_remove(client->monc.debugfs_file);
459 debugfs_remove(client->debugfs_congestion_kb);
460 debugfs_remove(client->debugfs_dir);
461}
462
463#else // CONFIG_DEBUG_FS
464
465int __init ceph_debugfs_init(void)
466{
467 return 0;
468}
469
470void ceph_debugfs_cleanup(void)
471{
472}
473
474int ceph_debugfs_client_init(struct ceph_client *client)
475{
476 return 0;
477}
478
479void ceph_debugfs_client_cleanup(struct ceph_client *client)
480{
481}
482
483#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..8a9116e15b70
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1222 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/sched.h>
7
8#include "super.h"
9
10/*
11 * Directory operations: readdir, lookup, create, link, unlink,
12 * rename, etc.
13 */
14
15/*
16 * Ceph MDS operations are specified in terms of a base ino and
17 * relative path. Thus, the client can specify an operation on a
18 * specific inode (e.g., a getattr due to fstat(2)), or as a path
19 * relative to, say, the root directory.
20 *
21 * Normally, we limit ourselves to strict inode ops (no path component)
22 * or dentry operations (a single path component relative to an ino). The
23 * exception to this is open_root_dentry(), which will open the mount
24 * point by name.
25 */
26
27const struct inode_operations ceph_dir_iops;
28const struct file_operations ceph_dir_fops;
29struct dentry_operations ceph_dentry_ops;
30
31/*
32 * Initialize ceph dentry state.
33 */
34int ceph_init_dentry(struct dentry *dentry)
35{
36 struct ceph_dentry_info *di;
37
38 if (dentry->d_fsdata)
39 return 0;
40
41 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
42 dentry->d_op = &ceph_dentry_ops;
43 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
44 dentry->d_op = &ceph_snapdir_dentry_ops;
45 else
46 dentry->d_op = &ceph_snap_dentry_ops;
47
48 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
49 if (!di)
50 return -ENOMEM; /* oh well */
51
52 spin_lock(&dentry->d_lock);
53 if (dentry->d_fsdata) /* lost a race */
54 goto out_unlock;
55 di->dentry = dentry;
56 di->lease_session = NULL;
57 dentry->d_fsdata = di;
58 dentry->d_time = jiffies;
59 ceph_dentry_lru_add(dentry);
60out_unlock:
61 spin_unlock(&dentry->d_lock);
62 return 0;
63}
64
65
66
67/*
68 * for readdir, we encode the directory frag and offset within that
69 * frag into f_pos.
70 */
71static unsigned fpos_frag(loff_t p)
72{
73 return p >> 32;
74}
75static unsigned fpos_off(loff_t p)
76{
77 return p & 0xffffffff;
78}
79
80/*
81 * When possible, we try to satisfy a readdir by peeking at the
82 * dcache. We make this work by carefully ordering dentries on
83 * d_u.d_child when we initially get results back from the MDS, and
84 * falling back to a "normal" sync readdir if any dentries in the dir
85 * are dropped.
86 *
87 * I_COMPLETE tells indicates we have all dentries in the dir. It is
88 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
89 * the MDS if/when the directory is modified).
90 */
91static int __dcache_readdir(struct file *filp,
92 void *dirent, filldir_t filldir)
93{
94 struct inode *inode = filp->f_dentry->d_inode;
95 struct ceph_file_info *fi = filp->private_data;
96 struct dentry *parent = filp->f_dentry;
97 struct inode *dir = parent->d_inode;
98 struct list_head *p;
99 struct dentry *dentry, *last;
100 struct ceph_dentry_info *di;
101 int err = 0;
102
103 /* claim ref on last dentry we returned */
104 last = fi->dentry;
105 fi->dentry = NULL;
106
107 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
108 last);
109
110 spin_lock(&dcache_lock);
111
112 /* start at beginning? */
113 if (filp->f_pos == 2 || (last &&
114 filp->f_pos < ceph_dentry(last)->offset)) {
115 if (list_empty(&parent->d_subdirs))
116 goto out_unlock;
117 p = parent->d_subdirs.prev;
118 dout(" initial p %p/%p\n", p->prev, p->next);
119 } else {
120 p = last->d_u.d_child.prev;
121 }
122
123more:
124 dentry = list_entry(p, struct dentry, d_u.d_child);
125 di = ceph_dentry(dentry);
126 while (1) {
127 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
128 parent->d_subdirs.prev, parent->d_subdirs.next);
129 if (p == &parent->d_subdirs) {
130 fi->at_end = 1;
131 goto out_unlock;
132 }
133 if (!d_unhashed(dentry) && dentry->d_inode &&
134 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
135 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
136 filp->f_pos <= di->offset)
137 break;
138 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
139 dentry->d_name.len, dentry->d_name.name, di->offset,
140 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
141 !dentry->d_inode ? " null" : "");
142 p = p->prev;
143 dentry = list_entry(p, struct dentry, d_u.d_child);
144 di = ceph_dentry(dentry);
145 }
146
147 atomic_inc(&dentry->d_count);
148 spin_unlock(&dcache_lock);
149 spin_unlock(&inode->i_lock);
150
151 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
152 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
153 filp->f_pos = di->offset;
154 err = filldir(dirent, dentry->d_name.name,
155 dentry->d_name.len, di->offset,
156 dentry->d_inode->i_ino,
157 dentry->d_inode->i_mode >> 12);
158
159 if (last) {
160 if (err < 0) {
161 /* remember our position */
162 fi->dentry = last;
163 fi->next_offset = di->offset;
164 } else {
165 dput(last);
166 }
167 last = NULL;
168 }
169
170 spin_lock(&inode->i_lock);
171 spin_lock(&dcache_lock);
172
173 if (err < 0)
174 goto out_unlock;
175
176 last = dentry;
177
178 p = p->prev;
179 filp->f_pos++;
180
181 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
182 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
183 goto more;
184 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
185 err = -EAGAIN;
186
187out_unlock:
188 spin_unlock(&dcache_lock);
189
190 if (last) {
191 spin_unlock(&inode->i_lock);
192 dput(last);
193 spin_lock(&inode->i_lock);
194 }
195
196 return err;
197}
198
199/*
200 * make note of the last dentry we read, so we can
201 * continue at the same lexicographical point,
202 * regardless of what dir changes take place on the
203 * server.
204 */
205static int note_last_dentry(struct ceph_file_info *fi, const char *name,
206 int len)
207{
208 kfree(fi->last_name);
209 fi->last_name = kmalloc(len+1, GFP_NOFS);
210 if (!fi->last_name)
211 return -ENOMEM;
212 memcpy(fi->last_name, name, len);
213 fi->last_name[len] = 0;
214 dout("note_last_dentry '%s'\n", fi->last_name);
215 return 0;
216}
217
218static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
219{
220 struct ceph_file_info *fi = filp->private_data;
221 struct inode *inode = filp->f_dentry->d_inode;
222 struct ceph_inode_info *ci = ceph_inode(inode);
223 struct ceph_client *client = ceph_inode_to_client(inode);
224 struct ceph_mds_client *mdsc = &client->mdsc;
225 unsigned frag = fpos_frag(filp->f_pos);
226 int off = fpos_off(filp->f_pos);
227 int err;
228 u32 ftype;
229 struct ceph_mds_reply_info_parsed *rinfo;
230 const int max_entries = client->mount_args->max_readdir;
231
232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
233 if (fi->at_end)
234 return 0;
235
236 /* always start with . and .. */
237 if (filp->f_pos == 0) {
238 /* note dir version at start of readdir so we can tell
239 * if any dentries get dropped */
240 fi->dir_release_count = ci->i_release_count;
241
242 dout("readdir off 0 -> '.'\n");
243 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
244 inode->i_ino, inode->i_mode >> 12) < 0)
245 return 0;
246 filp->f_pos = 1;
247 off = 1;
248 }
249 if (filp->f_pos == 1) {
250 dout("readdir off 1 -> '..'\n");
251 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
252 filp->f_dentry->d_parent->d_inode->i_ino,
253 inode->i_mode >> 12) < 0)
254 return 0;
255 filp->f_pos = 2;
256 off = 2;
257 }
258
259 /* can we use the dcache? */
260 spin_lock(&inode->i_lock);
261 if ((filp->f_pos == 2 || fi->dentry) &&
262 !ceph_test_opt(client, NOASYNCREADDIR) &&
263 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
264 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
265 err = __dcache_readdir(filp, dirent, filldir);
266 if (err != -EAGAIN) {
267 spin_unlock(&inode->i_lock);
268 return err;
269 }
270 }
271 spin_unlock(&inode->i_lock);
272 if (fi->dentry) {
273 err = note_last_dentry(fi, fi->dentry->d_name.name,
274 fi->dentry->d_name.len);
275 if (err)
276 return err;
277 dput(fi->dentry);
278 fi->dentry = NULL;
279 }
280
281 /* proceed with a normal readdir */
282
283more:
284 /* do we have the correct frag content buffered? */
285 if (fi->frag != frag || fi->last_readdir == NULL) {
286 struct ceph_mds_request *req;
287 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
288 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
289
290 /* discard old result, if any */
291 if (fi->last_readdir) {
292 ceph_mdsc_put_request(fi->last_readdir);
293 fi->last_readdir = NULL;
294 }
295
296 /* requery frag tree, as the frag topology may have changed */
297 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
298
299 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
300 ceph_vinop(inode), frag, fi->last_name);
301 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
302 if (IS_ERR(req))
303 return PTR_ERR(req);
304 req->r_inode = igrab(inode);
305 req->r_dentry = dget(filp->f_dentry);
306 /* hints to request -> mds selection code */
307 req->r_direct_mode = USE_AUTH_MDS;
308 req->r_direct_hash = ceph_frag_value(frag);
309 req->r_direct_is_hash = true;
310 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
311 req->r_readdir_offset = fi->next_offset;
312 req->r_args.readdir.frag = cpu_to_le32(frag);
313 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
314 req->r_num_caps = max_entries;
315 err = ceph_mdsc_do_request(mdsc, NULL, req);
316 if (err < 0) {
317 ceph_mdsc_put_request(req);
318 return err;
319 }
320 dout("readdir got and parsed readdir result=%d"
321 " on frag %x, end=%d, complete=%d\n", err, frag,
322 (int)req->r_reply_info.dir_end,
323 (int)req->r_reply_info.dir_complete);
324
325 if (!req->r_did_prepopulate) {
326 dout("readdir !did_prepopulate");
327 fi->dir_release_count--; /* preclude I_COMPLETE */
328 }
329
330 /* note next offset and last dentry name */
331 fi->offset = fi->next_offset;
332 fi->last_readdir = req;
333
334 if (req->r_reply_info.dir_end) {
335 kfree(fi->last_name);
336 fi->last_name = NULL;
337 fi->next_offset = 0;
338 } else {
339 rinfo = &req->r_reply_info;
340 err = note_last_dentry(fi,
341 rinfo->dir_dname[rinfo->dir_nr-1],
342 rinfo->dir_dname_len[rinfo->dir_nr-1]);
343 if (err)
344 return err;
345 fi->next_offset += rinfo->dir_nr;
346 }
347 }
348
349 rinfo = &fi->last_readdir->r_reply_info;
350 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
351 rinfo->dir_nr, off, fi->offset);
352 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
353 u64 pos = ceph_make_fpos(frag, off);
354 struct ceph_mds_reply_inode *in =
355 rinfo->dir_in[off - fi->offset].in;
356 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
357 off, off - fi->offset, rinfo->dir_nr, pos,
358 rinfo->dir_dname_len[off - fi->offset],
359 rinfo->dir_dname[off - fi->offset], in);
360 BUG_ON(!in);
361 ftype = le32_to_cpu(in->mode) >> 12;
362 if (filldir(dirent,
363 rinfo->dir_dname[off - fi->offset],
364 rinfo->dir_dname_len[off - fi->offset],
365 pos,
366 le64_to_cpu(in->ino),
367 ftype) < 0) {
368 dout("filldir stopping us...\n");
369 return 0;
370 }
371 off++;
372 filp->f_pos = pos + 1;
373 }
374
375 if (fi->last_name) {
376 ceph_mdsc_put_request(fi->last_readdir);
377 fi->last_readdir = NULL;
378 goto more;
379 }
380
381 /* more frags? */
382 if (!ceph_frag_is_rightmost(frag)) {
383 frag = ceph_frag_next(frag);
384 off = 0;
385 filp->f_pos = ceph_make_fpos(frag, off);
386 dout("readdir next frag is %x\n", frag);
387 goto more;
388 }
389 fi->at_end = 1;
390
391 /*
392 * if dir_release_count still matches the dir, no dentries
393 * were released during the whole readdir, and we should have
394 * the complete dir contents in our cache.
395 */
396 spin_lock(&inode->i_lock);
397 if (ci->i_release_count == fi->dir_release_count) {
398 dout(" marking %p complete\n", inode);
399 ci->i_ceph_flags |= CEPH_I_COMPLETE;
400 ci->i_max_offset = filp->f_pos;
401 }
402 spin_unlock(&inode->i_lock);
403
404 dout("readdir %p filp %p done.\n", inode, filp);
405 return 0;
406}
407
408static void reset_readdir(struct ceph_file_info *fi)
409{
410 if (fi->last_readdir) {
411 ceph_mdsc_put_request(fi->last_readdir);
412 fi->last_readdir = NULL;
413 }
414 kfree(fi->last_name);
415 fi->next_offset = 2; /* compensate for . and .. */
416 if (fi->dentry) {
417 dput(fi->dentry);
418 fi->dentry = NULL;
419 }
420 fi->at_end = 0;
421}
422
423static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
424{
425 struct ceph_file_info *fi = file->private_data;
426 struct inode *inode = file->f_mapping->host;
427 loff_t old_offset = offset;
428 loff_t retval;
429
430 mutex_lock(&inode->i_mutex);
431 switch (origin) {
432 case SEEK_END:
433 offset += inode->i_size + 2; /* FIXME */
434 break;
435 case SEEK_CUR:
436 offset += file->f_pos;
437 }
438 retval = -EINVAL;
439 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
440 if (offset != file->f_pos) {
441 file->f_pos = offset;
442 file->f_version = 0;
443 fi->at_end = 0;
444 }
445 retval = offset;
446
447 /*
448 * discard buffered readdir content on seekdir(0), or
449 * seek to new frag, or seek prior to current chunk.
450 */
451 if (offset == 0 ||
452 fpos_frag(offset) != fpos_frag(old_offset) ||
453 fpos_off(offset) < fi->offset) {
454 dout("dir_llseek dropping %p content\n", file);
455 reset_readdir(fi);
456 }
457
458 /* bump dir_release_count if we did a forward seek */
459 if (offset > old_offset)
460 fi->dir_release_count--;
461 }
462 mutex_unlock(&inode->i_mutex);
463 return retval;
464}
465
466/*
467 * Process result of a lookup/open request.
468 *
469 * Mainly, make sure we return the final req->r_dentry (if it already
470 * existed) in place of the original VFS-provided dentry when they
471 * differ.
472 *
473 * Gracefully handle the case where the MDS replies with -ENOENT and
474 * no trace (which it may do, at its discretion, e.g., if it doesn't
475 * care to issue a lease on the negative dentry).
476 */
477struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
478 struct dentry *dentry, int err)
479{
480 struct ceph_client *client = ceph_client(dentry->d_sb);
481 struct inode *parent = dentry->d_parent->d_inode;
482
483 /* .snap dir? */
484 if (err == -ENOENT &&
485 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
486 strcmp(dentry->d_name.name,
487 client->mount_args->snapdir_name) == 0) {
488 struct inode *inode = ceph_get_snapdir(parent);
489 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
490 dentry, dentry->d_name.len, dentry->d_name.name, inode);
491 d_add(dentry, inode);
492 err = 0;
493 }
494
495 if (err == -ENOENT) {
496 /* no trace? */
497 err = 0;
498 if (!req->r_reply_info.head->is_dentry) {
499 dout("ENOENT and no trace, dentry %p inode %p\n",
500 dentry, dentry->d_inode);
501 if (dentry->d_inode) {
502 d_drop(dentry);
503 err = -ENOENT;
504 } else {
505 d_add(dentry, NULL);
506 }
507 }
508 }
509 if (err)
510 dentry = ERR_PTR(err);
511 else if (dentry != req->r_dentry)
512 dentry = dget(req->r_dentry); /* we got spliced */
513 else
514 dentry = NULL;
515 return dentry;
516}
517
518static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
519{
520 return ceph_ino(inode) == CEPH_INO_ROOT &&
521 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
522}
523
524/*
525 * Look up a single dir entry. If there is a lookup intent, inform
526 * the MDS so that it gets our 'caps wanted' value in a single op.
527 */
528static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
529 struct nameidata *nd)
530{
531 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
532 struct ceph_mds_client *mdsc = &client->mdsc;
533 struct ceph_mds_request *req;
534 int op;
535 int err;
536
537 dout("lookup %p dentry %p '%.*s'\n",
538 dir, dentry, dentry->d_name.len, dentry->d_name.name);
539
540 if (dentry->d_name.len > NAME_MAX)
541 return ERR_PTR(-ENAMETOOLONG);
542
543 err = ceph_init_dentry(dentry);
544 if (err < 0)
545 return ERR_PTR(err);
546
547 /* open (but not create!) intent? */
548 if (nd &&
549 (nd->flags & LOOKUP_OPEN) &&
550 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
551 !(nd->intent.open.flags & O_CREAT)) {
552 int mode = nd->intent.open.create_mode & ~current->fs->umask;
553 return ceph_lookup_open(dir, dentry, nd, mode, 1);
554 }
555
556 /* can we conclude ENOENT locally? */
557 if (dentry->d_inode == NULL) {
558 struct ceph_inode_info *ci = ceph_inode(dir);
559 struct ceph_dentry_info *di = ceph_dentry(dentry);
560
561 spin_lock(&dir->i_lock);
562 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
563 if (strncmp(dentry->d_name.name,
564 client->mount_args->snapdir_name,
565 dentry->d_name.len) &&
566 !is_root_ceph_dentry(dir, dentry) &&
567 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
568 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
569 di->offset = ci->i_max_offset++;
570 spin_unlock(&dir->i_lock);
571 dout(" dir %p complete, -ENOENT\n", dir);
572 d_add(dentry, NULL);
573 di->lease_shared_gen = ci->i_shared_gen;
574 return NULL;
575 }
576 spin_unlock(&dir->i_lock);
577 }
578
579 op = ceph_snap(dir) == CEPH_SNAPDIR ?
580 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
581 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
582 if (IS_ERR(req))
583 return ERR_PTR(PTR_ERR(req));
584 req->r_dentry = dget(dentry);
585 req->r_num_caps = 2;
586 /* we only need inode linkage */
587 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
588 req->r_locked_dir = dir;
589 err = ceph_mdsc_do_request(mdsc, NULL, req);
590 dentry = ceph_finish_lookup(req, dentry, err);
591 ceph_mdsc_put_request(req); /* will dput(dentry) */
592 dout("lookup result=%p\n", dentry);
593 return dentry;
594}
595
596/*
597 * If we do a create but get no trace back from the MDS, follow up with
598 * a lookup (the VFS expects us to link up the provided dentry).
599 */
600int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
601{
602 struct dentry *result = ceph_lookup(dir, dentry, NULL);
603
604 if (result && !IS_ERR(result)) {
605 /*
606 * We created the item, then did a lookup, and found
607 * it was already linked to another inode we already
608 * had in our cache (and thus got spliced). Link our
609 * dentry to that inode, but don't hash it, just in
610 * case the VFS wants to dereference it.
611 */
612 BUG_ON(!result->d_inode);
613 d_instantiate(dentry, result->d_inode);
614 return 0;
615 }
616 return PTR_ERR(result);
617}
618
619static int ceph_mknod(struct inode *dir, struct dentry *dentry,
620 int mode, dev_t rdev)
621{
622 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
623 struct ceph_mds_client *mdsc = &client->mdsc;
624 struct ceph_mds_request *req;
625 int err;
626
627 if (ceph_snap(dir) != CEPH_NOSNAP)
628 return -EROFS;
629
630 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
631 dir, dentry, mode, rdev);
632 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
633 if (IS_ERR(req)) {
634 d_drop(dentry);
635 return PTR_ERR(req);
636 }
637 req->r_dentry = dget(dentry);
638 req->r_num_caps = 2;
639 req->r_locked_dir = dir;
640 req->r_args.mknod.mode = cpu_to_le32(mode);
641 req->r_args.mknod.rdev = cpu_to_le32(rdev);
642 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
643 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
644 err = ceph_mdsc_do_request(mdsc, dir, req);
645 if (!err && !req->r_reply_info.head->is_dentry)
646 err = ceph_handle_notrace_create(dir, dentry);
647 ceph_mdsc_put_request(req);
648 if (err)
649 d_drop(dentry);
650 return err;
651}
652
653static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
654 struct nameidata *nd)
655{
656 dout("create in dir %p dentry %p name '%.*s'\n",
657 dir, dentry, dentry->d_name.len, dentry->d_name.name);
658
659 if (ceph_snap(dir) != CEPH_NOSNAP)
660 return -EROFS;
661
662 if (nd) {
663 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
664 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
665 /* hrm, what should i do here if we get aliased? */
666 if (IS_ERR(dentry))
667 return PTR_ERR(dentry);
668 return 0;
669 }
670
671 /* fall back to mknod */
672 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
673}
674
675static int ceph_symlink(struct inode *dir, struct dentry *dentry,
676 const char *dest)
677{
678 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
679 struct ceph_mds_client *mdsc = &client->mdsc;
680 struct ceph_mds_request *req;
681 int err;
682
683 if (ceph_snap(dir) != CEPH_NOSNAP)
684 return -EROFS;
685
686 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
687 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
688 if (IS_ERR(req)) {
689 d_drop(dentry);
690 return PTR_ERR(req);
691 }
692 req->r_dentry = dget(dentry);
693 req->r_num_caps = 2;
694 req->r_path2 = kstrdup(dest, GFP_NOFS);
695 req->r_locked_dir = dir;
696 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
697 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
698 err = ceph_mdsc_do_request(mdsc, dir, req);
699 if (!err && !req->r_reply_info.head->is_dentry)
700 err = ceph_handle_notrace_create(dir, dentry);
701 ceph_mdsc_put_request(req);
702 if (err)
703 d_drop(dentry);
704 return err;
705}
706
707static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
708{
709 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
710 struct ceph_mds_client *mdsc = &client->mdsc;
711 struct ceph_mds_request *req;
712 int err = -EROFS;
713 int op;
714
715 if (ceph_snap(dir) == CEPH_SNAPDIR) {
716 /* mkdir .snap/foo is a MKSNAP */
717 op = CEPH_MDS_OP_MKSNAP;
718 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
719 dentry->d_name.len, dentry->d_name.name, dentry);
720 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
721 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
722 op = CEPH_MDS_OP_MKDIR;
723 } else {
724 goto out;
725 }
726 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
727 if (IS_ERR(req)) {
728 err = PTR_ERR(req);
729 goto out;
730 }
731
732 req->r_dentry = dget(dentry);
733 req->r_num_caps = 2;
734 req->r_locked_dir = dir;
735 req->r_args.mkdir.mode = cpu_to_le32(mode);
736 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
737 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
738 err = ceph_mdsc_do_request(mdsc, dir, req);
739 if (!err && !req->r_reply_info.head->is_dentry)
740 err = ceph_handle_notrace_create(dir, dentry);
741 ceph_mdsc_put_request(req);
742out:
743 if (err < 0)
744 d_drop(dentry);
745 return err;
746}
747
748static int ceph_link(struct dentry *old_dentry, struct inode *dir,
749 struct dentry *dentry)
750{
751 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
752 struct ceph_mds_client *mdsc = &client->mdsc;
753 struct ceph_mds_request *req;
754 int err;
755
756 if (ceph_snap(dir) != CEPH_NOSNAP)
757 return -EROFS;
758
759 dout("link in dir %p old_dentry %p dentry %p\n", dir,
760 old_dentry, dentry);
761 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
762 if (IS_ERR(req)) {
763 d_drop(dentry);
764 return PTR_ERR(req);
765 }
766 req->r_dentry = dget(dentry);
767 req->r_num_caps = 2;
768 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
769 req->r_locked_dir = dir;
770 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
771 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
772 err = ceph_mdsc_do_request(mdsc, dir, req);
773 if (err)
774 d_drop(dentry);
775 else if (!req->r_reply_info.head->is_dentry)
776 d_instantiate(dentry, igrab(old_dentry->d_inode));
777 ceph_mdsc_put_request(req);
778 return err;
779}
780
781/*
782 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
783 * looks like the link count will hit 0, drop any other caps (other
784 * than PIN) we don't specifically want (due to the file still being
785 * open).
786 */
787static int drop_caps_for_unlink(struct inode *inode)
788{
789 struct ceph_inode_info *ci = ceph_inode(inode);
790 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
791
792 spin_lock(&inode->i_lock);
793 if (inode->i_nlink == 1) {
794 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
795 ci->i_ceph_flags |= CEPH_I_NODELAY;
796 }
797 spin_unlock(&inode->i_lock);
798 return drop;
799}
800
801/*
802 * rmdir and unlink are differ only by the metadata op code
803 */
804static int ceph_unlink(struct inode *dir, struct dentry *dentry)
805{
806 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
807 struct ceph_mds_client *mdsc = &client->mdsc;
808 struct inode *inode = dentry->d_inode;
809 struct ceph_mds_request *req;
810 int err = -EROFS;
811 int op;
812
813 if (ceph_snap(dir) == CEPH_SNAPDIR) {
814 /* rmdir .snap/foo is RMSNAP */
815 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
816 dentry->d_name.name, dentry);
817 op = CEPH_MDS_OP_RMSNAP;
818 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
819 dout("unlink/rmdir dir %p dn %p inode %p\n",
820 dir, dentry, inode);
821 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
822 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
823 } else
824 goto out;
825 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
826 if (IS_ERR(req)) {
827 err = PTR_ERR(req);
828 goto out;
829 }
830 req->r_dentry = dget(dentry);
831 req->r_num_caps = 2;
832 req->r_locked_dir = dir;
833 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
834 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
835 req->r_inode_drop = drop_caps_for_unlink(inode);
836 err = ceph_mdsc_do_request(mdsc, dir, req);
837 if (!err && !req->r_reply_info.head->is_dentry)
838 d_delete(dentry);
839 ceph_mdsc_put_request(req);
840out:
841 return err;
842}
843
844static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
845 struct inode *new_dir, struct dentry *new_dentry)
846{
847 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
848 struct ceph_mds_client *mdsc = &client->mdsc;
849 struct ceph_mds_request *req;
850 int err;
851
852 if (ceph_snap(old_dir) != ceph_snap(new_dir))
853 return -EXDEV;
854 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
855 ceph_snap(new_dir) != CEPH_NOSNAP)
856 return -EROFS;
857 dout("rename dir %p dentry %p to dir %p dentry %p\n",
858 old_dir, old_dentry, new_dir, new_dentry);
859 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
860 if (IS_ERR(req))
861 return PTR_ERR(req);
862 req->r_dentry = dget(new_dentry);
863 req->r_num_caps = 2;
864 req->r_old_dentry = dget(old_dentry);
865 req->r_locked_dir = new_dir;
866 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
867 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
868 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
869 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
870 /* release LINK_RDCACHE on source inode (mds will lock it) */
871 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
872 if (new_dentry->d_inode)
873 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
874 err = ceph_mdsc_do_request(mdsc, old_dir, req);
875 if (!err && !req->r_reply_info.head->is_dentry) {
876 /*
877 * Normally d_move() is done by fill_trace (called by
878 * do_request, above). If there is no trace, we need
879 * to do it here.
880 */
881 d_move(old_dentry, new_dentry);
882 }
883 ceph_mdsc_put_request(req);
884 return err;
885}
886
887
888/*
889 * Check if dentry lease is valid. If not, delete the lease. Try to
890 * renew if the least is more than half up.
891 */
892static int dentry_lease_is_valid(struct dentry *dentry)
893{
894 struct ceph_dentry_info *di;
895 struct ceph_mds_session *s;
896 int valid = 0;
897 u32 gen;
898 unsigned long ttl;
899 struct ceph_mds_session *session = NULL;
900 struct inode *dir = NULL;
901 u32 seq = 0;
902
903 spin_lock(&dentry->d_lock);
904 di = ceph_dentry(dentry);
905 if (di && di->lease_session) {
906 s = di->lease_session;
907 spin_lock(&s->s_cap_lock);
908 gen = s->s_cap_gen;
909 ttl = s->s_cap_ttl;
910 spin_unlock(&s->s_cap_lock);
911
912 if (di->lease_gen == gen &&
913 time_before(jiffies, dentry->d_time) &&
914 time_before(jiffies, ttl)) {
915 valid = 1;
916 if (di->lease_renew_after &&
917 time_after(jiffies, di->lease_renew_after)) {
918 /* we should renew */
919 dir = dentry->d_parent->d_inode;
920 session = ceph_get_mds_session(s);
921 seq = di->lease_seq;
922 di->lease_renew_after = 0;
923 di->lease_renew_from = jiffies;
924 }
925 }
926 }
927 spin_unlock(&dentry->d_lock);
928
929 if (session) {
930 ceph_mdsc_lease_send_msg(session, dir, dentry,
931 CEPH_MDS_LEASE_RENEW, seq);
932 ceph_put_mds_session(session);
933 }
934 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
935 return valid;
936}
937
938/*
939 * Check if directory-wide content lease/cap is valid.
940 */
941static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
942{
943 struct ceph_inode_info *ci = ceph_inode(dir);
944 struct ceph_dentry_info *di = ceph_dentry(dentry);
945 int valid = 0;
946
947 spin_lock(&dir->i_lock);
948 if (ci->i_shared_gen == di->lease_shared_gen)
949 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
950 spin_unlock(&dir->i_lock);
951 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
952 dir, (unsigned)ci->i_shared_gen, dentry,
953 (unsigned)di->lease_shared_gen, valid);
954 return valid;
955}
956
957/*
958 * Check if cached dentry can be trusted.
959 */
960static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
961{
962 struct inode *dir = dentry->d_parent->d_inode;
963
964 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
965 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
966
967 /* always trust cached snapped dentries, snapdir dentry */
968 if (ceph_snap(dir) != CEPH_NOSNAP) {
969 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
970 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
971 goto out_touch;
972 }
973 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
974 goto out_touch;
975
976 if (dentry_lease_is_valid(dentry) ||
977 dir_lease_is_valid(dir, dentry))
978 goto out_touch;
979
980 dout("d_revalidate %p invalid\n", dentry);
981 d_drop(dentry);
982 return 0;
983out_touch:
984 ceph_dentry_lru_touch(dentry);
985 return 1;
986}
987
988/*
989 * When a dentry is released, clear the dir I_COMPLETE if it was part
990 * of the current dir gen.
991 */
992static void ceph_dentry_release(struct dentry *dentry)
993{
994 struct ceph_dentry_info *di = ceph_dentry(dentry);
995 struct inode *parent_inode = dentry->d_parent->d_inode;
996
997 if (parent_inode) {
998 struct ceph_inode_info *ci = ceph_inode(parent_inode);
999
1000 spin_lock(&parent_inode->i_lock);
1001 if (ci->i_shared_gen == di->lease_shared_gen) {
1002 dout(" clearing %p complete (d_release)\n",
1003 parent_inode);
1004 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1005 ci->i_release_count++;
1006 }
1007 spin_unlock(&parent_inode->i_lock);
1008 }
1009 if (di) {
1010 ceph_dentry_lru_del(dentry);
1011 if (di->lease_session)
1012 ceph_put_mds_session(di->lease_session);
1013 kmem_cache_free(ceph_dentry_cachep, di);
1014 dentry->d_fsdata = NULL;
1015 }
1016}
1017
1018static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1019 struct nameidata *nd)
1020{
1021 /*
1022 * Eventually, we'll want to revalidate snapped metadata
1023 * too... probably...
1024 */
1025 return 1;
1026}
1027
1028
1029
1030/*
1031 * read() on a dir. This weird interface hack only works if mounted
1032 * with '-o dirstat'.
1033 */
1034static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1035 loff_t *ppos)
1036{
1037 struct ceph_file_info *cf = file->private_data;
1038 struct inode *inode = file->f_dentry->d_inode;
1039 struct ceph_inode_info *ci = ceph_inode(inode);
1040 int left;
1041
1042 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1043 return -EISDIR;
1044
1045 if (!cf->dir_info) {
1046 cf->dir_info = kmalloc(1024, GFP_NOFS);
1047 if (!cf->dir_info)
1048 return -ENOMEM;
1049 cf->dir_info_len =
1050 sprintf(cf->dir_info,
1051 "entries: %20lld\n"
1052 " files: %20lld\n"
1053 " subdirs: %20lld\n"
1054 "rentries: %20lld\n"
1055 " rfiles: %20lld\n"
1056 " rsubdirs: %20lld\n"
1057 "rbytes: %20lld\n"
1058 "rctime: %10ld.%09ld\n",
1059 ci->i_files + ci->i_subdirs,
1060 ci->i_files,
1061 ci->i_subdirs,
1062 ci->i_rfiles + ci->i_rsubdirs,
1063 ci->i_rfiles,
1064 ci->i_rsubdirs,
1065 ci->i_rbytes,
1066 (long)ci->i_rctime.tv_sec,
1067 (long)ci->i_rctime.tv_nsec);
1068 }
1069
1070 if (*ppos >= cf->dir_info_len)
1071 return 0;
1072 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1073 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1074 if (left == size)
1075 return -EFAULT;
1076 *ppos += (size - left);
1077 return size - left;
1078}
1079
1080/*
1081 * an fsync() on a dir will wait for any uncommitted directory
1082 * operations to commit.
1083 */
1084static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1085 int datasync)
1086{
1087 struct inode *inode = dentry->d_inode;
1088 struct ceph_inode_info *ci = ceph_inode(inode);
1089 struct list_head *head = &ci->i_unsafe_dirops;
1090 struct ceph_mds_request *req;
1091 u64 last_tid;
1092 int ret = 0;
1093
1094 dout("dir_fsync %p\n", inode);
1095 spin_lock(&ci->i_unsafe_lock);
1096 if (list_empty(head))
1097 goto out;
1098
1099 req = list_entry(head->prev,
1100 struct ceph_mds_request, r_unsafe_dir_item);
1101 last_tid = req->r_tid;
1102
1103 do {
1104 ceph_mdsc_get_request(req);
1105 spin_unlock(&ci->i_unsafe_lock);
1106 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1107 inode, req->r_tid, last_tid);
1108 if (req->r_timeout) {
1109 ret = wait_for_completion_timeout(
1110 &req->r_safe_completion, req->r_timeout);
1111 if (ret > 0)
1112 ret = 0;
1113 else if (ret == 0)
1114 ret = -EIO; /* timed out */
1115 } else {
1116 wait_for_completion(&req->r_safe_completion);
1117 }
1118 spin_lock(&ci->i_unsafe_lock);
1119 ceph_mdsc_put_request(req);
1120
1121 if (ret || list_empty(head))
1122 break;
1123 req = list_entry(head->next,
1124 struct ceph_mds_request, r_unsafe_dir_item);
1125 } while (req->r_tid < last_tid);
1126out:
1127 spin_unlock(&ci->i_unsafe_lock);
1128 return ret;
1129}
1130
1131/*
1132 * We maintain a private dentry LRU.
1133 *
1134 * FIXME: this needs to be changed to a per-mds lru to be useful.
1135 */
1136void ceph_dentry_lru_add(struct dentry *dn)
1137{
1138 struct ceph_dentry_info *di = ceph_dentry(dn);
1139 struct ceph_mds_client *mdsc;
1140
1141 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1142 dn->d_name.len, dn->d_name.name);
1143 if (di) {
1144 mdsc = &ceph_client(dn->d_sb)->mdsc;
1145 spin_lock(&mdsc->dentry_lru_lock);
1146 list_add_tail(&di->lru, &mdsc->dentry_lru);
1147 mdsc->num_dentry++;
1148 spin_unlock(&mdsc->dentry_lru_lock);
1149 }
1150}
1151
1152void ceph_dentry_lru_touch(struct dentry *dn)
1153{
1154 struct ceph_dentry_info *di = ceph_dentry(dn);
1155 struct ceph_mds_client *mdsc;
1156
1157 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1158 dn->d_name.len, dn->d_name.name);
1159 if (di) {
1160 mdsc = &ceph_client(dn->d_sb)->mdsc;
1161 spin_lock(&mdsc->dentry_lru_lock);
1162 list_move_tail(&di->lru, &mdsc->dentry_lru);
1163 spin_unlock(&mdsc->dentry_lru_lock);
1164 }
1165}
1166
1167void ceph_dentry_lru_del(struct dentry *dn)
1168{
1169 struct ceph_dentry_info *di = ceph_dentry(dn);
1170 struct ceph_mds_client *mdsc;
1171
1172 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1173 dn->d_name.len, dn->d_name.name);
1174 if (di) {
1175 mdsc = &ceph_client(dn->d_sb)->mdsc;
1176 spin_lock(&mdsc->dentry_lru_lock);
1177 list_del_init(&di->lru);
1178 mdsc->num_dentry--;
1179 spin_unlock(&mdsc->dentry_lru_lock);
1180 }
1181}
1182
1183const struct file_operations ceph_dir_fops = {
1184 .read = ceph_read_dir,
1185 .readdir = ceph_readdir,
1186 .llseek = ceph_dir_llseek,
1187 .open = ceph_open,
1188 .release = ceph_release,
1189 .unlocked_ioctl = ceph_ioctl,
1190 .fsync = ceph_dir_fsync,
1191};
1192
1193const struct inode_operations ceph_dir_iops = {
1194 .lookup = ceph_lookup,
1195 .permission = ceph_permission,
1196 .getattr = ceph_getattr,
1197 .setattr = ceph_setattr,
1198 .setxattr = ceph_setxattr,
1199 .getxattr = ceph_getxattr,
1200 .listxattr = ceph_listxattr,
1201 .removexattr = ceph_removexattr,
1202 .mknod = ceph_mknod,
1203 .symlink = ceph_symlink,
1204 .mkdir = ceph_mkdir,
1205 .link = ceph_link,
1206 .unlink = ceph_unlink,
1207 .rmdir = ceph_unlink,
1208 .rename = ceph_rename,
1209 .create = ceph_create,
1210};
1211
1212struct dentry_operations ceph_dentry_ops = {
1213 .d_revalidate = ceph_d_revalidate,
1214 .d_release = ceph_dentry_release,
1215};
1216
1217struct dentry_operations ceph_snapdir_dentry_ops = {
1218 .d_revalidate = ceph_snapdir_d_revalidate,
1219};
1220
1221struct dentry_operations ceph_snap_dentry_ops = {
1222};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..fc68e39cbad6
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <asm/unaligned.h>
5
6#include "super.h"
7
8/*
9 * NFS export support
10 *
11 * NFS re-export of a ceph mount is, at present, only semireliable.
12 * The basic issue is that the Ceph architectures doesn't lend itself
13 * well to generating filehandles that will remain valid forever.
14 *
15 * So, we do our best. If you're lucky, your inode will be in the
16 * client's cache. If it's not, and you have a connectable fh, then
17 * the MDS server may be able to find it for you. Otherwise, you get
18 * ESTALE.
19 *
20 * There are ways to this more reliable, but in the non-connectable fh
21 * case, we won't every work perfectly, and in the connectable case,
22 * some changes are needed on the MDS side to work better.
23 */
24
25/*
26 * Basic fh
27 */
28struct ceph_nfs_fh {
29 u64 ino;
30} __attribute__ ((packed));
31
32/*
33 * Larger 'connectable' fh that includes parent ino and name hash.
34 * Use this whenever possible, as it works more reliably.
35 */
36struct ceph_nfs_confh {
37 u64 ino, parent_ino;
38 u32 parent_name_hash;
39} __attribute__ ((packed));
40
41static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
42 int connectable)
43{
44 struct ceph_nfs_fh *fh = (void *)rawfh;
45 struct ceph_nfs_confh *cfh = (void *)rawfh;
46 struct dentry *parent = dentry->d_parent;
47 struct inode *inode = dentry->d_inode;
48 int type;
49
50 /* don't re-export snaps */
51 if (ceph_snap(inode) != CEPH_NOSNAP)
52 return -EINVAL;
53
54 if (*max_len >= sizeof(*cfh)) {
55 dout("encode_fh %p connectable\n", dentry);
56 cfh->ino = ceph_ino(dentry->d_inode);
57 cfh->parent_ino = ceph_ino(parent->d_inode);
58 cfh->parent_name_hash = parent->d_name.hash;
59 *max_len = sizeof(*cfh);
60 type = 2;
61 } else if (*max_len > sizeof(*fh)) {
62 if (connectable)
63 return -ENOSPC;
64 dout("encode_fh %p\n", dentry);
65 fh->ino = ceph_ino(dentry->d_inode);
66 *max_len = sizeof(*fh);
67 type = 1;
68 } else {
69 return -ENOSPC;
70 }
71 return type;
72}
73
74/*
75 * convert regular fh to dentry
76 *
77 * FIXME: we should try harder by querying the mds for the ino.
78 */
79static struct dentry *__fh_to_dentry(struct super_block *sb,
80 struct ceph_nfs_fh *fh)
81{
82 struct inode *inode;
83 struct dentry *dentry;
84 struct ceph_vino vino;
85 int err;
86
87 dout("__fh_to_dentry %llx\n", fh->ino);
88 vino.ino = fh->ino;
89 vino.snap = CEPH_NOSNAP;
90 inode = ceph_find_inode(sb, vino);
91 if (!inode)
92 return ERR_PTR(-ESTALE);
93
94 dentry = d_obtain_alias(inode);
95 if (!dentry) {
96 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
97 fh->ino, inode);
98 iput(inode);
99 return ERR_PTR(-ENOMEM);
100 }
101 err = ceph_init_dentry(dentry);
102
103 if (err < 0) {
104 iput(inode);
105 return ERR_PTR(err);
106 }
107 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
108 return dentry;
109}
110
111/*
112 * convert connectable fh to dentry
113 */
114static struct dentry *__cfh_to_dentry(struct super_block *sb,
115 struct ceph_nfs_confh *cfh)
116{
117 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
118 struct inode *inode;
119 struct dentry *dentry;
120 struct ceph_vino vino;
121 int err;
122
123 dout("__cfh_to_dentry %llx (%llx/%x)\n",
124 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
125
126 vino.ino = cfh->ino;
127 vino.snap = CEPH_NOSNAP;
128 inode = ceph_find_inode(sb, vino);
129 if (!inode) {
130 struct ceph_mds_request *req;
131
132 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
133 USE_ANY_MDS);
134 if (IS_ERR(req))
135 return ERR_PTR(PTR_ERR(req));
136
137 req->r_ino1 = vino;
138 req->r_ino2.ino = cfh->parent_ino;
139 req->r_ino2.snap = CEPH_NOSNAP;
140 req->r_path2 = kmalloc(16, GFP_NOFS);
141 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
142 req->r_num_caps = 1;
143 err = ceph_mdsc_do_request(mdsc, NULL, req);
144 ceph_mdsc_put_request(req);
145 inode = ceph_find_inode(sb, vino);
146 if (!inode)
147 return ERR_PTR(err ? err : -ESTALE);
148 }
149
150 dentry = d_obtain_alias(inode);
151 if (!dentry) {
152 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
153 cfh->ino, inode);
154 iput(inode);
155 return ERR_PTR(-ENOMEM);
156 }
157 err = ceph_init_dentry(dentry);
158 if (err < 0) {
159 iput(inode);
160 return ERR_PTR(err);
161 }
162 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
163 return dentry;
164}
165
166static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
167 int fh_len, int fh_type)
168{
169 if (fh_type == 1)
170 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
171 else
172 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
173}
174
175/*
176 * get parent, if possible.
177 *
178 * FIXME: we could do better by querying the mds to discover the
179 * parent.
180 */
181static struct dentry *ceph_fh_to_parent(struct super_block *sb,
182 struct fid *fid,
183 int fh_len, int fh_type)
184{
185 struct ceph_nfs_confh *cfh = (void *)fid->raw;
186 struct ceph_vino vino;
187 struct inode *inode;
188 struct dentry *dentry;
189 int err;
190
191 if (fh_type == 1)
192 return ERR_PTR(-ESTALE);
193
194 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
195 cfh->parent_name_hash);
196
197 vino.ino = cfh->ino;
198 vino.snap = CEPH_NOSNAP;
199 inode = ceph_find_inode(sb, vino);
200 if (!inode)
201 return ERR_PTR(-ESTALE);
202
203 dentry = d_obtain_alias(inode);
204 if (!dentry) {
205 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
206 cfh->ino, inode);
207 iput(inode);
208 return ERR_PTR(-ENOMEM);
209 }
210 err = ceph_init_dentry(dentry);
211 if (err < 0) {
212 iput(inode);
213 return ERR_PTR(err);
214 }
215 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
216 return dentry;
217}
218
219const struct export_operations ceph_export_ops = {
220 .encode_fh = ceph_encode_fh,
221 .fh_to_dentry = ceph_fh_to_dentry,
222 .fh_to_parent = ceph_fh_to_parent,
223};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..5d2af8464f6a
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,937 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/writeback.h>
7
8#include "super.h"
9#include "mds_client.h"
10
11/*
12 * Ceph file operations
13 *
14 * Implement basic open/close functionality, and implement
15 * read/write.
16 *
17 * We implement three modes of file I/O:
18 * - buffered uses the generic_file_aio_{read,write} helpers
19 *
20 * - synchronous is used when there is multi-client read/write
21 * sharing, avoids the page cache, and synchronously waits for an
22 * ack from the OSD.
23 *
24 * - direct io takes the variant of the sync path that references
25 * user pages directly.
26 *
27 * fsync() flushes and waits on dirty pages, but just queues metadata
28 * for writeback: since the MDS can recover size and mtime there is no
29 * need to wait for MDS acknowledgement.
30 */
31
32
33/*
34 * Prepare an open request. Preallocate ceph_cap to avoid an
35 * inopportune ENOMEM later.
36 */
37static struct ceph_mds_request *
38prepare_open_request(struct super_block *sb, int flags, int create_mode)
39{
40 struct ceph_client *client = ceph_sb_to_client(sb);
41 struct ceph_mds_client *mdsc = &client->mdsc;
42 struct ceph_mds_request *req;
43 int want_auth = USE_ANY_MDS;
44 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
45
46 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
47 want_auth = USE_AUTH_MDS;
48
49 req = ceph_mdsc_create_request(mdsc, op, want_auth);
50 if (IS_ERR(req))
51 goto out;
52 req->r_fmode = ceph_flags_to_mode(flags);
53 req->r_args.open.flags = cpu_to_le32(flags);
54 req->r_args.open.mode = cpu_to_le32(create_mode);
55 req->r_args.open.preferred = cpu_to_le32(-1);
56out:
57 return req;
58}
59
60/*
61 * initialize private struct file data.
62 * if we fail, clean up by dropping fmode reference on the ceph_inode
63 */
64static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
65{
66 struct ceph_file_info *cf;
67 int ret = 0;
68
69 switch (inode->i_mode & S_IFMT) {
70 case S_IFREG:
71 case S_IFDIR:
72 dout("init_file %p %p 0%o (regular)\n", inode, file,
73 inode->i_mode);
74 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
75 if (cf == NULL) {
76 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
77 return -ENOMEM;
78 }
79 cf->fmode = fmode;
80 cf->next_offset = 2;
81 file->private_data = cf;
82 BUG_ON(inode->i_fop->release != ceph_release);
83 break;
84
85 case S_IFLNK:
86 dout("init_file %p %p 0%o (symlink)\n", inode, file,
87 inode->i_mode);
88 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
89 break;
90
91 default:
92 dout("init_file %p %p 0%o (special)\n", inode, file,
93 inode->i_mode);
94 /*
95 * we need to drop the open ref now, since we don't
96 * have .release set to ceph_release.
97 */
98 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
99 BUG_ON(inode->i_fop->release == ceph_release);
100
101 /* call the proper open fop */
102 ret = inode->i_fop->open(inode, file);
103 }
104 return ret;
105}
106
107/*
108 * If the filp already has private_data, that means the file was
109 * already opened by intent during lookup, and we do nothing.
110 *
111 * If we already have the requisite capabilities, we can satisfy
112 * the open request locally (no need to request new caps from the
113 * MDS). We do, however, need to inform the MDS (asynchronously)
114 * if our wanted caps set expands.
115 */
116int ceph_open(struct inode *inode, struct file *file)
117{
118 struct ceph_inode_info *ci = ceph_inode(inode);
119 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
120 struct ceph_mds_client *mdsc = &client->mdsc;
121 struct ceph_mds_request *req;
122 struct ceph_file_info *cf = file->private_data;
123 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
124 int err;
125 int flags, fmode, wanted;
126
127 if (cf) {
128 dout("open file %p is already opened\n", file);
129 return 0;
130 }
131
132 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
133 flags = file->f_flags & ~(O_CREAT|O_EXCL);
134 if (S_ISDIR(inode->i_mode))
135 flags = O_DIRECTORY; /* mds likes to know */
136
137 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
138 ceph_vinop(inode), file, flags, file->f_flags);
139 fmode = ceph_flags_to_mode(flags);
140 wanted = ceph_caps_for_mode(fmode);
141
142 /* snapped files are read-only */
143 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
144 return -EROFS;
145
146 /* trivially open snapdir */
147 if (ceph_snap(inode) == CEPH_SNAPDIR) {
148 spin_lock(&inode->i_lock);
149 __ceph_get_fmode(ci, fmode);
150 spin_unlock(&inode->i_lock);
151 return ceph_init_file(inode, file, fmode);
152 }
153
154 /*
155 * No need to block if we have any caps. Update wanted set
156 * asynchronously.
157 */
158 spin_lock(&inode->i_lock);
159 if (__ceph_is_any_real_caps(ci)) {
160 int mds_wanted = __ceph_caps_mds_wanted(ci);
161 int issued = __ceph_caps_issued(ci, NULL);
162
163 dout("open %p fmode %d want %s issued %s using existing\n",
164 inode, fmode, ceph_cap_string(wanted),
165 ceph_cap_string(issued));
166 __ceph_get_fmode(ci, fmode);
167 spin_unlock(&inode->i_lock);
168
169 /* adjust wanted? */
170 if ((issued & wanted) != wanted &&
171 (mds_wanted & wanted) != wanted &&
172 ceph_snap(inode) != CEPH_SNAPDIR)
173 ceph_check_caps(ci, 0, NULL);
174
175 return ceph_init_file(inode, file, fmode);
176 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
177 (ci->i_snap_caps & wanted) == wanted) {
178 __ceph_get_fmode(ci, fmode);
179 spin_unlock(&inode->i_lock);
180 return ceph_init_file(inode, file, fmode);
181 }
182 spin_unlock(&inode->i_lock);
183
184 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
185 req = prepare_open_request(inode->i_sb, flags, 0);
186 if (IS_ERR(req)) {
187 err = PTR_ERR(req);
188 goto out;
189 }
190 req->r_inode = igrab(inode);
191 req->r_num_caps = 1;
192 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
193 if (!err)
194 err = ceph_init_file(inode, file, req->r_fmode);
195 ceph_mdsc_put_request(req);
196 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
197out:
198 return err;
199}
200
201
202/*
203 * Do a lookup + open with a single request.
204 *
205 * If this succeeds, but some subsequent check in the vfs
206 * may_open() fails, the struct *file gets cleaned up (i.e.
207 * ceph_release gets called). So fear not!
208 */
209/*
210 * flags
211 * path_lookup_open -> LOOKUP_OPEN
212 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
213 */
214struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
215 struct nameidata *nd, int mode,
216 int locked_dir)
217{
218 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
219 struct ceph_mds_client *mdsc = &client->mdsc;
220 struct file *file = nd->intent.open.file;
221 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
222 struct ceph_mds_request *req;
223 int err;
224 int flags = nd->intent.open.flags - 1; /* silly vfs! */
225
226 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
227 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
228
229 /* do the open */
230 req = prepare_open_request(dir->i_sb, flags, mode);
231 if (IS_ERR(req))
232 return ERR_PTR(PTR_ERR(req));
233 req->r_dentry = dget(dentry);
234 req->r_num_caps = 2;
235 if (flags & O_CREAT) {
236 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
237 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
238 }
239 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
240 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
241 dentry = ceph_finish_lookup(req, dentry, err);
242 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
243 err = ceph_handle_notrace_create(dir, dentry);
244 if (!err)
245 err = ceph_init_file(req->r_dentry->d_inode, file,
246 req->r_fmode);
247 ceph_mdsc_put_request(req);
248 dout("ceph_lookup_open result=%p\n", dentry);
249 return dentry;
250}
251
252int ceph_release(struct inode *inode, struct file *file)
253{
254 struct ceph_inode_info *ci = ceph_inode(inode);
255 struct ceph_file_info *cf = file->private_data;
256
257 dout("release inode %p file %p\n", inode, file);
258 ceph_put_fmode(ci, cf->fmode);
259 if (cf->last_readdir)
260 ceph_mdsc_put_request(cf->last_readdir);
261 kfree(cf->last_name);
262 kfree(cf->dir_info);
263 dput(cf->dentry);
264 kmem_cache_free(ceph_file_cachep, cf);
265
266 /* wake up anyone waiting for caps on this inode */
267 wake_up(&ci->i_cap_wq);
268 return 0;
269}
270
271/*
272 * build a vector of user pages
273 */
274static struct page **get_direct_page_vector(const char __user *data,
275 int num_pages,
276 loff_t off, size_t len)
277{
278 struct page **pages;
279 int rc;
280
281 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
282 if (!pages)
283 return ERR_PTR(-ENOMEM);
284
285 down_read(&current->mm->mmap_sem);
286 rc = get_user_pages(current, current->mm, (unsigned long)data,
287 num_pages, 0, 0, pages, NULL);
288 up_read(&current->mm->mmap_sem);
289 if (rc < 0)
290 goto fail;
291 return pages;
292
293fail:
294 kfree(pages);
295 return ERR_PTR(rc);
296}
297
298static void put_page_vector(struct page **pages, int num_pages)
299{
300 int i;
301
302 for (i = 0; i < num_pages; i++)
303 put_page(pages[i]);
304 kfree(pages);
305}
306
307void ceph_release_page_vector(struct page **pages, int num_pages)
308{
309 int i;
310
311 for (i = 0; i < num_pages; i++)
312 __free_pages(pages[i], 0);
313 kfree(pages);
314}
315
316/*
317 * allocate a vector new pages
318 */
319static struct page **alloc_page_vector(int num_pages)
320{
321 struct page **pages;
322 int i;
323
324 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
325 if (!pages)
326 return ERR_PTR(-ENOMEM);
327 for (i = 0; i < num_pages; i++) {
328 pages[i] = alloc_page(GFP_NOFS);
329 if (pages[i] == NULL) {
330 ceph_release_page_vector(pages, i);
331 return ERR_PTR(-ENOMEM);
332 }
333 }
334 return pages;
335}
336
337/*
338 * copy user data into a page vector
339 */
340static int copy_user_to_page_vector(struct page **pages,
341 const char __user *data,
342 loff_t off, size_t len)
343{
344 int i = 0;
345 int po = off & ~PAGE_CACHE_MASK;
346 int left = len;
347 int l, bad;
348
349 while (left > 0) {
350 l = min_t(int, PAGE_CACHE_SIZE-po, left);
351 bad = copy_from_user(page_address(pages[i]) + po, data, l);
352 if (bad == l)
353 return -EFAULT;
354 data += l - bad;
355 left -= l - bad;
356 po += l - bad;
357 if (po == PAGE_CACHE_SIZE) {
358 po = 0;
359 i++;
360 }
361 }
362 return len;
363}
364
365/*
366 * copy user data from a page vector into a user pointer
367 */
368static int copy_page_vector_to_user(struct page **pages, char __user *data,
369 loff_t off, size_t len)
370{
371 int i = 0;
372 int po = off & ~PAGE_CACHE_MASK;
373 int left = len;
374 int l, bad;
375
376 while (left > 0) {
377 l = min_t(int, left, PAGE_CACHE_SIZE-po);
378 bad = copy_to_user(data, page_address(pages[i]) + po, l);
379 if (bad == l)
380 return -EFAULT;
381 data += l - bad;
382 left -= l - bad;
383 if (po) {
384 po += l - bad;
385 if (po == PAGE_CACHE_SIZE)
386 po = 0;
387 }
388 i++;
389 }
390 return len;
391}
392
393/*
394 * Zero an extent within a page vector. Offset is relative to the
395 * start of the first page.
396 */
397static void zero_page_vector_range(int off, int len, struct page **pages)
398{
399 int i = off >> PAGE_CACHE_SHIFT;
400
401 off &= ~PAGE_CACHE_MASK;
402
403 dout("zero_page_vector_page %u~%u\n", off, len);
404
405 /* leading partial page? */
406 if (off) {
407 int end = min((int)PAGE_CACHE_SIZE, off + len);
408 dout("zeroing %d %p head from %d\n", i, pages[i],
409 (int)off);
410 zero_user_segment(pages[i], off, end);
411 len -= (end - off);
412 i++;
413 }
414 while (len >= PAGE_CACHE_SIZE) {
415 dout("zeroing %d %p len=%d\n", i, pages[i], len);
416 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
417 len -= PAGE_CACHE_SIZE;
418 i++;
419 }
420 /* trailing partial page? */
421 if (len) {
422 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
423 zero_user_segment(pages[i], 0, len);
424 }
425}
426
427
428/*
429 * Read a range of bytes striped over one or more objects. Iterate over
430 * objects we stripe over. (That's not atomic, but good enough for now.)
431 *
432 * If we get a short result from the OSD, check against i_size; we need to
433 * only return a short read to the caller if we hit EOF.
434 */
435static int striped_read(struct inode *inode,
436 u64 off, u64 len,
437 struct page **pages, int num_pages,
438 int *checkeof)
439{
440 struct ceph_client *client = ceph_inode_to_client(inode);
441 struct ceph_inode_info *ci = ceph_inode(inode);
442 u64 pos, this_len;
443 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
444 int left, pages_left;
445 int read;
446 struct page **page_pos;
447 int ret;
448 bool hit_stripe, was_short;
449
450 /*
451 * we may need to do multiple reads. not atomic, unfortunately.
452 */
453 pos = off;
454 left = len;
455 page_pos = pages;
456 pages_left = num_pages;
457 read = 0;
458
459more:
460 this_len = left;
461 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
462 &ci->i_layout, pos, &this_len,
463 ci->i_truncate_seq,
464 ci->i_truncate_size,
465 page_pos, pages_left);
466 hit_stripe = this_len < left;
467 was_short = ret >= 0 && ret < this_len;
468 if (ret == -ENOENT)
469 ret = 0;
470 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
471 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
472
473 if (ret > 0) {
474 int didpages =
475 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
476
477 if (read < pos - off) {
478 dout(" zero gap %llu to %llu\n", off + read, pos);
479 zero_page_vector_range(page_off + read,
480 pos - off - read, pages);
481 }
482 pos += ret;
483 read = pos - off;
484 left -= ret;
485 page_pos += didpages;
486 pages_left -= didpages;
487
488 /* hit stripe? */
489 if (left && hit_stripe)
490 goto more;
491 }
492
493 if (was_short) {
494 /* was original extent fully inside i_size? */
495 if (pos + left <= inode->i_size) {
496 dout("zero tail\n");
497 zero_page_vector_range(page_off + read, len - read,
498 pages);
499 read = len;
500 goto out;
501 }
502
503 /* check i_size */
504 *checkeof = 1;
505 }
506
507out:
508 if (ret >= 0)
509 ret = read;
510 dout("striped_read returns %d\n", ret);
511 return ret;
512}
513
514/*
515 * Completely synchronous read and write methods. Direct from __user
516 * buffer to osd, or directly to user pages (if O_DIRECT).
517 *
518 * If the read spans object boundary, just do multiple reads.
519 */
520static ssize_t ceph_sync_read(struct file *file, char __user *data,
521 unsigned len, loff_t *poff, int *checkeof)
522{
523 struct inode *inode = file->f_dentry->d_inode;
524 struct page **pages;
525 u64 off = *poff;
526 int num_pages = calc_pages_for(off, len);
527 int ret;
528
529 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
530 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
531
532 if (file->f_flags & O_DIRECT) {
533 pages = get_direct_page_vector(data, num_pages, off, len);
534
535 /*
536 * flush any page cache pages in this range. this
537 * will make concurrent normal and O_DIRECT io slow,
538 * but it will at least behave sensibly when they are
539 * in sequence.
540 */
541 } else {
542 pages = alloc_page_vector(num_pages);
543 }
544 if (IS_ERR(pages))
545 return PTR_ERR(pages);
546
547 ret = filemap_write_and_wait(inode->i_mapping);
548 if (ret < 0)
549 goto done;
550
551 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
552
553 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
554 ret = copy_page_vector_to_user(pages, data, off, ret);
555 if (ret >= 0)
556 *poff = off + ret;
557
558done:
559 if (file->f_flags & O_DIRECT)
560 put_page_vector(pages, num_pages);
561 else
562 ceph_release_page_vector(pages, num_pages);
563 dout("sync_read result %d\n", ret);
564 return ret;
565}
566
567/*
568 * Write commit callback, called if we requested both an ACK and
569 * ONDISK commit reply from the OSD.
570 */
571static void sync_write_commit(struct ceph_osd_request *req,
572 struct ceph_msg *msg)
573{
574 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
575
576 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
577 spin_lock(&ci->i_unsafe_lock);
578 list_del_init(&req->r_unsafe_item);
579 spin_unlock(&ci->i_unsafe_lock);
580 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
581}
582
583/*
584 * Synchronous write, straight from __user pointer or user pages (if
585 * O_DIRECT).
586 *
587 * If write spans object boundary, just do multiple writes. (For a
588 * correct atomic write, we should e.g. take write locks on all
589 * objects, rollback on failure, etc.)
590 */
591static ssize_t ceph_sync_write(struct file *file, const char __user *data,
592 size_t left, loff_t *offset)
593{
594 struct inode *inode = file->f_dentry->d_inode;
595 struct ceph_inode_info *ci = ceph_inode(inode);
596 struct ceph_client *client = ceph_inode_to_client(inode);
597 struct ceph_osd_request *req;
598 struct page **pages;
599 int num_pages;
600 long long unsigned pos;
601 u64 len;
602 int written = 0;
603 int flags;
604 int do_sync = 0;
605 int check_caps = 0;
606 int ret;
607 struct timespec mtime = CURRENT_TIME;
608
609 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
610 return -EROFS;
611
612 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
613 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
614
615 if (file->f_flags & O_APPEND)
616 pos = i_size_read(inode);
617 else
618 pos = *offset;
619
620 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
621 if (ret < 0)
622 return ret;
623
624 ret = invalidate_inode_pages2_range(inode->i_mapping,
625 pos >> PAGE_CACHE_SHIFT,
626 (pos + left) >> PAGE_CACHE_SHIFT);
627 if (ret < 0)
628 dout("invalidate_inode_pages2_range returned %d\n", ret);
629
630 flags = CEPH_OSD_FLAG_ORDERSNAP |
631 CEPH_OSD_FLAG_ONDISK |
632 CEPH_OSD_FLAG_WRITE;
633 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
634 flags |= CEPH_OSD_FLAG_ACK;
635 else
636 do_sync = 1;
637
638 /*
639 * we may need to do multiple writes here if we span an object
640 * boundary. this isn't atomic, unfortunately. :(
641 */
642more:
643 len = left;
644 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
645 ceph_vino(inode), pos, &len,
646 CEPH_OSD_OP_WRITE, flags,
647 ci->i_snap_realm->cached_context,
648 do_sync,
649 ci->i_truncate_seq, ci->i_truncate_size,
650 &mtime, false, 2);
651 if (IS_ERR(req))
652 return PTR_ERR(req);
653
654 num_pages = calc_pages_for(pos, len);
655
656 if (file->f_flags & O_DIRECT) {
657 pages = get_direct_page_vector(data, num_pages, pos, len);
658 if (IS_ERR(pages)) {
659 ret = PTR_ERR(pages);
660 goto out;
661 }
662
663 /*
664 * throw out any page cache pages in this range. this
665 * may block.
666 */
667 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
668 } else {
669 pages = alloc_page_vector(num_pages);
670 if (IS_ERR(pages)) {
671 ret = PTR_ERR(pages);
672 goto out;
673 }
674 ret = copy_user_to_page_vector(pages, data, pos, len);
675 if (ret < 0) {
676 ceph_release_page_vector(pages, num_pages);
677 goto out;
678 }
679
680 if ((file->f_flags & O_SYNC) == 0) {
681 /* get a second commit callback */
682 req->r_safe_callback = sync_write_commit;
683 req->r_own_pages = 1;
684 }
685 }
686 req->r_pages = pages;
687 req->r_num_pages = num_pages;
688 req->r_inode = inode;
689
690 ret = ceph_osdc_start_request(&client->osdc, req, false);
691 if (!ret) {
692 if (req->r_safe_callback) {
693 /*
694 * Add to inode unsafe list only after we
695 * start_request so that a tid has been assigned.
696 */
697 spin_lock(&ci->i_unsafe_lock);
698 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
699 spin_unlock(&ci->i_unsafe_lock);
700 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
701 }
702 ret = ceph_osdc_wait_request(&client->osdc, req);
703 }
704
705 if (file->f_flags & O_DIRECT)
706 put_page_vector(pages, num_pages);
707 else if (file->f_flags & O_SYNC)
708 ceph_release_page_vector(pages, num_pages);
709
710out:
711 ceph_osdc_put_request(req);
712 if (ret == 0) {
713 pos += len;
714 written += len;
715 left -= len;
716 if (left)
717 goto more;
718
719 ret = written;
720 *offset = pos;
721 if (pos > i_size_read(inode))
722 check_caps = ceph_inode_set_size(inode, pos);
723 if (check_caps)
724 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
725 NULL);
726 }
727 return ret;
728}
729
730/*
731 * Wrap generic_file_aio_read with checks for cap bits on the inode.
732 * Atomically grab references, so that those bits are not released
733 * back to the MDS mid-read.
734 *
735 * Hmm, the sync read case isn't actually async... should it be?
736 */
737static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
738 unsigned long nr_segs, loff_t pos)
739{
740 struct file *filp = iocb->ki_filp;
741 loff_t *ppos = &iocb->ki_pos;
742 size_t len = iov->iov_len;
743 struct inode *inode = filp->f_dentry->d_inode;
744 struct ceph_inode_info *ci = ceph_inode(inode);
745 void *base = iov->iov_base;
746 ssize_t ret;
747 int got = 0;
748 int checkeof = 0, read = 0;
749
750 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
751 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
752again:
753 __ceph_do_pending_vmtruncate(inode);
754 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
755 &got, -1);
756 if (ret < 0)
757 goto out;
758 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
759 inode, ceph_vinop(inode), pos, (unsigned)len,
760 ceph_cap_string(got));
761
762 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
763 (iocb->ki_filp->f_flags & O_DIRECT) ||
764 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
765 /* hmm, this isn't really async... */
766 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
767 else
768 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
769
770out:
771 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
772 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
773 ceph_put_cap_refs(ci, got);
774
775 if (checkeof && ret >= 0) {
776 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
777
778 /* hit EOF or hole? */
779 if (statret == 0 && *ppos < inode->i_size) {
780 dout("aio_read sync_read hit hole, reading more\n");
781 read += ret;
782 base += ret;
783 len -= ret;
784 checkeof = 0;
785 goto again;
786 }
787 }
788 if (ret >= 0)
789 ret += read;
790
791 return ret;
792}
793
794/*
795 * Take cap references to avoid releasing caps to MDS mid-write.
796 *
797 * If we are synchronous, and write with an old snap context, the OSD
798 * may return EOLDSNAPC. In that case, retry the write.. _after_
799 * dropping our cap refs and allowing the pending snap to logically
800 * complete _before_ this write occurs.
801 *
802 * If we are near ENOSPC, write synchronously.
803 */
804static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
805 unsigned long nr_segs, loff_t pos)
806{
807 struct file *file = iocb->ki_filp;
808 struct inode *inode = file->f_dentry->d_inode;
809 struct ceph_inode_info *ci = ceph_inode(inode);
810 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
811 loff_t endoff = pos + iov->iov_len;
812 int got = 0;
813 int ret, err;
814
815 if (ceph_snap(inode) != CEPH_NOSNAP)
816 return -EROFS;
817
818retry_snap:
819 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
820 return -ENOSPC;
821 __ceph_do_pending_vmtruncate(inode);
822 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
823 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
824 inode->i_size);
825 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
826 &got, endoff);
827 if (ret < 0)
828 goto out;
829
830 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
831 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
832 ceph_cap_string(got));
833
834 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
835 (iocb->ki_filp->f_flags & O_DIRECT) ||
836 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
837 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
838 &iocb->ki_pos);
839 } else {
840 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
841
842 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
843 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
844 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
845 err = vfs_fsync_range(file, file->f_path.dentry,
846 pos, pos + ret - 1, 1);
847 if (err < 0)
848 ret = err;
849 }
850 }
851 if (ret >= 0) {
852 spin_lock(&inode->i_lock);
853 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
854 spin_unlock(&inode->i_lock);
855 }
856
857out:
858 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
859 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
860 ceph_cap_string(got));
861 ceph_put_cap_refs(ci, got);
862
863 if (ret == -EOLDSNAPC) {
864 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
865 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
866 goto retry_snap;
867 }
868
869 return ret;
870}
871
872/*
873 * llseek. be sure to verify file size on SEEK_END.
874 */
875static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
876{
877 struct inode *inode = file->f_mapping->host;
878 int ret;
879
880 mutex_lock(&inode->i_mutex);
881 __ceph_do_pending_vmtruncate(inode);
882 switch (origin) {
883 case SEEK_END:
884 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
885 if (ret < 0) {
886 offset = ret;
887 goto out;
888 }
889 offset += inode->i_size;
890 break;
891 case SEEK_CUR:
892 /*
893 * Here we special-case the lseek(fd, 0, SEEK_CUR)
894 * position-querying operation. Avoid rewriting the "same"
895 * f_pos value back to the file because a concurrent read(),
896 * write() or lseek() might have altered it
897 */
898 if (offset == 0) {
899 offset = file->f_pos;
900 goto out;
901 }
902 offset += file->f_pos;
903 break;
904 }
905
906 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
907 offset = -EINVAL;
908 goto out;
909 }
910
911 /* Special lock needed here? */
912 if (offset != file->f_pos) {
913 file->f_pos = offset;
914 file->f_version = 0;
915 }
916
917out:
918 mutex_unlock(&inode->i_mutex);
919 return offset;
920}
921
922const struct file_operations ceph_file_fops = {
923 .open = ceph_open,
924 .release = ceph_release,
925 .llseek = ceph_llseek,
926 .read = do_sync_read,
927 .write = do_sync_write,
928 .aio_read = ceph_aio_read,
929 .aio_write = ceph_aio_write,
930 .mmap = ceph_mmap,
931 .fsync = ceph_fsync,
932 .splice_read = generic_file_splice_read,
933 .splice_write = generic_file_splice_write,
934 .unlocked_ioctl = ceph_ioctl,
935 .compat_ioctl = ceph_ioctl,
936};
937
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..aca82d55cc53
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1766 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
397 kfree(ci->i_symlink);
398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
399 frag = rb_entry(n, struct ceph_inode_frag, node);
400 rb_erase(n, &ci->i_fragtree);
401 kfree(frag);
402 }
403
404 __ceph_destroy_xattrs(ci);
405 if (ci->i_xattrs.blob)
406 ceph_buffer_put(ci->i_xattrs.blob);
407 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409
410 kmem_cache_free(ceph_inode_cachep, ci);
411}
412
413
414/*
415 * Helpers to fill in size, ctime, mtime, and atime. We have to be
416 * careful because either the client or MDS may have more up to date
417 * info, depending on which capabilities are held, and whether
418 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
419 * and size are monotonically increasing, except when utimes() or
420 * truncate() increments the corresponding _seq values.)
421 */
422int ceph_fill_file_size(struct inode *inode, int issued,
423 u32 truncate_seq, u64 truncate_size, u64 size)
424{
425 struct ceph_inode_info *ci = ceph_inode(inode);
426 int queue_trunc = 0;
427
428 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
429 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
430 dout("size %lld -> %llu\n", inode->i_size, size);
431 inode->i_size = size;
432 inode->i_blocks = (size + (1<<9) - 1) >> 9;
433 ci->i_reported_size = size;
434 if (truncate_seq != ci->i_truncate_seq) {
435 dout("truncate_seq %u -> %u\n",
436 ci->i_truncate_seq, truncate_seq);
437 ci->i_truncate_seq = truncate_seq;
438 /*
439 * If we hold relevant caps, or in the case where we're
440 * not the only client referencing this file and we
441 * don't hold those caps, then we need to check whether
442 * the file is either opened or mmaped
443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) ||
447 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++;
450 queue_trunc = 1;
451 }
452 }
453 }
454 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
455 ci->i_truncate_size != truncate_size) {
456 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
457 truncate_size);
458 ci->i_truncate_size = truncate_size;
459 }
460 return queue_trunc;
461}
462
463void ceph_fill_file_time(struct inode *inode, int issued,
464 u64 time_warp_seq, struct timespec *ctime,
465 struct timespec *mtime, struct timespec *atime)
466{
467 struct ceph_inode_info *ci = ceph_inode(inode);
468 int warn = 0;
469
470 if (issued & (CEPH_CAP_FILE_EXCL|
471 CEPH_CAP_FILE_WR|
472 CEPH_CAP_FILE_BUFFER)) {
473 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
474 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
475 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
476 ctime->tv_sec, ctime->tv_nsec);
477 inode->i_ctime = *ctime;
478 }
479 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
480 /* the MDS did a utimes() */
481 dout("mtime %ld.%09ld -> %ld.%09ld "
482 "tw %d -> %d\n",
483 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
484 mtime->tv_sec, mtime->tv_nsec,
485 ci->i_time_warp_seq, (int)time_warp_seq);
486
487 inode->i_mtime = *mtime;
488 inode->i_atime = *atime;
489 ci->i_time_warp_seq = time_warp_seq;
490 } else if (time_warp_seq == ci->i_time_warp_seq) {
491 /* nobody did utimes(); take the max */
492 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
493 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
494 inode->i_mtime.tv_sec,
495 inode->i_mtime.tv_nsec,
496 mtime->tv_sec, mtime->tv_nsec);
497 inode->i_mtime = *mtime;
498 }
499 if (timespec_compare(atime, &inode->i_atime) > 0) {
500 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
501 inode->i_atime.tv_sec,
502 inode->i_atime.tv_nsec,
503 atime->tv_sec, atime->tv_nsec);
504 inode->i_atime = *atime;
505 }
506 } else if (issued & CEPH_CAP_FILE_EXCL) {
507 /* we did a utimes(); ignore mds values */
508 } else {
509 warn = 1;
510 }
511 } else {
512 /* we have no write caps; whatever the MDS says is true */
513 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
514 inode->i_ctime = *ctime;
515 inode->i_mtime = *mtime;
516 inode->i_atime = *atime;
517 ci->i_time_warp_seq = time_warp_seq;
518 } else {
519 warn = 1;
520 }
521 }
522 if (warn) /* time_warp_seq shouldn't go backwards */
523 dout("%p mds time_warp_seq %llu < %u\n",
524 inode, time_warp_seq, ci->i_time_warp_seq);
525}
526
527/*
528 * Populate an inode based on info from mds. May be called on new or
529 * existing inodes.
530 */
531static int fill_inode(struct inode *inode,
532 struct ceph_mds_reply_info_in *iinfo,
533 struct ceph_mds_reply_dirfrag *dirinfo,
534 struct ceph_mds_session *session,
535 unsigned long ttl_from, int cap_fmode,
536 struct ceph_cap_reservation *caps_reservation)
537{
538 struct ceph_mds_reply_inode *info = iinfo->in;
539 struct ceph_inode_info *ci = ceph_inode(inode);
540 int i;
541 int issued, implemented;
542 struct timespec mtime, atime, ctime;
543 u32 nsplits;
544 struct ceph_buffer *xattr_blob = NULL;
545 int err = 0;
546 int queue_trunc = 0;
547
548 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
549 inode, ceph_vinop(inode), le64_to_cpu(info->version),
550 ci->i_version);
551
552 /*
553 * prealloc xattr data, if it looks like we'll need it. only
554 * if len > 4 (meaning there are actually xattrs; the first 4
555 * bytes are the xattr count).
556 */
557 if (iinfo->xattr_len > 4) {
558 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
559 if (!xattr_blob)
560 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
561 iinfo->xattr_len);
562 }
563
564 spin_lock(&inode->i_lock);
565
566 /*
567 * provided version will be odd if inode value is projected,
568 * even if stable. skip the update if we have a newer info
569 * (e.g., due to inode info racing form multiple MDSs), or if
570 * we are getting projected (unstable) inode info.
571 */
572 if (le64_to_cpu(info->version) > 0 &&
573 (ci->i_version & ~1) > le64_to_cpu(info->version))
574 goto no_change;
575
576 issued = __ceph_caps_issued(ci, &implemented);
577 issued |= implemented | __ceph_caps_dirty(ci);
578
579 /* update inode */
580 ci->i_version = le64_to_cpu(info->version);
581 inode->i_version++;
582 inode->i_rdev = le32_to_cpu(info->rdev);
583
584 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
585 inode->i_mode = le32_to_cpu(info->mode);
586 inode->i_uid = le32_to_cpu(info->uid);
587 inode->i_gid = le32_to_cpu(info->gid);
588 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
589 inode->i_uid, inode->i_gid);
590 }
591
592 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
593 inode->i_nlink = le32_to_cpu(info->nlink);
594
595 /* be careful with mtime, atime, size */
596 ceph_decode_timespec(&atime, &info->atime);
597 ceph_decode_timespec(&mtime, &info->mtime);
598 ceph_decode_timespec(&ctime, &info->ctime);
599 queue_trunc = ceph_fill_file_size(inode, issued,
600 le32_to_cpu(info->truncate_seq),
601 le64_to_cpu(info->truncate_size),
602 le64_to_cpu(info->size));
603 ceph_fill_file_time(inode, issued,
604 le32_to_cpu(info->time_warp_seq),
605 &ctime, &mtime, &atime);
606
607 ci->i_max_size = le64_to_cpu(info->max_size);
608 ci->i_layout = info->layout;
609 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
610
611 /* xattrs */
612 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
613 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
614 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
615 if (ci->i_xattrs.blob)
616 ceph_buffer_put(ci->i_xattrs.blob);
617 ci->i_xattrs.blob = xattr_blob;
618 if (xattr_blob)
619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 }
623
624 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info;
627
628 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO:
630 case S_IFBLK:
631 case S_IFCHR:
632 case S_IFSOCK:
633 init_special_inode(inode, inode->i_mode, inode->i_rdev);
634 inode->i_op = &ceph_file_iops;
635 break;
636 case S_IFREG:
637 inode->i_op = &ceph_file_iops;
638 inode->i_fop = &ceph_file_fops;
639 break;
640 case S_IFLNK:
641 inode->i_op = &ceph_symlink_iops;
642 if (!ci->i_symlink) {
643 int symlen = iinfo->symlink_len;
644 char *sym;
645
646 BUG_ON(symlen != inode->i_size);
647 spin_unlock(&inode->i_lock);
648
649 err = -ENOMEM;
650 sym = kmalloc(symlen+1, GFP_NOFS);
651 if (!sym)
652 goto out;
653 memcpy(sym, iinfo->symlink, symlen);
654 sym[symlen] = 0;
655
656 spin_lock(&inode->i_lock);
657 if (!ci->i_symlink)
658 ci->i_symlink = sym;
659 else
660 kfree(sym); /* lost a race */
661 }
662 break;
663 case S_IFDIR:
664 inode->i_op = &ceph_dir_iops;
665 inode->i_fop = &ceph_dir_fops;
666
667 ci->i_files = le64_to_cpu(info->files);
668 ci->i_subdirs = le64_to_cpu(info->subdirs);
669 ci->i_rbytes = le64_to_cpu(info->rbytes);
670 ci->i_rfiles = le64_to_cpu(info->rfiles);
671 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
672 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
673
674 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
678 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2;
681 }
682
683 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes;
686 break;
687 default:
688 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
689 ceph_vinop(inode), inode->i_mode);
690 }
691
692no_change:
693 spin_unlock(&inode->i_lock);
694
695 /* queue truncate if we saw i_size decrease */
696 if (queue_trunc)
697 ceph_queue_vmtruncate(inode);
698
699 /* populate frag tree */
700 /* FIXME: move me up, if/when version reflects fragtree changes */
701 nsplits = le32_to_cpu(info->fragtree.nsplits);
702 mutex_lock(&ci->i_fragtree_mutex);
703 for (i = 0; i < nsplits; i++) {
704 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
705 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
706
707 if (IS_ERR(frag))
708 continue;
709 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
710 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
711 }
712 mutex_unlock(&ci->i_fragtree_mutex);
713
714 /* were we issued a capability? */
715 if (info->cap.caps) {
716 if (ceph_snap(inode) == CEPH_NOSNAP) {
717 ceph_add_cap(inode, session,
718 le64_to_cpu(info->cap.cap_id),
719 cap_fmode,
720 le32_to_cpu(info->cap.caps),
721 le32_to_cpu(info->cap.wanted),
722 le32_to_cpu(info->cap.seq),
723 le32_to_cpu(info->cap.mseq),
724 le64_to_cpu(info->cap.realm),
725 info->cap.flags,
726 caps_reservation);
727 } else {
728 spin_lock(&inode->i_lock);
729 dout(" %p got snap_caps %s\n", inode,
730 ceph_cap_string(le32_to_cpu(info->cap.caps)));
731 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
732 if (cap_fmode >= 0)
733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock);
735 }
736 }
737
738 /* update delegation info? */
739 if (dirinfo)
740 ceph_fill_dirfrag(inode, dirinfo);
741
742 err = 0;
743
744out:
745 if (xattr_blob)
746 ceph_buffer_put(xattr_blob);
747 return err;
748}
749
750/*
751 * caller should hold session s_mutex.
752 */
753static void update_dentry_lease(struct dentry *dentry,
754 struct ceph_mds_reply_lease *lease,
755 struct ceph_mds_session *session,
756 unsigned long from_time)
757{
758 struct ceph_dentry_info *di = ceph_dentry(dentry);
759 long unsigned duration = le32_to_cpu(lease->duration_ms);
760 long unsigned ttl = from_time + (duration * HZ) / 1000;
761 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
762 struct inode *dir;
763
764 /* only track leases on regular dentries */
765 if (dentry->d_op != &ceph_dentry_ops)
766 return;
767
768 spin_lock(&dentry->d_lock);
769 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
770 dentry, le16_to_cpu(lease->mask), duration, ttl);
771
772 /* make lease_rdcache_gen match directory */
773 dir = dentry->d_parent->d_inode;
774 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
775
776 if (lease->mask == 0)
777 goto out_unlock;
778
779 if (di->lease_gen == session->s_cap_gen &&
780 time_before(ttl, dentry->d_time))
781 goto out_unlock; /* we already have a newer lease. */
782
783 if (di->lease_session && di->lease_session != session)
784 goto out_unlock;
785
786 ceph_dentry_lru_touch(dentry);
787
788 if (!di->lease_session)
789 di->lease_session = ceph_get_mds_session(session);
790 di->lease_gen = session->s_cap_gen;
791 di->lease_seq = le32_to_cpu(lease->seq);
792 di->lease_renew_after = half_ttl;
793 di->lease_renew_from = 0;
794 dentry->d_time = ttl;
795out_unlock:
796 spin_unlock(&dentry->d_lock);
797 return;
798}
799
800/*
801 * splice a dentry to an inode.
802 * caller must hold directory i_mutex for this to be safe.
803 *
804 * we will only rehash the resulting dentry if @prehash is
805 * true; @prehash will be set to false (for the benefit of
806 * the caller) if we fail.
807 */
808static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
809 bool *prehash)
810{
811 struct dentry *realdn;
812
813 /* dn must be unhashed */
814 if (!d_unhashed(dn))
815 d_drop(dn);
816 realdn = d_materialise_unique(dn, in);
817 if (IS_ERR(realdn)) {
818 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
819 dn, in, ceph_vinop(in));
820 if (prehash)
821 *prehash = false; /* don't rehash on error */
822 dn = realdn; /* note realdn contains the error */
823 goto out;
824 } else if (realdn) {
825 dout("dn %p (%d) spliced with %p (%d) "
826 "inode %p ino %llx.%llx\n",
827 dn, atomic_read(&dn->d_count),
828 realdn, atomic_read(&realdn->d_count),
829 realdn->d_inode, ceph_vinop(realdn->d_inode));
830 dput(dn);
831 dn = realdn;
832 } else {
833 BUG_ON(!ceph_dentry(dn));
834
835 dout("dn %p attached to %p ino %llx.%llx\n",
836 dn, dn->d_inode, ceph_vinop(dn->d_inode));
837 }
838 if ((!prehash || *prehash) && d_unhashed(dn))
839 d_rehash(dn);
840out:
841 return dn;
842}
843
844/*
845 * Set dentry's directory position based on the current dir's max, and
846 * order it in d_subdirs, so that dcache_readdir behaves.
847 */
848static void ceph_set_dentry_offset(struct dentry *dn)
849{
850 struct dentry *dir = dn->d_parent;
851 struct inode *inode = dn->d_parent->d_inode;
852 struct ceph_dentry_info *di;
853
854 BUG_ON(!inode);
855
856 di = ceph_dentry(dn);
857
858 spin_lock(&inode->i_lock);
859 di->offset = ceph_inode(inode)->i_max_offset++;
860 spin_unlock(&inode->i_lock);
861
862 spin_lock(&dcache_lock);
863 spin_lock(&dn->d_lock);
864 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
865 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
866 dn->d_u.d_child.prev, dn->d_u.d_child.next);
867 spin_unlock(&dn->d_lock);
868 spin_unlock(&dcache_lock);
869}
870
871/*
872 * Incorporate results into the local cache. This is either just
873 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
874 * after a lookup).
875 *
876 * A reply may contain
877 * a directory inode along with a dentry.
878 * and/or a target inode
879 *
880 * Called with snap_rwsem (read).
881 */
882int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
883 struct ceph_mds_session *session)
884{
885 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
886 struct inode *in = NULL;
887 struct ceph_mds_reply_inode *ininfo;
888 struct ceph_vino vino;
889 int i = 0;
890 int err = 0;
891
892 dout("fill_trace %p is_dentry %d is_target %d\n", req,
893 rinfo->head->is_dentry, rinfo->head->is_target);
894
895#if 0
896 /*
897 * Debugging hook:
898 *
899 * If we resend completed ops to a recovering mds, we get no
900 * trace. Since that is very rare, pretend this is the case
901 * to ensure the 'no trace' handlers in the callers behave.
902 *
903 * Fill in inodes unconditionally to avoid breaking cap
904 * invariants.
905 */
906 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
907 pr_info("fill_trace faking empty trace on %lld %s\n",
908 req->r_tid, ceph_mds_op_name(rinfo->head->op));
909 if (rinfo->head->is_dentry) {
910 rinfo->head->is_dentry = 0;
911 err = fill_inode(req->r_locked_dir,
912 &rinfo->diri, rinfo->dirfrag,
913 session, req->r_request_started, -1);
914 }
915 if (rinfo->head->is_target) {
916 rinfo->head->is_target = 0;
917 ininfo = rinfo->targeti.in;
918 vino.ino = le64_to_cpu(ininfo->ino);
919 vino.snap = le64_to_cpu(ininfo->snapid);
920 in = ceph_get_inode(sb, vino);
921 err = fill_inode(in, &rinfo->targeti, NULL,
922 session, req->r_request_started,
923 req->r_fmode);
924 iput(in);
925 }
926 }
927#endif
928
929 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
930 dout("fill_trace reply is empty!\n");
931 if (rinfo->head->result == 0 && req->r_locked_dir) {
932 struct ceph_inode_info *ci =
933 ceph_inode(req->r_locked_dir);
934 dout(" clearing %p complete (empty trace)\n",
935 req->r_locked_dir);
936 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
937 ci->i_release_count++;
938 }
939 return 0;
940 }
941
942 if (rinfo->head->is_dentry) {
943 struct inode *dir = req->r_locked_dir;
944
945 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
946 session, req->r_request_started, -1,
947 &req->r_caps_reservation);
948 if (err < 0)
949 return err;
950 }
951
952 if (rinfo->head->is_dentry && !req->r_aborted) {
953 /*
954 * lookup link rename : null -> possibly existing inode
955 * mknod symlink mkdir : null -> new inode
956 * unlink : linked -> null
957 */
958 struct inode *dir = req->r_locked_dir;
959 struct dentry *dn = req->r_dentry;
960 bool have_dir_cap, have_lease;
961
962 BUG_ON(!dn);
963 BUG_ON(!dir);
964 BUG_ON(dn->d_parent->d_inode != dir);
965 BUG_ON(ceph_ino(dir) !=
966 le64_to_cpu(rinfo->diri.in->ino));
967 BUG_ON(ceph_snap(dir) !=
968 le64_to_cpu(rinfo->diri.in->snapid));
969
970 /* do we have a lease on the whole dir? */
971 have_dir_cap =
972 (le32_to_cpu(rinfo->diri.in->cap.caps) &
973 CEPH_CAP_FILE_SHARED);
974
975 /* do we have a dn lease? */
976 have_lease = have_dir_cap ||
977 (le16_to_cpu(rinfo->dlease->mask) &
978 CEPH_LOCK_DN);
979
980 if (!have_lease)
981 dout("fill_trace no dentry lease or dir cap\n");
982
983 /* rename? */
984 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
985 dout(" src %p '%.*s' dst %p '%.*s'\n",
986 req->r_old_dentry,
987 req->r_old_dentry->d_name.len,
988 req->r_old_dentry->d_name.name,
989 dn, dn->d_name.len, dn->d_name.name);
990 dout("fill_trace doing d_move %p -> %p\n",
991 req->r_old_dentry, dn);
992 d_move(req->r_old_dentry, dn);
993 dout(" src %p '%.*s' dst %p '%.*s'\n",
994 req->r_old_dentry,
995 req->r_old_dentry->d_name.len,
996 req->r_old_dentry->d_name.name,
997 dn, dn->d_name.len, dn->d_name.name);
998 /* ensure target dentry is invalidated, despite
999 rehashing bug in vfs_rename_dir */
1000 dn->d_time = jiffies;
1001 ceph_dentry(dn)->lease_shared_gen = 0;
1002 /* take overwritten dentry's readdir offset */
1003 ceph_dentry(req->r_old_dentry)->offset =
1004 ceph_dentry(dn)->offset;
1005 dn = req->r_old_dentry; /* use old_dentry */
1006 in = dn->d_inode;
1007 }
1008
1009 /* null dentry? */
1010 if (!rinfo->head->is_target) {
1011 dout("fill_trace null dentry\n");
1012 if (dn->d_inode) {
1013 dout("d_delete %p\n", dn);
1014 d_delete(dn);
1015 } else {
1016 dout("d_instantiate %p NULL\n", dn);
1017 d_instantiate(dn, NULL);
1018 if (have_lease && d_unhashed(dn))
1019 d_rehash(dn);
1020 update_dentry_lease(dn, rinfo->dlease,
1021 session,
1022 req->r_request_started);
1023 }
1024 goto done;
1025 }
1026
1027 /* attach proper inode */
1028 ininfo = rinfo->targeti.in;
1029 vino.ino = le64_to_cpu(ininfo->ino);
1030 vino.snap = le64_to_cpu(ininfo->snapid);
1031 if (!dn->d_inode) {
1032 in = ceph_get_inode(sb, vino);
1033 if (IS_ERR(in)) {
1034 pr_err("fill_trace bad get_inode "
1035 "%llx.%llx\n", vino.ino, vino.snap);
1036 err = PTR_ERR(in);
1037 d_delete(dn);
1038 goto done;
1039 }
1040 dn = splice_dentry(dn, in, &have_lease);
1041 if (IS_ERR(dn)) {
1042 err = PTR_ERR(dn);
1043 goto done;
1044 }
1045 req->r_dentry = dn; /* may have spliced */
1046 ceph_set_dentry_offset(dn);
1047 igrab(in);
1048 } else if (ceph_ino(in) == vino.ino &&
1049 ceph_snap(in) == vino.snap) {
1050 igrab(in);
1051 } else {
1052 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1053 dn, in, ceph_ino(in), ceph_snap(in),
1054 vino.ino, vino.snap);
1055 have_lease = false;
1056 in = NULL;
1057 }
1058
1059 if (have_lease)
1060 update_dentry_lease(dn, rinfo->dlease, session,
1061 req->r_request_started);
1062 dout(" final dn %p\n", dn);
1063 i++;
1064 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1065 req->r_op == CEPH_MDS_OP_MKSNAP) {
1066 struct dentry *dn = req->r_dentry;
1067
1068 /* fill out a snapdir LOOKUPSNAP dentry */
1069 BUG_ON(!dn);
1070 BUG_ON(!req->r_locked_dir);
1071 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1072 ininfo = rinfo->targeti.in;
1073 vino.ino = le64_to_cpu(ininfo->ino);
1074 vino.snap = le64_to_cpu(ininfo->snapid);
1075 in = ceph_get_inode(sb, vino);
1076 if (IS_ERR(in)) {
1077 pr_err("fill_inode get_inode badness %llx.%llx\n",
1078 vino.ino, vino.snap);
1079 err = PTR_ERR(in);
1080 d_delete(dn);
1081 goto done;
1082 }
1083 dout(" linking snapped dir %p to dn %p\n", in, dn);
1084 dn = splice_dentry(dn, in, NULL);
1085 if (IS_ERR(dn)) {
1086 err = PTR_ERR(dn);
1087 goto done;
1088 }
1089 ceph_set_dentry_offset(dn);
1090 req->r_dentry = dn; /* may have spliced */
1091 igrab(in);
1092 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1093 }
1094
1095 if (rinfo->head->is_target) {
1096 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1097 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1098
1099 if (in == NULL || ceph_ino(in) != vino.ino ||
1100 ceph_snap(in) != vino.snap) {
1101 in = ceph_get_inode(sb, vino);
1102 if (IS_ERR(in)) {
1103 err = PTR_ERR(in);
1104 goto done;
1105 }
1106 }
1107 req->r_target_inode = in;
1108
1109 err = fill_inode(in,
1110 &rinfo->targeti, NULL,
1111 session, req->r_request_started,
1112 (le32_to_cpu(rinfo->head->result) == 0) ?
1113 req->r_fmode : -1,
1114 &req->r_caps_reservation);
1115 if (err < 0) {
1116 pr_err("fill_inode badness %p %llx.%llx\n",
1117 in, ceph_vinop(in));
1118 goto done;
1119 }
1120 }
1121
1122done:
1123 dout("fill_trace done err=%d\n", err);
1124 return err;
1125}
1126
1127/*
1128 * Prepopulate our cache with readdir results, leases, etc.
1129 */
1130int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1131 struct ceph_mds_session *session)
1132{
1133 struct dentry *parent = req->r_dentry;
1134 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1135 struct qstr dname;
1136 struct dentry *dn;
1137 struct inode *in;
1138 int err = 0, i;
1139 struct inode *snapdir = NULL;
1140 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1141 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1142 struct ceph_dentry_info *di;
1143
1144 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1145 snapdir = ceph_get_snapdir(parent->d_inode);
1146 parent = d_find_alias(snapdir);
1147 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1148 rinfo->dir_nr, parent);
1149 } else {
1150 dout("readdir_prepopulate %d items under dn %p\n",
1151 rinfo->dir_nr, parent);
1152 if (rinfo->dir_dir)
1153 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1154 }
1155
1156 for (i = 0; i < rinfo->dir_nr; i++) {
1157 struct ceph_vino vino;
1158
1159 dname.name = rinfo->dir_dname[i];
1160 dname.len = rinfo->dir_dname_len[i];
1161 dname.hash = full_name_hash(dname.name, dname.len);
1162
1163 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1164 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1165
1166retry_lookup:
1167 dn = d_lookup(parent, &dname);
1168 dout("d_lookup on parent=%p name=%.*s got %p\n",
1169 parent, dname.len, dname.name, dn);
1170
1171 if (!dn) {
1172 dn = d_alloc(parent, &dname);
1173 dout("d_alloc %p '%.*s' = %p\n", parent,
1174 dname.len, dname.name, dn);
1175 if (dn == NULL) {
1176 dout("d_alloc badness\n");
1177 err = -ENOMEM;
1178 goto out;
1179 }
1180 err = ceph_init_dentry(dn);
1181 if (err < 0)
1182 goto out;
1183 } else if (dn->d_inode &&
1184 (ceph_ino(dn->d_inode) != vino.ino ||
1185 ceph_snap(dn->d_inode) != vino.snap)) {
1186 dout(" dn %p points to wrong inode %p\n",
1187 dn, dn->d_inode);
1188 d_delete(dn);
1189 dput(dn);
1190 goto retry_lookup;
1191 } else {
1192 /* reorder parent's d_subdirs */
1193 spin_lock(&dcache_lock);
1194 spin_lock(&dn->d_lock);
1195 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1196 spin_unlock(&dn->d_lock);
1197 spin_unlock(&dcache_lock);
1198 }
1199
1200 di = dn->d_fsdata;
1201 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1202
1203 /* inode */
1204 if (dn->d_inode) {
1205 in = dn->d_inode;
1206 } else {
1207 in = ceph_get_inode(parent->d_sb, vino);
1208 if (in == NULL) {
1209 dout("new_inode badness\n");
1210 d_delete(dn);
1211 dput(dn);
1212 err = -ENOMEM;
1213 goto out;
1214 }
1215 dn = splice_dentry(dn, in, NULL);
1216 }
1217
1218 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1219 req->r_request_started, -1,
1220 &req->r_caps_reservation) < 0) {
1221 pr_err("fill_inode badness on %p\n", in);
1222 dput(dn);
1223 continue;
1224 }
1225 update_dentry_lease(dn, rinfo->dir_dlease[i],
1226 req->r_session, req->r_request_started);
1227 dput(dn);
1228 }
1229 req->r_did_prepopulate = true;
1230
1231out:
1232 if (snapdir) {
1233 iput(snapdir);
1234 dput(parent);
1235 }
1236 dout("readdir_prepopulate done\n");
1237 return err;
1238}
1239
1240int ceph_inode_set_size(struct inode *inode, loff_t size)
1241{
1242 struct ceph_inode_info *ci = ceph_inode(inode);
1243 int ret = 0;
1244
1245 spin_lock(&inode->i_lock);
1246 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1247 inode->i_size = size;
1248 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1249
1250 /* tell the MDS if we are approaching max_size */
1251 if ((size << 1) >= ci->i_max_size &&
1252 (ci->i_reported_size << 1) < ci->i_max_size)
1253 ret = 1;
1254
1255 spin_unlock(&inode->i_lock);
1256 return ret;
1257}
1258
1259/*
1260 * Write back inode data in a worker thread. (This can't be done
1261 * in the message handler context.)
1262 */
1263void ceph_queue_writeback(struct inode *inode)
1264{
1265 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1266 &ceph_inode(inode)->i_wb_work)) {
1267 dout("ceph_queue_writeback %p\n", inode);
1268 igrab(inode);
1269 } else {
1270 dout("ceph_queue_writeback %p failed\n", inode);
1271 }
1272}
1273
1274static void ceph_writeback_work(struct work_struct *work)
1275{
1276 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1277 i_wb_work);
1278 struct inode *inode = &ci->vfs_inode;
1279
1280 dout("writeback %p\n", inode);
1281 filemap_fdatawrite(&inode->i_data);
1282 iput(inode);
1283}
1284
1285/*
1286 * queue an async invalidation
1287 */
1288void ceph_queue_invalidate(struct inode *inode)
1289{
1290 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1291 &ceph_inode(inode)->i_pg_inv_work)) {
1292 dout("ceph_queue_invalidate %p\n", inode);
1293 igrab(inode);
1294 } else {
1295 dout("ceph_queue_invalidate %p failed\n", inode);
1296 }
1297}
1298
1299/*
1300 * invalidate any pages that are not dirty or under writeback. this
1301 * includes pages that are clean and mapped.
1302 */
1303static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1304{
1305 struct pagevec pvec;
1306 pgoff_t next = 0;
1307 int i;
1308
1309 pagevec_init(&pvec, 0);
1310 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1311 for (i = 0; i < pagevec_count(&pvec); i++) {
1312 struct page *page = pvec.pages[i];
1313 pgoff_t index;
1314 int skip_page =
1315 (PageDirty(page) || PageWriteback(page));
1316
1317 if (!skip_page)
1318 skip_page = !trylock_page(page);
1319
1320 /*
1321 * We really shouldn't be looking at the ->index of an
1322 * unlocked page. But we're not allowed to lock these
1323 * pages. So we rely upon nobody altering the ->index
1324 * of this (pinned-by-us) page.
1325 */
1326 index = page->index;
1327 if (index > next)
1328 next = index;
1329 next++;
1330
1331 if (skip_page)
1332 continue;
1333
1334 generic_error_remove_page(mapping, page);
1335 unlock_page(page);
1336 }
1337 pagevec_release(&pvec);
1338 cond_resched();
1339 }
1340}
1341
1342/*
1343 * Invalidate inode pages in a worker thread. (This can't be done
1344 * in the message handler context.)
1345 */
1346static void ceph_invalidate_work(struct work_struct *work)
1347{
1348 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1349 i_pg_inv_work);
1350 struct inode *inode = &ci->vfs_inode;
1351 u32 orig_gen;
1352 int check = 0;
1353
1354 spin_lock(&inode->i_lock);
1355 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1356 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1357 if (ci->i_rdcache_gen == 0 ||
1358 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1359 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1360 /* nevermind! */
1361 ci->i_rdcache_revoking = 0;
1362 spin_unlock(&inode->i_lock);
1363 goto out;
1364 }
1365 orig_gen = ci->i_rdcache_gen;
1366 spin_unlock(&inode->i_lock);
1367
1368 ceph_invalidate_nondirty_pages(inode->i_mapping);
1369
1370 spin_lock(&inode->i_lock);
1371 if (orig_gen == ci->i_rdcache_gen) {
1372 dout("invalidate_pages %p gen %d successful\n", inode,
1373 ci->i_rdcache_gen);
1374 ci->i_rdcache_gen = 0;
1375 ci->i_rdcache_revoking = 0;
1376 check = 1;
1377 } else {
1378 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1379 inode, orig_gen, ci->i_rdcache_gen);
1380 }
1381 spin_unlock(&inode->i_lock);
1382
1383 if (check)
1384 ceph_check_caps(ci, 0, NULL);
1385out:
1386 iput(inode);
1387}
1388
1389
1390/*
1391 * called by trunc_wq; take i_mutex ourselves
1392 *
1393 * We also truncate in a separate thread as well.
1394 */
1395static void ceph_vmtruncate_work(struct work_struct *work)
1396{
1397 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1398 i_vmtruncate_work);
1399 struct inode *inode = &ci->vfs_inode;
1400
1401 dout("vmtruncate_work %p\n", inode);
1402 mutex_lock(&inode->i_mutex);
1403 __ceph_do_pending_vmtruncate(inode);
1404 mutex_unlock(&inode->i_mutex);
1405 iput(inode);
1406}
1407
1408/*
1409 * Queue an async vmtruncate. If we fail to queue work, we will handle
1410 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1411 */
1412void ceph_queue_vmtruncate(struct inode *inode)
1413{
1414 struct ceph_inode_info *ci = ceph_inode(inode);
1415
1416 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1417 &ci->i_vmtruncate_work)) {
1418 dout("ceph_queue_vmtruncate %p\n", inode);
1419 igrab(inode);
1420 } else {
1421 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1422 inode, ci->i_truncate_pending);
1423 }
1424}
1425
1426/*
1427 * called with i_mutex held.
1428 *
1429 * Make sure any pending truncation is applied before doing anything
1430 * that may depend on it.
1431 */
1432void __ceph_do_pending_vmtruncate(struct inode *inode)
1433{
1434 struct ceph_inode_info *ci = ceph_inode(inode);
1435 u64 to;
1436 int wrbuffer_refs, wake = 0;
1437
1438retry:
1439 spin_lock(&inode->i_lock);
1440 if (ci->i_truncate_pending == 0) {
1441 dout("__do_pending_vmtruncate %p none pending\n", inode);
1442 spin_unlock(&inode->i_lock);
1443 return;
1444 }
1445
1446 /*
1447 * make sure any dirty snapped pages are flushed before we
1448 * possibly truncate them.. so write AND block!
1449 */
1450 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1451 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1452 inode);
1453 spin_unlock(&inode->i_lock);
1454 filemap_write_and_wait_range(&inode->i_data, 0,
1455 inode->i_sb->s_maxbytes);
1456 goto retry;
1457 }
1458
1459 to = ci->i_truncate_size;
1460 wrbuffer_refs = ci->i_wrbuffer_ref;
1461 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1462 ci->i_truncate_pending, to);
1463 spin_unlock(&inode->i_lock);
1464
1465 truncate_inode_pages(inode->i_mapping, to);
1466
1467 spin_lock(&inode->i_lock);
1468 ci->i_truncate_pending--;
1469 if (ci->i_truncate_pending == 0)
1470 wake = 1;
1471 spin_unlock(&inode->i_lock);
1472
1473 if (wrbuffer_refs == 0)
1474 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1475 if (wake)
1476 wake_up(&ci->i_cap_wq);
1477}
1478
1479
1480/*
1481 * symlinks
1482 */
1483static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1484{
1485 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1486 nd_set_link(nd, ci->i_symlink);
1487 return NULL;
1488}
1489
1490static const struct inode_operations ceph_symlink_iops = {
1491 .readlink = generic_readlink,
1492 .follow_link = ceph_sym_follow_link,
1493};
1494
1495/*
1496 * setattr
1497 */
1498int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1499{
1500 struct inode *inode = dentry->d_inode;
1501 struct ceph_inode_info *ci = ceph_inode(inode);
1502 struct inode *parent_inode = dentry->d_parent->d_inode;
1503 const unsigned int ia_valid = attr->ia_valid;
1504 struct ceph_mds_request *req;
1505 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1506 int issued;
1507 int release = 0, dirtied = 0;
1508 int mask = 0;
1509 int err = 0;
1510
1511 if (ceph_snap(inode) != CEPH_NOSNAP)
1512 return -EROFS;
1513
1514 __ceph_do_pending_vmtruncate(inode);
1515
1516 err = inode_change_ok(inode, attr);
1517 if (err != 0)
1518 return err;
1519
1520 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1521 USE_AUTH_MDS);
1522 if (IS_ERR(req))
1523 return PTR_ERR(req);
1524
1525 spin_lock(&inode->i_lock);
1526 issued = __ceph_caps_issued(ci, NULL);
1527 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1528
1529 if (ia_valid & ATTR_UID) {
1530 dout("setattr %p uid %d -> %d\n", inode,
1531 inode->i_uid, attr->ia_uid);
1532 if (issued & CEPH_CAP_AUTH_EXCL) {
1533 inode->i_uid = attr->ia_uid;
1534 dirtied |= CEPH_CAP_AUTH_EXCL;
1535 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1536 attr->ia_uid != inode->i_uid) {
1537 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1538 mask |= CEPH_SETATTR_UID;
1539 release |= CEPH_CAP_AUTH_SHARED;
1540 }
1541 }
1542 if (ia_valid & ATTR_GID) {
1543 dout("setattr %p gid %d -> %d\n", inode,
1544 inode->i_gid, attr->ia_gid);
1545 if (issued & CEPH_CAP_AUTH_EXCL) {
1546 inode->i_gid = attr->ia_gid;
1547 dirtied |= CEPH_CAP_AUTH_EXCL;
1548 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1549 attr->ia_gid != inode->i_gid) {
1550 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1551 mask |= CEPH_SETATTR_GID;
1552 release |= CEPH_CAP_AUTH_SHARED;
1553 }
1554 }
1555 if (ia_valid & ATTR_MODE) {
1556 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1557 attr->ia_mode);
1558 if (issued & CEPH_CAP_AUTH_EXCL) {
1559 inode->i_mode = attr->ia_mode;
1560 dirtied |= CEPH_CAP_AUTH_EXCL;
1561 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1562 attr->ia_mode != inode->i_mode) {
1563 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1564 mask |= CEPH_SETATTR_MODE;
1565 release |= CEPH_CAP_AUTH_SHARED;
1566 }
1567 }
1568
1569 if (ia_valid & ATTR_ATIME) {
1570 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1571 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1572 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1573 if (issued & CEPH_CAP_FILE_EXCL) {
1574 ci->i_time_warp_seq++;
1575 inode->i_atime = attr->ia_atime;
1576 dirtied |= CEPH_CAP_FILE_EXCL;
1577 } else if ((issued & CEPH_CAP_FILE_WR) &&
1578 timespec_compare(&inode->i_atime,
1579 &attr->ia_atime) < 0) {
1580 inode->i_atime = attr->ia_atime;
1581 dirtied |= CEPH_CAP_FILE_WR;
1582 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1583 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1584 ceph_encode_timespec(&req->r_args.setattr.atime,
1585 &attr->ia_atime);
1586 mask |= CEPH_SETATTR_ATIME;
1587 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1588 CEPH_CAP_FILE_WR;
1589 }
1590 }
1591 if (ia_valid & ATTR_MTIME) {
1592 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1593 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1594 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1595 if (issued & CEPH_CAP_FILE_EXCL) {
1596 ci->i_time_warp_seq++;
1597 inode->i_mtime = attr->ia_mtime;
1598 dirtied |= CEPH_CAP_FILE_EXCL;
1599 } else if ((issued & CEPH_CAP_FILE_WR) &&
1600 timespec_compare(&inode->i_mtime,
1601 &attr->ia_mtime) < 0) {
1602 inode->i_mtime = attr->ia_mtime;
1603 dirtied |= CEPH_CAP_FILE_WR;
1604 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1605 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1606 ceph_encode_timespec(&req->r_args.setattr.mtime,
1607 &attr->ia_mtime);
1608 mask |= CEPH_SETATTR_MTIME;
1609 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1610 CEPH_CAP_FILE_WR;
1611 }
1612 }
1613 if (ia_valid & ATTR_SIZE) {
1614 dout("setattr %p size %lld -> %lld\n", inode,
1615 inode->i_size, attr->ia_size);
1616 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1617 err = -EINVAL;
1618 goto out;
1619 }
1620 if ((issued & CEPH_CAP_FILE_EXCL) &&
1621 attr->ia_size > inode->i_size) {
1622 inode->i_size = attr->ia_size;
1623 inode->i_blocks =
1624 (attr->ia_size + (1 << 9) - 1) >> 9;
1625 inode->i_ctime = attr->ia_ctime;
1626 ci->i_reported_size = attr->ia_size;
1627 dirtied |= CEPH_CAP_FILE_EXCL;
1628 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1629 attr->ia_size != inode->i_size) {
1630 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1631 req->r_args.setattr.old_size =
1632 cpu_to_le64(inode->i_size);
1633 mask |= CEPH_SETATTR_SIZE;
1634 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1635 CEPH_CAP_FILE_WR;
1636 }
1637 }
1638
1639 /* these do nothing */
1640 if (ia_valid & ATTR_CTIME) {
1641 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1642 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1643 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1644 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1645 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1646 only ? "ctime only" : "ignored");
1647 inode->i_ctime = attr->ia_ctime;
1648 if (only) {
1649 /*
1650 * if kernel wants to dirty ctime but nothing else,
1651 * we need to choose a cap to dirty under, or do
1652 * a almost-no-op setattr
1653 */
1654 if (issued & CEPH_CAP_AUTH_EXCL)
1655 dirtied |= CEPH_CAP_AUTH_EXCL;
1656 else if (issued & CEPH_CAP_FILE_EXCL)
1657 dirtied |= CEPH_CAP_FILE_EXCL;
1658 else if (issued & CEPH_CAP_XATTR_EXCL)
1659 dirtied |= CEPH_CAP_XATTR_EXCL;
1660 else
1661 mask |= CEPH_SETATTR_CTIME;
1662 }
1663 }
1664 if (ia_valid & ATTR_FILE)
1665 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1666
1667 if (dirtied) {
1668 __ceph_mark_dirty_caps(ci, dirtied);
1669 inode->i_ctime = CURRENT_TIME;
1670 }
1671
1672 release &= issued;
1673 spin_unlock(&inode->i_lock);
1674
1675 if (mask) {
1676 req->r_inode = igrab(inode);
1677 req->r_inode_drop = release;
1678 req->r_args.setattr.mask = cpu_to_le32(mask);
1679 req->r_num_caps = 1;
1680 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1681 }
1682 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1683 ceph_cap_string(dirtied), mask);
1684
1685 ceph_mdsc_put_request(req);
1686 __ceph_do_pending_vmtruncate(inode);
1687 return err;
1688out:
1689 spin_unlock(&inode->i_lock);
1690 ceph_mdsc_put_request(req);
1691 return err;
1692}
1693
1694/*
1695 * Verify that we have a lease on the given mask. If not,
1696 * do a getattr against an mds.
1697 */
1698int ceph_do_getattr(struct inode *inode, int mask)
1699{
1700 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1701 struct ceph_mds_client *mdsc = &client->mdsc;
1702 struct ceph_mds_request *req;
1703 int err;
1704
1705 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1706 dout("do_getattr inode %p SNAPDIR\n", inode);
1707 return 0;
1708 }
1709
1710 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1711 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1712 return 0;
1713
1714 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1715 if (IS_ERR(req))
1716 return PTR_ERR(req);
1717 req->r_inode = igrab(inode);
1718 req->r_num_caps = 1;
1719 req->r_args.getattr.mask = cpu_to_le32(mask);
1720 err = ceph_mdsc_do_request(mdsc, NULL, req);
1721 ceph_mdsc_put_request(req);
1722 dout("do_getattr result=%d\n", err);
1723 return err;
1724}
1725
1726
1727/*
1728 * Check inode permissions. We verify we have a valid value for
1729 * the AUTH cap, then call the generic handler.
1730 */
1731int ceph_permission(struct inode *inode, int mask)
1732{
1733 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1734
1735 if (!err)
1736 err = generic_permission(inode, mask, NULL);
1737 return err;
1738}
1739
1740/*
1741 * Get all attributes. Hopefully somedata we'll have a statlite()
1742 * and can limit the fields we require to be accurate.
1743 */
1744int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1745 struct kstat *stat)
1746{
1747 struct inode *inode = dentry->d_inode;
1748 struct ceph_inode_info *ci = ceph_inode(inode);
1749 int err;
1750
1751 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1752 if (!err) {
1753 generic_fillattr(inode, stat);
1754 stat->ino = inode->i_ino;
1755 if (ceph_snap(inode) != CEPH_NOSNAP)
1756 stat->dev = ceph_snap(inode);
1757 else
1758 stat->dev = 0;
1759 if (S_ISDIR(inode->i_mode)) {
1760 stat->size = ci->i_rbytes;
1761 stat->blocks = 0;
1762 stat->blksize = 65536;
1763 }
1764 }
1765 return err;
1766}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..5c7920be6420
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3042 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/sched.h>
5
6#include "mds_client.h"
7#include "mon_client.h"
8#include "super.h"
9#include "messenger.h"
10#include "decode.h"
11#include "auth.h"
12#include "pagelist.h"
13
14/*
15 * A cluster of MDS (metadata server) daemons is responsible for
16 * managing the file system namespace (the directory hierarchy and
17 * inodes) and for coordinating shared access to storage. Metadata is
18 * partitioning hierarchically across a number of servers, and that
19 * partition varies over time as the cluster adjusts the distribution
20 * in order to balance load.
21 *
22 * The MDS client is primarily responsible to managing synchronous
23 * metadata requests for operations like open, unlink, and so forth.
24 * If there is a MDS failure, we find out about it when we (possibly
25 * request and) receive a new MDS map, and can resubmit affected
26 * requests.
27 *
28 * For the most part, though, we take advantage of a lossless
29 * communications channel to the MDS, and do not need to worry about
30 * timing out or resubmitting requests.
31 *
32 * We maintain a stateful "session" with each MDS we interact with.
33 * Within each session, we sent periodic heartbeat messages to ensure
34 * any capabilities or leases we have been issues remain valid. If
35 * the session times out and goes stale, our leases and capabilities
36 * are no longer valid.
37 */
38
39static void __wake_requests(struct ceph_mds_client *mdsc,
40 struct list_head *head);
41
42const static struct ceph_connection_operations mds_con_ops;
43
44
45/*
46 * mds reply parsing
47 */
48
49/*
50 * parse individual inode info
51 */
52static int parse_reply_info_in(void **p, void *end,
53 struct ceph_mds_reply_info_in *info)
54{
55 int err = -EIO;
56
57 info->in = *p;
58 *p += sizeof(struct ceph_mds_reply_inode) +
59 sizeof(*info->in->fragtree.splits) *
60 le32_to_cpu(info->in->fragtree.nsplits);
61
62 ceph_decode_32_safe(p, end, info->symlink_len, bad);
63 ceph_decode_need(p, end, info->symlink_len, bad);
64 info->symlink = *p;
65 *p += info->symlink_len;
66
67 ceph_decode_32_safe(p, end, info->xattr_len, bad);
68 ceph_decode_need(p, end, info->xattr_len, bad);
69 info->xattr_data = *p;
70 *p += info->xattr_len;
71 return 0;
72bad:
73 return err;
74}
75
76/*
77 * parse a normal reply, which may contain a (dir+)dentry and/or a
78 * target inode.
79 */
80static int parse_reply_info_trace(void **p, void *end,
81 struct ceph_mds_reply_info_parsed *info)
82{
83 int err;
84
85 if (info->head->is_dentry) {
86 err = parse_reply_info_in(p, end, &info->diri);
87 if (err < 0)
88 goto out_bad;
89
90 if (unlikely(*p + sizeof(*info->dirfrag) > end))
91 goto bad;
92 info->dirfrag = *p;
93 *p += sizeof(*info->dirfrag) +
94 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
95 if (unlikely(*p > end))
96 goto bad;
97
98 ceph_decode_32_safe(p, end, info->dname_len, bad);
99 ceph_decode_need(p, end, info->dname_len, bad);
100 info->dname = *p;
101 *p += info->dname_len;
102 info->dlease = *p;
103 *p += sizeof(*info->dlease);
104 }
105
106 if (info->head->is_target) {
107 err = parse_reply_info_in(p, end, &info->targeti);
108 if (err < 0)
109 goto out_bad;
110 }
111
112 if (unlikely(*p != end))
113 goto bad;
114 return 0;
115
116bad:
117 err = -EIO;
118out_bad:
119 pr_err("problem parsing mds trace %d\n", err);
120 return err;
121}
122
123/*
124 * parse readdir results
125 */
126static int parse_reply_info_dir(void **p, void *end,
127 struct ceph_mds_reply_info_parsed *info)
128{
129 u32 num, i = 0;
130 int err;
131
132 info->dir_dir = *p;
133 if (*p + sizeof(*info->dir_dir) > end)
134 goto bad;
135 *p += sizeof(*info->dir_dir) +
136 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
137 if (*p > end)
138 goto bad;
139
140 ceph_decode_need(p, end, sizeof(num) + 2, bad);
141 num = ceph_decode_32(p);
142 info->dir_end = ceph_decode_8(p);
143 info->dir_complete = ceph_decode_8(p);
144 if (num == 0)
145 goto done;
146
147 /* alloc large array */
148 info->dir_nr = num;
149 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
150 sizeof(*info->dir_dname) +
151 sizeof(*info->dir_dname_len) +
152 sizeof(*info->dir_dlease),
153 GFP_NOFS);
154 if (info->dir_in == NULL) {
155 err = -ENOMEM;
156 goto out_bad;
157 }
158 info->dir_dname = (void *)(info->dir_in + num);
159 info->dir_dname_len = (void *)(info->dir_dname + num);
160 info->dir_dlease = (void *)(info->dir_dname_len + num);
161
162 while (num) {
163 /* dentry */
164 ceph_decode_need(p, end, sizeof(u32)*2, bad);
165 info->dir_dname_len[i] = ceph_decode_32(p);
166 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
167 info->dir_dname[i] = *p;
168 *p += info->dir_dname_len[i];
169 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
170 info->dir_dname[i]);
171 info->dir_dlease[i] = *p;
172 *p += sizeof(struct ceph_mds_reply_lease);
173
174 /* inode */
175 err = parse_reply_info_in(p, end, &info->dir_in[i]);
176 if (err < 0)
177 goto out_bad;
178 i++;
179 num--;
180 }
181
182done:
183 if (*p != end)
184 goto bad;
185 return 0;
186
187bad:
188 err = -EIO;
189out_bad:
190 pr_err("problem parsing dir contents %d\n", err);
191 return err;
192}
193
194/*
195 * parse entire mds reply
196 */
197static int parse_reply_info(struct ceph_msg *msg,
198 struct ceph_mds_reply_info_parsed *info)
199{
200 void *p, *end;
201 u32 len;
202 int err;
203
204 info->head = msg->front.iov_base;
205 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
206 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
207
208 /* trace */
209 ceph_decode_32_safe(&p, end, len, bad);
210 if (len > 0) {
211 err = parse_reply_info_trace(&p, p+len, info);
212 if (err < 0)
213 goto out_bad;
214 }
215
216 /* dir content */
217 ceph_decode_32_safe(&p, end, len, bad);
218 if (len > 0) {
219 err = parse_reply_info_dir(&p, p+len, info);
220 if (err < 0)
221 goto out_bad;
222 }
223
224 /* snap blob */
225 ceph_decode_32_safe(&p, end, len, bad);
226 info->snapblob_len = len;
227 info->snapblob = p;
228 p += len;
229
230 if (p != end)
231 goto bad;
232 return 0;
233
234bad:
235 err = -EIO;
236out_bad:
237 pr_err("mds parse_reply err %d\n", err);
238 return err;
239}
240
241static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
242{
243 kfree(info->dir_in);
244}
245
246
247/*
248 * sessions
249 */
250static const char *session_state_name(int s)
251{
252 switch (s) {
253 case CEPH_MDS_SESSION_NEW: return "new";
254 case CEPH_MDS_SESSION_OPENING: return "opening";
255 case CEPH_MDS_SESSION_OPEN: return "open";
256 case CEPH_MDS_SESSION_HUNG: return "hung";
257 case CEPH_MDS_SESSION_CLOSING: return "closing";
258 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
259 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
260 default: return "???";
261 }
262}
263
264static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
265{
266 if (atomic_inc_not_zero(&s->s_ref)) {
267 dout("mdsc get_session %p %d -> %d\n", s,
268 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
269 return s;
270 } else {
271 dout("mdsc get_session %p 0 -- FAIL", s);
272 return NULL;
273 }
274}
275
276void ceph_put_mds_session(struct ceph_mds_session *s)
277{
278 dout("mdsc put_session %p %d -> %d\n", s,
279 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
280 if (atomic_dec_and_test(&s->s_ref)) {
281 if (s->s_authorizer)
282 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
283 s->s_mdsc->client->monc.auth, s->s_authorizer);
284 kfree(s);
285 }
286}
287
288/*
289 * called under mdsc->mutex
290 */
291struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
292 int mds)
293{
294 struct ceph_mds_session *session;
295
296 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
297 return NULL;
298 session = mdsc->sessions[mds];
299 dout("lookup_mds_session %p %d\n", session,
300 atomic_read(&session->s_ref));
301 get_session(session);
302 return session;
303}
304
305static bool __have_session(struct ceph_mds_client *mdsc, int mds)
306{
307 if (mds >= mdsc->max_sessions)
308 return false;
309 return mdsc->sessions[mds];
310}
311
312static int __verify_registered_session(struct ceph_mds_client *mdsc,
313 struct ceph_mds_session *s)
314{
315 if (s->s_mds >= mdsc->max_sessions ||
316 mdsc->sessions[s->s_mds] != s)
317 return -ENOENT;
318 return 0;
319}
320
321/*
322 * create+register a new session for given mds.
323 * called under mdsc->mutex.
324 */
325static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
326 int mds)
327{
328 struct ceph_mds_session *s;
329
330 s = kzalloc(sizeof(*s), GFP_NOFS);
331 if (!s)
332 return ERR_PTR(-ENOMEM);
333 s->s_mdsc = mdsc;
334 s->s_mds = mds;
335 s->s_state = CEPH_MDS_SESSION_NEW;
336 s->s_ttl = 0;
337 s->s_seq = 0;
338 mutex_init(&s->s_mutex);
339
340 ceph_con_init(mdsc->client->msgr, &s->s_con);
341 s->s_con.private = s;
342 s->s_con.ops = &mds_con_ops;
343 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
344 s->s_con.peer_name.num = cpu_to_le64(mds);
345
346 spin_lock_init(&s->s_cap_lock);
347 s->s_cap_gen = 0;
348 s->s_cap_ttl = 0;
349 s->s_renew_requested = 0;
350 s->s_renew_seq = 0;
351 INIT_LIST_HEAD(&s->s_caps);
352 s->s_nr_caps = 0;
353 s->s_trim_caps = 0;
354 atomic_set(&s->s_ref, 1);
355 INIT_LIST_HEAD(&s->s_waiting);
356 INIT_LIST_HEAD(&s->s_unsafe);
357 s->s_num_cap_releases = 0;
358 s->s_cap_iterator = NULL;
359 INIT_LIST_HEAD(&s->s_cap_releases);
360 INIT_LIST_HEAD(&s->s_cap_releases_done);
361 INIT_LIST_HEAD(&s->s_cap_flushing);
362 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
363
364 dout("register_session mds%d\n", mds);
365 if (mds >= mdsc->max_sessions) {
366 int newmax = 1 << get_count_order(mds+1);
367 struct ceph_mds_session **sa;
368
369 dout("register_session realloc to %d\n", newmax);
370 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
371 if (sa == NULL)
372 goto fail_realloc;
373 if (mdsc->sessions) {
374 memcpy(sa, mdsc->sessions,
375 mdsc->max_sessions * sizeof(void *));
376 kfree(mdsc->sessions);
377 }
378 mdsc->sessions = sa;
379 mdsc->max_sessions = newmax;
380 }
381 mdsc->sessions[mds] = s;
382 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
383
384 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
385
386 return s;
387
388fail_realloc:
389 kfree(s);
390 return ERR_PTR(-ENOMEM);
391}
392
393/*
394 * called under mdsc->mutex
395 */
396static void __unregister_session(struct ceph_mds_client *mdsc,
397 struct ceph_mds_session *s)
398{
399 dout("__unregister_session mds%d %p\n", s->s_mds, s);
400 BUG_ON(mdsc->sessions[s->s_mds] != s);
401 mdsc->sessions[s->s_mds] = NULL;
402 ceph_con_close(&s->s_con);
403 ceph_put_mds_session(s);
404}
405
406/*
407 * drop session refs in request.
408 *
409 * should be last request ref, or hold mdsc->mutex
410 */
411static void put_request_session(struct ceph_mds_request *req)
412{
413 if (req->r_session) {
414 ceph_put_mds_session(req->r_session);
415 req->r_session = NULL;
416 }
417}
418
419void ceph_mdsc_release_request(struct kref *kref)
420{
421 struct ceph_mds_request *req = container_of(kref,
422 struct ceph_mds_request,
423 r_kref);
424 if (req->r_request)
425 ceph_msg_put(req->r_request);
426 if (req->r_reply) {
427 ceph_msg_put(req->r_reply);
428 destroy_reply_info(&req->r_reply_info);
429 }
430 if (req->r_inode) {
431 ceph_put_cap_refs(ceph_inode(req->r_inode),
432 CEPH_CAP_PIN);
433 iput(req->r_inode);
434 }
435 if (req->r_locked_dir)
436 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
437 CEPH_CAP_PIN);
438 if (req->r_target_inode)
439 iput(req->r_target_inode);
440 if (req->r_dentry)
441 dput(req->r_dentry);
442 if (req->r_old_dentry) {
443 ceph_put_cap_refs(
444 ceph_inode(req->r_old_dentry->d_parent->d_inode),
445 CEPH_CAP_PIN);
446 dput(req->r_old_dentry);
447 }
448 kfree(req->r_path1);
449 kfree(req->r_path2);
450 put_request_session(req);
451 ceph_unreserve_caps(&req->r_caps_reservation);
452 kfree(req);
453}
454
455/*
456 * lookup session, bump ref if found.
457 *
458 * called under mdsc->mutex.
459 */
460static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
461 u64 tid)
462{
463 struct ceph_mds_request *req;
464 struct rb_node *n = mdsc->request_tree.rb_node;
465
466 while (n) {
467 req = rb_entry(n, struct ceph_mds_request, r_node);
468 if (tid < req->r_tid)
469 n = n->rb_left;
470 else if (tid > req->r_tid)
471 n = n->rb_right;
472 else {
473 ceph_mdsc_get_request(req);
474 return req;
475 }
476 }
477 return NULL;
478}
479
480static void __insert_request(struct ceph_mds_client *mdsc,
481 struct ceph_mds_request *new)
482{
483 struct rb_node **p = &mdsc->request_tree.rb_node;
484 struct rb_node *parent = NULL;
485 struct ceph_mds_request *req = NULL;
486
487 while (*p) {
488 parent = *p;
489 req = rb_entry(parent, struct ceph_mds_request, r_node);
490 if (new->r_tid < req->r_tid)
491 p = &(*p)->rb_left;
492 else if (new->r_tid > req->r_tid)
493 p = &(*p)->rb_right;
494 else
495 BUG();
496 }
497
498 rb_link_node(&new->r_node, parent, p);
499 rb_insert_color(&new->r_node, &mdsc->request_tree);
500}
501
502/*
503 * Register an in-flight request, and assign a tid. Link to directory
504 * are modifying (if any).
505 *
506 * Called under mdsc->mutex.
507 */
508static void __register_request(struct ceph_mds_client *mdsc,
509 struct ceph_mds_request *req,
510 struct inode *dir)
511{
512 req->r_tid = ++mdsc->last_tid;
513 if (req->r_num_caps)
514 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
515 dout("__register_request %p tid %lld\n", req, req->r_tid);
516 ceph_mdsc_get_request(req);
517 __insert_request(mdsc, req);
518
519 if (dir) {
520 struct ceph_inode_info *ci = ceph_inode(dir);
521
522 spin_lock(&ci->i_unsafe_lock);
523 req->r_unsafe_dir = dir;
524 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
525 spin_unlock(&ci->i_unsafe_lock);
526 }
527}
528
529static void __unregister_request(struct ceph_mds_client *mdsc,
530 struct ceph_mds_request *req)
531{
532 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
533 rb_erase(&req->r_node, &mdsc->request_tree);
534 RB_CLEAR_NODE(&req->r_node);
535
536 if (req->r_unsafe_dir) {
537 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
538
539 spin_lock(&ci->i_unsafe_lock);
540 list_del_init(&req->r_unsafe_dir_item);
541 spin_unlock(&ci->i_unsafe_lock);
542 }
543
544 ceph_mdsc_put_request(req);
545}
546
547/*
548 * Choose mds to send request to next. If there is a hint set in the
549 * request (e.g., due to a prior forward hint from the mds), use that.
550 * Otherwise, consult frag tree and/or caps to identify the
551 * appropriate mds. If all else fails, choose randomly.
552 *
553 * Called under mdsc->mutex.
554 */
555static int __choose_mds(struct ceph_mds_client *mdsc,
556 struct ceph_mds_request *req)
557{
558 struct inode *inode;
559 struct ceph_inode_info *ci;
560 struct ceph_cap *cap;
561 int mode = req->r_direct_mode;
562 int mds = -1;
563 u32 hash = req->r_direct_hash;
564 bool is_hash = req->r_direct_is_hash;
565
566 /*
567 * is there a specific mds we should try? ignore hint if we have
568 * no session and the mds is not up (active or recovering).
569 */
570 if (req->r_resend_mds >= 0 &&
571 (__have_session(mdsc, req->r_resend_mds) ||
572 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
573 dout("choose_mds using resend_mds mds%d\n",
574 req->r_resend_mds);
575 return req->r_resend_mds;
576 }
577
578 if (mode == USE_RANDOM_MDS)
579 goto random;
580
581 inode = NULL;
582 if (req->r_inode) {
583 inode = req->r_inode;
584 } else if (req->r_dentry) {
585 if (req->r_dentry->d_inode) {
586 inode = req->r_dentry->d_inode;
587 } else {
588 inode = req->r_dentry->d_parent->d_inode;
589 hash = req->r_dentry->d_name.hash;
590 is_hash = true;
591 }
592 }
593 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
594 (int)hash, mode);
595 if (!inode)
596 goto random;
597 ci = ceph_inode(inode);
598
599 if (is_hash && S_ISDIR(inode->i_mode)) {
600 struct ceph_inode_frag frag;
601 int found;
602
603 ceph_choose_frag(ci, hash, &frag, &found);
604 if (found) {
605 if (mode == USE_ANY_MDS && frag.ndist > 0) {
606 u8 r;
607
608 /* choose a random replica */
609 get_random_bytes(&r, 1);
610 r %= frag.ndist;
611 mds = frag.dist[r];
612 dout("choose_mds %p %llx.%llx "
613 "frag %u mds%d (%d/%d)\n",
614 inode, ceph_vinop(inode),
615 frag.frag, frag.mds,
616 (int)r, frag.ndist);
617 return mds;
618 }
619
620 /* since this file/dir wasn't known to be
621 * replicated, then we want to look for the
622 * authoritative mds. */
623 mode = USE_AUTH_MDS;
624 if (frag.mds >= 0) {
625 /* choose auth mds */
626 mds = frag.mds;
627 dout("choose_mds %p %llx.%llx "
628 "frag %u mds%d (auth)\n",
629 inode, ceph_vinop(inode), frag.frag, mds);
630 return mds;
631 }
632 }
633 }
634
635 spin_lock(&inode->i_lock);
636 cap = NULL;
637 if (mode == USE_AUTH_MDS)
638 cap = ci->i_auth_cap;
639 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
640 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
641 if (!cap) {
642 spin_unlock(&inode->i_lock);
643 goto random;
644 }
645 mds = cap->session->s_mds;
646 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
647 inode, ceph_vinop(inode), mds,
648 cap == ci->i_auth_cap ? "auth " : "", cap);
649 spin_unlock(&inode->i_lock);
650 return mds;
651
652random:
653 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
654 dout("choose_mds chose random mds%d\n", mds);
655 return mds;
656}
657
658
659/*
660 * session messages
661 */
662static struct ceph_msg *create_session_msg(u32 op, u64 seq)
663{
664 struct ceph_msg *msg;
665 struct ceph_mds_session_head *h;
666
667 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
668 if (IS_ERR(msg)) {
669 pr_err("create_session_msg ENOMEM creating msg\n");
670 return ERR_PTR(PTR_ERR(msg));
671 }
672 h = msg->front.iov_base;
673 h->op = cpu_to_le32(op);
674 h->seq = cpu_to_le64(seq);
675 return msg;
676}
677
678/*
679 * send session open request.
680 *
681 * called under mdsc->mutex
682 */
683static int __open_session(struct ceph_mds_client *mdsc,
684 struct ceph_mds_session *session)
685{
686 struct ceph_msg *msg;
687 int mstate;
688 int mds = session->s_mds;
689 int err = 0;
690
691 /* wait for mds to go active? */
692 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
693 dout("open_session to mds%d (%s)\n", mds,
694 ceph_mds_state_name(mstate));
695 session->s_state = CEPH_MDS_SESSION_OPENING;
696 session->s_renew_requested = jiffies;
697
698 /* send connect message */
699 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
700 if (IS_ERR(msg)) {
701 err = PTR_ERR(msg);
702 goto out;
703 }
704 ceph_con_send(&session->s_con, msg);
705
706out:
707 return 0;
708}
709
710/*
711 * session caps
712 */
713
714/*
715 * Free preallocated cap messages assigned to this session
716 */
717static void cleanup_cap_releases(struct ceph_mds_session *session)
718{
719 struct ceph_msg *msg;
720
721 spin_lock(&session->s_cap_lock);
722 while (!list_empty(&session->s_cap_releases)) {
723 msg = list_first_entry(&session->s_cap_releases,
724 struct ceph_msg, list_head);
725 list_del_init(&msg->list_head);
726 ceph_msg_put(msg);
727 }
728 while (!list_empty(&session->s_cap_releases_done)) {
729 msg = list_first_entry(&session->s_cap_releases_done,
730 struct ceph_msg, list_head);
731 list_del_init(&msg->list_head);
732 ceph_msg_put(msg);
733 }
734 spin_unlock(&session->s_cap_lock);
735}
736
737/*
738 * Helper to safely iterate over all caps associated with a session.
739 *
740 * caller must hold session s_mutex
741 */
742static int iterate_session_caps(struct ceph_mds_session *session,
743 int (*cb)(struct inode *, struct ceph_cap *,
744 void *), void *arg)
745{
746 struct list_head *p;
747 struct ceph_cap *cap;
748 struct inode *inode, *last_inode = NULL;
749 struct ceph_cap *old_cap = NULL;
750 int ret;
751
752 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
753 spin_lock(&session->s_cap_lock);
754 p = session->s_caps.next;
755 while (p != &session->s_caps) {
756 cap = list_entry(p, struct ceph_cap, session_caps);
757 inode = igrab(&cap->ci->vfs_inode);
758 if (!inode) {
759 p = p->next;
760 continue;
761 }
762 session->s_cap_iterator = cap;
763 spin_unlock(&session->s_cap_lock);
764
765 if (last_inode) {
766 iput(last_inode);
767 last_inode = NULL;
768 }
769 if (old_cap) {
770 ceph_put_cap(old_cap);
771 old_cap = NULL;
772 }
773
774 ret = cb(inode, cap, arg);
775 last_inode = inode;
776
777 spin_lock(&session->s_cap_lock);
778 p = p->next;
779 if (cap->ci == NULL) {
780 dout("iterate_session_caps finishing cap %p removal\n",
781 cap);
782 BUG_ON(cap->session != session);
783 list_del_init(&cap->session_caps);
784 session->s_nr_caps--;
785 cap->session = NULL;
786 old_cap = cap; /* put_cap it w/o locks held */
787 }
788 if (ret < 0)
789 goto out;
790 }
791 ret = 0;
792out:
793 session->s_cap_iterator = NULL;
794 spin_unlock(&session->s_cap_lock);
795
796 if (last_inode)
797 iput(last_inode);
798 if (old_cap)
799 ceph_put_cap(old_cap);
800
801 return ret;
802}
803
804static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
805 void *arg)
806{
807 struct ceph_inode_info *ci = ceph_inode(inode);
808 dout("removing cap %p, ci is %p, inode is %p\n",
809 cap, ci, &ci->vfs_inode);
810 ceph_remove_cap(cap);
811 return 0;
812}
813
814/*
815 * caller must hold session s_mutex
816 */
817static void remove_session_caps(struct ceph_mds_session *session)
818{
819 dout("remove_session_caps on %p\n", session);
820 iterate_session_caps(session, remove_session_caps_cb, NULL);
821 BUG_ON(session->s_nr_caps > 0);
822 cleanup_cap_releases(session);
823}
824
825/*
826 * wake up any threads waiting on this session's caps. if the cap is
827 * old (didn't get renewed on the client reconnect), remove it now.
828 *
829 * caller must hold s_mutex.
830 */
831static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
832 void *arg)
833{
834 struct ceph_inode_info *ci = ceph_inode(inode);
835
836 wake_up(&ci->i_cap_wq);
837 if (arg) {
838 spin_lock(&inode->i_lock);
839 ci->i_wanted_max_size = 0;
840 ci->i_requested_max_size = 0;
841 spin_unlock(&inode->i_lock);
842 }
843 return 0;
844}
845
846static void wake_up_session_caps(struct ceph_mds_session *session,
847 int reconnect)
848{
849 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
850 iterate_session_caps(session, wake_up_session_cb,
851 (void *)(unsigned long)reconnect);
852}
853
854/*
855 * Send periodic message to MDS renewing all currently held caps. The
856 * ack will reset the expiration for all caps from this session.
857 *
858 * caller holds s_mutex
859 */
860static int send_renew_caps(struct ceph_mds_client *mdsc,
861 struct ceph_mds_session *session)
862{
863 struct ceph_msg *msg;
864 int state;
865
866 if (time_after_eq(jiffies, session->s_cap_ttl) &&
867 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
868 pr_info("mds%d caps stale\n", session->s_mds);
869 session->s_renew_requested = jiffies;
870
871 /* do not try to renew caps until a recovering mds has reconnected
872 * with its clients. */
873 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
874 if (state < CEPH_MDS_STATE_RECONNECT) {
875 dout("send_renew_caps ignoring mds%d (%s)\n",
876 session->s_mds, ceph_mds_state_name(state));
877 return 0;
878 }
879
880 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
881 ceph_mds_state_name(state));
882 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
883 ++session->s_renew_seq);
884 if (IS_ERR(msg))
885 return PTR_ERR(msg);
886 ceph_con_send(&session->s_con, msg);
887 return 0;
888}
889
890/*
891 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
892 *
893 * Called under session->s_mutex
894 */
895static void renewed_caps(struct ceph_mds_client *mdsc,
896 struct ceph_mds_session *session, int is_renew)
897{
898 int was_stale;
899 int wake = 0;
900
901 spin_lock(&session->s_cap_lock);
902 was_stale = is_renew && (session->s_cap_ttl == 0 ||
903 time_after_eq(jiffies, session->s_cap_ttl));
904
905 session->s_cap_ttl = session->s_renew_requested +
906 mdsc->mdsmap->m_session_timeout*HZ;
907
908 if (was_stale) {
909 if (time_before(jiffies, session->s_cap_ttl)) {
910 pr_info("mds%d caps renewed\n", session->s_mds);
911 wake = 1;
912 } else {
913 pr_info("mds%d caps still stale\n", session->s_mds);
914 }
915 }
916 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
917 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
918 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
919 spin_unlock(&session->s_cap_lock);
920
921 if (wake)
922 wake_up_session_caps(session, 0);
923}
924
925/*
926 * send a session close request
927 */
928static int request_close_session(struct ceph_mds_client *mdsc,
929 struct ceph_mds_session *session)
930{
931 struct ceph_msg *msg;
932 int err = 0;
933
934 dout("request_close_session mds%d state %s seq %lld\n",
935 session->s_mds, session_state_name(session->s_state),
936 session->s_seq);
937 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
938 if (IS_ERR(msg))
939 err = PTR_ERR(msg);
940 else
941 ceph_con_send(&session->s_con, msg);
942 return err;
943}
944
945/*
946 * Called with s_mutex held.
947 */
948static int __close_session(struct ceph_mds_client *mdsc,
949 struct ceph_mds_session *session)
950{
951 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
952 return 0;
953 session->s_state = CEPH_MDS_SESSION_CLOSING;
954 return request_close_session(mdsc, session);
955}
956
957/*
958 * Trim old(er) caps.
959 *
960 * Because we can't cache an inode without one or more caps, we do
961 * this indirectly: if a cap is unused, we prune its aliases, at which
962 * point the inode will hopefully get dropped to.
963 *
964 * Yes, this is a bit sloppy. Our only real goal here is to respond to
965 * memory pressure from the MDS, though, so it needn't be perfect.
966 */
967static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
968{
969 struct ceph_mds_session *session = arg;
970 struct ceph_inode_info *ci = ceph_inode(inode);
971 int used, oissued, mine;
972
973 if (session->s_trim_caps <= 0)
974 return -1;
975
976 spin_lock(&inode->i_lock);
977 mine = cap->issued | cap->implemented;
978 used = __ceph_caps_used(ci);
979 oissued = __ceph_caps_issued_other(ci, cap);
980
981 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
982 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
983 ceph_cap_string(used));
984 if (ci->i_dirty_caps)
985 goto out; /* dirty caps */
986 if ((used & ~oissued) & mine)
987 goto out; /* we need these caps */
988
989 session->s_trim_caps--;
990 if (oissued) {
991 /* we aren't the only cap.. just remove us */
992 __ceph_remove_cap(cap);
993 } else {
994 /* try to drop referring dentries */
995 spin_unlock(&inode->i_lock);
996 d_prune_aliases(inode);
997 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
998 inode, cap, atomic_read(&inode->i_count));
999 return 0;
1000 }
1001
1002out:
1003 spin_unlock(&inode->i_lock);
1004 return 0;
1005}
1006
1007/*
1008 * Trim session cap count down to some max number.
1009 */
1010static int trim_caps(struct ceph_mds_client *mdsc,
1011 struct ceph_mds_session *session,
1012 int max_caps)
1013{
1014 int trim_caps = session->s_nr_caps - max_caps;
1015
1016 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1017 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1018 if (trim_caps > 0) {
1019 session->s_trim_caps = trim_caps;
1020 iterate_session_caps(session, trim_caps_cb, session);
1021 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1022 session->s_mds, session->s_nr_caps, max_caps,
1023 trim_caps - session->s_trim_caps);
1024 session->s_trim_caps = 0;
1025 }
1026 return 0;
1027}
1028
1029/*
1030 * Allocate cap_release messages. If there is a partially full message
1031 * in the queue, try to allocate enough to cover it's remainder, so that
1032 * we can send it immediately.
1033 *
1034 * Called under s_mutex.
1035 */
1036static int add_cap_releases(struct ceph_mds_client *mdsc,
1037 struct ceph_mds_session *session,
1038 int extra)
1039{
1040 struct ceph_msg *msg;
1041 struct ceph_mds_cap_release *head;
1042 int err = -ENOMEM;
1043
1044 if (extra < 0)
1045 extra = mdsc->client->mount_args->cap_release_safety;
1046
1047 spin_lock(&session->s_cap_lock);
1048
1049 if (!list_empty(&session->s_cap_releases)) {
1050 msg = list_first_entry(&session->s_cap_releases,
1051 struct ceph_msg,
1052 list_head);
1053 head = msg->front.iov_base;
1054 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1055 }
1056
1057 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1058 spin_unlock(&session->s_cap_lock);
1059 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1060 0, 0, NULL);
1061 if (!msg)
1062 goto out_unlocked;
1063 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1064 (int)msg->front.iov_len);
1065 head = msg->front.iov_base;
1066 head->num = cpu_to_le32(0);
1067 msg->front.iov_len = sizeof(*head);
1068 spin_lock(&session->s_cap_lock);
1069 list_add(&msg->list_head, &session->s_cap_releases);
1070 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1071 }
1072
1073 if (!list_empty(&session->s_cap_releases)) {
1074 msg = list_first_entry(&session->s_cap_releases,
1075 struct ceph_msg,
1076 list_head);
1077 head = msg->front.iov_base;
1078 if (head->num) {
1079 dout(" queueing non-full %p (%d)\n", msg,
1080 le32_to_cpu(head->num));
1081 list_move_tail(&msg->list_head,
1082 &session->s_cap_releases_done);
1083 session->s_num_cap_releases -=
1084 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1085 }
1086 }
1087 err = 0;
1088 spin_unlock(&session->s_cap_lock);
1089out_unlocked:
1090 return err;
1091}
1092
1093/*
1094 * flush all dirty inode data to disk.
1095 *
1096 * returns true if we've flushed through want_flush_seq
1097 */
1098static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1099{
1100 int mds, ret = 1;
1101
1102 dout("check_cap_flush want %lld\n", want_flush_seq);
1103 mutex_lock(&mdsc->mutex);
1104 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1105 struct ceph_mds_session *session = mdsc->sessions[mds];
1106
1107 if (!session)
1108 continue;
1109 get_session(session);
1110 mutex_unlock(&mdsc->mutex);
1111
1112 mutex_lock(&session->s_mutex);
1113 if (!list_empty(&session->s_cap_flushing)) {
1114 struct ceph_inode_info *ci =
1115 list_entry(session->s_cap_flushing.next,
1116 struct ceph_inode_info,
1117 i_flushing_item);
1118 struct inode *inode = &ci->vfs_inode;
1119
1120 spin_lock(&inode->i_lock);
1121 if (ci->i_cap_flush_seq <= want_flush_seq) {
1122 dout("check_cap_flush still flushing %p "
1123 "seq %lld <= %lld to mds%d\n", inode,
1124 ci->i_cap_flush_seq, want_flush_seq,
1125 session->s_mds);
1126 ret = 0;
1127 }
1128 spin_unlock(&inode->i_lock);
1129 }
1130 mutex_unlock(&session->s_mutex);
1131 ceph_put_mds_session(session);
1132
1133 if (!ret)
1134 return ret;
1135 mutex_lock(&mdsc->mutex);
1136 }
1137
1138 mutex_unlock(&mdsc->mutex);
1139 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1140 return ret;
1141}
1142
1143/*
1144 * called under s_mutex
1145 */
1146static void send_cap_releases(struct ceph_mds_client *mdsc,
1147 struct ceph_mds_session *session)
1148{
1149 struct ceph_msg *msg;
1150
1151 dout("send_cap_releases mds%d\n", session->s_mds);
1152 while (1) {
1153 spin_lock(&session->s_cap_lock);
1154 if (list_empty(&session->s_cap_releases_done))
1155 break;
1156 msg = list_first_entry(&session->s_cap_releases_done,
1157 struct ceph_msg, list_head);
1158 list_del_init(&msg->list_head);
1159 spin_unlock(&session->s_cap_lock);
1160 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1161 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1162 ceph_con_send(&session->s_con, msg);
1163 }
1164 spin_unlock(&session->s_cap_lock);
1165}
1166
1167/*
1168 * requests
1169 */
1170
1171/*
1172 * Create an mds request.
1173 */
1174struct ceph_mds_request *
1175ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1176{
1177 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1178
1179 if (!req)
1180 return ERR_PTR(-ENOMEM);
1181
1182 req->r_started = jiffies;
1183 req->r_resend_mds = -1;
1184 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1185 req->r_fmode = -1;
1186 kref_init(&req->r_kref);
1187 INIT_LIST_HEAD(&req->r_wait);
1188 init_completion(&req->r_completion);
1189 init_completion(&req->r_safe_completion);
1190 INIT_LIST_HEAD(&req->r_unsafe_item);
1191
1192 req->r_op = op;
1193 req->r_direct_mode = mode;
1194 return req;
1195}
1196
1197/*
1198 * return oldest (lowest) request, tid in request tree, 0 if none.
1199 *
1200 * called under mdsc->mutex.
1201 */
1202static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1203{
1204 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1205 return NULL;
1206 return rb_entry(rb_first(&mdsc->request_tree),
1207 struct ceph_mds_request, r_node);
1208}
1209
1210static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1211{
1212 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1213
1214 if (req)
1215 return req->r_tid;
1216 return 0;
1217}
1218
1219/*
1220 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1221 * on build_path_from_dentry in fs/cifs/dir.c.
1222 *
1223 * If @stop_on_nosnap, generate path relative to the first non-snapped
1224 * inode.
1225 *
1226 * Encode hidden .snap dirs as a double /, i.e.
1227 * foo/.snap/bar -> foo//bar
1228 */
1229char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1230 int stop_on_nosnap)
1231{
1232 struct dentry *temp;
1233 char *path;
1234 int len, pos;
1235
1236 if (dentry == NULL)
1237 return ERR_PTR(-EINVAL);
1238
1239retry:
1240 len = 0;
1241 for (temp = dentry; !IS_ROOT(temp);) {
1242 struct inode *inode = temp->d_inode;
1243 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1244 len++; /* slash only */
1245 else if (stop_on_nosnap && inode &&
1246 ceph_snap(inode) == CEPH_NOSNAP)
1247 break;
1248 else
1249 len += 1 + temp->d_name.len;
1250 temp = temp->d_parent;
1251 if (temp == NULL) {
1252 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1253 return ERR_PTR(-EINVAL);
1254 }
1255 }
1256 if (len)
1257 len--; /* no leading '/' */
1258
1259 path = kmalloc(len+1, GFP_NOFS);
1260 if (path == NULL)
1261 return ERR_PTR(-ENOMEM);
1262 pos = len;
1263 path[pos] = 0; /* trailing null */
1264 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1265 struct inode *inode = temp->d_inode;
1266
1267 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1268 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1269 pos, temp);
1270 } else if (stop_on_nosnap && inode &&
1271 ceph_snap(inode) == CEPH_NOSNAP) {
1272 break;
1273 } else {
1274 pos -= temp->d_name.len;
1275 if (pos < 0)
1276 break;
1277 strncpy(path + pos, temp->d_name.name,
1278 temp->d_name.len);
1279 dout("build_path_dentry path+%d: %p '%.*s'\n",
1280 pos, temp, temp->d_name.len, path + pos);
1281 }
1282 if (pos)
1283 path[--pos] = '/';
1284 temp = temp->d_parent;
1285 if (temp == NULL) {
1286 pr_err("build_path_dentry corrupt dentry\n");
1287 kfree(path);
1288 return ERR_PTR(-EINVAL);
1289 }
1290 }
1291 if (pos != 0) {
1292 pr_err("build_path_dentry did not end path lookup where "
1293 "expected, namelen is %d, pos is %d\n", len, pos);
1294 /* presumably this is only possible if racing with a
1295 rename of one of the parent directories (we can not
1296 lock the dentries above us to prevent this, but
1297 retrying should be harmless) */
1298 kfree(path);
1299 goto retry;
1300 }
1301
1302 *base = ceph_ino(temp->d_inode);
1303 *plen = len;
1304 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1305 dentry, atomic_read(&dentry->d_count), *base, len, path);
1306 return path;
1307}
1308
1309static int build_dentry_path(struct dentry *dentry,
1310 const char **ppath, int *ppathlen, u64 *pino,
1311 int *pfreepath)
1312{
1313 char *path;
1314
1315 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1316 *pino = ceph_ino(dentry->d_parent->d_inode);
1317 *ppath = dentry->d_name.name;
1318 *ppathlen = dentry->d_name.len;
1319 return 0;
1320 }
1321 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1322 if (IS_ERR(path))
1323 return PTR_ERR(path);
1324 *ppath = path;
1325 *pfreepath = 1;
1326 return 0;
1327}
1328
1329static int build_inode_path(struct inode *inode,
1330 const char **ppath, int *ppathlen, u64 *pino,
1331 int *pfreepath)
1332{
1333 struct dentry *dentry;
1334 char *path;
1335
1336 if (ceph_snap(inode) == CEPH_NOSNAP) {
1337 *pino = ceph_ino(inode);
1338 *ppathlen = 0;
1339 return 0;
1340 }
1341 dentry = d_find_alias(inode);
1342 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1343 dput(dentry);
1344 if (IS_ERR(path))
1345 return PTR_ERR(path);
1346 *ppath = path;
1347 *pfreepath = 1;
1348 return 0;
1349}
1350
1351/*
1352 * request arguments may be specified via an inode *, a dentry *, or
1353 * an explicit ino+path.
1354 */
1355static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1356 const char *rpath, u64 rino,
1357 const char **ppath, int *pathlen,
1358 u64 *ino, int *freepath)
1359{
1360 int r = 0;
1361
1362 if (rinode) {
1363 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1364 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1365 ceph_snap(rinode));
1366 } else if (rdentry) {
1367 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1368 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1369 *ppath);
1370 } else if (rpath) {
1371 *ino = rino;
1372 *ppath = rpath;
1373 *pathlen = strlen(rpath);
1374 dout(" path %.*s\n", *pathlen, rpath);
1375 }
1376
1377 return r;
1378}
1379
1380/*
1381 * called under mdsc->mutex
1382 */
1383static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1384 struct ceph_mds_request *req,
1385 int mds)
1386{
1387 struct ceph_msg *msg;
1388 struct ceph_mds_request_head *head;
1389 const char *path1 = NULL;
1390 const char *path2 = NULL;
1391 u64 ino1 = 0, ino2 = 0;
1392 int pathlen1 = 0, pathlen2 = 0;
1393 int freepath1 = 0, freepath2 = 0;
1394 int len;
1395 u16 releases;
1396 void *p, *end;
1397 int ret;
1398
1399 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1400 req->r_path1, req->r_ino1.ino,
1401 &path1, &pathlen1, &ino1, &freepath1);
1402 if (ret < 0) {
1403 msg = ERR_PTR(ret);
1404 goto out;
1405 }
1406
1407 ret = set_request_path_attr(NULL, req->r_old_dentry,
1408 req->r_path2, req->r_ino2.ino,
1409 &path2, &pathlen2, &ino2, &freepath2);
1410 if (ret < 0) {
1411 msg = ERR_PTR(ret);
1412 goto out_free1;
1413 }
1414
1415 len = sizeof(*head) +
1416 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1417
1418 /* calculate (max) length for cap releases */
1419 len += sizeof(struct ceph_mds_request_release) *
1420 (!!req->r_inode_drop + !!req->r_dentry_drop +
1421 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1422 if (req->r_dentry_drop)
1423 len += req->r_dentry->d_name.len;
1424 if (req->r_old_dentry_drop)
1425 len += req->r_old_dentry->d_name.len;
1426
1427 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1428 if (IS_ERR(msg))
1429 goto out_free2;
1430
1431 msg->hdr.tid = cpu_to_le64(req->r_tid);
1432
1433 head = msg->front.iov_base;
1434 p = msg->front.iov_base + sizeof(*head);
1435 end = msg->front.iov_base + msg->front.iov_len;
1436
1437 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1438 head->op = cpu_to_le32(req->r_op);
1439 head->caller_uid = cpu_to_le32(current_fsuid());
1440 head->caller_gid = cpu_to_le32(current_fsgid());
1441 head->args = req->r_args;
1442
1443 ceph_encode_filepath(&p, end, ino1, path1);
1444 ceph_encode_filepath(&p, end, ino2, path2);
1445
1446 /* cap releases */
1447 releases = 0;
1448 if (req->r_inode_drop)
1449 releases += ceph_encode_inode_release(&p,
1450 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1451 mds, req->r_inode_drop, req->r_inode_unless, 0);
1452 if (req->r_dentry_drop)
1453 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1454 mds, req->r_dentry_drop, req->r_dentry_unless);
1455 if (req->r_old_dentry_drop)
1456 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1457 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1458 if (req->r_old_inode_drop)
1459 releases += ceph_encode_inode_release(&p,
1460 req->r_old_dentry->d_inode,
1461 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1462 head->num_releases = cpu_to_le16(releases);
1463
1464 BUG_ON(p > end);
1465 msg->front.iov_len = p - msg->front.iov_base;
1466 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1467
1468 msg->pages = req->r_pages;
1469 msg->nr_pages = req->r_num_pages;
1470 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1471 msg->hdr.data_off = cpu_to_le16(0);
1472
1473out_free2:
1474 if (freepath2)
1475 kfree((char *)path2);
1476out_free1:
1477 if (freepath1)
1478 kfree((char *)path1);
1479out:
1480 return msg;
1481}
1482
1483/*
1484 * called under mdsc->mutex if error, under no mutex if
1485 * success.
1486 */
1487static void complete_request(struct ceph_mds_client *mdsc,
1488 struct ceph_mds_request *req)
1489{
1490 if (req->r_callback)
1491 req->r_callback(mdsc, req);
1492 else
1493 complete(&req->r_completion);
1494}
1495
1496/*
1497 * called under mdsc->mutex
1498 */
1499static int __prepare_send_request(struct ceph_mds_client *mdsc,
1500 struct ceph_mds_request *req,
1501 int mds)
1502{
1503 struct ceph_mds_request_head *rhead;
1504 struct ceph_msg *msg;
1505 int flags = 0;
1506
1507 req->r_mds = mds;
1508 req->r_attempts++;
1509 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1510 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1511
1512 if (req->r_request) {
1513 ceph_msg_put(req->r_request);
1514 req->r_request = NULL;
1515 }
1516 msg = create_request_message(mdsc, req, mds);
1517 if (IS_ERR(msg)) {
1518 req->r_reply = ERR_PTR(PTR_ERR(msg));
1519 complete_request(mdsc, req);
1520 return -PTR_ERR(msg);
1521 }
1522 req->r_request = msg;
1523
1524 rhead = msg->front.iov_base;
1525 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1526 if (req->r_got_unsafe)
1527 flags |= CEPH_MDS_FLAG_REPLAY;
1528 if (req->r_locked_dir)
1529 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1530 rhead->flags = cpu_to_le32(flags);
1531 rhead->num_fwd = req->r_num_fwd;
1532 rhead->num_retry = req->r_attempts - 1;
1533
1534 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1535
1536 if (req->r_target_inode && req->r_got_unsafe)
1537 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1538 else
1539 rhead->ino = 0;
1540 return 0;
1541}
1542
1543/*
1544 * send request, or put it on the appropriate wait list.
1545 */
1546static int __do_request(struct ceph_mds_client *mdsc,
1547 struct ceph_mds_request *req)
1548{
1549 struct ceph_mds_session *session = NULL;
1550 int mds = -1;
1551 int err = -EAGAIN;
1552
1553 if (req->r_reply)
1554 goto out;
1555
1556 if (req->r_timeout &&
1557 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1558 dout("do_request timed out\n");
1559 err = -EIO;
1560 goto finish;
1561 }
1562
1563 mds = __choose_mds(mdsc, req);
1564 if (mds < 0 ||
1565 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1566 dout("do_request no mds or not active, waiting for map\n");
1567 list_add(&req->r_wait, &mdsc->waiting_for_map);
1568 goto out;
1569 }
1570
1571 /* get, open session */
1572 session = __ceph_lookup_mds_session(mdsc, mds);
1573 if (!session) {
1574 session = register_session(mdsc, mds);
1575 if (IS_ERR(session)) {
1576 err = PTR_ERR(session);
1577 goto finish;
1578 }
1579 }
1580 dout("do_request mds%d session %p state %s\n", mds, session,
1581 session_state_name(session->s_state));
1582 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1583 session->s_state != CEPH_MDS_SESSION_HUNG) {
1584 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1585 session->s_state == CEPH_MDS_SESSION_CLOSING)
1586 __open_session(mdsc, session);
1587 list_add(&req->r_wait, &session->s_waiting);
1588 goto out_session;
1589 }
1590
1591 /* send request */
1592 req->r_session = get_session(session);
1593 req->r_resend_mds = -1; /* forget any previous mds hint */
1594
1595 if (req->r_request_started == 0) /* note request start time */
1596 req->r_request_started = jiffies;
1597
1598 err = __prepare_send_request(mdsc, req, mds);
1599 if (!err) {
1600 ceph_msg_get(req->r_request);
1601 ceph_con_send(&session->s_con, req->r_request);
1602 }
1603
1604out_session:
1605 ceph_put_mds_session(session);
1606out:
1607 return err;
1608
1609finish:
1610 req->r_reply = ERR_PTR(err);
1611 complete_request(mdsc, req);
1612 goto out;
1613}
1614
1615/*
1616 * called under mdsc->mutex
1617 */
1618static void __wake_requests(struct ceph_mds_client *mdsc,
1619 struct list_head *head)
1620{
1621 struct ceph_mds_request *req, *nreq;
1622
1623 list_for_each_entry_safe(req, nreq, head, r_wait) {
1624 list_del_init(&req->r_wait);
1625 __do_request(mdsc, req);
1626 }
1627}
1628
1629/*
1630 * Wake up threads with requests pending for @mds, so that they can
1631 * resubmit their requests to a possibly different mds. If @all is set,
1632 * wake up if their requests has been forwarded to @mds, too.
1633 */
1634static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1635{
1636 struct ceph_mds_request *req;
1637 struct rb_node *p;
1638
1639 dout("kick_requests mds%d\n", mds);
1640 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1641 req = rb_entry(p, struct ceph_mds_request, r_node);
1642 if (req->r_got_unsafe)
1643 continue;
1644 if (req->r_session &&
1645 req->r_session->s_mds == mds) {
1646 dout(" kicking tid %llu\n", req->r_tid);
1647 put_request_session(req);
1648 __do_request(mdsc, req);
1649 }
1650 }
1651}
1652
1653void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1654 struct ceph_mds_request *req)
1655{
1656 dout("submit_request on %p\n", req);
1657 mutex_lock(&mdsc->mutex);
1658 __register_request(mdsc, req, NULL);
1659 __do_request(mdsc, req);
1660 mutex_unlock(&mdsc->mutex);
1661}
1662
1663/*
1664 * Synchrously perform an mds request. Take care of all of the
1665 * session setup, forwarding, retry details.
1666 */
1667int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1668 struct inode *dir,
1669 struct ceph_mds_request *req)
1670{
1671 int err;
1672
1673 dout("do_request on %p\n", req);
1674
1675 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1676 if (req->r_inode)
1677 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1678 if (req->r_locked_dir)
1679 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1680 if (req->r_old_dentry)
1681 ceph_get_cap_refs(
1682 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1683 CEPH_CAP_PIN);
1684
1685 /* issue */
1686 mutex_lock(&mdsc->mutex);
1687 __register_request(mdsc, req, dir);
1688 __do_request(mdsc, req);
1689
1690 /* wait */
1691 if (!req->r_reply) {
1692 mutex_unlock(&mdsc->mutex);
1693 if (req->r_timeout) {
1694 err = (long)wait_for_completion_interruptible_timeout(
1695 &req->r_completion, req->r_timeout);
1696 if (err == 0)
1697 req->r_reply = ERR_PTR(-EIO);
1698 else if (err < 0)
1699 req->r_reply = ERR_PTR(err);
1700 } else {
1701 err = wait_for_completion_interruptible(
1702 &req->r_completion);
1703 if (err)
1704 req->r_reply = ERR_PTR(err);
1705 }
1706 mutex_lock(&mdsc->mutex);
1707 }
1708
1709 if (IS_ERR(req->r_reply)) {
1710 err = PTR_ERR(req->r_reply);
1711 req->r_reply = NULL;
1712
1713 if (err == -ERESTARTSYS) {
1714 /* aborted */
1715 req->r_aborted = true;
1716
1717 if (req->r_locked_dir &&
1718 (req->r_op & CEPH_MDS_OP_WRITE)) {
1719 struct ceph_inode_info *ci =
1720 ceph_inode(req->r_locked_dir);
1721
1722 dout("aborted, clearing I_COMPLETE on %p\n",
1723 req->r_locked_dir);
1724 spin_lock(&req->r_locked_dir->i_lock);
1725 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1726 ci->i_release_count++;
1727 spin_unlock(&req->r_locked_dir->i_lock);
1728 }
1729 } else {
1730 /* clean up this request */
1731 __unregister_request(mdsc, req);
1732 if (!list_empty(&req->r_unsafe_item))
1733 list_del_init(&req->r_unsafe_item);
1734 complete(&req->r_safe_completion);
1735 }
1736 } else if (req->r_err) {
1737 err = req->r_err;
1738 } else {
1739 err = le32_to_cpu(req->r_reply_info.head->result);
1740 }
1741 mutex_unlock(&mdsc->mutex);
1742
1743 dout("do_request %p done, result %d\n", req, err);
1744 return err;
1745}
1746
1747/*
1748 * Handle mds reply.
1749 *
1750 * We take the session mutex and parse and process the reply immediately.
1751 * This preserves the logical ordering of replies, capabilities, etc., sent
1752 * by the MDS as they are applied to our local cache.
1753 */
1754static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1755{
1756 struct ceph_mds_client *mdsc = session->s_mdsc;
1757 struct ceph_mds_request *req;
1758 struct ceph_mds_reply_head *head = msg->front.iov_base;
1759 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1760 u64 tid;
1761 int err, result;
1762 int mds = session->s_mds;
1763
1764 if (msg->front.iov_len < sizeof(*head)) {
1765 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1766 ceph_msg_dump(msg);
1767 return;
1768 }
1769
1770 /* get request, session */
1771 tid = le64_to_cpu(msg->hdr.tid);
1772 mutex_lock(&mdsc->mutex);
1773 req = __lookup_request(mdsc, tid);
1774 if (!req) {
1775 dout("handle_reply on unknown tid %llu\n", tid);
1776 mutex_unlock(&mdsc->mutex);
1777 return;
1778 }
1779 dout("handle_reply %p\n", req);
1780
1781 /* correct session? */
1782 if (req->r_session != session) {
1783 pr_err("mdsc_handle_reply got %llu on session mds%d"
1784 " not mds%d\n", tid, session->s_mds,
1785 req->r_session ? req->r_session->s_mds : -1);
1786 mutex_unlock(&mdsc->mutex);
1787 goto out;
1788 }
1789
1790 /* dup? */
1791 if ((req->r_got_unsafe && !head->safe) ||
1792 (req->r_got_safe && head->safe)) {
1793 pr_warning("got a dup %s reply on %llu from mds%d\n",
1794 head->safe ? "safe" : "unsafe", tid, mds);
1795 mutex_unlock(&mdsc->mutex);
1796 goto out;
1797 }
1798
1799 result = le32_to_cpu(head->result);
1800
1801 /*
1802 * Tolerate 2 consecutive ESTALEs from the same mds.
1803 * FIXME: we should be looking at the cap migrate_seq.
1804 */
1805 if (result == -ESTALE) {
1806 req->r_direct_mode = USE_AUTH_MDS;
1807 req->r_num_stale++;
1808 if (req->r_num_stale <= 2) {
1809 __do_request(mdsc, req);
1810 mutex_unlock(&mdsc->mutex);
1811 goto out;
1812 }
1813 } else {
1814 req->r_num_stale = 0;
1815 }
1816
1817 if (head->safe) {
1818 req->r_got_safe = true;
1819 __unregister_request(mdsc, req);
1820 complete(&req->r_safe_completion);
1821
1822 if (req->r_got_unsafe) {
1823 /*
1824 * We already handled the unsafe response, now do the
1825 * cleanup. No need to examine the response; the MDS
1826 * doesn't include any result info in the safe
1827 * response. And even if it did, there is nothing
1828 * useful we could do with a revised return value.
1829 */
1830 dout("got safe reply %llu, mds%d\n", tid, mds);
1831 list_del_init(&req->r_unsafe_item);
1832
1833 /* last unsafe request during umount? */
1834 if (mdsc->stopping && !__get_oldest_req(mdsc))
1835 complete(&mdsc->safe_umount_waiters);
1836 mutex_unlock(&mdsc->mutex);
1837 goto out;
1838 }
1839 }
1840
1841 BUG_ON(req->r_reply);
1842
1843 if (!head->safe) {
1844 req->r_got_unsafe = true;
1845 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1846 }
1847
1848 dout("handle_reply tid %lld result %d\n", tid, result);
1849 rinfo = &req->r_reply_info;
1850 err = parse_reply_info(msg, rinfo);
1851 mutex_unlock(&mdsc->mutex);
1852
1853 mutex_lock(&session->s_mutex);
1854 if (err < 0) {
1855 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1856 ceph_msg_dump(msg);
1857 goto out_err;
1858 }
1859
1860 /* snap trace */
1861 if (rinfo->snapblob_len) {
1862 down_write(&mdsc->snap_rwsem);
1863 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1864 rinfo->snapblob + rinfo->snapblob_len,
1865 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1866 downgrade_write(&mdsc->snap_rwsem);
1867 } else {
1868 down_read(&mdsc->snap_rwsem);
1869 }
1870
1871 /* insert trace into our cache */
1872 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1873 if (err == 0) {
1874 if (result == 0 && rinfo->dir_nr)
1875 ceph_readdir_prepopulate(req, req->r_session);
1876 ceph_unreserve_caps(&req->r_caps_reservation);
1877 }
1878
1879 up_read(&mdsc->snap_rwsem);
1880out_err:
1881 if (err) {
1882 req->r_err = err;
1883 } else {
1884 req->r_reply = msg;
1885 ceph_msg_get(msg);
1886 }
1887
1888 add_cap_releases(mdsc, req->r_session, -1);
1889 mutex_unlock(&session->s_mutex);
1890
1891 /* kick calling process */
1892 complete_request(mdsc, req);
1893out:
1894 ceph_mdsc_put_request(req);
1895 return;
1896}
1897
1898
1899
1900/*
1901 * handle mds notification that our request has been forwarded.
1902 */
1903static void handle_forward(struct ceph_mds_client *mdsc,
1904 struct ceph_mds_session *session,
1905 struct ceph_msg *msg)
1906{
1907 struct ceph_mds_request *req;
1908 u64 tid = le64_to_cpu(msg->hdr.tid);
1909 u32 next_mds;
1910 u32 fwd_seq;
1911 int err = -EINVAL;
1912 void *p = msg->front.iov_base;
1913 void *end = p + msg->front.iov_len;
1914
1915 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1916 next_mds = ceph_decode_32(&p);
1917 fwd_seq = ceph_decode_32(&p);
1918
1919 mutex_lock(&mdsc->mutex);
1920 req = __lookup_request(mdsc, tid);
1921 if (!req) {
1922 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1923 goto out; /* dup reply? */
1924 }
1925
1926 if (fwd_seq <= req->r_num_fwd) {
1927 dout("forward %llu to mds%d - old seq %d <= %d\n",
1928 tid, next_mds, req->r_num_fwd, fwd_seq);
1929 } else {
1930 /* resend. forward race not possible; mds would drop */
1931 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1932 req->r_num_fwd = fwd_seq;
1933 req->r_resend_mds = next_mds;
1934 put_request_session(req);
1935 __do_request(mdsc, req);
1936 }
1937 ceph_mdsc_put_request(req);
1938out:
1939 mutex_unlock(&mdsc->mutex);
1940 return;
1941
1942bad:
1943 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1944}
1945
1946/*
1947 * handle a mds session control message
1948 */
1949static void handle_session(struct ceph_mds_session *session,
1950 struct ceph_msg *msg)
1951{
1952 struct ceph_mds_client *mdsc = session->s_mdsc;
1953 u32 op;
1954 u64 seq;
1955 int mds = session->s_mds;
1956 struct ceph_mds_session_head *h = msg->front.iov_base;
1957 int wake = 0;
1958
1959 /* decode */
1960 if (msg->front.iov_len != sizeof(*h))
1961 goto bad;
1962 op = le32_to_cpu(h->op);
1963 seq = le64_to_cpu(h->seq);
1964
1965 mutex_lock(&mdsc->mutex);
1966 if (op == CEPH_SESSION_CLOSE)
1967 __unregister_session(mdsc, session);
1968 /* FIXME: this ttl calculation is generous */
1969 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1970 mutex_unlock(&mdsc->mutex);
1971
1972 mutex_lock(&session->s_mutex);
1973
1974 dout("handle_session mds%d %s %p state %s seq %llu\n",
1975 mds, ceph_session_op_name(op), session,
1976 session_state_name(session->s_state), seq);
1977
1978 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1979 session->s_state = CEPH_MDS_SESSION_OPEN;
1980 pr_info("mds%d came back\n", session->s_mds);
1981 }
1982
1983 switch (op) {
1984 case CEPH_SESSION_OPEN:
1985 session->s_state = CEPH_MDS_SESSION_OPEN;
1986 renewed_caps(mdsc, session, 0);
1987 wake = 1;
1988 if (mdsc->stopping)
1989 __close_session(mdsc, session);
1990 break;
1991
1992 case CEPH_SESSION_RENEWCAPS:
1993 if (session->s_renew_seq == seq)
1994 renewed_caps(mdsc, session, 1);
1995 break;
1996
1997 case CEPH_SESSION_CLOSE:
1998 remove_session_caps(session);
1999 wake = 1; /* for good measure */
2000 complete(&mdsc->session_close_waiters);
2001 kick_requests(mdsc, mds, 0); /* cur only */
2002 break;
2003
2004 case CEPH_SESSION_STALE:
2005 pr_info("mds%d caps went stale, renewing\n",
2006 session->s_mds);
2007 spin_lock(&session->s_cap_lock);
2008 session->s_cap_gen++;
2009 session->s_cap_ttl = 0;
2010 spin_unlock(&session->s_cap_lock);
2011 send_renew_caps(mdsc, session);
2012 break;
2013
2014 case CEPH_SESSION_RECALL_STATE:
2015 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2016 break;
2017
2018 default:
2019 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2020 WARN_ON(1);
2021 }
2022
2023 mutex_unlock(&session->s_mutex);
2024 if (wake) {
2025 mutex_lock(&mdsc->mutex);
2026 __wake_requests(mdsc, &session->s_waiting);
2027 mutex_unlock(&mdsc->mutex);
2028 }
2029 return;
2030
2031bad:
2032 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2033 (int)msg->front.iov_len);
2034 ceph_msg_dump(msg);
2035 return;
2036}
2037
2038
2039/*
2040 * called under session->mutex.
2041 */
2042static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2043 struct ceph_mds_session *session)
2044{
2045 struct ceph_mds_request *req, *nreq;
2046 int err;
2047
2048 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2049
2050 mutex_lock(&mdsc->mutex);
2051 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2052 err = __prepare_send_request(mdsc, req, session->s_mds);
2053 if (!err) {
2054 ceph_msg_get(req->r_request);
2055 ceph_con_send(&session->s_con, req->r_request);
2056 }
2057 }
2058 mutex_unlock(&mdsc->mutex);
2059}
2060
2061/*
2062 * Encode information about a cap for a reconnect with the MDS.
2063 */
2064static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2065 void *arg)
2066{
2067 struct ceph_mds_cap_reconnect rec;
2068 struct ceph_inode_info *ci;
2069 struct ceph_pagelist *pagelist = arg;
2070 char *path;
2071 int pathlen, err;
2072 u64 pathbase;
2073 struct dentry *dentry;
2074
2075 ci = cap->ci;
2076
2077 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2078 inode, ceph_vinop(inode), cap, cap->cap_id,
2079 ceph_cap_string(cap->issued));
2080 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2081 if (err)
2082 return err;
2083
2084 dentry = d_find_alias(inode);
2085 if (dentry) {
2086 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2087 if (IS_ERR(path)) {
2088 err = PTR_ERR(path);
2089 BUG_ON(err);
2090 }
2091 } else {
2092 path = NULL;
2093 pathlen = 0;
2094 }
2095 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2096 if (err)
2097 goto out;
2098
2099 spin_lock(&inode->i_lock);
2100 cap->seq = 0; /* reset cap seq */
2101 cap->issue_seq = 0; /* and issue_seq */
2102 rec.cap_id = cpu_to_le64(cap->cap_id);
2103 rec.pathbase = cpu_to_le64(pathbase);
2104 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2105 rec.issued = cpu_to_le32(cap->issued);
2106 rec.size = cpu_to_le64(inode->i_size);
2107 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2108 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2109 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2110 spin_unlock(&inode->i_lock);
2111
2112 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2113
2114out:
2115 kfree(path);
2116 dput(dentry);
2117 return err;
2118}
2119
2120
2121/*
2122 * If an MDS fails and recovers, clients need to reconnect in order to
2123 * reestablish shared state. This includes all caps issued through
2124 * this session _and_ the snap_realm hierarchy. Because it's not
2125 * clear which snap realms the mds cares about, we send everything we
2126 * know about.. that ensures we'll then get any new info the
2127 * recovering MDS might have.
2128 *
2129 * This is a relatively heavyweight operation, but it's rare.
2130 *
2131 * called with mdsc->mutex held.
2132 */
2133static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2134{
2135 struct ceph_mds_session *session = NULL;
2136 struct ceph_msg *reply;
2137 struct rb_node *p;
2138 int err;
2139 struct ceph_pagelist *pagelist;
2140
2141 pr_info("reconnect to recovering mds%d\n", mds);
2142
2143 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2144 if (!pagelist)
2145 goto fail_nopagelist;
2146 ceph_pagelist_init(pagelist);
2147
2148 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2149 if (IS_ERR(reply)) {
2150 err = PTR_ERR(reply);
2151 goto fail_nomsg;
2152 }
2153
2154 /* find session */
2155 session = __ceph_lookup_mds_session(mdsc, mds);
2156 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2157
2158 if (session) {
2159 mutex_lock(&session->s_mutex);
2160
2161 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2162 session->s_seq = 0;
2163
2164 ceph_con_open(&session->s_con,
2165 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2166
2167 /* replay unsafe requests */
2168 replay_unsafe_requests(mdsc, session);
2169 } else {
2170 dout("no session for mds%d, will send short reconnect\n",
2171 mds);
2172 }
2173
2174 down_read(&mdsc->snap_rwsem);
2175
2176 if (!session)
2177 goto send;
2178 dout("session %p state %s\n", session,
2179 session_state_name(session->s_state));
2180
2181 /* traverse this session's caps */
2182 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2183 if (err)
2184 goto fail;
2185 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2186 if (err < 0)
2187 goto out;
2188
2189 /*
2190 * snaprealms. we provide mds with the ino, seq (version), and
2191 * parent for all of our realms. If the mds has any newer info,
2192 * it will tell us.
2193 */
2194 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2195 struct ceph_snap_realm *realm =
2196 rb_entry(p, struct ceph_snap_realm, node);
2197 struct ceph_mds_snaprealm_reconnect sr_rec;
2198
2199 dout(" adding snap realm %llx seq %lld parent %llx\n",
2200 realm->ino, realm->seq, realm->parent_ino);
2201 sr_rec.ino = cpu_to_le64(realm->ino);
2202 sr_rec.seq = cpu_to_le64(realm->seq);
2203 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2204 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2205 if (err)
2206 goto fail;
2207 }
2208
2209send:
2210 reply->pagelist = pagelist;
2211 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2212 reply->nr_pages = calc_pages_for(0, pagelist->length);
2213 ceph_con_send(&session->s_con, reply);
2214
2215 if (session) {
2216 session->s_state = CEPH_MDS_SESSION_OPEN;
2217 __wake_requests(mdsc, &session->s_waiting);
2218 }
2219
2220out:
2221 up_read(&mdsc->snap_rwsem);
2222 if (session) {
2223 mutex_unlock(&session->s_mutex);
2224 ceph_put_mds_session(session);
2225 }
2226 mutex_lock(&mdsc->mutex);
2227 return;
2228
2229fail:
2230 ceph_msg_put(reply);
2231fail_nomsg:
2232 ceph_pagelist_release(pagelist);
2233 kfree(pagelist);
2234fail_nopagelist:
2235 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2236 goto out;
2237}
2238
2239
2240/*
2241 * compare old and new mdsmaps, kicking requests
2242 * and closing out old connections as necessary
2243 *
2244 * called under mdsc->mutex.
2245 */
2246static void check_new_map(struct ceph_mds_client *mdsc,
2247 struct ceph_mdsmap *newmap,
2248 struct ceph_mdsmap *oldmap)
2249{
2250 int i;
2251 int oldstate, newstate;
2252 struct ceph_mds_session *s;
2253
2254 dout("check_new_map new %u old %u\n",
2255 newmap->m_epoch, oldmap->m_epoch);
2256
2257 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2258 if (mdsc->sessions[i] == NULL)
2259 continue;
2260 s = mdsc->sessions[i];
2261 oldstate = ceph_mdsmap_get_state(oldmap, i);
2262 newstate = ceph_mdsmap_get_state(newmap, i);
2263
2264 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2265 i, ceph_mds_state_name(oldstate),
2266 ceph_mds_state_name(newstate),
2267 session_state_name(s->s_state));
2268
2269 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2270 ceph_mdsmap_get_addr(newmap, i),
2271 sizeof(struct ceph_entity_addr))) {
2272 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2273 /* the session never opened, just close it
2274 * out now */
2275 __wake_requests(mdsc, &s->s_waiting);
2276 __unregister_session(mdsc, s);
2277 } else {
2278 /* just close it */
2279 mutex_unlock(&mdsc->mutex);
2280 mutex_lock(&s->s_mutex);
2281 mutex_lock(&mdsc->mutex);
2282 ceph_con_close(&s->s_con);
2283 mutex_unlock(&s->s_mutex);
2284 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2285 }
2286
2287 /* kick any requests waiting on the recovering mds */
2288 kick_requests(mdsc, i, 1);
2289 } else if (oldstate == newstate) {
2290 continue; /* nothing new with this mds */
2291 }
2292
2293 /*
2294 * send reconnect?
2295 */
2296 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2297 newstate >= CEPH_MDS_STATE_RECONNECT)
2298 send_mds_reconnect(mdsc, i);
2299
2300 /*
2301 * kick requests on any mds that has gone active.
2302 *
2303 * kick requests on cur or forwarder: we may have sent
2304 * the request to mds1, mds1 told us it forwarded it
2305 * to mds2, but then we learn mds1 failed and can't be
2306 * sure it successfully forwarded our request before
2307 * it died.
2308 */
2309 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2310 newstate >= CEPH_MDS_STATE_ACTIVE) {
2311 pr_info("mds%d reconnect completed\n", s->s_mds);
2312 kick_requests(mdsc, i, 1);
2313 ceph_kick_flushing_caps(mdsc, s);
2314 wake_up_session_caps(s, 1);
2315 }
2316 }
2317}
2318
2319
2320
2321/*
2322 * leases
2323 */
2324
2325/*
2326 * caller must hold session s_mutex, dentry->d_lock
2327 */
2328void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2329{
2330 struct ceph_dentry_info *di = ceph_dentry(dentry);
2331
2332 ceph_put_mds_session(di->lease_session);
2333 di->lease_session = NULL;
2334}
2335
2336static void handle_lease(struct ceph_mds_client *mdsc,
2337 struct ceph_mds_session *session,
2338 struct ceph_msg *msg)
2339{
2340 struct super_block *sb = mdsc->client->sb;
2341 struct inode *inode;
2342 struct ceph_inode_info *ci;
2343 struct dentry *parent, *dentry;
2344 struct ceph_dentry_info *di;
2345 int mds = session->s_mds;
2346 struct ceph_mds_lease *h = msg->front.iov_base;
2347 struct ceph_vino vino;
2348 int mask;
2349 struct qstr dname;
2350 int release = 0;
2351
2352 dout("handle_lease from mds%d\n", mds);
2353
2354 /* decode */
2355 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2356 goto bad;
2357 vino.ino = le64_to_cpu(h->ino);
2358 vino.snap = CEPH_NOSNAP;
2359 mask = le16_to_cpu(h->mask);
2360 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2361 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2362 if (dname.len != get_unaligned_le32(h+1))
2363 goto bad;
2364
2365 mutex_lock(&session->s_mutex);
2366 session->s_seq++;
2367
2368 /* lookup inode */
2369 inode = ceph_find_inode(sb, vino);
2370 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2371 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2372 if (inode == NULL) {
2373 dout("handle_lease no inode %llx\n", vino.ino);
2374 goto release;
2375 }
2376 ci = ceph_inode(inode);
2377
2378 /* dentry */
2379 parent = d_find_alias(inode);
2380 if (!parent) {
2381 dout("no parent dentry on inode %p\n", inode);
2382 WARN_ON(1);
2383 goto release; /* hrm... */
2384 }
2385 dname.hash = full_name_hash(dname.name, dname.len);
2386 dentry = d_lookup(parent, &dname);
2387 dput(parent);
2388 if (!dentry)
2389 goto release;
2390
2391 spin_lock(&dentry->d_lock);
2392 di = ceph_dentry(dentry);
2393 switch (h->action) {
2394 case CEPH_MDS_LEASE_REVOKE:
2395 if (di && di->lease_session == session) {
2396 h->seq = cpu_to_le32(di->lease_seq);
2397 __ceph_mdsc_drop_dentry_lease(dentry);
2398 }
2399 release = 1;
2400 break;
2401
2402 case CEPH_MDS_LEASE_RENEW:
2403 if (di && di->lease_session == session &&
2404 di->lease_gen == session->s_cap_gen &&
2405 di->lease_renew_from &&
2406 di->lease_renew_after == 0) {
2407 unsigned long duration =
2408 le32_to_cpu(h->duration_ms) * HZ / 1000;
2409
2410 di->lease_seq = le32_to_cpu(h->seq);
2411 dentry->d_time = di->lease_renew_from + duration;
2412 di->lease_renew_after = di->lease_renew_from +
2413 (duration >> 1);
2414 di->lease_renew_from = 0;
2415 }
2416 break;
2417 }
2418 spin_unlock(&dentry->d_lock);
2419 dput(dentry);
2420
2421 if (!release)
2422 goto out;
2423
2424release:
2425 /* let's just reuse the same message */
2426 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2427 ceph_msg_get(msg);
2428 ceph_con_send(&session->s_con, msg);
2429
2430out:
2431 iput(inode);
2432 mutex_unlock(&session->s_mutex);
2433 return;
2434
2435bad:
2436 pr_err("corrupt lease message\n");
2437 ceph_msg_dump(msg);
2438}
2439
2440void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2441 struct inode *inode,
2442 struct dentry *dentry, char action,
2443 u32 seq)
2444{
2445 struct ceph_msg *msg;
2446 struct ceph_mds_lease *lease;
2447 int len = sizeof(*lease) + sizeof(u32);
2448 int dnamelen = 0;
2449
2450 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2451 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2452 dnamelen = dentry->d_name.len;
2453 len += dnamelen;
2454
2455 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2456 if (IS_ERR(msg))
2457 return;
2458 lease = msg->front.iov_base;
2459 lease->action = action;
2460 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2461 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2462 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2463 lease->seq = cpu_to_le32(seq);
2464 put_unaligned_le32(dnamelen, lease + 1);
2465 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2466
2467 /*
2468 * if this is a preemptive lease RELEASE, no need to
2469 * flush request stream, since the actual request will
2470 * soon follow.
2471 */
2472 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2473
2474 ceph_con_send(&session->s_con, msg);
2475}
2476
2477/*
2478 * Preemptively release a lease we expect to invalidate anyway.
2479 * Pass @inode always, @dentry is optional.
2480 */
2481void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2482 struct dentry *dentry, int mask)
2483{
2484 struct ceph_dentry_info *di;
2485 struct ceph_mds_session *session;
2486 u32 seq;
2487
2488 BUG_ON(inode == NULL);
2489 BUG_ON(dentry == NULL);
2490 BUG_ON(mask != CEPH_LOCK_DN);
2491
2492 /* is dentry lease valid? */
2493 spin_lock(&dentry->d_lock);
2494 di = ceph_dentry(dentry);
2495 if (!di || !di->lease_session ||
2496 di->lease_session->s_mds < 0 ||
2497 di->lease_gen != di->lease_session->s_cap_gen ||
2498 !time_before(jiffies, dentry->d_time)) {
2499 dout("lease_release inode %p dentry %p -- "
2500 "no lease on %d\n",
2501 inode, dentry, mask);
2502 spin_unlock(&dentry->d_lock);
2503 return;
2504 }
2505
2506 /* we do have a lease on this dentry; note mds and seq */
2507 session = ceph_get_mds_session(di->lease_session);
2508 seq = di->lease_seq;
2509 __ceph_mdsc_drop_dentry_lease(dentry);
2510 spin_unlock(&dentry->d_lock);
2511
2512 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2513 inode, dentry, mask, session->s_mds);
2514 ceph_mdsc_lease_send_msg(session, inode, dentry,
2515 CEPH_MDS_LEASE_RELEASE, seq);
2516 ceph_put_mds_session(session);
2517}
2518
2519/*
2520 * drop all leases (and dentry refs) in preparation for umount
2521 */
2522static void drop_leases(struct ceph_mds_client *mdsc)
2523{
2524 int i;
2525
2526 dout("drop_leases\n");
2527 mutex_lock(&mdsc->mutex);
2528 for (i = 0; i < mdsc->max_sessions; i++) {
2529 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2530 if (!s)
2531 continue;
2532 mutex_unlock(&mdsc->mutex);
2533 mutex_lock(&s->s_mutex);
2534 mutex_unlock(&s->s_mutex);
2535 ceph_put_mds_session(s);
2536 mutex_lock(&mdsc->mutex);
2537 }
2538 mutex_unlock(&mdsc->mutex);
2539}
2540
2541
2542
2543/*
2544 * delayed work -- periodically trim expired leases, renew caps with mds
2545 */
2546static void schedule_delayed(struct ceph_mds_client *mdsc)
2547{
2548 int delay = 5;
2549 unsigned hz = round_jiffies_relative(HZ * delay);
2550 schedule_delayed_work(&mdsc->delayed_work, hz);
2551}
2552
2553static void delayed_work(struct work_struct *work)
2554{
2555 int i;
2556 struct ceph_mds_client *mdsc =
2557 container_of(work, struct ceph_mds_client, delayed_work.work);
2558 int renew_interval;
2559 int renew_caps;
2560
2561 dout("mdsc delayed_work\n");
2562 ceph_check_delayed_caps(mdsc);
2563
2564 mutex_lock(&mdsc->mutex);
2565 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2566 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2567 mdsc->last_renew_caps);
2568 if (renew_caps)
2569 mdsc->last_renew_caps = jiffies;
2570
2571 for (i = 0; i < mdsc->max_sessions; i++) {
2572 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2573 if (s == NULL)
2574 continue;
2575 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2576 dout("resending session close request for mds%d\n",
2577 s->s_mds);
2578 request_close_session(mdsc, s);
2579 ceph_put_mds_session(s);
2580 continue;
2581 }
2582 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2583 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2584 s->s_state = CEPH_MDS_SESSION_HUNG;
2585 pr_info("mds%d hung\n", s->s_mds);
2586 }
2587 }
2588 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2589 /* this mds is failed or recovering, just wait */
2590 ceph_put_mds_session(s);
2591 continue;
2592 }
2593 mutex_unlock(&mdsc->mutex);
2594
2595 mutex_lock(&s->s_mutex);
2596 if (renew_caps)
2597 send_renew_caps(mdsc, s);
2598 else
2599 ceph_con_keepalive(&s->s_con);
2600 add_cap_releases(mdsc, s, -1);
2601 send_cap_releases(mdsc, s);
2602 mutex_unlock(&s->s_mutex);
2603 ceph_put_mds_session(s);
2604
2605 mutex_lock(&mdsc->mutex);
2606 }
2607 mutex_unlock(&mdsc->mutex);
2608
2609 schedule_delayed(mdsc);
2610}
2611
2612
2613int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2614{
2615 mdsc->client = client;
2616 mutex_init(&mdsc->mutex);
2617 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2618 init_completion(&mdsc->safe_umount_waiters);
2619 init_completion(&mdsc->session_close_waiters);
2620 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2621 mdsc->sessions = NULL;
2622 mdsc->max_sessions = 0;
2623 mdsc->stopping = 0;
2624 init_rwsem(&mdsc->snap_rwsem);
2625 mdsc->snap_realms = RB_ROOT;
2626 INIT_LIST_HEAD(&mdsc->snap_empty);
2627 spin_lock_init(&mdsc->snap_empty_lock);
2628 mdsc->last_tid = 0;
2629 mdsc->request_tree = RB_ROOT;
2630 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2631 mdsc->last_renew_caps = jiffies;
2632 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2633 spin_lock_init(&mdsc->cap_delay_lock);
2634 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2635 spin_lock_init(&mdsc->snap_flush_lock);
2636 mdsc->cap_flush_seq = 0;
2637 INIT_LIST_HEAD(&mdsc->cap_dirty);
2638 mdsc->num_cap_flushing = 0;
2639 spin_lock_init(&mdsc->cap_dirty_lock);
2640 init_waitqueue_head(&mdsc->cap_flushing_wq);
2641 spin_lock_init(&mdsc->dentry_lru_lock);
2642 INIT_LIST_HEAD(&mdsc->dentry_lru);
2643 return 0;
2644}
2645
2646/*
2647 * Wait for safe replies on open mds requests. If we time out, drop
2648 * all requests from the tree to avoid dangling dentry refs.
2649 */
2650static void wait_requests(struct ceph_mds_client *mdsc)
2651{
2652 struct ceph_mds_request *req;
2653 struct ceph_client *client = mdsc->client;
2654
2655 mutex_lock(&mdsc->mutex);
2656 if (__get_oldest_req(mdsc)) {
2657 mutex_unlock(&mdsc->mutex);
2658
2659 dout("wait_requests waiting for requests\n");
2660 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2661 client->mount_args->mount_timeout * HZ);
2662
2663 /* tear down remaining requests */
2664 mutex_lock(&mdsc->mutex);
2665 while ((req = __get_oldest_req(mdsc))) {
2666 dout("wait_requests timed out on tid %llu\n",
2667 req->r_tid);
2668 __unregister_request(mdsc, req);
2669 }
2670 }
2671 mutex_unlock(&mdsc->mutex);
2672 dout("wait_requests done\n");
2673}
2674
2675/*
2676 * called before mount is ro, and before dentries are torn down.
2677 * (hmm, does this still race with new lookups?)
2678 */
2679void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2680{
2681 dout("pre_umount\n");
2682 mdsc->stopping = 1;
2683
2684 drop_leases(mdsc);
2685 ceph_flush_dirty_caps(mdsc);
2686 wait_requests(mdsc);
2687}
2688
2689/*
2690 * wait for all write mds requests to flush.
2691 */
2692static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2693{
2694 struct ceph_mds_request *req = NULL, *nextreq;
2695 struct rb_node *n;
2696
2697 mutex_lock(&mdsc->mutex);
2698 dout("wait_unsafe_requests want %lld\n", want_tid);
2699restart:
2700 req = __get_oldest_req(mdsc);
2701 while (req && req->r_tid <= want_tid) {
2702 /* find next request */
2703 n = rb_next(&req->r_node);
2704 if (n)
2705 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2706 else
2707 nextreq = NULL;
2708 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2709 /* write op */
2710 ceph_mdsc_get_request(req);
2711 if (nextreq)
2712 ceph_mdsc_get_request(nextreq);
2713 mutex_unlock(&mdsc->mutex);
2714 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2715 req->r_tid, want_tid);
2716 wait_for_completion(&req->r_safe_completion);
2717 mutex_lock(&mdsc->mutex);
2718 ceph_mdsc_put_request(req);
2719 if (!nextreq)
2720 break; /* next dne before, so we're done! */
2721 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2722 /* next request was removed from tree */
2723 ceph_mdsc_put_request(nextreq);
2724 goto restart;
2725 }
2726 ceph_mdsc_put_request(nextreq); /* won't go away */
2727 }
2728 req = nextreq;
2729 }
2730 mutex_unlock(&mdsc->mutex);
2731 dout("wait_unsafe_requests done\n");
2732}
2733
2734void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2735{
2736 u64 want_tid, want_flush;
2737
2738 dout("sync\n");
2739 mutex_lock(&mdsc->mutex);
2740 want_tid = mdsc->last_tid;
2741 want_flush = mdsc->cap_flush_seq;
2742 mutex_unlock(&mdsc->mutex);
2743 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2744
2745 ceph_flush_dirty_caps(mdsc);
2746
2747 wait_unsafe_requests(mdsc, want_tid);
2748 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2749}
2750
2751
2752/*
2753 * called after sb is ro.
2754 */
2755void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2756{
2757 struct ceph_mds_session *session;
2758 int i;
2759 int n;
2760 struct ceph_client *client = mdsc->client;
2761 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2762
2763 dout("close_sessions\n");
2764
2765 mutex_lock(&mdsc->mutex);
2766
2767 /* close sessions */
2768 started = jiffies;
2769 while (time_before(jiffies, started + timeout)) {
2770 dout("closing sessions\n");
2771 n = 0;
2772 for (i = 0; i < mdsc->max_sessions; i++) {
2773 session = __ceph_lookup_mds_session(mdsc, i);
2774 if (!session)
2775 continue;
2776 mutex_unlock(&mdsc->mutex);
2777 mutex_lock(&session->s_mutex);
2778 __close_session(mdsc, session);
2779 mutex_unlock(&session->s_mutex);
2780 ceph_put_mds_session(session);
2781 mutex_lock(&mdsc->mutex);
2782 n++;
2783 }
2784 if (n == 0)
2785 break;
2786
2787 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2788 break;
2789
2790 dout("waiting for sessions to close\n");
2791 mutex_unlock(&mdsc->mutex);
2792 wait_for_completion_timeout(&mdsc->session_close_waiters,
2793 timeout);
2794 mutex_lock(&mdsc->mutex);
2795 }
2796
2797 /* tear down remaining sessions */
2798 for (i = 0; i < mdsc->max_sessions; i++) {
2799 if (mdsc->sessions[i]) {
2800 session = get_session(mdsc->sessions[i]);
2801 __unregister_session(mdsc, session);
2802 mutex_unlock(&mdsc->mutex);
2803 mutex_lock(&session->s_mutex);
2804 remove_session_caps(session);
2805 mutex_unlock(&session->s_mutex);
2806 ceph_put_mds_session(session);
2807 mutex_lock(&mdsc->mutex);
2808 }
2809 }
2810
2811 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2812
2813 mutex_unlock(&mdsc->mutex);
2814
2815 ceph_cleanup_empty_realms(mdsc);
2816
2817 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2818
2819 dout("stopped\n");
2820}
2821
2822void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2823{
2824 dout("stop\n");
2825 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2826 if (mdsc->mdsmap)
2827 ceph_mdsmap_destroy(mdsc->mdsmap);
2828 kfree(mdsc->sessions);
2829}
2830
2831
2832/*
2833 * handle mds map update.
2834 */
2835void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2836{
2837 u32 epoch;
2838 u32 maplen;
2839 void *p = msg->front.iov_base;
2840 void *end = p + msg->front.iov_len;
2841 struct ceph_mdsmap *newmap, *oldmap;
2842 struct ceph_fsid fsid;
2843 int err = -EINVAL;
2844
2845 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2846 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2847 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2848 return;
2849 epoch = ceph_decode_32(&p);
2850 maplen = ceph_decode_32(&p);
2851 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2852
2853 /* do we need it? */
2854 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2855 mutex_lock(&mdsc->mutex);
2856 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2857 dout("handle_map epoch %u <= our %u\n",
2858 epoch, mdsc->mdsmap->m_epoch);
2859 mutex_unlock(&mdsc->mutex);
2860 return;
2861 }
2862
2863 newmap = ceph_mdsmap_decode(&p, end);
2864 if (IS_ERR(newmap)) {
2865 err = PTR_ERR(newmap);
2866 goto bad_unlock;
2867 }
2868
2869 /* swap into place */
2870 if (mdsc->mdsmap) {
2871 oldmap = mdsc->mdsmap;
2872 mdsc->mdsmap = newmap;
2873 check_new_map(mdsc, newmap, oldmap);
2874 ceph_mdsmap_destroy(oldmap);
2875 } else {
2876 mdsc->mdsmap = newmap; /* first mds map */
2877 }
2878 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2879
2880 __wake_requests(mdsc, &mdsc->waiting_for_map);
2881
2882 mutex_unlock(&mdsc->mutex);
2883 schedule_delayed(mdsc);
2884 return;
2885
2886bad_unlock:
2887 mutex_unlock(&mdsc->mutex);
2888bad:
2889 pr_err("error decoding mdsmap %d\n", err);
2890 return;
2891}
2892
2893static struct ceph_connection *con_get(struct ceph_connection *con)
2894{
2895 struct ceph_mds_session *s = con->private;
2896
2897 if (get_session(s)) {
2898 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2899 return con;
2900 }
2901 dout("mdsc con_get %p FAIL\n", s);
2902 return NULL;
2903}
2904
2905static void con_put(struct ceph_connection *con)
2906{
2907 struct ceph_mds_session *s = con->private;
2908
2909 ceph_put_mds_session(s);
2910 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2911}
2912
2913/*
2914 * if the client is unresponsive for long enough, the mds will kill
2915 * the session entirely.
2916 */
2917static void peer_reset(struct ceph_connection *con)
2918{
2919 struct ceph_mds_session *s = con->private;
2920
2921 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2922 s->s_mds);
2923}
2924
2925static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2926{
2927 struct ceph_mds_session *s = con->private;
2928 struct ceph_mds_client *mdsc = s->s_mdsc;
2929 int type = le16_to_cpu(msg->hdr.type);
2930
2931 mutex_lock(&mdsc->mutex);
2932 if (__verify_registered_session(mdsc, s) < 0) {
2933 mutex_unlock(&mdsc->mutex);
2934 goto out;
2935 }
2936 mutex_unlock(&mdsc->mutex);
2937
2938 switch (type) {
2939 case CEPH_MSG_MDS_MAP:
2940 ceph_mdsc_handle_map(mdsc, msg);
2941 break;
2942 case CEPH_MSG_CLIENT_SESSION:
2943 handle_session(s, msg);
2944 break;
2945 case CEPH_MSG_CLIENT_REPLY:
2946 handle_reply(s, msg);
2947 break;
2948 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2949 handle_forward(mdsc, s, msg);
2950 break;
2951 case CEPH_MSG_CLIENT_CAPS:
2952 ceph_handle_caps(s, msg);
2953 break;
2954 case CEPH_MSG_CLIENT_SNAP:
2955 ceph_handle_snap(mdsc, s, msg);
2956 break;
2957 case CEPH_MSG_CLIENT_LEASE:
2958 handle_lease(mdsc, s, msg);
2959 break;
2960
2961 default:
2962 pr_err("received unknown message type %d %s\n", type,
2963 ceph_msg_type_name(type));
2964 }
2965out:
2966 ceph_msg_put(msg);
2967}
2968
2969/*
2970 * authentication
2971 */
2972static int get_authorizer(struct ceph_connection *con,
2973 void **buf, int *len, int *proto,
2974 void **reply_buf, int *reply_len, int force_new)
2975{
2976 struct ceph_mds_session *s = con->private;
2977 struct ceph_mds_client *mdsc = s->s_mdsc;
2978 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2979 int ret = 0;
2980
2981 if (force_new && s->s_authorizer) {
2982 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2983 s->s_authorizer = NULL;
2984 }
2985 if (s->s_authorizer == NULL) {
2986 if (ac->ops->create_authorizer) {
2987 ret = ac->ops->create_authorizer(
2988 ac, CEPH_ENTITY_TYPE_MDS,
2989 &s->s_authorizer,
2990 &s->s_authorizer_buf,
2991 &s->s_authorizer_buf_len,
2992 &s->s_authorizer_reply_buf,
2993 &s->s_authorizer_reply_buf_len);
2994 if (ret)
2995 return ret;
2996 }
2997 }
2998
2999 *proto = ac->protocol;
3000 *buf = s->s_authorizer_buf;
3001 *len = s->s_authorizer_buf_len;
3002 *reply_buf = s->s_authorizer_reply_buf;
3003 *reply_len = s->s_authorizer_reply_buf_len;
3004 return 0;
3005}
3006
3007
3008static int verify_authorizer_reply(struct ceph_connection *con, int len)
3009{
3010 struct ceph_mds_session *s = con->private;
3011 struct ceph_mds_client *mdsc = s->s_mdsc;
3012 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3013
3014 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3015}
3016
3017static int invalidate_authorizer(struct ceph_connection *con)
3018{
3019 struct ceph_mds_session *s = con->private;
3020 struct ceph_mds_client *mdsc = s->s_mdsc;
3021 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3022
3023 if (ac->ops->invalidate_authorizer)
3024 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3025
3026 return ceph_monc_validate_auth(&mdsc->client->monc);
3027}
3028
3029const static struct ceph_connection_operations mds_con_ops = {
3030 .get = con_get,
3031 .put = con_put,
3032 .dispatch = dispatch,
3033 .get_authorizer = get_authorizer,
3034 .verify_authorizer_reply = verify_authorizer_reply,
3035 .invalidate_authorizer = invalidate_authorizer,
3036 .peer_reset = peer_reset,
3037};
3038
3039
3040
3041
3042/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..a32f0f896d9f
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2239 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/socket.h>
10#include <linux/string.h>
11#include <net/tcp.h>
12
13#include "super.h"
14#include "messenger.h"
15#include "decode.h"
16#include "pagelist.h"
17
18/*
19 * Ceph uses the messenger to exchange ceph_msg messages with other
20 * hosts in the system. The messenger provides ordered and reliable
21 * delivery. We tolerate TCP disconnects by reconnecting (with
22 * exponential backoff) in the case of a fault (disconnection, bad
23 * crc, protocol error). Acks allow sent messages to be discarded by
24 * the sender.
25 */
26
27/* static tag bytes (protocol control messages) */
28static char tag_msg = CEPH_MSGR_TAG_MSG;
29static char tag_ack = CEPH_MSGR_TAG_ACK;
30static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
31
32
33static void queue_con(struct ceph_connection *con);
34static void con_work(struct work_struct *);
35static void ceph_fault(struct ceph_connection *con);
36
37const char *ceph_name_type_str(int t)
38{
39 switch (t) {
40 case CEPH_ENTITY_TYPE_MON: return "mon";
41 case CEPH_ENTITY_TYPE_MDS: return "mds";
42 case CEPH_ENTITY_TYPE_OSD: return "osd";
43 case CEPH_ENTITY_TYPE_CLIENT: return "client";
44 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
45 default: return "???";
46 }
47}
48
49/*
50 * nicely render a sockaddr as a string.
51 */
52#define MAX_ADDR_STR 20
53static char addr_str[MAX_ADDR_STR][40];
54static DEFINE_SPINLOCK(addr_str_lock);
55static int last_addr_str;
56
57const char *pr_addr(const struct sockaddr_storage *ss)
58{
59 int i;
60 char *s;
61 struct sockaddr_in *in4 = (void *)ss;
62 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
63 struct sockaddr_in6 *in6 = (void *)ss;
64
65 spin_lock(&addr_str_lock);
66 i = last_addr_str++;
67 if (last_addr_str == MAX_ADDR_STR)
68 last_addr_str = 0;
69 spin_unlock(&addr_str_lock);
70 s = addr_str[i];
71
72 switch (ss->ss_family) {
73 case AF_INET:
74 sprintf(s, "%u.%u.%u.%u:%u",
75 (unsigned int)quad[0],
76 (unsigned int)quad[1],
77 (unsigned int)quad[2],
78 (unsigned int)quad[3],
79 (unsigned int)ntohs(in4->sin_port));
80 break;
81
82 case AF_INET6:
83 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
84 in6->sin6_addr.s6_addr16[0],
85 in6->sin6_addr.s6_addr16[1],
86 in6->sin6_addr.s6_addr16[2],
87 in6->sin6_addr.s6_addr16[3],
88 in6->sin6_addr.s6_addr16[4],
89 in6->sin6_addr.s6_addr16[5],
90 in6->sin6_addr.s6_addr16[6],
91 in6->sin6_addr.s6_addr16[7],
92 (unsigned int)ntohs(in6->sin6_port));
93 break;
94
95 default:
96 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
97 }
98
99 return s;
100}
101
102static void encode_my_addr(struct ceph_messenger *msgr)
103{
104 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
105 ceph_encode_addr(&msgr->my_enc_addr);
106}
107
108/*
109 * work queue for all reading and writing to/from the socket.
110 */
111struct workqueue_struct *ceph_msgr_wq;
112
113int __init ceph_msgr_init(void)
114{
115 ceph_msgr_wq = create_workqueue("ceph-msgr");
116 if (IS_ERR(ceph_msgr_wq)) {
117 int ret = PTR_ERR(ceph_msgr_wq);
118 pr_err("msgr_init failed to create workqueue: %d\n", ret);
119 ceph_msgr_wq = NULL;
120 return ret;
121 }
122 return 0;
123}
124
125void ceph_msgr_exit(void)
126{
127 destroy_workqueue(ceph_msgr_wq);
128}
129
130/*
131 * socket callback functions
132 */
133
134/* data available on socket, or listen socket received a connect */
135static void ceph_data_ready(struct sock *sk, int count_unused)
136{
137 struct ceph_connection *con =
138 (struct ceph_connection *)sk->sk_user_data;
139 if (sk->sk_state != TCP_CLOSE_WAIT) {
140 dout("ceph_data_ready on %p state = %lu, queueing work\n",
141 con, con->state);
142 queue_con(con);
143 }
144}
145
146/* socket has buffer space for writing */
147static void ceph_write_space(struct sock *sk)
148{
149 struct ceph_connection *con =
150 (struct ceph_connection *)sk->sk_user_data;
151
152 /* only queue to workqueue if there is data we want to write. */
153 if (test_bit(WRITE_PENDING, &con->state)) {
154 dout("ceph_write_space %p queueing write work\n", con);
155 queue_con(con);
156 } else {
157 dout("ceph_write_space %p nothing to write\n", con);
158 }
159
160 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
161 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
162}
163
164/* socket's state has changed */
165static void ceph_state_change(struct sock *sk)
166{
167 struct ceph_connection *con =
168 (struct ceph_connection *)sk->sk_user_data;
169
170 dout("ceph_state_change %p state = %lu sk_state = %u\n",
171 con, con->state, sk->sk_state);
172
173 if (test_bit(CLOSED, &con->state))
174 return;
175
176 switch (sk->sk_state) {
177 case TCP_CLOSE:
178 dout("ceph_state_change TCP_CLOSE\n");
179 case TCP_CLOSE_WAIT:
180 dout("ceph_state_change TCP_CLOSE_WAIT\n");
181 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
182 if (test_bit(CONNECTING, &con->state))
183 con->error_msg = "connection failed";
184 else
185 con->error_msg = "socket closed";
186 queue_con(con);
187 }
188 break;
189 case TCP_ESTABLISHED:
190 dout("ceph_state_change TCP_ESTABLISHED\n");
191 queue_con(con);
192 break;
193 }
194}
195
196/*
197 * set up socket callbacks
198 */
199static void set_sock_callbacks(struct socket *sock,
200 struct ceph_connection *con)
201{
202 struct sock *sk = sock->sk;
203 sk->sk_user_data = (void *)con;
204 sk->sk_data_ready = ceph_data_ready;
205 sk->sk_write_space = ceph_write_space;
206 sk->sk_state_change = ceph_state_change;
207}
208
209
210/*
211 * socket helpers
212 */
213
214/*
215 * initiate connection to a remote socket.
216 */
217static struct socket *ceph_tcp_connect(struct ceph_connection *con)
218{
219 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
220 struct socket *sock;
221 int ret;
222
223 BUG_ON(con->sock);
224 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
225 if (ret)
226 return ERR_PTR(ret);
227 con->sock = sock;
228 sock->sk->sk_allocation = GFP_NOFS;
229
230 set_sock_callbacks(sock, con);
231
232 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
233
234 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
235 if (ret == -EINPROGRESS) {
236 dout("connect %s EINPROGRESS sk_state = %u\n",
237 pr_addr(&con->peer_addr.in_addr),
238 sock->sk->sk_state);
239 ret = 0;
240 }
241 if (ret < 0) {
242 pr_err("connect %s error %d\n",
243 pr_addr(&con->peer_addr.in_addr), ret);
244 sock_release(sock);
245 con->sock = NULL;
246 con->error_msg = "connect error";
247 }
248
249 if (ret < 0)
250 return ERR_PTR(ret);
251 return sock;
252}
253
254static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
255{
256 struct kvec iov = {buf, len};
257 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
258
259 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
260}
261
262/*
263 * write something. @more is true if caller will be sending more data
264 * shortly.
265 */
266static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
267 size_t kvlen, size_t len, int more)
268{
269 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
270
271 if (more)
272 msg.msg_flags |= MSG_MORE;
273 else
274 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
275
276 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
277}
278
279
280/*
281 * Shutdown/close the socket for the given connection.
282 */
283static int con_close_socket(struct ceph_connection *con)
284{
285 int rc;
286
287 dout("con_close_socket on %p sock %p\n", con, con->sock);
288 if (!con->sock)
289 return 0;
290 set_bit(SOCK_CLOSED, &con->state);
291 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
292 sock_release(con->sock);
293 con->sock = NULL;
294 clear_bit(SOCK_CLOSED, &con->state);
295 return rc;
296}
297
298/*
299 * Reset a connection. Discard all incoming and outgoing messages
300 * and clear *_seq state.
301 */
302static void ceph_msg_remove(struct ceph_msg *msg)
303{
304 list_del_init(&msg->list_head);
305 ceph_msg_put(msg);
306}
307static void ceph_msg_remove_list(struct list_head *head)
308{
309 while (!list_empty(head)) {
310 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
311 list_head);
312 ceph_msg_remove(msg);
313 }
314}
315
316static void reset_connection(struct ceph_connection *con)
317{
318 /* reset connection, out_queue, msg_ and connect_seq */
319 /* discard existing out_queue and msg_seq */
320 ceph_msg_remove_list(&con->out_queue);
321 ceph_msg_remove_list(&con->out_sent);
322
323 if (con->in_msg) {
324 ceph_msg_put(con->in_msg);
325 con->in_msg = NULL;
326 }
327
328 con->connect_seq = 0;
329 con->out_seq = 0;
330 if (con->out_msg) {
331 ceph_msg_put(con->out_msg);
332 con->out_msg = NULL;
333 }
334 con->in_seq = 0;
335}
336
337/*
338 * mark a peer down. drop any open connections.
339 */
340void ceph_con_close(struct ceph_connection *con)
341{
342 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
343 set_bit(CLOSED, &con->state); /* in case there's queued work */
344 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
345 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
346 clear_bit(KEEPALIVE_PENDING, &con->state);
347 clear_bit(WRITE_PENDING, &con->state);
348 mutex_lock(&con->mutex);
349 reset_connection(con);
350 cancel_delayed_work(&con->work);
351 mutex_unlock(&con->mutex);
352 queue_con(con);
353}
354
355/*
356 * Reopen a closed connection, with a new peer address.
357 */
358void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
359{
360 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
361 set_bit(OPENING, &con->state);
362 clear_bit(CLOSED, &con->state);
363 memcpy(&con->peer_addr, addr, sizeof(*addr));
364 con->delay = 0; /* reset backoff memory */
365 queue_con(con);
366}
367
368/*
369 * return true if this connection ever successfully opened
370 */
371bool ceph_con_opened(struct ceph_connection *con)
372{
373 return con->connect_seq > 0;
374}
375
376/*
377 * generic get/put
378 */
379struct ceph_connection *ceph_con_get(struct ceph_connection *con)
380{
381 dout("con_get %p nref = %d -> %d\n", con,
382 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
383 if (atomic_inc_not_zero(&con->nref))
384 return con;
385 return NULL;
386}
387
388void ceph_con_put(struct ceph_connection *con)
389{
390 dout("con_put %p nref = %d -> %d\n", con,
391 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
392 BUG_ON(atomic_read(&con->nref) == 0);
393 if (atomic_dec_and_test(&con->nref)) {
394 BUG_ON(con->sock);
395 kfree(con);
396 }
397}
398
399/*
400 * initialize a new connection.
401 */
402void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
403{
404 dout("con_init %p\n", con);
405 memset(con, 0, sizeof(*con));
406 atomic_set(&con->nref, 1);
407 con->msgr = msgr;
408 mutex_init(&con->mutex);
409 INIT_LIST_HEAD(&con->out_queue);
410 INIT_LIST_HEAD(&con->out_sent);
411 INIT_DELAYED_WORK(&con->work, con_work);
412}
413
414
415/*
416 * We maintain a global counter to order connection attempts. Get
417 * a unique seq greater than @gt.
418 */
419static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
420{
421 u32 ret;
422
423 spin_lock(&msgr->global_seq_lock);
424 if (msgr->global_seq < gt)
425 msgr->global_seq = gt;
426 ret = ++msgr->global_seq;
427 spin_unlock(&msgr->global_seq_lock);
428 return ret;
429}
430
431
432/*
433 * Prepare footer for currently outgoing message, and finish things
434 * off. Assumes out_kvec* are already valid.. we just add on to the end.
435 */
436static void prepare_write_message_footer(struct ceph_connection *con, int v)
437{
438 struct ceph_msg *m = con->out_msg;
439
440 dout("prepare_write_message_footer %p\n", con);
441 con->out_kvec_is_msg = true;
442 con->out_kvec[v].iov_base = &m->footer;
443 con->out_kvec[v].iov_len = sizeof(m->footer);
444 con->out_kvec_bytes += sizeof(m->footer);
445 con->out_kvec_left++;
446 con->out_more = m->more_to_follow;
447 con->out_msg_done = true;
448}
449
450/*
451 * Prepare headers for the next outgoing message.
452 */
453static void prepare_write_message(struct ceph_connection *con)
454{
455 struct ceph_msg *m;
456 int v = 0;
457
458 con->out_kvec_bytes = 0;
459 con->out_kvec_is_msg = true;
460 con->out_msg_done = false;
461
462 /* Sneak an ack in there first? If we can get it into the same
463 * TCP packet that's a good thing. */
464 if (con->in_seq > con->in_seq_acked) {
465 con->in_seq_acked = con->in_seq;
466 con->out_kvec[v].iov_base = &tag_ack;
467 con->out_kvec[v++].iov_len = 1;
468 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
469 con->out_kvec[v].iov_base = &con->out_temp_ack;
470 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
471 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
472 }
473
474 m = list_first_entry(&con->out_queue,
475 struct ceph_msg, list_head);
476 con->out_msg = m;
477 if (test_bit(LOSSYTX, &con->state)) {
478 list_del_init(&m->list_head);
479 } else {
480 /* put message on sent list */
481 ceph_msg_get(m);
482 list_move_tail(&m->list_head, &con->out_sent);
483 }
484
485 m->hdr.seq = cpu_to_le64(++con->out_seq);
486
487 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
488 m, con->out_seq, le16_to_cpu(m->hdr.type),
489 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
490 le32_to_cpu(m->hdr.data_len),
491 m->nr_pages);
492 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
493
494 /* tag + hdr + front + middle */
495 con->out_kvec[v].iov_base = &tag_msg;
496 con->out_kvec[v++].iov_len = 1;
497 con->out_kvec[v].iov_base = &m->hdr;
498 con->out_kvec[v++].iov_len = sizeof(m->hdr);
499 con->out_kvec[v++] = m->front;
500 if (m->middle)
501 con->out_kvec[v++] = m->middle->vec;
502 con->out_kvec_left = v;
503 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
504 (m->middle ? m->middle->vec.iov_len : 0);
505 con->out_kvec_cur = con->out_kvec;
506
507 /* fill in crc (except data pages), footer */
508 con->out_msg->hdr.crc =
509 cpu_to_le32(crc32c(0, (void *)&m->hdr,
510 sizeof(m->hdr) - sizeof(m->hdr.crc)));
511 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
512 con->out_msg->footer.front_crc =
513 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
514 if (m->middle)
515 con->out_msg->footer.middle_crc =
516 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
517 m->middle->vec.iov_len));
518 else
519 con->out_msg->footer.middle_crc = 0;
520 con->out_msg->footer.data_crc = 0;
521 dout("prepare_write_message front_crc %u data_crc %u\n",
522 le32_to_cpu(con->out_msg->footer.front_crc),
523 le32_to_cpu(con->out_msg->footer.middle_crc));
524
525 /* is there a data payload? */
526 if (le32_to_cpu(m->hdr.data_len) > 0) {
527 /* initialize page iterator */
528 con->out_msg_pos.page = 0;
529 con->out_msg_pos.page_pos =
530 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
531 con->out_msg_pos.data_pos = 0;
532 con->out_msg_pos.did_page_crc = 0;
533 con->out_more = 1; /* data + footer will follow */
534 } else {
535 /* no, queue up footer too and be done */
536 prepare_write_message_footer(con, v);
537 }
538
539 set_bit(WRITE_PENDING, &con->state);
540}
541
542/*
543 * Prepare an ack.
544 */
545static void prepare_write_ack(struct ceph_connection *con)
546{
547 dout("prepare_write_ack %p %llu -> %llu\n", con,
548 con->in_seq_acked, con->in_seq);
549 con->in_seq_acked = con->in_seq;
550
551 con->out_kvec[0].iov_base = &tag_ack;
552 con->out_kvec[0].iov_len = 1;
553 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
554 con->out_kvec[1].iov_base = &con->out_temp_ack;
555 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
556 con->out_kvec_left = 2;
557 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
558 con->out_kvec_cur = con->out_kvec;
559 con->out_more = 1; /* more will follow.. eventually.. */
560 set_bit(WRITE_PENDING, &con->state);
561}
562
563/*
564 * Prepare to write keepalive byte.
565 */
566static void prepare_write_keepalive(struct ceph_connection *con)
567{
568 dout("prepare_write_keepalive %p\n", con);
569 con->out_kvec[0].iov_base = &tag_keepalive;
570 con->out_kvec[0].iov_len = 1;
571 con->out_kvec_left = 1;
572 con->out_kvec_bytes = 1;
573 con->out_kvec_cur = con->out_kvec;
574 set_bit(WRITE_PENDING, &con->state);
575}
576
577/*
578 * Connection negotiation.
579 */
580
581static void prepare_connect_authorizer(struct ceph_connection *con)
582{
583 void *auth_buf;
584 int auth_len = 0;
585 int auth_protocol = 0;
586
587 mutex_unlock(&con->mutex);
588 if (con->ops->get_authorizer)
589 con->ops->get_authorizer(con, &auth_buf, &auth_len,
590 &auth_protocol, &con->auth_reply_buf,
591 &con->auth_reply_buf_len,
592 con->auth_retry);
593 mutex_lock(&con->mutex);
594
595 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
596 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
597
598 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
599 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
600 con->out_kvec_left++;
601 con->out_kvec_bytes += auth_len;
602}
603
604/*
605 * We connected to a peer and are saying hello.
606 */
607static void prepare_write_banner(struct ceph_messenger *msgr,
608 struct ceph_connection *con)
609{
610 int len = strlen(CEPH_BANNER);
611
612 con->out_kvec[0].iov_base = CEPH_BANNER;
613 con->out_kvec[0].iov_len = len;
614 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
615 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
616 con->out_kvec_left = 2;
617 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
618 con->out_kvec_cur = con->out_kvec;
619 con->out_more = 0;
620 set_bit(WRITE_PENDING, &con->state);
621}
622
623static void prepare_write_connect(struct ceph_messenger *msgr,
624 struct ceph_connection *con,
625 int after_banner)
626{
627 unsigned global_seq = get_global_seq(con->msgr, 0);
628 int proto;
629
630 switch (con->peer_name.type) {
631 case CEPH_ENTITY_TYPE_MON:
632 proto = CEPH_MONC_PROTOCOL;
633 break;
634 case CEPH_ENTITY_TYPE_OSD:
635 proto = CEPH_OSDC_PROTOCOL;
636 break;
637 case CEPH_ENTITY_TYPE_MDS:
638 proto = CEPH_MDSC_PROTOCOL;
639 break;
640 default:
641 BUG();
642 }
643
644 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
645 con->connect_seq, global_seq, proto);
646
647 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
648 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
649 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
650 con->out_connect.global_seq = cpu_to_le32(global_seq);
651 con->out_connect.protocol_version = cpu_to_le32(proto);
652 con->out_connect.flags = 0;
653
654 if (!after_banner) {
655 con->out_kvec_left = 0;
656 con->out_kvec_bytes = 0;
657 }
658 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
659 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
660 con->out_kvec_left++;
661 con->out_kvec_bytes += sizeof(con->out_connect);
662 con->out_kvec_cur = con->out_kvec;
663 con->out_more = 0;
664 set_bit(WRITE_PENDING, &con->state);
665
666 prepare_connect_authorizer(con);
667}
668
669
670/*
671 * write as much of pending kvecs to the socket as we can.
672 * 1 -> done
673 * 0 -> socket full, but more to do
674 * <0 -> error
675 */
676static int write_partial_kvec(struct ceph_connection *con)
677{
678 int ret;
679
680 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
681 while (con->out_kvec_bytes > 0) {
682 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
683 con->out_kvec_left, con->out_kvec_bytes,
684 con->out_more);
685 if (ret <= 0)
686 goto out;
687 con->out_kvec_bytes -= ret;
688 if (con->out_kvec_bytes == 0)
689 break; /* done */
690 while (ret > 0) {
691 if (ret >= con->out_kvec_cur->iov_len) {
692 ret -= con->out_kvec_cur->iov_len;
693 con->out_kvec_cur++;
694 con->out_kvec_left--;
695 } else {
696 con->out_kvec_cur->iov_len -= ret;
697 con->out_kvec_cur->iov_base += ret;
698 ret = 0;
699 break;
700 }
701 }
702 }
703 con->out_kvec_left = 0;
704 con->out_kvec_is_msg = false;
705 ret = 1;
706out:
707 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
708 con->out_kvec_bytes, con->out_kvec_left, ret);
709 return ret; /* done! */
710}
711
712/*
713 * Write as much message data payload as we can. If we finish, queue
714 * up the footer.
715 * 1 -> done, footer is now queued in out_kvec[].
716 * 0 -> socket full, but more to do
717 * <0 -> error
718 */
719static int write_partial_msg_pages(struct ceph_connection *con)
720{
721 struct ceph_msg *msg = con->out_msg;
722 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
723 size_t len;
724 int crc = con->msgr->nocrc;
725 int ret;
726
727 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
728 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
729 con->out_msg_pos.page_pos);
730
731 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
732 struct page *page = NULL;
733 void *kaddr = NULL;
734
735 /*
736 * if we are calculating the data crc (the default), we need
737 * to map the page. if our pages[] has been revoked, use the
738 * zero page.
739 */
740 if (msg->pages) {
741 page = msg->pages[con->out_msg_pos.page];
742 if (crc)
743 kaddr = kmap(page);
744 } else if (msg->pagelist) {
745 page = list_first_entry(&msg->pagelist->head,
746 struct page, lru);
747 if (crc)
748 kaddr = kmap(page);
749 } else {
750 page = con->msgr->zero_page;
751 if (crc)
752 kaddr = page_address(con->msgr->zero_page);
753 }
754 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
755 (int)(data_len - con->out_msg_pos.data_pos));
756 if (crc && !con->out_msg_pos.did_page_crc) {
757 void *base = kaddr + con->out_msg_pos.page_pos;
758 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
759
760 BUG_ON(kaddr == NULL);
761 con->out_msg->footer.data_crc =
762 cpu_to_le32(crc32c(tmpcrc, base, len));
763 con->out_msg_pos.did_page_crc = 1;
764 }
765
766 ret = kernel_sendpage(con->sock, page,
767 con->out_msg_pos.page_pos, len,
768 MSG_DONTWAIT | MSG_NOSIGNAL |
769 MSG_MORE);
770
771 if (crc && (msg->pages || msg->pagelist))
772 kunmap(page);
773
774 if (ret <= 0)
775 goto out;
776
777 con->out_msg_pos.data_pos += ret;
778 con->out_msg_pos.page_pos += ret;
779 if (ret == len) {
780 con->out_msg_pos.page_pos = 0;
781 con->out_msg_pos.page++;
782 con->out_msg_pos.did_page_crc = 0;
783 if (msg->pagelist)
784 list_move_tail(&page->lru,
785 &msg->pagelist->head);
786 }
787 }
788
789 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
790
791 /* prepare and queue up footer, too */
792 if (!crc)
793 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
794 con->out_kvec_bytes = 0;
795 con->out_kvec_left = 0;
796 con->out_kvec_cur = con->out_kvec;
797 prepare_write_message_footer(con, 0);
798 ret = 1;
799out:
800 return ret;
801}
802
803/*
804 * write some zeros
805 */
806static int write_partial_skip(struct ceph_connection *con)
807{
808 int ret;
809
810 while (con->out_skip > 0) {
811 struct kvec iov = {
812 .iov_base = page_address(con->msgr->zero_page),
813 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
814 };
815
816 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
817 if (ret <= 0)
818 goto out;
819 con->out_skip -= ret;
820 }
821 ret = 1;
822out:
823 return ret;
824}
825
826/*
827 * Prepare to read connection handshake, or an ack.
828 */
829static void prepare_read_banner(struct ceph_connection *con)
830{
831 dout("prepare_read_banner %p\n", con);
832 con->in_base_pos = 0;
833}
834
835static void prepare_read_connect(struct ceph_connection *con)
836{
837 dout("prepare_read_connect %p\n", con);
838 con->in_base_pos = 0;
839}
840
841static void prepare_read_ack(struct ceph_connection *con)
842{
843 dout("prepare_read_ack %p\n", con);
844 con->in_base_pos = 0;
845}
846
847static void prepare_read_tag(struct ceph_connection *con)
848{
849 dout("prepare_read_tag %p\n", con);
850 con->in_base_pos = 0;
851 con->in_tag = CEPH_MSGR_TAG_READY;
852}
853
854/*
855 * Prepare to read a message.
856 */
857static int prepare_read_message(struct ceph_connection *con)
858{
859 dout("prepare_read_message %p\n", con);
860 BUG_ON(con->in_msg != NULL);
861 con->in_base_pos = 0;
862 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
863 return 0;
864}
865
866
867static int read_partial(struct ceph_connection *con,
868 int *to, int size, void *object)
869{
870 *to += size;
871 while (con->in_base_pos < *to) {
872 int left = *to - con->in_base_pos;
873 int have = size - left;
874 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
875 if (ret <= 0)
876 return ret;
877 con->in_base_pos += ret;
878 }
879 return 1;
880}
881
882
883/*
884 * Read all or part of the connect-side handshake on a new connection
885 */
886static int read_partial_banner(struct ceph_connection *con)
887{
888 int ret, to = 0;
889
890 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
891
892 /* peer's banner */
893 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
894 if (ret <= 0)
895 goto out;
896 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
897 &con->actual_peer_addr);
898 if (ret <= 0)
899 goto out;
900 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
901 &con->peer_addr_for_me);
902 if (ret <= 0)
903 goto out;
904out:
905 return ret;
906}
907
908static int read_partial_connect(struct ceph_connection *con)
909{
910 int ret, to = 0;
911
912 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
913
914 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
915 if (ret <= 0)
916 goto out;
917 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
918 con->auth_reply_buf);
919 if (ret <= 0)
920 goto out;
921
922 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
923 con, (int)con->in_reply.tag,
924 le32_to_cpu(con->in_reply.connect_seq),
925 le32_to_cpu(con->in_reply.global_seq));
926out:
927 return ret;
928
929}
930
931/*
932 * Verify the hello banner looks okay.
933 */
934static int verify_hello(struct ceph_connection *con)
935{
936 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
937 pr_err("connect to %s got bad banner\n",
938 pr_addr(&con->peer_addr.in_addr));
939 con->error_msg = "protocol error, bad banner";
940 return -1;
941 }
942 return 0;
943}
944
945static bool addr_is_blank(struct sockaddr_storage *ss)
946{
947 switch (ss->ss_family) {
948 case AF_INET:
949 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
950 case AF_INET6:
951 return
952 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
953 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
954 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
955 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
956 }
957 return false;
958}
959
960static int addr_port(struct sockaddr_storage *ss)
961{
962 switch (ss->ss_family) {
963 case AF_INET:
964 return ntohs(((struct sockaddr_in *)ss)->sin_port);
965 case AF_INET6:
966 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
967 }
968 return 0;
969}
970
971static void addr_set_port(struct sockaddr_storage *ss, int p)
972{
973 switch (ss->ss_family) {
974 case AF_INET:
975 ((struct sockaddr_in *)ss)->sin_port = htons(p);
976 case AF_INET6:
977 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
978 }
979}
980
981/*
982 * Parse an ip[:port] list into an addr array. Use the default
983 * monitor port if a port isn't specified.
984 */
985int ceph_parse_ips(const char *c, const char *end,
986 struct ceph_entity_addr *addr,
987 int max_count, int *count)
988{
989 int i;
990 const char *p = c;
991
992 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
993 for (i = 0; i < max_count; i++) {
994 const char *ipend;
995 struct sockaddr_storage *ss = &addr[i].in_addr;
996 struct sockaddr_in *in4 = (void *)ss;
997 struct sockaddr_in6 *in6 = (void *)ss;
998 int port;
999
1000 memset(ss, 0, sizeof(*ss));
1001 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1002 ',', &ipend)) {
1003 ss->ss_family = AF_INET;
1004 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1005 ',', &ipend)) {
1006 ss->ss_family = AF_INET6;
1007 } else {
1008 goto bad;
1009 }
1010 p = ipend;
1011
1012 /* port? */
1013 if (p < end && *p == ':') {
1014 port = 0;
1015 p++;
1016 while (p < end && *p >= '0' && *p <= '9') {
1017 port = (port * 10) + (*p - '0');
1018 p++;
1019 }
1020 if (port > 65535 || port == 0)
1021 goto bad;
1022 } else {
1023 port = CEPH_MON_PORT;
1024 }
1025
1026 addr_set_port(ss, port);
1027
1028 dout("parse_ips got %s\n", pr_addr(ss));
1029
1030 if (p == end)
1031 break;
1032 if (*p != ',')
1033 goto bad;
1034 p++;
1035 }
1036
1037 if (p != end)
1038 goto bad;
1039
1040 if (count)
1041 *count = i + 1;
1042 return 0;
1043
1044bad:
1045 pr_err("parse_ips bad ip '%s'\n", c);
1046 return -EINVAL;
1047}
1048
1049static int process_banner(struct ceph_connection *con)
1050{
1051 dout("process_banner on %p\n", con);
1052
1053 if (verify_hello(con) < 0)
1054 return -1;
1055
1056 ceph_decode_addr(&con->actual_peer_addr);
1057 ceph_decode_addr(&con->peer_addr_for_me);
1058
1059 /*
1060 * Make sure the other end is who we wanted. note that the other
1061 * end may not yet know their ip address, so if it's 0.0.0.0, give
1062 * them the benefit of the doubt.
1063 */
1064 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1065 sizeof(con->peer_addr)) != 0 &&
1066 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1067 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1068 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1069 pr_addr(&con->peer_addr.in_addr),
1070 le64_to_cpu(con->peer_addr.nonce),
1071 pr_addr(&con->actual_peer_addr.in_addr),
1072 le64_to_cpu(con->actual_peer_addr.nonce));
1073 con->error_msg = "wrong peer at address";
1074 return -1;
1075 }
1076
1077 /*
1078 * did we learn our address?
1079 */
1080 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1081 int port = addr_port(&con->msgr->inst.addr.in_addr);
1082
1083 memcpy(&con->msgr->inst.addr.in_addr,
1084 &con->peer_addr_for_me.in_addr,
1085 sizeof(con->peer_addr_for_me.in_addr));
1086 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1087 encode_my_addr(con->msgr);
1088 dout("process_banner learned my addr is %s\n",
1089 pr_addr(&con->msgr->inst.addr.in_addr));
1090 }
1091
1092 set_bit(NEGOTIATING, &con->state);
1093 prepare_read_connect(con);
1094 return 0;
1095}
1096
1097static void fail_protocol(struct ceph_connection *con)
1098{
1099 reset_connection(con);
1100 set_bit(CLOSED, &con->state); /* in case there's queued work */
1101
1102 mutex_unlock(&con->mutex);
1103 if (con->ops->bad_proto)
1104 con->ops->bad_proto(con);
1105 mutex_lock(&con->mutex);
1106}
1107
1108static int process_connect(struct ceph_connection *con)
1109{
1110 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1111 u64 req_feat = CEPH_FEATURE_REQUIRED;
1112 u64 server_feat = le64_to_cpu(con->in_reply.features);
1113
1114 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1115
1116 switch (con->in_reply.tag) {
1117 case CEPH_MSGR_TAG_FEATURES:
1118 pr_err("%s%lld %s feature set mismatch,"
1119 " my %llx < server's %llx, missing %llx\n",
1120 ENTITY_NAME(con->peer_name),
1121 pr_addr(&con->peer_addr.in_addr),
1122 sup_feat, server_feat, server_feat & ~sup_feat);
1123 con->error_msg = "missing required protocol features";
1124 fail_protocol(con);
1125 return -1;
1126
1127 case CEPH_MSGR_TAG_BADPROTOVER:
1128 pr_err("%s%lld %s protocol version mismatch,"
1129 " my %d != server's %d\n",
1130 ENTITY_NAME(con->peer_name),
1131 pr_addr(&con->peer_addr.in_addr),
1132 le32_to_cpu(con->out_connect.protocol_version),
1133 le32_to_cpu(con->in_reply.protocol_version));
1134 con->error_msg = "protocol version mismatch";
1135 fail_protocol(con);
1136 return -1;
1137
1138 case CEPH_MSGR_TAG_BADAUTHORIZER:
1139 con->auth_retry++;
1140 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1141 con->auth_retry);
1142 if (con->auth_retry == 2) {
1143 con->error_msg = "connect authorization failure";
1144 reset_connection(con);
1145 set_bit(CLOSED, &con->state);
1146 return -1;
1147 }
1148 con->auth_retry = 1;
1149 prepare_write_connect(con->msgr, con, 0);
1150 prepare_read_connect(con);
1151 break;
1152
1153 case CEPH_MSGR_TAG_RESETSESSION:
1154 /*
1155 * If we connected with a large connect_seq but the peer
1156 * has no record of a session with us (no connection, or
1157 * connect_seq == 0), they will send RESETSESION to indicate
1158 * that they must have reset their session, and may have
1159 * dropped messages.
1160 */
1161 dout("process_connect got RESET peer seq %u\n",
1162 le32_to_cpu(con->in_connect.connect_seq));
1163 pr_err("%s%lld %s connection reset\n",
1164 ENTITY_NAME(con->peer_name),
1165 pr_addr(&con->peer_addr.in_addr));
1166 reset_connection(con);
1167 prepare_write_connect(con->msgr, con, 0);
1168 prepare_read_connect(con);
1169
1170 /* Tell ceph about it. */
1171 mutex_unlock(&con->mutex);
1172 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1173 if (con->ops->peer_reset)
1174 con->ops->peer_reset(con);
1175 mutex_lock(&con->mutex);
1176 break;
1177
1178 case CEPH_MSGR_TAG_RETRY_SESSION:
1179 /*
1180 * If we sent a smaller connect_seq than the peer has, try
1181 * again with a larger value.
1182 */
1183 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1184 le32_to_cpu(con->out_connect.connect_seq),
1185 le32_to_cpu(con->in_connect.connect_seq));
1186 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1187 prepare_write_connect(con->msgr, con, 0);
1188 prepare_read_connect(con);
1189 break;
1190
1191 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1192 /*
1193 * If we sent a smaller global_seq than the peer has, try
1194 * again with a larger value.
1195 */
1196 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1197 con->peer_global_seq,
1198 le32_to_cpu(con->in_connect.global_seq));
1199 get_global_seq(con->msgr,
1200 le32_to_cpu(con->in_connect.global_seq));
1201 prepare_write_connect(con->msgr, con, 0);
1202 prepare_read_connect(con);
1203 break;
1204
1205 case CEPH_MSGR_TAG_READY:
1206 if (req_feat & ~server_feat) {
1207 pr_err("%s%lld %s protocol feature mismatch,"
1208 " my required %llx > server's %llx, need %llx\n",
1209 ENTITY_NAME(con->peer_name),
1210 pr_addr(&con->peer_addr.in_addr),
1211 req_feat, server_feat, req_feat & ~server_feat);
1212 con->error_msg = "missing required protocol features";
1213 fail_protocol(con);
1214 return -1;
1215 }
1216 clear_bit(CONNECTING, &con->state);
1217 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1218 con->connect_seq++;
1219 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1220 con->peer_global_seq,
1221 le32_to_cpu(con->in_reply.connect_seq),
1222 con->connect_seq);
1223 WARN_ON(con->connect_seq !=
1224 le32_to_cpu(con->in_reply.connect_seq));
1225
1226 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1227 set_bit(LOSSYTX, &con->state);
1228
1229 prepare_read_tag(con);
1230 break;
1231
1232 case CEPH_MSGR_TAG_WAIT:
1233 /*
1234 * If there is a connection race (we are opening
1235 * connections to each other), one of us may just have
1236 * to WAIT. This shouldn't happen if we are the
1237 * client.
1238 */
1239 pr_err("process_connect peer connecting WAIT\n");
1240
1241 default:
1242 pr_err("connect protocol error, will retry\n");
1243 con->error_msg = "protocol error, garbage tag during connect";
1244 return -1;
1245 }
1246 return 0;
1247}
1248
1249
1250/*
1251 * read (part of) an ack
1252 */
1253static int read_partial_ack(struct ceph_connection *con)
1254{
1255 int to = 0;
1256
1257 return read_partial(con, &to, sizeof(con->in_temp_ack),
1258 &con->in_temp_ack);
1259}
1260
1261
1262/*
1263 * We can finally discard anything that's been acked.
1264 */
1265static void process_ack(struct ceph_connection *con)
1266{
1267 struct ceph_msg *m;
1268 u64 ack = le64_to_cpu(con->in_temp_ack);
1269 u64 seq;
1270
1271 while (!list_empty(&con->out_sent)) {
1272 m = list_first_entry(&con->out_sent, struct ceph_msg,
1273 list_head);
1274 seq = le64_to_cpu(m->hdr.seq);
1275 if (seq > ack)
1276 break;
1277 dout("got ack for seq %llu type %d at %p\n", seq,
1278 le16_to_cpu(m->hdr.type), m);
1279 ceph_msg_remove(m);
1280 }
1281 prepare_read_tag(con);
1282}
1283
1284
1285
1286
1287static int read_partial_message_section(struct ceph_connection *con,
1288 struct kvec *section, unsigned int sec_len,
1289 u32 *crc)
1290{
1291 int left;
1292 int ret;
1293
1294 BUG_ON(!section);
1295
1296 while (section->iov_len < sec_len) {
1297 BUG_ON(section->iov_base == NULL);
1298 left = sec_len - section->iov_len;
1299 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1300 section->iov_len, left);
1301 if (ret <= 0)
1302 return ret;
1303 section->iov_len += ret;
1304 if (section->iov_len == sec_len)
1305 *crc = crc32c(0, section->iov_base,
1306 section->iov_len);
1307 }
1308
1309 return 1;
1310}
1311
1312static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1313 struct ceph_msg_header *hdr,
1314 int *skip);
1315/*
1316 * read (part of) a message.
1317 */
1318static int read_partial_message(struct ceph_connection *con)
1319{
1320 struct ceph_msg *m = con->in_msg;
1321 void *p;
1322 int ret;
1323 int to, left;
1324 unsigned front_len, middle_len, data_len, data_off;
1325 int datacrc = con->msgr->nocrc;
1326 int skip;
1327
1328 dout("read_partial_message con %p msg %p\n", con, m);
1329
1330 /* header */
1331 while (con->in_base_pos < sizeof(con->in_hdr)) {
1332 left = sizeof(con->in_hdr) - con->in_base_pos;
1333 ret = ceph_tcp_recvmsg(con->sock,
1334 (char *)&con->in_hdr + con->in_base_pos,
1335 left);
1336 if (ret <= 0)
1337 return ret;
1338 con->in_base_pos += ret;
1339 if (con->in_base_pos == sizeof(con->in_hdr)) {
1340 u32 crc = crc32c(0, (void *)&con->in_hdr,
1341 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1342 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1343 pr_err("read_partial_message bad hdr "
1344 " crc %u != expected %u\n",
1345 crc, con->in_hdr.crc);
1346 return -EBADMSG;
1347 }
1348 }
1349 }
1350 front_len = le32_to_cpu(con->in_hdr.front_len);
1351 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1352 return -EIO;
1353 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1354 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1355 return -EIO;
1356 data_len = le32_to_cpu(con->in_hdr.data_len);
1357 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1358 return -EIO;
1359 data_off = le16_to_cpu(con->in_hdr.data_off);
1360
1361 /* allocate message? */
1362 if (!con->in_msg) {
1363 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1364 con->in_hdr.front_len, con->in_hdr.data_len);
1365 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1366 if (skip) {
1367 /* skip this message */
1368 dout("alloc_msg returned NULL, skipping message\n");
1369 con->in_base_pos = -front_len - middle_len - data_len -
1370 sizeof(m->footer);
1371 con->in_tag = CEPH_MSGR_TAG_READY;
1372 return 0;
1373 }
1374 if (IS_ERR(con->in_msg)) {
1375 ret = PTR_ERR(con->in_msg);
1376 con->in_msg = NULL;
1377 con->error_msg =
1378 "error allocating memory for incoming message";
1379 return ret;
1380 }
1381 m = con->in_msg;
1382 m->front.iov_len = 0; /* haven't read it yet */
1383 if (m->middle)
1384 m->middle->vec.iov_len = 0;
1385
1386 con->in_msg_pos.page = 0;
1387 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1388 con->in_msg_pos.data_pos = 0;
1389 }
1390
1391 /* front */
1392 ret = read_partial_message_section(con, &m->front, front_len,
1393 &con->in_front_crc);
1394 if (ret <= 0)
1395 return ret;
1396
1397 /* middle */
1398 if (m->middle) {
1399 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1400 &con->in_middle_crc);
1401 if (ret <= 0)
1402 return ret;
1403 }
1404
1405 /* (page) data */
1406 while (con->in_msg_pos.data_pos < data_len) {
1407 left = min((int)(data_len - con->in_msg_pos.data_pos),
1408 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1409 BUG_ON(m->pages == NULL);
1410 p = kmap(m->pages[con->in_msg_pos.page]);
1411 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1412 left);
1413 if (ret > 0 && datacrc)
1414 con->in_data_crc =
1415 crc32c(con->in_data_crc,
1416 p + con->in_msg_pos.page_pos, ret);
1417 kunmap(m->pages[con->in_msg_pos.page]);
1418 if (ret <= 0)
1419 return ret;
1420 con->in_msg_pos.data_pos += ret;
1421 con->in_msg_pos.page_pos += ret;
1422 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1423 con->in_msg_pos.page_pos = 0;
1424 con->in_msg_pos.page++;
1425 }
1426 }
1427
1428 /* footer */
1429 to = sizeof(m->hdr) + sizeof(m->footer);
1430 while (con->in_base_pos < to) {
1431 left = to - con->in_base_pos;
1432 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1433 (con->in_base_pos - sizeof(m->hdr)),
1434 left);
1435 if (ret <= 0)
1436 return ret;
1437 con->in_base_pos += ret;
1438 }
1439 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1440 m, front_len, m->footer.front_crc, middle_len,
1441 m->footer.middle_crc, data_len, m->footer.data_crc);
1442
1443 /* crc ok? */
1444 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1445 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1446 m, con->in_front_crc, m->footer.front_crc);
1447 return -EBADMSG;
1448 }
1449 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1450 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1451 m, con->in_middle_crc, m->footer.middle_crc);
1452 return -EBADMSG;
1453 }
1454 if (datacrc &&
1455 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1456 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1457 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1458 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1459 return -EBADMSG;
1460 }
1461
1462 return 1; /* done! */
1463}
1464
1465/*
1466 * Process message. This happens in the worker thread. The callback should
1467 * be careful not to do anything that waits on other incoming messages or it
1468 * may deadlock.
1469 */
1470static void process_message(struct ceph_connection *con)
1471{
1472 struct ceph_msg *msg;
1473
1474 msg = con->in_msg;
1475 con->in_msg = NULL;
1476
1477 /* if first message, set peer_name */
1478 if (con->peer_name.type == 0)
1479 con->peer_name = msg->hdr.src.name;
1480
1481 con->in_seq++;
1482 mutex_unlock(&con->mutex);
1483
1484 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1485 msg, le64_to_cpu(msg->hdr.seq),
1486 ENTITY_NAME(msg->hdr.src.name),
1487 le16_to_cpu(msg->hdr.type),
1488 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1489 le32_to_cpu(msg->hdr.front_len),
1490 le32_to_cpu(msg->hdr.data_len),
1491 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1492 con->ops->dispatch(con, msg);
1493
1494 mutex_lock(&con->mutex);
1495 prepare_read_tag(con);
1496}
1497
1498
1499/*
1500 * Write something to the socket. Called in a worker thread when the
1501 * socket appears to be writeable and we have something ready to send.
1502 */
1503static int try_write(struct ceph_connection *con)
1504{
1505 struct ceph_messenger *msgr = con->msgr;
1506 int ret = 1;
1507
1508 dout("try_write start %p state %lu nref %d\n", con, con->state,
1509 atomic_read(&con->nref));
1510
1511 mutex_lock(&con->mutex);
1512more:
1513 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1514
1515 /* open the socket first? */
1516 if (con->sock == NULL) {
1517 /*
1518 * if we were STANDBY and are reconnecting _this_
1519 * connection, bump connect_seq now. Always bump
1520 * global_seq.
1521 */
1522 if (test_and_clear_bit(STANDBY, &con->state))
1523 con->connect_seq++;
1524
1525 prepare_write_banner(msgr, con);
1526 prepare_write_connect(msgr, con, 1);
1527 prepare_read_banner(con);
1528 set_bit(CONNECTING, &con->state);
1529 clear_bit(NEGOTIATING, &con->state);
1530
1531 BUG_ON(con->in_msg);
1532 con->in_tag = CEPH_MSGR_TAG_READY;
1533 dout("try_write initiating connect on %p new state %lu\n",
1534 con, con->state);
1535 con->sock = ceph_tcp_connect(con);
1536 if (IS_ERR(con->sock)) {
1537 con->sock = NULL;
1538 con->error_msg = "connect error";
1539 ret = -1;
1540 goto out;
1541 }
1542 }
1543
1544more_kvec:
1545 /* kvec data queued? */
1546 if (con->out_skip) {
1547 ret = write_partial_skip(con);
1548 if (ret <= 0)
1549 goto done;
1550 if (ret < 0) {
1551 dout("try_write write_partial_skip err %d\n", ret);
1552 goto done;
1553 }
1554 }
1555 if (con->out_kvec_left) {
1556 ret = write_partial_kvec(con);
1557 if (ret <= 0)
1558 goto done;
1559 }
1560
1561 /* msg pages? */
1562 if (con->out_msg) {
1563 if (con->out_msg_done) {
1564 ceph_msg_put(con->out_msg);
1565 con->out_msg = NULL; /* we're done with this one */
1566 goto do_next;
1567 }
1568
1569 ret = write_partial_msg_pages(con);
1570 if (ret == 1)
1571 goto more_kvec; /* we need to send the footer, too! */
1572 if (ret == 0)
1573 goto done;
1574 if (ret < 0) {
1575 dout("try_write write_partial_msg_pages err %d\n",
1576 ret);
1577 goto done;
1578 }
1579 }
1580
1581do_next:
1582 if (!test_bit(CONNECTING, &con->state)) {
1583 /* is anything else pending? */
1584 if (!list_empty(&con->out_queue)) {
1585 prepare_write_message(con);
1586 goto more;
1587 }
1588 if (con->in_seq > con->in_seq_acked) {
1589 prepare_write_ack(con);
1590 goto more;
1591 }
1592 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1593 prepare_write_keepalive(con);
1594 goto more;
1595 }
1596 }
1597
1598 /* Nothing to do! */
1599 clear_bit(WRITE_PENDING, &con->state);
1600 dout("try_write nothing else to write.\n");
1601done:
1602 ret = 0;
1603out:
1604 mutex_unlock(&con->mutex);
1605 dout("try_write done on %p\n", con);
1606 return ret;
1607}
1608
1609
1610
1611/*
1612 * Read what we can from the socket.
1613 */
1614static int try_read(struct ceph_connection *con)
1615{
1616 struct ceph_messenger *msgr;
1617 int ret = -1;
1618
1619 if (!con->sock)
1620 return 0;
1621
1622 if (test_bit(STANDBY, &con->state))
1623 return 0;
1624
1625 dout("try_read start on %p\n", con);
1626 msgr = con->msgr;
1627
1628 mutex_lock(&con->mutex);
1629
1630more:
1631 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1632 con->in_base_pos);
1633 if (test_bit(CONNECTING, &con->state)) {
1634 if (!test_bit(NEGOTIATING, &con->state)) {
1635 dout("try_read connecting\n");
1636 ret = read_partial_banner(con);
1637 if (ret <= 0)
1638 goto done;
1639 if (process_banner(con) < 0) {
1640 ret = -1;
1641 goto out;
1642 }
1643 }
1644 ret = read_partial_connect(con);
1645 if (ret <= 0)
1646 goto done;
1647 if (process_connect(con) < 0) {
1648 ret = -1;
1649 goto out;
1650 }
1651 goto more;
1652 }
1653
1654 if (con->in_base_pos < 0) {
1655 /*
1656 * skipping + discarding content.
1657 *
1658 * FIXME: there must be a better way to do this!
1659 */
1660 static char buf[1024];
1661 int skip = min(1024, -con->in_base_pos);
1662 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1663 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1664 if (ret <= 0)
1665 goto done;
1666 con->in_base_pos += ret;
1667 if (con->in_base_pos)
1668 goto more;
1669 }
1670 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1671 /*
1672 * what's next?
1673 */
1674 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1675 if (ret <= 0)
1676 goto done;
1677 dout("try_read got tag %d\n", (int)con->in_tag);
1678 switch (con->in_tag) {
1679 case CEPH_MSGR_TAG_MSG:
1680 prepare_read_message(con);
1681 break;
1682 case CEPH_MSGR_TAG_ACK:
1683 prepare_read_ack(con);
1684 break;
1685 case CEPH_MSGR_TAG_CLOSE:
1686 set_bit(CLOSED, &con->state); /* fixme */
1687 goto done;
1688 default:
1689 goto bad_tag;
1690 }
1691 }
1692 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1693 ret = read_partial_message(con);
1694 if (ret <= 0) {
1695 switch (ret) {
1696 case -EBADMSG:
1697 con->error_msg = "bad crc";
1698 ret = -EIO;
1699 goto out;
1700 case -EIO:
1701 con->error_msg = "io error";
1702 goto out;
1703 default:
1704 goto done;
1705 }
1706 }
1707 if (con->in_tag == CEPH_MSGR_TAG_READY)
1708 goto more;
1709 process_message(con);
1710 goto more;
1711 }
1712 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1713 ret = read_partial_ack(con);
1714 if (ret <= 0)
1715 goto done;
1716 process_ack(con);
1717 goto more;
1718 }
1719
1720done:
1721 ret = 0;
1722out:
1723 mutex_unlock(&con->mutex);
1724 dout("try_read done on %p\n", con);
1725 return ret;
1726
1727bad_tag:
1728 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1729 con->error_msg = "protocol error, garbage tag";
1730 ret = -1;
1731 goto out;
1732}
1733
1734
1735/*
1736 * Atomically queue work on a connection. Bump @con reference to
1737 * avoid races with connection teardown.
1738 *
1739 * There is some trickery going on with QUEUED and BUSY because we
1740 * only want a _single_ thread operating on each connection at any
1741 * point in time, but we want to use all available CPUs.
1742 *
1743 * The worker thread only proceeds if it can atomically set BUSY. It
1744 * clears QUEUED and does it's thing. When it thinks it's done, it
1745 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1746 * (tries again to set BUSY).
1747 *
1748 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1749 * try to queue work. If that fails (work is already queued, or BUSY)
1750 * we give up (work also already being done or is queued) but leave QUEUED
1751 * set so that the worker thread will loop if necessary.
1752 */
1753static void queue_con(struct ceph_connection *con)
1754{
1755 if (test_bit(DEAD, &con->state)) {
1756 dout("queue_con %p ignoring: DEAD\n",
1757 con);
1758 return;
1759 }
1760
1761 if (!con->ops->get(con)) {
1762 dout("queue_con %p ref count 0\n", con);
1763 return;
1764 }
1765
1766 set_bit(QUEUED, &con->state);
1767 if (test_bit(BUSY, &con->state)) {
1768 dout("queue_con %p - already BUSY\n", con);
1769 con->ops->put(con);
1770 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1771 dout("queue_con %p - already queued\n", con);
1772 con->ops->put(con);
1773 } else {
1774 dout("queue_con %p\n", con);
1775 }
1776}
1777
1778/*
1779 * Do some work on a connection. Drop a connection ref when we're done.
1780 */
1781static void con_work(struct work_struct *work)
1782{
1783 struct ceph_connection *con = container_of(work, struct ceph_connection,
1784 work.work);
1785 int backoff = 0;
1786
1787more:
1788 if (test_and_set_bit(BUSY, &con->state) != 0) {
1789 dout("con_work %p BUSY already set\n", con);
1790 goto out;
1791 }
1792 dout("con_work %p start, clearing QUEUED\n", con);
1793 clear_bit(QUEUED, &con->state);
1794
1795 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1796 dout("con_work CLOSED\n");
1797 con_close_socket(con);
1798 goto done;
1799 }
1800 if (test_and_clear_bit(OPENING, &con->state)) {
1801 /* reopen w/ new peer */
1802 dout("con_work OPENING\n");
1803 con_close_socket(con);
1804 }
1805
1806 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1807 try_read(con) < 0 ||
1808 try_write(con) < 0) {
1809 backoff = 1;
1810 ceph_fault(con); /* error/fault path */
1811 }
1812
1813done:
1814 clear_bit(BUSY, &con->state);
1815 dout("con->state=%lu\n", con->state);
1816 if (test_bit(QUEUED, &con->state)) {
1817 if (!backoff || test_bit(OPENING, &con->state)) {
1818 dout("con_work %p QUEUED reset, looping\n", con);
1819 goto more;
1820 }
1821 dout("con_work %p QUEUED reset, but just faulted\n", con);
1822 clear_bit(QUEUED, &con->state);
1823 }
1824 dout("con_work %p done\n", con);
1825
1826out:
1827 con->ops->put(con);
1828}
1829
1830
1831/*
1832 * Generic error/fault handler. A retry mechanism is used with
1833 * exponential backoff
1834 */
1835static void ceph_fault(struct ceph_connection *con)
1836{
1837 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1838 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1839 dout("fault %p state %lu to peer %s\n",
1840 con, con->state, pr_addr(&con->peer_addr.in_addr));
1841
1842 if (test_bit(LOSSYTX, &con->state)) {
1843 dout("fault on LOSSYTX channel\n");
1844 goto out;
1845 }
1846
1847 mutex_lock(&con->mutex);
1848 if (test_bit(CLOSED, &con->state))
1849 goto out_unlock;
1850
1851 con_close_socket(con);
1852
1853 if (con->in_msg) {
1854 ceph_msg_put(con->in_msg);
1855 con->in_msg = NULL;
1856 }
1857
1858 /* Requeue anything that hasn't been acked */
1859 list_splice_init(&con->out_sent, &con->out_queue);
1860
1861 /* If there are no messages in the queue, place the connection
1862 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1863 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1864 dout("fault setting STANDBY\n");
1865 set_bit(STANDBY, &con->state);
1866 } else {
1867 /* retry after a delay. */
1868 if (con->delay == 0)
1869 con->delay = BASE_DELAY_INTERVAL;
1870 else if (con->delay < MAX_DELAY_INTERVAL)
1871 con->delay *= 2;
1872 dout("fault queueing %p delay %lu\n", con, con->delay);
1873 con->ops->get(con);
1874 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1875 round_jiffies_relative(con->delay)) == 0)
1876 con->ops->put(con);
1877 }
1878
1879out_unlock:
1880 mutex_unlock(&con->mutex);
1881out:
1882 /*
1883 * in case we faulted due to authentication, invalidate our
1884 * current tickets so that we can get new ones.
1885 */
1886 if (con->auth_retry && con->ops->invalidate_authorizer) {
1887 dout("calling invalidate_authorizer()\n");
1888 con->ops->invalidate_authorizer(con);
1889 }
1890
1891 if (con->ops->fault)
1892 con->ops->fault(con);
1893}
1894
1895
1896
1897/*
1898 * create a new messenger instance
1899 */
1900struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1901{
1902 struct ceph_messenger *msgr;
1903
1904 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1905 if (msgr == NULL)
1906 return ERR_PTR(-ENOMEM);
1907
1908 spin_lock_init(&msgr->global_seq_lock);
1909
1910 /* the zero page is needed if a request is "canceled" while the message
1911 * is being written over the socket */
1912 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1913 if (!msgr->zero_page) {
1914 kfree(msgr);
1915 return ERR_PTR(-ENOMEM);
1916 }
1917 kmap(msgr->zero_page);
1918
1919 if (myaddr)
1920 msgr->inst.addr = *myaddr;
1921
1922 /* select a random nonce */
1923 msgr->inst.addr.type = 0;
1924 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1925 encode_my_addr(msgr);
1926
1927 dout("messenger_create %p\n", msgr);
1928 return msgr;
1929}
1930
1931void ceph_messenger_destroy(struct ceph_messenger *msgr)
1932{
1933 dout("destroy %p\n", msgr);
1934 kunmap(msgr->zero_page);
1935 __free_page(msgr->zero_page);
1936 kfree(msgr);
1937 dout("destroyed messenger %p\n", msgr);
1938}
1939
1940/*
1941 * Queue up an outgoing message on the given connection.
1942 */
1943void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1944{
1945 if (test_bit(CLOSED, &con->state)) {
1946 dout("con_send %p closed, dropping %p\n", con, msg);
1947 ceph_msg_put(msg);
1948 return;
1949 }
1950
1951 /* set src+dst */
1952 msg->hdr.src.name = con->msgr->inst.name;
1953 msg->hdr.src.addr = con->msgr->my_enc_addr;
1954 msg->hdr.orig_src = msg->hdr.src;
1955
1956 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1957
1958 /* queue */
1959 mutex_lock(&con->mutex);
1960 BUG_ON(!list_empty(&msg->list_head));
1961 list_add_tail(&msg->list_head, &con->out_queue);
1962 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1963 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1964 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1965 le32_to_cpu(msg->hdr.front_len),
1966 le32_to_cpu(msg->hdr.middle_len),
1967 le32_to_cpu(msg->hdr.data_len));
1968 mutex_unlock(&con->mutex);
1969
1970 /* if there wasn't anything waiting to send before, queue
1971 * new work */
1972 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1973 queue_con(con);
1974}
1975
1976/*
1977 * Revoke a message that was previously queued for send
1978 */
1979void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1980{
1981 mutex_lock(&con->mutex);
1982 if (!list_empty(&msg->list_head)) {
1983 dout("con_revoke %p msg %p\n", con, msg);
1984 list_del_init(&msg->list_head);
1985 ceph_msg_put(msg);
1986 msg->hdr.seq = 0;
1987 if (con->out_msg == msg) {
1988 ceph_msg_put(con->out_msg);
1989 con->out_msg = NULL;
1990 }
1991 if (con->out_kvec_is_msg) {
1992 con->out_skip = con->out_kvec_bytes;
1993 con->out_kvec_is_msg = false;
1994 }
1995 } else {
1996 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
1997 }
1998 mutex_unlock(&con->mutex);
1999}
2000
2001/*
2002 * Revoke a message that we may be reading data into
2003 */
2004void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2005{
2006 mutex_lock(&con->mutex);
2007 if (con->in_msg && con->in_msg == msg) {
2008 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2009 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2010 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2011
2012 /* skip rest of message */
2013 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2014 con->in_base_pos = con->in_base_pos -
2015 sizeof(struct ceph_msg_header) -
2016 front_len -
2017 middle_len -
2018 data_len -
2019 sizeof(struct ceph_msg_footer);
2020 ceph_msg_put(con->in_msg);
2021 con->in_msg = NULL;
2022 con->in_tag = CEPH_MSGR_TAG_READY;
2023 } else {
2024 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2025 con, con->in_msg, msg);
2026 }
2027 mutex_unlock(&con->mutex);
2028}
2029
2030/*
2031 * Queue a keepalive byte to ensure the tcp connection is alive.
2032 */
2033void ceph_con_keepalive(struct ceph_connection *con)
2034{
2035 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2036 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2037 queue_con(con);
2038}
2039
2040
2041/*
2042 * construct a new message with given type, size
2043 * the new msg has a ref count of 1.
2044 */
2045struct ceph_msg *ceph_msg_new(int type, int front_len,
2046 int page_len, int page_off, struct page **pages)
2047{
2048 struct ceph_msg *m;
2049
2050 m = kmalloc(sizeof(*m), GFP_NOFS);
2051 if (m == NULL)
2052 goto out;
2053 kref_init(&m->kref);
2054 INIT_LIST_HEAD(&m->list_head);
2055
2056 m->hdr.type = cpu_to_le16(type);
2057 m->hdr.front_len = cpu_to_le32(front_len);
2058 m->hdr.middle_len = 0;
2059 m->hdr.data_len = cpu_to_le32(page_len);
2060 m->hdr.data_off = cpu_to_le16(page_off);
2061 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2062 m->footer.front_crc = 0;
2063 m->footer.middle_crc = 0;
2064 m->footer.data_crc = 0;
2065 m->front_max = front_len;
2066 m->front_is_vmalloc = false;
2067 m->more_to_follow = false;
2068 m->pool = NULL;
2069
2070 /* front */
2071 if (front_len) {
2072 if (front_len > PAGE_CACHE_SIZE) {
2073 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2074 PAGE_KERNEL);
2075 m->front_is_vmalloc = true;
2076 } else {
2077 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2078 }
2079 if (m->front.iov_base == NULL) {
2080 pr_err("msg_new can't allocate %d bytes\n",
2081 front_len);
2082 goto out2;
2083 }
2084 } else {
2085 m->front.iov_base = NULL;
2086 }
2087 m->front.iov_len = front_len;
2088
2089 /* middle */
2090 m->middle = NULL;
2091
2092 /* data */
2093 m->nr_pages = calc_pages_for(page_off, page_len);
2094 m->pages = pages;
2095 m->pagelist = NULL;
2096
2097 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2098 m->nr_pages);
2099 return m;
2100
2101out2:
2102 ceph_msg_put(m);
2103out:
2104 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2105 return ERR_PTR(-ENOMEM);
2106}
2107
2108/*
2109 * Allocate "middle" portion of a message, if it is needed and wasn't
2110 * allocated by alloc_msg. This allows us to read a small fixed-size
2111 * per-type header in the front and then gracefully fail (i.e.,
2112 * propagate the error to the caller based on info in the front) when
2113 * the middle is too large.
2114 */
2115static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2116{
2117 int type = le16_to_cpu(msg->hdr.type);
2118 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2119
2120 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2121 ceph_msg_type_name(type), middle_len);
2122 BUG_ON(!middle_len);
2123 BUG_ON(msg->middle);
2124
2125 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2126 if (!msg->middle)
2127 return -ENOMEM;
2128 return 0;
2129}
2130
2131/*
2132 * Generic message allocator, for incoming messages.
2133 */
2134static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2135 struct ceph_msg_header *hdr,
2136 int *skip)
2137{
2138 int type = le16_to_cpu(hdr->type);
2139 int front_len = le32_to_cpu(hdr->front_len);
2140 int middle_len = le32_to_cpu(hdr->middle_len);
2141 struct ceph_msg *msg = NULL;
2142 int ret;
2143
2144 if (con->ops->alloc_msg) {
2145 mutex_unlock(&con->mutex);
2146 msg = con->ops->alloc_msg(con, hdr, skip);
2147 mutex_lock(&con->mutex);
2148 if (IS_ERR(msg))
2149 return msg;
2150
2151 if (*skip)
2152 return NULL;
2153 }
2154 if (!msg) {
2155 *skip = 0;
2156 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2157 if (!msg) {
2158 pr_err("unable to allocate msg type %d len %d\n",
2159 type, front_len);
2160 return ERR_PTR(-ENOMEM);
2161 }
2162 }
2163 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2164
2165 if (middle_len) {
2166 ret = ceph_alloc_middle(con, msg);
2167
2168 if (ret < 0) {
2169 ceph_msg_put(msg);
2170 return msg;
2171 }
2172 }
2173
2174 return msg;
2175}
2176
2177
2178/*
2179 * Free a generically kmalloc'd message.
2180 */
2181void ceph_msg_kfree(struct ceph_msg *m)
2182{
2183 dout("msg_kfree %p\n", m);
2184 if (m->front_is_vmalloc)
2185 vfree(m->front.iov_base);
2186 else
2187 kfree(m->front.iov_base);
2188 kfree(m);
2189}
2190
2191/*
2192 * Drop a msg ref. Destroy as needed.
2193 */
2194void ceph_msg_last_put(struct kref *kref)
2195{
2196 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2197
2198 dout("ceph_msg_put last one on %p\n", m);
2199 WARN_ON(!list_empty(&m->list_head));
2200
2201 /* drop middle, data, if any */
2202 if (m->middle) {
2203 ceph_buffer_put(m->middle);
2204 m->middle = NULL;
2205 }
2206 m->nr_pages = 0;
2207 m->pages = NULL;
2208
2209 if (m->pagelist) {
2210 ceph_pagelist_release(m->pagelist);
2211 kfree(m->pagelist);
2212 m->pagelist = NULL;
2213 }
2214
2215 if (m->pool)
2216 ceph_msgpool_put(m->pool, m);
2217 else
2218 ceph_msg_kfree(m);
2219}
2220
2221void ceph_msg_dump(struct ceph_msg *msg)
2222{
2223 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2224 msg->front_max, msg->nr_pages);
2225 print_hex_dump(KERN_DEBUG, "header: ",
2226 DUMP_PREFIX_OFFSET, 16, 1,
2227 &msg->hdr, sizeof(msg->hdr), true);
2228 print_hex_dump(KERN_DEBUG, " front: ",
2229 DUMP_PREFIX_OFFSET, 16, 1,
2230 msg->front.iov_base, msg->front.iov_len, true);
2231 if (msg->middle)
2232 print_hex_dump(KERN_DEBUG, "middle: ",
2233 DUMP_PREFIX_OFFSET, 16, 1,
2234 msg->middle->vec.iov_base,
2235 msg->middle->vec.iov_len, true);
2236 print_hex_dump(KERN_DEBUG, "footer: ",
2237 DUMP_PREFIX_OFFSET, 16, 1,
2238 &msg->footer, sizeof(msg->footer), true);
2239}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a343dae73cdc
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len,
237 int page_len, int page_off,
238 struct page **pages);
239extern void ceph_msg_kfree(struct ceph_msg *m);
240
241
242static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
243{
244 kref_get(&msg->kref);
245 return msg;
246}
247extern void ceph_msg_last_put(struct kref *kref);
248static inline void ceph_msg_put(struct ceph_msg *msg)
249{
250 kref_put(&msg->kref, ceph_msg_last_put);
251}
252
253extern void ceph_msg_dump(struct ceph_msg *msg);
254
255#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..890597c09d43
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,834 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/random.h>
5#include <linux/sched.h>
6
7#include "mon_client.h"
8#include "super.h"
9#include "auth.h"
10#include "decode.h"
11
12/*
13 * Interact with Ceph monitor cluster. Handle requests for new map
14 * versions, and periodically resend as needed. Also implement
15 * statfs() and umount().
16 *
17 * A small cluster of Ceph "monitors" are responsible for managing critical
18 * cluster configuration and state information. An odd number (e.g., 3, 5)
19 * of cmon daemons use a modified version of the Paxos part-time parliament
20 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
21 * list of clients who have mounted the file system.
22 *
23 * We maintain an open, active session with a monitor at all times in order to
24 * receive timely MDSMap updates. We periodically send a keepalive byte on the
25 * TCP socket to ensure we detect a failure. If the connection does break, we
26 * randomly hunt for a new monitor. Once the connection is reestablished, we
27 * resend any outstanding requests.
28 */
29
30const static struct ceph_connection_operations mon_con_ops;
31
32static int __validate_auth(struct ceph_mon_client *monc);
33
34/*
35 * Decode a monmap blob (e.g., during mount).
36 */
37struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
38{
39 struct ceph_monmap *m = NULL;
40 int i, err = -EINVAL;
41 struct ceph_fsid fsid;
42 u32 epoch, num_mon;
43 u16 version;
44 u32 len;
45
46 ceph_decode_32_safe(&p, end, len, bad);
47 ceph_decode_need(&p, end, len, bad);
48
49 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
50
51 ceph_decode_16_safe(&p, end, version, bad);
52
53 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
54 ceph_decode_copy(&p, &fsid, sizeof(fsid));
55 epoch = ceph_decode_32(&p);
56
57 num_mon = ceph_decode_32(&p);
58 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
59
60 if (num_mon >= CEPH_MAX_MON)
61 goto bad;
62 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
63 if (m == NULL)
64 return ERR_PTR(-ENOMEM);
65 m->fsid = fsid;
66 m->epoch = epoch;
67 m->num_mon = num_mon;
68 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
69 for (i = 0; i < num_mon; i++)
70 ceph_decode_addr(&m->mon_inst[i].addr);
71
72 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
73 m->num_mon);
74 for (i = 0; i < m->num_mon; i++)
75 dout("monmap_decode mon%d is %s\n", i,
76 pr_addr(&m->mon_inst[i].addr.in_addr));
77 return m;
78
79bad:
80 dout("monmap_decode failed with %d\n", err);
81 kfree(m);
82 return ERR_PTR(err);
83}
84
85/*
86 * return true if *addr is included in the monmap.
87 */
88int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
89{
90 int i;
91
92 for (i = 0; i < m->num_mon; i++)
93 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
94 return 1;
95 return 0;
96}
97
98/*
99 * Send an auth request.
100 */
101static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
102{
103 monc->pending_auth = 1;
104 monc->m_auth->front.iov_len = len;
105 monc->m_auth->hdr.front_len = cpu_to_le32(len);
106 ceph_msg_get(monc->m_auth); /* keep our ref */
107 ceph_con_send(monc->con, monc->m_auth);
108}
109
110/*
111 * Close monitor session, if any.
112 */
113static void __close_session(struct ceph_mon_client *monc)
114{
115 if (monc->con) {
116 dout("__close_session closing mon%d\n", monc->cur_mon);
117 ceph_con_revoke(monc->con, monc->m_auth);
118 ceph_con_close(monc->con);
119 monc->cur_mon = -1;
120 monc->pending_auth = 0;
121 ceph_auth_reset(monc->auth);
122 }
123}
124
125/*
126 * Open a session with a (new) monitor.
127 */
128static int __open_session(struct ceph_mon_client *monc)
129{
130 char r;
131 int ret;
132
133 if (monc->cur_mon < 0) {
134 get_random_bytes(&r, 1);
135 monc->cur_mon = r % monc->monmap->num_mon;
136 dout("open_session num=%d r=%d -> mon%d\n",
137 monc->monmap->num_mon, r, monc->cur_mon);
138 monc->sub_sent = 0;
139 monc->sub_renew_after = jiffies; /* i.e., expired */
140 monc->want_next_osdmap = !!monc->want_next_osdmap;
141
142 dout("open_session mon%d opening\n", monc->cur_mon);
143 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
144 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
145 ceph_con_open(monc->con,
146 &monc->monmap->mon_inst[monc->cur_mon].addr);
147
148 /* initiatiate authentication handshake */
149 ret = ceph_auth_build_hello(monc->auth,
150 monc->m_auth->front.iov_base,
151 monc->m_auth->front_max);
152 __send_prepared_auth_request(monc, ret);
153 } else {
154 dout("open_session mon%d already open\n", monc->cur_mon);
155 }
156 return 0;
157}
158
159static bool __sub_expired(struct ceph_mon_client *monc)
160{
161 return time_after_eq(jiffies, monc->sub_renew_after);
162}
163
164/*
165 * Reschedule delayed work timer.
166 */
167static void __schedule_delayed(struct ceph_mon_client *monc)
168{
169 unsigned delay;
170
171 if (monc->cur_mon < 0 || __sub_expired(monc))
172 delay = 10 * HZ;
173 else
174 delay = 20 * HZ;
175 dout("__schedule_delayed after %u\n", delay);
176 schedule_delayed_work(&monc->delayed_work, delay);
177}
178
179/*
180 * Send subscribe request for mdsmap and/or osdmap.
181 */
182static void __send_subscribe(struct ceph_mon_client *monc)
183{
184 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
185 (unsigned)monc->sub_sent, __sub_expired(monc),
186 monc->want_next_osdmap);
187 if ((__sub_expired(monc) && !monc->sub_sent) ||
188 monc->want_next_osdmap == 1) {
189 struct ceph_msg *msg;
190 struct ceph_mon_subscribe_item *i;
191 void *p, *end;
192
193 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
194 if (!msg)
195 return;
196
197 p = msg->front.iov_base;
198 end = p + msg->front.iov_len;
199
200 dout("__send_subscribe to 'mdsmap' %u+\n",
201 (unsigned)monc->have_mdsmap);
202 if (monc->want_next_osdmap) {
203 dout("__send_subscribe to 'osdmap' %u\n",
204 (unsigned)monc->have_osdmap);
205 ceph_encode_32(&p, 3);
206 ceph_encode_string(&p, end, "osdmap", 6);
207 i = p;
208 i->have = cpu_to_le64(monc->have_osdmap);
209 i->onetime = 1;
210 p += sizeof(*i);
211 monc->want_next_osdmap = 2; /* requested */
212 } else {
213 ceph_encode_32(&p, 2);
214 }
215 ceph_encode_string(&p, end, "mdsmap", 6);
216 i = p;
217 i->have = cpu_to_le64(monc->have_mdsmap);
218 i->onetime = 0;
219 p += sizeof(*i);
220 ceph_encode_string(&p, end, "monmap", 6);
221 i = p;
222 i->have = 0;
223 i->onetime = 0;
224 p += sizeof(*i);
225
226 msg->front.iov_len = p - msg->front.iov_base;
227 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
228 ceph_con_send(monc->con, msg);
229
230 monc->sub_sent = jiffies | 1; /* never 0 */
231 }
232}
233
234static void handle_subscribe_ack(struct ceph_mon_client *monc,
235 struct ceph_msg *msg)
236{
237 unsigned seconds;
238 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
239
240 if (msg->front.iov_len < sizeof(*h))
241 goto bad;
242 seconds = le32_to_cpu(h->duration);
243
244 mutex_lock(&monc->mutex);
245 if (monc->hunting) {
246 pr_info("mon%d %s session established\n",
247 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
248 monc->hunting = false;
249 }
250 dout("handle_subscribe_ack after %d seconds\n", seconds);
251 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
252 monc->sub_sent = 0;
253 mutex_unlock(&monc->mutex);
254 return;
255bad:
256 pr_err("got corrupt subscribe-ack msg\n");
257 ceph_msg_dump(msg);
258}
259
260/*
261 * Keep track of which maps we have
262 */
263int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
264{
265 mutex_lock(&monc->mutex);
266 monc->have_mdsmap = got;
267 mutex_unlock(&monc->mutex);
268 return 0;
269}
270
271int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
272{
273 mutex_lock(&monc->mutex);
274 monc->have_osdmap = got;
275 monc->want_next_osdmap = 0;
276 mutex_unlock(&monc->mutex);
277 return 0;
278}
279
280/*
281 * Register interest in the next osdmap
282 */
283void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
284{
285 dout("request_next_osdmap have %u\n", monc->have_osdmap);
286 mutex_lock(&monc->mutex);
287 if (!monc->want_next_osdmap)
288 monc->want_next_osdmap = 1;
289 if (monc->want_next_osdmap < 2)
290 __send_subscribe(monc);
291 mutex_unlock(&monc->mutex);
292}
293
294/*
295 *
296 */
297int ceph_monc_open_session(struct ceph_mon_client *monc)
298{
299 if (!monc->con) {
300 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
301 if (!monc->con)
302 return -ENOMEM;
303 ceph_con_init(monc->client->msgr, monc->con);
304 monc->con->private = monc;
305 monc->con->ops = &mon_con_ops;
306 }
307
308 mutex_lock(&monc->mutex);
309 __open_session(monc);
310 __schedule_delayed(monc);
311 mutex_unlock(&monc->mutex);
312 return 0;
313}
314
315/*
316 * The monitor responds with mount ack indicate mount success. The
317 * included client ticket allows the client to talk to MDSs and OSDs.
318 */
319static void ceph_monc_handle_map(struct ceph_mon_client *monc,
320 struct ceph_msg *msg)
321{
322 struct ceph_client *client = monc->client;
323 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
324 void *p, *end;
325
326 mutex_lock(&monc->mutex);
327
328 dout("handle_monmap\n");
329 p = msg->front.iov_base;
330 end = p + msg->front.iov_len;
331
332 monmap = ceph_monmap_decode(p, end);
333 if (IS_ERR(monmap)) {
334 pr_err("problem decoding monmap, %d\n",
335 (int)PTR_ERR(monmap));
336 goto out;
337 }
338
339 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
340 kfree(monmap);
341 goto out;
342 }
343
344 client->monc.monmap = monmap;
345 kfree(old);
346
347out:
348 mutex_unlock(&monc->mutex);
349 wake_up(&client->auth_wq);
350}
351
352/*
353 * statfs
354 */
355static struct ceph_mon_statfs_request *__lookup_statfs(
356 struct ceph_mon_client *monc, u64 tid)
357{
358 struct ceph_mon_statfs_request *req;
359 struct rb_node *n = monc->statfs_request_tree.rb_node;
360
361 while (n) {
362 req = rb_entry(n, struct ceph_mon_statfs_request, node);
363 if (tid < req->tid)
364 n = n->rb_left;
365 else if (tid > req->tid)
366 n = n->rb_right;
367 else
368 return req;
369 }
370 return NULL;
371}
372
373static void __insert_statfs(struct ceph_mon_client *monc,
374 struct ceph_mon_statfs_request *new)
375{
376 struct rb_node **p = &monc->statfs_request_tree.rb_node;
377 struct rb_node *parent = NULL;
378 struct ceph_mon_statfs_request *req = NULL;
379
380 while (*p) {
381 parent = *p;
382 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
383 if (new->tid < req->tid)
384 p = &(*p)->rb_left;
385 else if (new->tid > req->tid)
386 p = &(*p)->rb_right;
387 else
388 BUG();
389 }
390
391 rb_link_node(&new->node, parent, p);
392 rb_insert_color(&new->node, &monc->statfs_request_tree);
393}
394
395static void handle_statfs_reply(struct ceph_mon_client *monc,
396 struct ceph_msg *msg)
397{
398 struct ceph_mon_statfs_request *req;
399 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
400 u64 tid;
401
402 if (msg->front.iov_len != sizeof(*reply))
403 goto bad;
404 tid = le64_to_cpu(msg->hdr.tid);
405 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
406
407 mutex_lock(&monc->mutex);
408 req = __lookup_statfs(monc, tid);
409 if (req) {
410 *req->buf = reply->st;
411 req->result = 0;
412 }
413 mutex_unlock(&monc->mutex);
414 if (req)
415 complete(&req->completion);
416 return;
417
418bad:
419 pr_err("corrupt statfs reply, no tid\n");
420 ceph_msg_dump(msg);
421}
422
423/*
424 * (re)send a statfs request
425 */
426static int send_statfs(struct ceph_mon_client *monc,
427 struct ceph_mon_statfs_request *req)
428{
429 struct ceph_msg *msg;
430 struct ceph_mon_statfs *h;
431
432 dout("send_statfs tid %llu\n", req->tid);
433 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
434 if (IS_ERR(msg))
435 return PTR_ERR(msg);
436 req->request = msg;
437 msg->hdr.tid = cpu_to_le64(req->tid);
438 h = msg->front.iov_base;
439 h->monhdr.have_version = 0;
440 h->monhdr.session_mon = cpu_to_le16(-1);
441 h->monhdr.session_mon_tid = 0;
442 h->fsid = monc->monmap->fsid;
443 ceph_con_send(monc->con, msg);
444 return 0;
445}
446
447/*
448 * Do a synchronous statfs().
449 */
450int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
451{
452 struct ceph_mon_statfs_request req;
453 int err;
454
455 req.buf = buf;
456 init_completion(&req.completion);
457
458 /* allocate memory for reply */
459 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
460 if (err)
461 return err;
462
463 /* register request */
464 mutex_lock(&monc->mutex);
465 req.tid = ++monc->last_tid;
466 req.last_attempt = jiffies;
467 req.delay = BASE_DELAY_INTERVAL;
468 __insert_statfs(monc, &req);
469 monc->num_statfs_requests++;
470 mutex_unlock(&monc->mutex);
471
472 /* send request and wait */
473 err = send_statfs(monc, &req);
474 if (!err)
475 err = wait_for_completion_interruptible(&req.completion);
476
477 mutex_lock(&monc->mutex);
478 rb_erase(&req.node, &monc->statfs_request_tree);
479 monc->num_statfs_requests--;
480 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
481 mutex_unlock(&monc->mutex);
482
483 if (!err)
484 err = req.result;
485 return err;
486}
487
488/*
489 * Resend pending statfs requests.
490 */
491static void __resend_statfs(struct ceph_mon_client *monc)
492{
493 struct ceph_mon_statfs_request *req;
494 struct rb_node *p;
495
496 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
497 req = rb_entry(p, struct ceph_mon_statfs_request, node);
498 send_statfs(monc, req);
499 }
500}
501
502/*
503 * Delayed work. If we haven't mounted yet, retry. Otherwise,
504 * renew/retry subscription as needed (in case it is timing out, or we
505 * got an ENOMEM). And keep the monitor connection alive.
506 */
507static void delayed_work(struct work_struct *work)
508{
509 struct ceph_mon_client *monc =
510 container_of(work, struct ceph_mon_client, delayed_work.work);
511
512 dout("monc delayed_work\n");
513 mutex_lock(&monc->mutex);
514 if (monc->hunting) {
515 __close_session(monc);
516 __open_session(monc); /* continue hunting */
517 } else {
518 ceph_con_keepalive(monc->con);
519
520 __validate_auth(monc);
521
522 if (monc->auth->ops->is_authenticated(monc->auth))
523 __send_subscribe(monc);
524 }
525 __schedule_delayed(monc);
526 mutex_unlock(&monc->mutex);
527}
528
529/*
530 * On startup, we build a temporary monmap populated with the IPs
531 * provided by mount(2).
532 */
533static int build_initial_monmap(struct ceph_mon_client *monc)
534{
535 struct ceph_mount_args *args = monc->client->mount_args;
536 struct ceph_entity_addr *mon_addr = args->mon_addr;
537 int num_mon = args->num_mon;
538 int i;
539
540 /* build initial monmap */
541 monc->monmap = kzalloc(sizeof(*monc->monmap) +
542 num_mon*sizeof(monc->monmap->mon_inst[0]),
543 GFP_KERNEL);
544 if (!monc->monmap)
545 return -ENOMEM;
546 for (i = 0; i < num_mon; i++) {
547 monc->monmap->mon_inst[i].addr = mon_addr[i];
548 monc->monmap->mon_inst[i].addr.nonce = 0;
549 monc->monmap->mon_inst[i].name.type =
550 CEPH_ENTITY_TYPE_MON;
551 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
552 }
553 monc->monmap->num_mon = num_mon;
554 monc->have_fsid = false;
555
556 /* release addr memory */
557 kfree(args->mon_addr);
558 args->mon_addr = NULL;
559 args->num_mon = 0;
560 return 0;
561}
562
563int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
564{
565 int err = 0;
566
567 dout("init\n");
568 memset(monc, 0, sizeof(*monc));
569 monc->client = cl;
570 monc->monmap = NULL;
571 mutex_init(&monc->mutex);
572
573 err = build_initial_monmap(monc);
574 if (err)
575 goto out;
576
577 monc->con = NULL;
578
579 /* authentication */
580 monc->auth = ceph_auth_init(cl->mount_args->name,
581 cl->mount_args->secret);
582 if (IS_ERR(monc->auth))
583 return PTR_ERR(monc->auth);
584 monc->auth->want_keys =
585 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
586 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
587
588 /* msg pools */
589 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
590 sizeof(struct ceph_mon_subscribe_ack), 1, false);
591 if (err < 0)
592 goto out_monmap;
593 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
594 sizeof(struct ceph_mon_statfs_reply), 0, false);
595 if (err < 0)
596 goto out_pool1;
597 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
598 if (err < 0)
599 goto out_pool2;
600
601 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
602 monc->pending_auth = 0;
603 if (IS_ERR(monc->m_auth)) {
604 err = PTR_ERR(monc->m_auth);
605 monc->m_auth = NULL;
606 goto out_pool3;
607 }
608
609 monc->cur_mon = -1;
610 monc->hunting = true;
611 monc->sub_renew_after = jiffies;
612 monc->sub_sent = 0;
613
614 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
615 monc->statfs_request_tree = RB_ROOT;
616 monc->num_statfs_requests = 0;
617 monc->last_tid = 0;
618
619 monc->have_mdsmap = 0;
620 monc->have_osdmap = 0;
621 monc->want_next_osdmap = 1;
622 return 0;
623
624out_pool3:
625 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
626out_pool2:
627 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
628out_pool1:
629 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
630out_monmap:
631 kfree(monc->monmap);
632out:
633 return err;
634}
635
636void ceph_monc_stop(struct ceph_mon_client *monc)
637{
638 dout("stop\n");
639 cancel_delayed_work_sync(&monc->delayed_work);
640
641 mutex_lock(&monc->mutex);
642 __close_session(monc);
643 if (monc->con) {
644 monc->con->private = NULL;
645 monc->con->ops->put(monc->con);
646 monc->con = NULL;
647 }
648 mutex_unlock(&monc->mutex);
649
650 ceph_auth_destroy(monc->auth);
651
652 ceph_msg_put(monc->m_auth);
653 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
654 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
655 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
656
657 kfree(monc->monmap);
658}
659
660static void handle_auth_reply(struct ceph_mon_client *monc,
661 struct ceph_msg *msg)
662{
663 int ret;
664
665 mutex_lock(&monc->mutex);
666 monc->pending_auth = 0;
667 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
668 msg->front.iov_len,
669 monc->m_auth->front.iov_base,
670 monc->m_auth->front_max);
671 if (ret < 0) {
672 monc->client->auth_err = ret;
673 wake_up(&monc->client->auth_wq);
674 } else if (ret > 0) {
675 __send_prepared_auth_request(monc, ret);
676 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
677 dout("authenticated, starting session\n");
678
679 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
680 monc->client->msgr->inst.name.num = monc->auth->global_id;
681
682 __send_subscribe(monc);
683 __resend_statfs(monc);
684 }
685 mutex_unlock(&monc->mutex);
686}
687
688static int __validate_auth(struct ceph_mon_client *monc)
689{
690 int ret;
691
692 if (monc->pending_auth)
693 return 0;
694
695 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
696 monc->m_auth->front_max);
697 if (ret <= 0)
698 return ret; /* either an error, or no need to authenticate */
699 __send_prepared_auth_request(monc, ret);
700 return 0;
701}
702
703int ceph_monc_validate_auth(struct ceph_mon_client *monc)
704{
705 int ret;
706
707 mutex_lock(&monc->mutex);
708 ret = __validate_auth(monc);
709 mutex_unlock(&monc->mutex);
710 return ret;
711}
712
713/*
714 * handle incoming message
715 */
716static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
717{
718 struct ceph_mon_client *monc = con->private;
719 int type = le16_to_cpu(msg->hdr.type);
720
721 if (!monc)
722 return;
723
724 switch (type) {
725 case CEPH_MSG_AUTH_REPLY:
726 handle_auth_reply(monc, msg);
727 break;
728
729 case CEPH_MSG_MON_SUBSCRIBE_ACK:
730 handle_subscribe_ack(monc, msg);
731 break;
732
733 case CEPH_MSG_STATFS_REPLY:
734 handle_statfs_reply(monc, msg);
735 break;
736
737 case CEPH_MSG_MON_MAP:
738 ceph_monc_handle_map(monc, msg);
739 break;
740
741 case CEPH_MSG_MDS_MAP:
742 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
743 break;
744
745 case CEPH_MSG_OSD_MAP:
746 ceph_osdc_handle_map(&monc->client->osdc, msg);
747 break;
748
749 default:
750 pr_err("received unknown message type %d %s\n", type,
751 ceph_msg_type_name(type));
752 }
753 ceph_msg_put(msg);
754}
755
756/*
757 * Allocate memory for incoming message
758 */
759static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
760 struct ceph_msg_header *hdr,
761 int *skip)
762{
763 struct ceph_mon_client *monc = con->private;
764 int type = le16_to_cpu(hdr->type);
765 int front_len = le32_to_cpu(hdr->front_len);
766 struct ceph_msg *m = NULL;
767
768 *skip = 0;
769
770 switch (type) {
771 case CEPH_MSG_MON_SUBSCRIBE_ACK:
772 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
773 break;
774 case CEPH_MSG_STATFS_REPLY:
775 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
776 break;
777 case CEPH_MSG_AUTH_REPLY:
778 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
779 break;
780 case CEPH_MSG_MON_MAP:
781 case CEPH_MSG_MDS_MAP:
782 case CEPH_MSG_OSD_MAP:
783 m = ceph_msg_new(type, front_len, 0, 0, NULL);
784 break;
785 }
786
787 if (!m) {
788 pr_info("alloc_msg unknown type %d\n", type);
789 *skip = 1;
790 }
791 return m;
792}
793
794/*
795 * If the monitor connection resets, pick a new monitor and resubmit
796 * any pending requests.
797 */
798static void mon_fault(struct ceph_connection *con)
799{
800 struct ceph_mon_client *monc = con->private;
801
802 if (!monc)
803 return;
804
805 dout("mon_fault\n");
806 mutex_lock(&monc->mutex);
807 if (!con->private)
808 goto out;
809
810 if (monc->con && !monc->hunting)
811 pr_info("mon%d %s session lost, "
812 "hunting for new mon\n", monc->cur_mon,
813 pr_addr(&monc->con->peer_addr.in_addr));
814
815 __close_session(monc);
816 if (!monc->hunting) {
817 /* start hunting */
818 monc->hunting = true;
819 __open_session(monc);
820 } else {
821 /* already hunting, let's wait a bit */
822 __schedule_delayed(monc);
823 }
824out:
825 mutex_unlock(&monc->mutex);
826}
827
828const static struct ceph_connection_operations mon_con_ops = {
829 .get = ceph_con_get,
830 .put = ceph_con_put,
831 .dispatch = dispatch,
832 .fault = mon_fault,
833 .alloc_msg = mon_alloc_msg,
834};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..c7b4dedaace6
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 struct ceph_osd_request *req;
417 int ret = 0;
418
419 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
420 if (list_empty(&osd->o_requests)) {
421 __remove_osd(osdc, osd);
422 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
423 &osd->o_con.peer_addr,
424 sizeof(osd->o_con.peer_addr)) == 0 &&
425 !ceph_con_opened(&osd->o_con)) {
426 dout(" osd addr hasn't changed and connection never opened,"
427 " letting msgr retry");
428 /* touch each r_stamp for handle_timeout()'s benfit */
429 list_for_each_entry(req, &osd->o_requests, r_osd_item)
430 req->r_stamp = jiffies;
431 ret = -EAGAIN;
432 } else {
433 ceph_con_close(&osd->o_con);
434 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
435 osd->o_incarnation++;
436 }
437 return ret;
438}
439
440static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
441{
442 struct rb_node **p = &osdc->osds.rb_node;
443 struct rb_node *parent = NULL;
444 struct ceph_osd *osd = NULL;
445
446 while (*p) {
447 parent = *p;
448 osd = rb_entry(parent, struct ceph_osd, o_node);
449 if (new->o_osd < osd->o_osd)
450 p = &(*p)->rb_left;
451 else if (new->o_osd > osd->o_osd)
452 p = &(*p)->rb_right;
453 else
454 BUG();
455 }
456
457 rb_link_node(&new->o_node, parent, p);
458 rb_insert_color(&new->o_node, &osdc->osds);
459}
460
461static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
462{
463 struct ceph_osd *osd;
464 struct rb_node *n = osdc->osds.rb_node;
465
466 while (n) {
467 osd = rb_entry(n, struct ceph_osd, o_node);
468 if (o < osd->o_osd)
469 n = n->rb_left;
470 else if (o > osd->o_osd)
471 n = n->rb_right;
472 else
473 return osd;
474 }
475 return NULL;
476}
477
478static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
479{
480 schedule_delayed_work(&osdc->timeout_work,
481 osdc->client->mount_args->osd_keepalive_timeout * HZ);
482}
483
484static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
485{
486 cancel_delayed_work(&osdc->timeout_work);
487}
488
489/*
490 * Register request, assign tid. If this is the first request, set up
491 * the timeout event.
492 */
493static void register_request(struct ceph_osd_client *osdc,
494 struct ceph_osd_request *req)
495{
496 mutex_lock(&osdc->request_mutex);
497 req->r_tid = ++osdc->last_tid;
498 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
499 INIT_LIST_HEAD(&req->r_req_lru_item);
500
501 dout("register_request %p tid %lld\n", req, req->r_tid);
502 __insert_request(osdc, req);
503 ceph_osdc_get_request(req);
504 osdc->num_requests++;
505
506 if (osdc->num_requests == 1) {
507 dout(" first request, scheduling timeout\n");
508 __schedule_osd_timeout(osdc);
509 }
510 mutex_unlock(&osdc->request_mutex);
511}
512
513/*
514 * called under osdc->request_mutex
515 */
516static void __unregister_request(struct ceph_osd_client *osdc,
517 struct ceph_osd_request *req)
518{
519 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
520 rb_erase(&req->r_node, &osdc->requests);
521 osdc->num_requests--;
522
523 if (req->r_osd) {
524 /* make sure the original request isn't in flight. */
525 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
526
527 list_del_init(&req->r_osd_item);
528 if (list_empty(&req->r_osd->o_requests))
529 __move_osd_to_lru(osdc, req->r_osd);
530 req->r_osd = NULL;
531 }
532
533 ceph_osdc_put_request(req);
534
535 list_del_init(&req->r_req_lru_item);
536 if (osdc->num_requests == 0) {
537 dout(" no requests, canceling timeout\n");
538 __cancel_osd_timeout(osdc);
539 }
540}
541
542/*
543 * Cancel a previously queued request message
544 */
545static void __cancel_request(struct ceph_osd_request *req)
546{
547 if (req->r_sent) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0;
550 }
551 list_del_init(&req->r_req_lru_item);
552}
553
554/*
555 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
556 * (as needed), and set the request r_osd appropriately. If there is
557 * no up osd, set r_osd to NULL.
558 *
559 * Return 0 if unchanged, 1 if changed, or negative on error.
560 *
561 * Caller should hold map_sem for read and request_mutex.
562 */
563static int __map_osds(struct ceph_osd_client *osdc,
564 struct ceph_osd_request *req)
565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid;
568 int o = -1;
569 int err;
570
571 dout("map_osds %p tid %lld\n", req, req->r_tid);
572 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
573 &req->r_file_layout, osdc->osdmap);
574 if (err)
575 return err;
576 pgid = reqhead->layout.ol_pgid;
577 req->r_pgid = pgid;
578
579 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
580
581 if ((req->r_osd && req->r_osd->o_osd == o &&
582 req->r_sent >= req->r_osd->o_incarnation) ||
583 (req->r_osd == NULL && o == -1))
584 return 0; /* no change */
585
586 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
587 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
588 req->r_osd ? req->r_osd->o_osd : -1);
589
590 if (req->r_osd) {
591 __cancel_request(req);
592 list_del_init(&req->r_osd_item);
593 req->r_osd = NULL;
594 }
595
596 req->r_osd = __lookup_osd(osdc, o);
597 if (!req->r_osd && o >= 0) {
598 err = -ENOMEM;
599 req->r_osd = create_osd(osdc);
600 if (!req->r_osd)
601 goto out;
602
603 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
604 req->r_osd->o_osd = o;
605 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
606 __insert_osd(osdc, req->r_osd);
607
608 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
609 }
610
611 if (req->r_osd) {
612 __remove_osd_from_lru(req->r_osd);
613 list_add(&req->r_osd_item, &req->r_osd->o_requests);
614 }
615 err = 1; /* osd changed */
616
617out:
618 return err;
619}
620
621/*
622 * caller should hold map_sem (for read) and request_mutex
623 */
624static int __send_request(struct ceph_osd_client *osdc,
625 struct ceph_osd_request *req)
626{
627 struct ceph_osd_request_head *reqhead;
628 int err;
629
630 err = __map_osds(osdc, req);
631 if (err < 0)
632 return err;
633 if (req->r_osd == NULL) {
634 dout("send_request %p no up osds in pg\n", req);
635 ceph_monc_request_next_osdmap(&osdc->client->monc);
636 return 0;
637 }
638
639 dout("send_request %p tid %llu to osd%d flags %d\n",
640 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
641
642 reqhead = req->r_request->front.iov_base;
643 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
644 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
645 reqhead->reassert_version = req->r_reassert_version;
646
647 req->r_stamp = jiffies;
648 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
649
650 ceph_msg_get(req->r_request); /* send consumes a ref */
651 ceph_con_send(&req->r_osd->o_con, req->r_request);
652 req->r_sent = req->r_osd->o_incarnation;
653 return 0;
654}
655
656/*
657 * Timeout callback, called every N seconds when 1 or more osd
658 * requests has been active for more than N seconds. When this
659 * happens, we ping all OSDs with requests who have timed out to
660 * ensure any communications channel reset is detected. Reset the
661 * request timeouts another N seconds in the future as we go.
662 * Reschedule the timeout event another N seconds in future (unless
663 * there are no open requests).
664 */
665static void handle_timeout(struct work_struct *work)
666{
667 struct ceph_osd_client *osdc =
668 container_of(work, struct ceph_osd_client, timeout_work.work);
669 struct ceph_osd_request *req, *last_req = NULL;
670 struct ceph_osd *osd;
671 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
672 unsigned long keepalive =
673 osdc->client->mount_args->osd_keepalive_timeout * HZ;
674 unsigned long last_stamp = 0;
675 struct rb_node *p;
676 struct list_head slow_osds;
677
678 dout("timeout\n");
679 down_read(&osdc->map_sem);
680
681 ceph_monc_request_next_osdmap(&osdc->client->monc);
682
683 mutex_lock(&osdc->request_mutex);
684 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
685 req = rb_entry(p, struct ceph_osd_request, r_node);
686
687 if (req->r_resend) {
688 int err;
689
690 dout("osdc resending prev failed %lld\n", req->r_tid);
691 err = __send_request(osdc, req);
692 if (err)
693 dout("osdc failed again on %lld\n", req->r_tid);
694 else
695 req->r_resend = false;
696 continue;
697 }
698 }
699
700 /*
701 * reset osds that appear to be _really_ unresponsive. this
702 * is a failsafe measure.. we really shouldn't be getting to
703 * this point if the system is working properly. the monitors
704 * should mark the osd as failed and we should find out about
705 * it from an updated osd map.
706 */
707 while (!list_empty(&osdc->req_lru)) {
708 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
709 r_req_lru_item);
710
711 if (time_before(jiffies, req->r_stamp + timeout))
712 break;
713
714 BUG_ON(req == last_req && req->r_stamp == last_stamp);
715 last_req = req;
716 last_stamp = req->r_stamp;
717
718 osd = req->r_osd;
719 BUG_ON(!osd);
720 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
721 req->r_tid, osd->o_osd);
722 __kick_requests(osdc, osd);
723 }
724
725 /*
726 * ping osds that are a bit slow. this ensures that if there
727 * is a break in the TCP connection we will notice, and reopen
728 * a connection with that osd (from the fault callback).
729 */
730 INIT_LIST_HEAD(&slow_osds);
731 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
732 if (time_before(jiffies, req->r_stamp + keepalive))
733 break;
734
735 osd = req->r_osd;
736 BUG_ON(!osd);
737 dout(" tid %llu is slow, will send keepalive on osd%d\n",
738 req->r_tid, osd->o_osd);
739 list_move_tail(&osd->o_keepalive_item, &slow_osds);
740 }
741 while (!list_empty(&slow_osds)) {
742 osd = list_entry(slow_osds.next, struct ceph_osd,
743 o_keepalive_item);
744 list_del_init(&osd->o_keepalive_item);
745 ceph_con_keepalive(&osd->o_con);
746 }
747
748 __schedule_osd_timeout(osdc);
749 mutex_unlock(&osdc->request_mutex);
750
751 up_read(&osdc->map_sem);
752}
753
754static void handle_osds_timeout(struct work_struct *work)
755{
756 struct ceph_osd_client *osdc =
757 container_of(work, struct ceph_osd_client,
758 osds_timeout_work.work);
759 unsigned long delay =
760 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
761
762 dout("osds timeout\n");
763 down_read(&osdc->map_sem);
764 remove_old_osds(osdc, 0);
765 up_read(&osdc->map_sem);
766
767 schedule_delayed_work(&osdc->osds_timeout_work,
768 round_jiffies_relative(delay));
769}
770
771/*
772 * handle osd op reply. either call the callback if it is specified,
773 * or do the completion to wake up the waiting thread.
774 */
775static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
776 struct ceph_connection *con)
777{
778 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
779 struct ceph_osd_request *req;
780 u64 tid;
781 int numops, object_len, flags;
782
783 tid = le64_to_cpu(msg->hdr.tid);
784 if (msg->front.iov_len < sizeof(*rhead))
785 goto bad;
786 numops = le32_to_cpu(rhead->num_ops);
787 object_len = le32_to_cpu(rhead->object_len);
788 if (msg->front.iov_len != sizeof(*rhead) + object_len +
789 numops * sizeof(struct ceph_osd_op))
790 goto bad;
791 dout("handle_reply %p tid %llu\n", msg, tid);
792
793 /* lookup */
794 mutex_lock(&osdc->request_mutex);
795 req = __lookup_request(osdc, tid);
796 if (req == NULL) {
797 dout("handle_reply tid %llu dne\n", tid);
798 mutex_unlock(&osdc->request_mutex);
799 return;
800 }
801 ceph_osdc_get_request(req);
802 flags = le32_to_cpu(rhead->flags);
803
804 /*
805 * if this connection filled our message, drop our reference now, to
806 * avoid a (safe but slower) revoke later.
807 */
808 if (req->r_con_filling_msg == con && req->r_reply == msg) {
809 dout(" dropping con_filling_msg ref %p\n", con);
810 req->r_con_filling_msg = NULL;
811 ceph_con_put(con);
812 }
813
814 if (!req->r_got_reply) {
815 unsigned bytes;
816
817 req->r_result = le32_to_cpu(rhead->result);
818 bytes = le32_to_cpu(msg->hdr.data_len);
819 dout("handle_reply result %d bytes %d\n", req->r_result,
820 bytes);
821 if (req->r_result == 0)
822 req->r_result = bytes;
823
824 /* in case this is a write and we need to replay, */
825 req->r_reassert_version = rhead->reassert_version;
826
827 req->r_got_reply = 1;
828 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
829 dout("handle_reply tid %llu dup ack\n", tid);
830 mutex_unlock(&osdc->request_mutex);
831 goto done;
832 }
833
834 dout("handle_reply tid %llu flags %d\n", tid, flags);
835
836 /* either this is a read, or we got the safe response */
837 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
838 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
839 __unregister_request(osdc, req);
840
841 mutex_unlock(&osdc->request_mutex);
842
843 if (req->r_callback)
844 req->r_callback(req, msg);
845 else
846 complete(&req->r_completion);
847
848 if (flags & CEPH_OSD_FLAG_ONDISK) {
849 if (req->r_safe_callback)
850 req->r_safe_callback(req, msg);
851 complete(&req->r_safe_completion); /* fsync waiter */
852 }
853
854done:
855 ceph_osdc_put_request(req);
856 return;
857
858bad:
859 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
860 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
861 (int)sizeof(*rhead));
862 ceph_msg_dump(msg);
863}
864
865
866static int __kick_requests(struct ceph_osd_client *osdc,
867 struct ceph_osd *kickosd)
868{
869 struct ceph_osd_request *req;
870 struct rb_node *p, *n;
871 int needmap = 0;
872 int err;
873
874 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
875 if (kickosd) {
876 err = __reset_osd(osdc, kickosd);
877 if (err == -EAGAIN)
878 return 1;
879 } else {
880 for (p = rb_first(&osdc->osds); p; p = n) {
881 struct ceph_osd *osd =
882 rb_entry(p, struct ceph_osd, o_node);
883
884 n = rb_next(p);
885 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
886 memcmp(&osd->o_con.peer_addr,
887 ceph_osd_addr(osdc->osdmap,
888 osd->o_osd),
889 sizeof(struct ceph_entity_addr)) != 0)
890 __reset_osd(osdc, osd);
891 }
892 }
893
894 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
895 req = rb_entry(p, struct ceph_osd_request, r_node);
896
897 if (req->r_resend) {
898 dout(" r_resend set on tid %llu\n", req->r_tid);
899 __cancel_request(req);
900 goto kick;
901 }
902 if (req->r_osd && kickosd == req->r_osd) {
903 __cancel_request(req);
904 goto kick;
905 }
906
907 err = __map_osds(osdc, req);
908 if (err == 0)
909 continue; /* no change */
910 if (err < 0) {
911 /*
912 * FIXME: really, we should set the request
913 * error and fail if this isn't a 'nofail'
914 * request, but that's a fair bit more
915 * complicated to do. So retry!
916 */
917 dout(" setting r_resend on %llu\n", req->r_tid);
918 req->r_resend = true;
919 continue;
920 }
921 if (req->r_osd == NULL) {
922 dout("tid %llu maps to no valid osd\n", req->r_tid);
923 needmap++; /* request a newer map */
924 continue;
925 }
926
927kick:
928 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
929 req->r_osd ? req->r_osd->o_osd : -1);
930 req->r_flags |= CEPH_OSD_FLAG_RETRY;
931 err = __send_request(osdc, req);
932 if (err) {
933 dout(" setting r_resend on %llu\n", req->r_tid);
934 req->r_resend = true;
935 }
936 }
937
938 return needmap;
939}
940
941/*
942 * Resubmit osd requests whose osd or osd address has changed. Request
943 * a new osd map if osds are down, or we are otherwise unable to determine
944 * how to direct a request.
945 *
946 * Close connections to down osds.
947 *
948 * If @who is specified, resubmit requests for that specific osd.
949 *
950 * Caller should hold map_sem for read and request_mutex.
951 */
952static void kick_requests(struct ceph_osd_client *osdc,
953 struct ceph_osd *kickosd)
954{
955 int needmap;
956
957 mutex_lock(&osdc->request_mutex);
958 needmap = __kick_requests(osdc, kickosd);
959 mutex_unlock(&osdc->request_mutex);
960
961 if (needmap) {
962 dout("%d requests for down osds, need new map\n", needmap);
963 ceph_monc_request_next_osdmap(&osdc->client->monc);
964 }
965
966}
967/*
968 * Process updated osd map.
969 *
970 * The message contains any number of incremental and full maps, normally
971 * indicating some sort of topology change in the cluster. Kick requests
972 * off to different OSDs as needed.
973 */
974void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
975{
976 void *p, *end, *next;
977 u32 nr_maps, maplen;
978 u32 epoch;
979 struct ceph_osdmap *newmap = NULL, *oldmap;
980 int err;
981 struct ceph_fsid fsid;
982
983 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
984 p = msg->front.iov_base;
985 end = p + msg->front.iov_len;
986
987 /* verify fsid */
988 ceph_decode_need(&p, end, sizeof(fsid), bad);
989 ceph_decode_copy(&p, &fsid, sizeof(fsid));
990 if (ceph_check_fsid(osdc->client, &fsid) < 0)
991 return;
992
993 down_write(&osdc->map_sem);
994
995 /* incremental maps */
996 ceph_decode_32_safe(&p, end, nr_maps, bad);
997 dout(" %d inc maps\n", nr_maps);
998 while (nr_maps > 0) {
999 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1000 epoch = ceph_decode_32(&p);
1001 maplen = ceph_decode_32(&p);
1002 ceph_decode_need(&p, end, maplen, bad);
1003 next = p + maplen;
1004 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1005 dout("applying incremental map %u len %d\n",
1006 epoch, maplen);
1007 newmap = osdmap_apply_incremental(&p, next,
1008 osdc->osdmap,
1009 osdc->client->msgr);
1010 if (IS_ERR(newmap)) {
1011 err = PTR_ERR(newmap);
1012 goto bad;
1013 }
1014 BUG_ON(!newmap);
1015 if (newmap != osdc->osdmap) {
1016 ceph_osdmap_destroy(osdc->osdmap);
1017 osdc->osdmap = newmap;
1018 }
1019 } else {
1020 dout("ignoring incremental map %u len %d\n",
1021 epoch, maplen);
1022 }
1023 p = next;
1024 nr_maps--;
1025 }
1026 if (newmap)
1027 goto done;
1028
1029 /* full maps */
1030 ceph_decode_32_safe(&p, end, nr_maps, bad);
1031 dout(" %d full maps\n", nr_maps);
1032 while (nr_maps) {
1033 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1034 epoch = ceph_decode_32(&p);
1035 maplen = ceph_decode_32(&p);
1036 ceph_decode_need(&p, end, maplen, bad);
1037 if (nr_maps > 1) {
1038 dout("skipping non-latest full map %u len %d\n",
1039 epoch, maplen);
1040 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1041 dout("skipping full map %u len %d, "
1042 "older than our %u\n", epoch, maplen,
1043 osdc->osdmap->epoch);
1044 } else {
1045 dout("taking full map %u len %d\n", epoch, maplen);
1046 newmap = osdmap_decode(&p, p+maplen);
1047 if (IS_ERR(newmap)) {
1048 err = PTR_ERR(newmap);
1049 goto bad;
1050 }
1051 BUG_ON(!newmap);
1052 oldmap = osdc->osdmap;
1053 osdc->osdmap = newmap;
1054 if (oldmap)
1055 ceph_osdmap_destroy(oldmap);
1056 }
1057 p += maplen;
1058 nr_maps--;
1059 }
1060
1061done:
1062 downgrade_write(&osdc->map_sem);
1063 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1064 if (newmap)
1065 kick_requests(osdc, NULL);
1066 up_read(&osdc->map_sem);
1067 return;
1068
1069bad:
1070 pr_err("osdc handle_map corrupt msg\n");
1071 ceph_msg_dump(msg);
1072 up_write(&osdc->map_sem);
1073 return;
1074}
1075
1076
1077/*
1078 * A read request prepares specific pages that data is to be read into.
1079 * When a message is being read off the wire, we call prepare_pages to
1080 * find those pages.
1081 * 0 = success, -1 failure.
1082 */
1083static int __prepare_pages(struct ceph_connection *con,
1084 struct ceph_msg_header *hdr,
1085 struct ceph_osd_request *req,
1086 u64 tid,
1087 struct ceph_msg *m)
1088{
1089 struct ceph_osd *osd = con->private;
1090 struct ceph_osd_client *osdc;
1091 int ret = -1;
1092 int data_len = le32_to_cpu(hdr->data_len);
1093 unsigned data_off = le16_to_cpu(hdr->data_off);
1094
1095 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1096
1097 if (!osd)
1098 return -1;
1099
1100 osdc = osd->o_osdc;
1101
1102 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1103 tid, req->r_num_pages, want);
1104 if (unlikely(req->r_num_pages < want))
1105 goto out;
1106 m->pages = req->r_pages;
1107 m->nr_pages = req->r_num_pages;
1108 ret = 0; /* success */
1109out:
1110 BUG_ON(ret < 0 || m->nr_pages < want);
1111
1112 return ret;
1113}
1114
1115/*
1116 * Register request, send initial attempt.
1117 */
1118int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1119 struct ceph_osd_request *req,
1120 bool nofail)
1121{
1122 int rc = 0;
1123
1124 req->r_request->pages = req->r_pages;
1125 req->r_request->nr_pages = req->r_num_pages;
1126
1127 register_request(osdc, req);
1128
1129 down_read(&osdc->map_sem);
1130 mutex_lock(&osdc->request_mutex);
1131 /*
1132 * a racing kick_requests() may have sent the message for us
1133 * while we dropped request_mutex above, so only send now if
1134 * the request still han't been touched yet.
1135 */
1136 if (req->r_sent == 0) {
1137 rc = __send_request(osdc, req);
1138 if (rc) {
1139 if (nofail) {
1140 dout("osdc_start_request failed send, "
1141 " marking %lld\n", req->r_tid);
1142 req->r_resend = true;
1143 rc = 0;
1144 } else {
1145 __unregister_request(osdc, req);
1146 }
1147 }
1148 }
1149 mutex_unlock(&osdc->request_mutex);
1150 up_read(&osdc->map_sem);
1151 return rc;
1152}
1153
1154/*
1155 * wait for a request to complete
1156 */
1157int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1158 struct ceph_osd_request *req)
1159{
1160 int rc;
1161
1162 rc = wait_for_completion_interruptible(&req->r_completion);
1163 if (rc < 0) {
1164 mutex_lock(&osdc->request_mutex);
1165 __cancel_request(req);
1166 __unregister_request(osdc, req);
1167 mutex_unlock(&osdc->request_mutex);
1168 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1169 return rc;
1170 }
1171
1172 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1173 return req->r_result;
1174}
1175
1176/*
1177 * sync - wait for all in-flight requests to flush. avoid starvation.
1178 */
1179void ceph_osdc_sync(struct ceph_osd_client *osdc)
1180{
1181 struct ceph_osd_request *req;
1182 u64 last_tid, next_tid = 0;
1183
1184 mutex_lock(&osdc->request_mutex);
1185 last_tid = osdc->last_tid;
1186 while (1) {
1187 req = __lookup_request_ge(osdc, next_tid);
1188 if (!req)
1189 break;
1190 if (req->r_tid > last_tid)
1191 break;
1192
1193 next_tid = req->r_tid + 1;
1194 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1195 continue;
1196
1197 ceph_osdc_get_request(req);
1198 mutex_unlock(&osdc->request_mutex);
1199 dout("sync waiting on tid %llu (last is %llu)\n",
1200 req->r_tid, last_tid);
1201 wait_for_completion(&req->r_safe_completion);
1202 mutex_lock(&osdc->request_mutex);
1203 ceph_osdc_put_request(req);
1204 }
1205 mutex_unlock(&osdc->request_mutex);
1206 dout("sync done (thru tid %llu)\n", last_tid);
1207}
1208
1209/*
1210 * init, shutdown
1211 */
1212int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1213{
1214 int err;
1215
1216 dout("init\n");
1217 osdc->client = client;
1218 osdc->osdmap = NULL;
1219 init_rwsem(&osdc->map_sem);
1220 init_completion(&osdc->map_waiters);
1221 osdc->last_requested_map = 0;
1222 mutex_init(&osdc->request_mutex);
1223 osdc->last_tid = 0;
1224 osdc->osds = RB_ROOT;
1225 INIT_LIST_HEAD(&osdc->osd_lru);
1226 osdc->requests = RB_ROOT;
1227 INIT_LIST_HEAD(&osdc->req_lru);
1228 osdc->num_requests = 0;
1229 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1230 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1231
1232 schedule_delayed_work(&osdc->osds_timeout_work,
1233 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1234
1235 err = -ENOMEM;
1236 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1237 sizeof(struct ceph_osd_request));
1238 if (!osdc->req_mempool)
1239 goto out;
1240
1241 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1242 if (err < 0)
1243 goto out_mempool;
1244 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1245 OSD_OPREPLY_FRONT_LEN, 10, true);
1246 if (err < 0)
1247 goto out_msgpool;
1248 return 0;
1249
1250out_msgpool:
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252out_mempool:
1253 mempool_destroy(osdc->req_mempool);
1254out:
1255 return err;
1256}
1257
1258void ceph_osdc_stop(struct ceph_osd_client *osdc)
1259{
1260 cancel_delayed_work_sync(&osdc->timeout_work);
1261 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1262 if (osdc->osdmap) {
1263 ceph_osdmap_destroy(osdc->osdmap);
1264 osdc->osdmap = NULL;
1265 }
1266 remove_old_osds(osdc, 1);
1267 mempool_destroy(osdc->req_mempool);
1268 ceph_msgpool_destroy(&osdc->msgpool_op);
1269 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1270}
1271
1272/*
1273 * Read some contiguous pages. If we cross a stripe boundary, shorten
1274 * *plen. Return number of bytes read, or error.
1275 */
1276int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1277 struct ceph_vino vino, struct ceph_file_layout *layout,
1278 u64 off, u64 *plen,
1279 u32 truncate_seq, u64 truncate_size,
1280 struct page **pages, int num_pages)
1281{
1282 struct ceph_osd_request *req;
1283 int rc = 0;
1284
1285 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1286 vino.snap, off, *plen);
1287 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1288 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1289 NULL, 0, truncate_seq, truncate_size, NULL,
1290 false, 1);
1291 if (IS_ERR(req))
1292 return PTR_ERR(req);
1293
1294 /* it may be a short read due to an object boundary */
1295 req->r_pages = pages;
1296 num_pages = calc_pages_for(off, *plen);
1297 req->r_num_pages = num_pages;
1298
1299 dout("readpages final extent is %llu~%llu (%d pages)\n",
1300 off, *plen, req->r_num_pages);
1301
1302 rc = ceph_osdc_start_request(osdc, req, false);
1303 if (!rc)
1304 rc = ceph_osdc_wait_request(osdc, req);
1305
1306 ceph_osdc_put_request(req);
1307 dout("readpages result %d\n", rc);
1308 return rc;
1309}
1310
1311/*
1312 * do a synchronous write on N pages
1313 */
1314int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1315 struct ceph_file_layout *layout,
1316 struct ceph_snap_context *snapc,
1317 u64 off, u64 len,
1318 u32 truncate_seq, u64 truncate_size,
1319 struct timespec *mtime,
1320 struct page **pages, int num_pages,
1321 int flags, int do_sync, bool nofail)
1322{
1323 struct ceph_osd_request *req;
1324 int rc = 0;
1325
1326 BUG_ON(vino.snap != CEPH_NOSNAP);
1327 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1328 CEPH_OSD_OP_WRITE,
1329 flags | CEPH_OSD_FLAG_ONDISK |
1330 CEPH_OSD_FLAG_WRITE,
1331 snapc, do_sync,
1332 truncate_seq, truncate_size, mtime,
1333 nofail, 1);
1334 if (IS_ERR(req))
1335 return PTR_ERR(req);
1336
1337 /* it may be a short write due to an object boundary */
1338 req->r_pages = pages;
1339 req->r_num_pages = calc_pages_for(off, len);
1340 dout("writepages %llu~%llu (%d pages)\n", off, len,
1341 req->r_num_pages);
1342
1343 rc = ceph_osdc_start_request(osdc, req, nofail);
1344 if (!rc)
1345 rc = ceph_osdc_wait_request(osdc, req);
1346
1347 ceph_osdc_put_request(req);
1348 if (rc == 0)
1349 rc = len;
1350 dout("writepages result %d\n", rc);
1351 return rc;
1352}
1353
1354/*
1355 * handle incoming message
1356 */
1357static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1358{
1359 struct ceph_osd *osd = con->private;
1360 struct ceph_osd_client *osdc;
1361 int type = le16_to_cpu(msg->hdr.type);
1362
1363 if (!osd)
1364 return;
1365 osdc = osd->o_osdc;
1366
1367 switch (type) {
1368 case CEPH_MSG_OSD_MAP:
1369 ceph_osdc_handle_map(osdc, msg);
1370 break;
1371 case CEPH_MSG_OSD_OPREPLY:
1372 handle_reply(osdc, msg, con);
1373 break;
1374
1375 default:
1376 pr_err("received unknown message type %d %s\n", type,
1377 ceph_msg_type_name(type));
1378 }
1379 ceph_msg_put(msg);
1380}
1381
1382/*
1383 * lookup and return message for incoming reply
1384 */
1385static struct ceph_msg *get_reply(struct ceph_connection *con,
1386 struct ceph_msg_header *hdr,
1387 int *skip)
1388{
1389 struct ceph_osd *osd = con->private;
1390 struct ceph_osd_client *osdc = osd->o_osdc;
1391 struct ceph_msg *m;
1392 struct ceph_osd_request *req;
1393 int front = le32_to_cpu(hdr->front_len);
1394 int data_len = le32_to_cpu(hdr->data_len);
1395 u64 tid;
1396 int err;
1397
1398 tid = le64_to_cpu(hdr->tid);
1399 mutex_lock(&osdc->request_mutex);
1400 req = __lookup_request(osdc, tid);
1401 if (!req) {
1402 *skip = 1;
1403 m = NULL;
1404 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1405 osd->o_osd);
1406 goto out;
1407 }
1408
1409 if (req->r_con_filling_msg) {
1410 dout("get_reply revoking msg %p from old con %p\n",
1411 req->r_reply, req->r_con_filling_msg);
1412 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1413 ceph_con_put(req->r_con_filling_msg);
1414 }
1415
1416 if (front > req->r_reply->front.iov_len) {
1417 pr_warning("get_reply front %d > preallocated %d\n",
1418 front, (int)req->r_reply->front.iov_len);
1419 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1420 if (IS_ERR(m))
1421 goto out;
1422 ceph_msg_put(req->r_reply);
1423 req->r_reply = m;
1424 }
1425 m = ceph_msg_get(req->r_reply);
1426
1427 if (data_len > 0) {
1428 err = __prepare_pages(con, hdr, req, tid, m);
1429 if (err < 0) {
1430 *skip = 1;
1431 ceph_msg_put(m);
1432 m = ERR_PTR(err);
1433 }
1434 }
1435 *skip = 0;
1436 req->r_con_filling_msg = ceph_con_get(con);
1437 dout("get_reply tid %lld %p\n", tid, m);
1438
1439out:
1440 mutex_unlock(&osdc->request_mutex);
1441 return m;
1442
1443}
1444
1445static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1446 struct ceph_msg_header *hdr,
1447 int *skip)
1448{
1449 struct ceph_osd *osd = con->private;
1450 int type = le16_to_cpu(hdr->type);
1451 int front = le32_to_cpu(hdr->front_len);
1452
1453 switch (type) {
1454 case CEPH_MSG_OSD_MAP:
1455 return ceph_msg_new(type, front, 0, 0, NULL);
1456 case CEPH_MSG_OSD_OPREPLY:
1457 return get_reply(con, hdr, skip);
1458 default:
1459 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1460 osd->o_osd);
1461 *skip = 1;
1462 return NULL;
1463 }
1464}
1465
1466/*
1467 * Wrappers to refcount containing ceph_osd struct
1468 */
1469static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1470{
1471 struct ceph_osd *osd = con->private;
1472 if (get_osd(osd))
1473 return con;
1474 return NULL;
1475}
1476
1477static void put_osd_con(struct ceph_connection *con)
1478{
1479 struct ceph_osd *osd = con->private;
1480 put_osd(osd);
1481}
1482
1483/*
1484 * authentication
1485 */
1486static int get_authorizer(struct ceph_connection *con,
1487 void **buf, int *len, int *proto,
1488 void **reply_buf, int *reply_len, int force_new)
1489{
1490 struct ceph_osd *o = con->private;
1491 struct ceph_osd_client *osdc = o->o_osdc;
1492 struct ceph_auth_client *ac = osdc->client->monc.auth;
1493 int ret = 0;
1494
1495 if (force_new && o->o_authorizer) {
1496 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1497 o->o_authorizer = NULL;
1498 }
1499 if (o->o_authorizer == NULL) {
1500 ret = ac->ops->create_authorizer(
1501 ac, CEPH_ENTITY_TYPE_OSD,
1502 &o->o_authorizer,
1503 &o->o_authorizer_buf,
1504 &o->o_authorizer_buf_len,
1505 &o->o_authorizer_reply_buf,
1506 &o->o_authorizer_reply_buf_len);
1507 if (ret)
1508 return ret;
1509 }
1510
1511 *proto = ac->protocol;
1512 *buf = o->o_authorizer_buf;
1513 *len = o->o_authorizer_buf_len;
1514 *reply_buf = o->o_authorizer_reply_buf;
1515 *reply_len = o->o_authorizer_reply_buf_len;
1516 return 0;
1517}
1518
1519
1520static int verify_authorizer_reply(struct ceph_connection *con, int len)
1521{
1522 struct ceph_osd *o = con->private;
1523 struct ceph_osd_client *osdc = o->o_osdc;
1524 struct ceph_auth_client *ac = osdc->client->monc.auth;
1525
1526 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1527}
1528
1529static int invalidate_authorizer(struct ceph_connection *con)
1530{
1531 struct ceph_osd *o = con->private;
1532 struct ceph_osd_client *osdc = o->o_osdc;
1533 struct ceph_auth_client *ac = osdc->client->monc.auth;
1534
1535 if (ac->ops->invalidate_authorizer)
1536 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1537
1538 return ceph_monc_validate_auth(&osdc->client->monc);
1539}
1540
1541const static struct ceph_connection_operations osd_con_ops = {
1542 .get = get_osd_con,
1543 .put = put_osd_con,
1544 .dispatch = dispatch,
1545 .get_authorizer = get_authorizer,
1546 .verify_authorizer_reply = verify_authorizer_reply,
1547 .invalidate_authorizer = invalidate_authorizer,
1548 .alloc_msg = alloc_msg,
1549 .fault = osd_reset,
1550};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..b0759911e7c3
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_stamp; /* send OR check time */
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..d82fe87c2a6e
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1022 @@
1
2#include <asm/div64.h>
3
4#include "super.h"
5#include "osdmap.h"
6#include "crush/hash.h"
7#include "crush/mapper.h"
8#include "decode.h"
9#include "ceph_debug.h"
10
11char *ceph_osdmap_state_str(char *str, int len, int state)
12{
13 int flag = 0;
14
15 if (!len)
16 goto done;
17
18 *str = '\0';
19 if (state) {
20 if (state & CEPH_OSD_EXISTS) {
21 snprintf(str, len, "exists");
22 flag = 1;
23 }
24 if (state & CEPH_OSD_UP) {
25 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
26 "up");
27 flag = 1;
28 }
29 } else {
30 snprintf(str, len, "doesn't exist");
31 }
32done:
33 return str;
34}
35
36/* maps */
37
38static int calc_bits_of(unsigned t)
39{
40 int b = 0;
41 while (t) {
42 t = t >> 1;
43 b++;
44 }
45 return b;
46}
47
48/*
49 * the foo_mask is the smallest value 2^n-1 that is >= foo.
50 */
51static void calc_pg_masks(struct ceph_pg_pool_info *pi)
52{
53 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
54 pi->pgp_num_mask =
55 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
56 pi->lpg_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
58 pi->lpgp_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
60}
61
62/*
63 * decode crush map
64 */
65static int crush_decode_uniform_bucket(void **p, void *end,
66 struct crush_bucket_uniform *b)
67{
68 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
69 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
70 b->item_weight = ceph_decode_32(p);
71 return 0;
72bad:
73 return -EINVAL;
74}
75
76static int crush_decode_list_bucket(void **p, void *end,
77 struct crush_bucket_list *b)
78{
79 int j;
80 dout("crush_decode_list_bucket %p to %p\n", *p, end);
81 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
82 if (b->item_weights == NULL)
83 return -ENOMEM;
84 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85 if (b->sum_weights == NULL)
86 return -ENOMEM;
87 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
88 for (j = 0; j < b->h.size; j++) {
89 b->item_weights[j] = ceph_decode_32(p);
90 b->sum_weights[j] = ceph_decode_32(p);
91 }
92 return 0;
93bad:
94 return -EINVAL;
95}
96
97static int crush_decode_tree_bucket(void **p, void *end,
98 struct crush_bucket_tree *b)
99{
100 int j;
101 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
102 ceph_decode_32_safe(p, end, b->num_nodes, bad);
103 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
104 if (b->node_weights == NULL)
105 return -ENOMEM;
106 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
107 for (j = 0; j < b->num_nodes; j++)
108 b->node_weights[j] = ceph_decode_32(p);
109 return 0;
110bad:
111 return -EINVAL;
112}
113
114static int crush_decode_straw_bucket(void **p, void *end,
115 struct crush_bucket_straw *b)
116{
117 int j;
118 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
119 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
120 if (b->item_weights == NULL)
121 return -ENOMEM;
122 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123 if (b->straws == NULL)
124 return -ENOMEM;
125 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
126 for (j = 0; j < b->h.size; j++) {
127 b->item_weights[j] = ceph_decode_32(p);
128 b->straws[j] = ceph_decode_32(p);
129 }
130 return 0;
131bad:
132 return -EINVAL;
133}
134
135static struct crush_map *crush_decode(void *pbyval, void *end)
136{
137 struct crush_map *c;
138 int err = -EINVAL;
139 int i, j;
140 void **p = &pbyval;
141 void *start = pbyval;
142 u32 magic;
143
144 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
145
146 c = kzalloc(sizeof(*c), GFP_NOFS);
147 if (c == NULL)
148 return ERR_PTR(-ENOMEM);
149
150 ceph_decode_need(p, end, 4*sizeof(u32), bad);
151 magic = ceph_decode_32(p);
152 if (magic != CRUSH_MAGIC) {
153 pr_err("crush_decode magic %x != current %x\n",
154 (unsigned)magic, (unsigned)CRUSH_MAGIC);
155 goto bad;
156 }
157 c->max_buckets = ceph_decode_32(p);
158 c->max_rules = ceph_decode_32(p);
159 c->max_devices = ceph_decode_32(p);
160
161 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
162 if (c->device_parents == NULL)
163 goto badmem;
164 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
165 if (c->bucket_parents == NULL)
166 goto badmem;
167
168 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
169 if (c->buckets == NULL)
170 goto badmem;
171 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
172 if (c->rules == NULL)
173 goto badmem;
174
175 /* buckets */
176 for (i = 0; i < c->max_buckets; i++) {
177 int size = 0;
178 u32 alg;
179 struct crush_bucket *b;
180
181 ceph_decode_32_safe(p, end, alg, bad);
182 if (alg == 0) {
183 c->buckets[i] = NULL;
184 continue;
185 }
186 dout("crush_decode bucket %d off %x %p to %p\n",
187 i, (int)(*p-start), *p, end);
188
189 switch (alg) {
190 case CRUSH_BUCKET_UNIFORM:
191 size = sizeof(struct crush_bucket_uniform);
192 break;
193 case CRUSH_BUCKET_LIST:
194 size = sizeof(struct crush_bucket_list);
195 break;
196 case CRUSH_BUCKET_TREE:
197 size = sizeof(struct crush_bucket_tree);
198 break;
199 case CRUSH_BUCKET_STRAW:
200 size = sizeof(struct crush_bucket_straw);
201 break;
202 default:
203 err = -EINVAL;
204 goto bad;
205 }
206 BUG_ON(size == 0);
207 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
208 if (b == NULL)
209 goto badmem;
210
211 ceph_decode_need(p, end, 4*sizeof(u32), bad);
212 b->id = ceph_decode_32(p);
213 b->type = ceph_decode_16(p);
214 b->alg = ceph_decode_8(p);
215 b->hash = ceph_decode_8(p);
216 b->weight = ceph_decode_32(p);
217 b->size = ceph_decode_32(p);
218
219 dout("crush_decode bucket size %d off %x %p to %p\n",
220 b->size, (int)(*p-start), *p, end);
221
222 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
223 if (b->items == NULL)
224 goto badmem;
225 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
226 if (b->perm == NULL)
227 goto badmem;
228 b->perm_n = 0;
229
230 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
231 for (j = 0; j < b->size; j++)
232 b->items[j] = ceph_decode_32(p);
233
234 switch (b->alg) {
235 case CRUSH_BUCKET_UNIFORM:
236 err = crush_decode_uniform_bucket(p, end,
237 (struct crush_bucket_uniform *)b);
238 if (err < 0)
239 goto bad;
240 break;
241 case CRUSH_BUCKET_LIST:
242 err = crush_decode_list_bucket(p, end,
243 (struct crush_bucket_list *)b);
244 if (err < 0)
245 goto bad;
246 break;
247 case CRUSH_BUCKET_TREE:
248 err = crush_decode_tree_bucket(p, end,
249 (struct crush_bucket_tree *)b);
250 if (err < 0)
251 goto bad;
252 break;
253 case CRUSH_BUCKET_STRAW:
254 err = crush_decode_straw_bucket(p, end,
255 (struct crush_bucket_straw *)b);
256 if (err < 0)
257 goto bad;
258 break;
259 }
260 }
261
262 /* rules */
263 dout("rule vec is %p\n", c->rules);
264 for (i = 0; i < c->max_rules; i++) {
265 u32 yes;
266 struct crush_rule *r;
267
268 ceph_decode_32_safe(p, end, yes, bad);
269 if (!yes) {
270 dout("crush_decode NO rule %d off %x %p to %p\n",
271 i, (int)(*p-start), *p, end);
272 c->rules[i] = NULL;
273 continue;
274 }
275
276 dout("crush_decode rule %d off %x %p to %p\n",
277 i, (int)(*p-start), *p, end);
278
279 /* len */
280 ceph_decode_32_safe(p, end, yes, bad);
281#if BITS_PER_LONG == 32
282 err = -EINVAL;
283 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
284 goto bad;
285#endif
286 r = c->rules[i] = kmalloc(sizeof(*r) +
287 yes*sizeof(struct crush_rule_step),
288 GFP_NOFS);
289 if (r == NULL)
290 goto badmem;
291 dout(" rule %d is at %p\n", i, r);
292 r->len = yes;
293 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
294 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
295 for (j = 0; j < r->len; j++) {
296 r->steps[j].op = ceph_decode_32(p);
297 r->steps[j].arg1 = ceph_decode_32(p);
298 r->steps[j].arg2 = ceph_decode_32(p);
299 }
300 }
301
302 /* ignore trailing name maps. */
303
304 dout("crush_decode success\n");
305 return c;
306
307badmem:
308 err = -ENOMEM;
309bad:
310 dout("crush_decode fail %d\n", err);
311 crush_destroy(c);
312 return ERR_PTR(err);
313}
314
315
316/*
317 * osd map
318 */
319void ceph_osdmap_destroy(struct ceph_osdmap *map)
320{
321 dout("osdmap_destroy %p\n", map);
322 if (map->crush)
323 crush_destroy(map->crush);
324 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
325 struct ceph_pg_mapping *pg =
326 rb_entry(rb_first(&map->pg_temp),
327 struct ceph_pg_mapping, node);
328 rb_erase(&pg->node, &map->pg_temp);
329 kfree(pg);
330 }
331 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
332 struct ceph_pg_pool_info *pi =
333 rb_entry(rb_first(&map->pg_pools),
334 struct ceph_pg_pool_info, node);
335 rb_erase(&pi->node, &map->pg_pools);
336 kfree(pi);
337 }
338 kfree(map->osd_state);
339 kfree(map->osd_weight);
340 kfree(map->osd_addr);
341 kfree(map);
342}
343
344/*
345 * adjust max osd value. reallocate arrays.
346 */
347static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
348{
349 u8 *state;
350 struct ceph_entity_addr *addr;
351 u32 *weight;
352
353 state = kcalloc(max, sizeof(*state), GFP_NOFS);
354 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
355 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
356 if (state == NULL || addr == NULL || weight == NULL) {
357 kfree(state);
358 kfree(addr);
359 kfree(weight);
360 return -ENOMEM;
361 }
362
363 /* copy old? */
364 if (map->osd_state) {
365 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
366 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
367 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
368 kfree(map->osd_state);
369 kfree(map->osd_addr);
370 kfree(map->osd_weight);
371 }
372
373 map->osd_state = state;
374 map->osd_weight = weight;
375 map->osd_addr = addr;
376 map->max_osd = max;
377 return 0;
378}
379
380/*
381 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
382 * to a set of osds)
383 */
384static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
385{
386 u64 a = *(u64 *)&l;
387 u64 b = *(u64 *)&r;
388
389 if (a < b)
390 return -1;
391 if (a > b)
392 return 1;
393 return 0;
394}
395
396static int __insert_pg_mapping(struct ceph_pg_mapping *new,
397 struct rb_root *root)
398{
399 struct rb_node **p = &root->rb_node;
400 struct rb_node *parent = NULL;
401 struct ceph_pg_mapping *pg = NULL;
402 int c;
403
404 while (*p) {
405 parent = *p;
406 pg = rb_entry(parent, struct ceph_pg_mapping, node);
407 c = pgid_cmp(new->pgid, pg->pgid);
408 if (c < 0)
409 p = &(*p)->rb_left;
410 else if (c > 0)
411 p = &(*p)->rb_right;
412 else
413 return -EEXIST;
414 }
415
416 rb_link_node(&new->node, parent, p);
417 rb_insert_color(&new->node, root);
418 return 0;
419}
420
421static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
422 struct ceph_pg pgid)
423{
424 struct rb_node *n = root->rb_node;
425 struct ceph_pg_mapping *pg;
426 int c;
427
428 while (n) {
429 pg = rb_entry(n, struct ceph_pg_mapping, node);
430 c = pgid_cmp(pgid, pg->pgid);
431 if (c < 0)
432 n = n->rb_left;
433 else if (c > 0)
434 n = n->rb_right;
435 else
436 return pg;
437 }
438 return NULL;
439}
440
441/*
442 * rbtree of pg pool info
443 */
444static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
445{
446 struct rb_node **p = &root->rb_node;
447 struct rb_node *parent = NULL;
448 struct ceph_pg_pool_info *pi = NULL;
449
450 while (*p) {
451 parent = *p;
452 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
453 if (new->id < pi->id)
454 p = &(*p)->rb_left;
455 else if (new->id > pi->id)
456 p = &(*p)->rb_right;
457 else
458 return -EEXIST;
459 }
460
461 rb_link_node(&new->node, parent, p);
462 rb_insert_color(&new->node, root);
463 return 0;
464}
465
466static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
467{
468 struct ceph_pg_pool_info *pi;
469 struct rb_node *n = root->rb_node;
470
471 while (n) {
472 pi = rb_entry(n, struct ceph_pg_pool_info, node);
473 if (id < pi->id)
474 n = n->rb_left;
475 else if (id > pi->id)
476 n = n->rb_right;
477 else
478 return pi;
479 }
480 return NULL;
481}
482
483void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
484{
485 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
486 calc_pg_masks(pi);
487 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
488 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
489}
490
491/*
492 * decode a full map.
493 */
494struct ceph_osdmap *osdmap_decode(void **p, void *end)
495{
496 struct ceph_osdmap *map;
497 u16 version;
498 u32 len, max, i;
499 u8 ev;
500 int err = -EINVAL;
501 void *start = *p;
502 struct ceph_pg_pool_info *pi;
503
504 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
505
506 map = kzalloc(sizeof(*map), GFP_NOFS);
507 if (map == NULL)
508 return ERR_PTR(-ENOMEM);
509 map->pg_temp = RB_ROOT;
510
511 ceph_decode_16_safe(p, end, version, bad);
512 if (version > CEPH_OSDMAP_VERSION) {
513 pr_warning("got unknown v %d > %d of osdmap\n", version,
514 CEPH_OSDMAP_VERSION);
515 goto bad;
516 }
517
518 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
519 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
520 map->epoch = ceph_decode_32(p);
521 ceph_decode_copy(p, &map->created, sizeof(map->created));
522 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
523
524 ceph_decode_32_safe(p, end, max, bad);
525 while (max--) {
526 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
527 pi = kmalloc(sizeof(*pi), GFP_NOFS);
528 if (!pi)
529 goto bad;
530 pi->id = ceph_decode_32(p);
531 ev = ceph_decode_8(p); /* encoding version */
532 if (ev > CEPH_PG_POOL_VERSION) {
533 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
534 ev, CEPH_PG_POOL_VERSION);
535 goto bad;
536 }
537 __decode_pool(p, pi);
538 __insert_pg_pool(&map->pg_pools, pi);
539 }
540 ceph_decode_32_safe(p, end, map->pool_max, bad);
541
542 ceph_decode_32_safe(p, end, map->flags, bad);
543
544 max = ceph_decode_32(p);
545
546 /* (re)alloc osd arrays */
547 err = osdmap_set_max_osd(map, max);
548 if (err < 0)
549 goto bad;
550 dout("osdmap_decode max_osd = %d\n", map->max_osd);
551
552 /* osds */
553 err = -EINVAL;
554 ceph_decode_need(p, end, 3*sizeof(u32) +
555 map->max_osd*(1 + sizeof(*map->osd_weight) +
556 sizeof(*map->osd_addr)), bad);
557 *p += 4; /* skip length field (should match max) */
558 ceph_decode_copy(p, map->osd_state, map->max_osd);
559
560 *p += 4; /* skip length field (should match max) */
561 for (i = 0; i < map->max_osd; i++)
562 map->osd_weight[i] = ceph_decode_32(p);
563
564 *p += 4; /* skip length field (should match max) */
565 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
566 for (i = 0; i < map->max_osd; i++)
567 ceph_decode_addr(&map->osd_addr[i]);
568
569 /* pg_temp */
570 ceph_decode_32_safe(p, end, len, bad);
571 for (i = 0; i < len; i++) {
572 int n, j;
573 struct ceph_pg pgid;
574 struct ceph_pg_mapping *pg;
575
576 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
577 ceph_decode_copy(p, &pgid, sizeof(pgid));
578 n = ceph_decode_32(p);
579 ceph_decode_need(p, end, n * sizeof(u32), bad);
580 err = -ENOMEM;
581 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
582 if (!pg)
583 goto bad;
584 pg->pgid = pgid;
585 pg->len = n;
586 for (j = 0; j < n; j++)
587 pg->osds[j] = ceph_decode_32(p);
588
589 err = __insert_pg_mapping(pg, &map->pg_temp);
590 if (err)
591 goto bad;
592 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
593 }
594
595 /* crush */
596 ceph_decode_32_safe(p, end, len, bad);
597 dout("osdmap_decode crush len %d from off 0x%x\n", len,
598 (int)(*p - start));
599 ceph_decode_need(p, end, len, bad);
600 map->crush = crush_decode(*p, end);
601 *p += len;
602 if (IS_ERR(map->crush)) {
603 err = PTR_ERR(map->crush);
604 map->crush = NULL;
605 goto bad;
606 }
607
608 /* ignore the rest of the map */
609 *p = end;
610
611 dout("osdmap_decode done %p %p\n", *p, end);
612 return map;
613
614bad:
615 dout("osdmap_decode fail\n");
616 ceph_osdmap_destroy(map);
617 return ERR_PTR(err);
618}
619
620/*
621 * decode and apply an incremental map update.
622 */
623struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
624 struct ceph_osdmap *map,
625 struct ceph_messenger *msgr)
626{
627 struct crush_map *newcrush = NULL;
628 struct ceph_fsid fsid;
629 u32 epoch = 0;
630 struct ceph_timespec modified;
631 u32 len, pool;
632 __s32 new_pool_max, new_flags, max;
633 void *start = *p;
634 int err = -EINVAL;
635 u16 version;
636 struct rb_node *rbp;
637
638 ceph_decode_16_safe(p, end, version, bad);
639 if (version > CEPH_OSDMAP_INC_VERSION) {
640 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
641 CEPH_OSDMAP_INC_VERSION);
642 goto bad;
643 }
644
645 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
646 bad);
647 ceph_decode_copy(p, &fsid, sizeof(fsid));
648 epoch = ceph_decode_32(p);
649 BUG_ON(epoch != map->epoch+1);
650 ceph_decode_copy(p, &modified, sizeof(modified));
651 new_pool_max = ceph_decode_32(p);
652 new_flags = ceph_decode_32(p);
653
654 /* full map? */
655 ceph_decode_32_safe(p, end, len, bad);
656 if (len > 0) {
657 dout("apply_incremental full map len %d, %p to %p\n",
658 len, *p, end);
659 return osdmap_decode(p, min(*p+len, end));
660 }
661
662 /* new crush? */
663 ceph_decode_32_safe(p, end, len, bad);
664 if (len > 0) {
665 dout("apply_incremental new crush map len %d, %p to %p\n",
666 len, *p, end);
667 newcrush = crush_decode(*p, min(*p+len, end));
668 if (IS_ERR(newcrush))
669 return ERR_PTR(PTR_ERR(newcrush));
670 }
671
672 /* new flags? */
673 if (new_flags >= 0)
674 map->flags = new_flags;
675 if (new_pool_max >= 0)
676 map->pool_max = new_pool_max;
677
678 ceph_decode_need(p, end, 5*sizeof(u32), bad);
679
680 /* new max? */
681 max = ceph_decode_32(p);
682 if (max >= 0) {
683 err = osdmap_set_max_osd(map, max);
684 if (err < 0)
685 goto bad;
686 }
687
688 map->epoch++;
689 map->modified = map->modified;
690 if (newcrush) {
691 if (map->crush)
692 crush_destroy(map->crush);
693 map->crush = newcrush;
694 newcrush = NULL;
695 }
696
697 /* new_pool */
698 ceph_decode_32_safe(p, end, len, bad);
699 while (len--) {
700 __u8 ev;
701 struct ceph_pg_pool_info *pi;
702
703 ceph_decode_32_safe(p, end, pool, bad);
704 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
705 ev = ceph_decode_8(p); /* encoding version */
706 if (ev > CEPH_PG_POOL_VERSION) {
707 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
708 ev, CEPH_PG_POOL_VERSION);
709 goto bad;
710 }
711 pi = __lookup_pg_pool(&map->pg_pools, pool);
712 if (!pi) {
713 pi = kmalloc(sizeof(*pi), GFP_NOFS);
714 if (!pi) {
715 err = -ENOMEM;
716 goto bad;
717 }
718 pi->id = pool;
719 __insert_pg_pool(&map->pg_pools, pi);
720 }
721 __decode_pool(p, pi);
722 }
723
724 /* old_pool */
725 ceph_decode_32_safe(p, end, len, bad);
726 while (len--) {
727 struct ceph_pg_pool_info *pi;
728
729 ceph_decode_32_safe(p, end, pool, bad);
730 pi = __lookup_pg_pool(&map->pg_pools, pool);
731 if (pi) {
732 rb_erase(&pi->node, &map->pg_pools);
733 kfree(pi);
734 }
735 }
736
737 /* new_up */
738 err = -EINVAL;
739 ceph_decode_32_safe(p, end, len, bad);
740 while (len--) {
741 u32 osd;
742 struct ceph_entity_addr addr;
743 ceph_decode_32_safe(p, end, osd, bad);
744 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
745 ceph_decode_addr(&addr);
746 pr_info("osd%d up\n", osd);
747 BUG_ON(osd >= map->max_osd);
748 map->osd_state[osd] |= CEPH_OSD_UP;
749 map->osd_addr[osd] = addr;
750 }
751
752 /* new_down */
753 ceph_decode_32_safe(p, end, len, bad);
754 while (len--) {
755 u32 osd;
756 ceph_decode_32_safe(p, end, osd, bad);
757 (*p)++; /* clean flag */
758 pr_info("osd%d down\n", osd);
759 if (osd < map->max_osd)
760 map->osd_state[osd] &= ~CEPH_OSD_UP;
761 }
762
763 /* new_weight */
764 ceph_decode_32_safe(p, end, len, bad);
765 while (len--) {
766 u32 osd, off;
767 ceph_decode_need(p, end, sizeof(u32)*2, bad);
768 osd = ceph_decode_32(p);
769 off = ceph_decode_32(p);
770 pr_info("osd%d weight 0x%x %s\n", osd, off,
771 off == CEPH_OSD_IN ? "(in)" :
772 (off == CEPH_OSD_OUT ? "(out)" : ""));
773 if (osd < map->max_osd)
774 map->osd_weight[osd] = off;
775 }
776
777 /* new_pg_temp */
778 rbp = rb_first(&map->pg_temp);
779 ceph_decode_32_safe(p, end, len, bad);
780 while (len--) {
781 struct ceph_pg_mapping *pg;
782 int j;
783 struct ceph_pg pgid;
784 u32 pglen;
785 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
786 ceph_decode_copy(p, &pgid, sizeof(pgid));
787 pglen = ceph_decode_32(p);
788
789 /* remove any? */
790 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
791 node)->pgid, pgid) <= 0) {
792 struct rb_node *cur = rbp;
793 rbp = rb_next(rbp);
794 dout(" removed pg_temp %llx\n",
795 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
796 node)->pgid);
797 rb_erase(cur, &map->pg_temp);
798 }
799
800 if (pglen) {
801 /* insert */
802 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
803 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
804 if (!pg) {
805 err = -ENOMEM;
806 goto bad;
807 }
808 pg->pgid = pgid;
809 pg->len = pglen;
810 for (j = 0; j < pglen; j++)
811 pg->osds[j] = ceph_decode_32(p);
812 err = __insert_pg_mapping(pg, &map->pg_temp);
813 if (err)
814 goto bad;
815 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
816 pglen);
817 }
818 }
819 while (rbp) {
820 struct rb_node *cur = rbp;
821 rbp = rb_next(rbp);
822 dout(" removed pg_temp %llx\n",
823 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
824 node)->pgid);
825 rb_erase(cur, &map->pg_temp);
826 }
827
828 /* ignore the rest */
829 *p = end;
830 return map;
831
832bad:
833 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
834 epoch, (int)(*p - start), *p, start, end);
835 print_hex_dump(KERN_DEBUG, "osdmap: ",
836 DUMP_PREFIX_OFFSET, 16, 1,
837 start, end - start, true);
838 if (newcrush)
839 crush_destroy(newcrush);
840 return ERR_PTR(err);
841}
842
843
844
845
846/*
847 * calculate file layout from given offset, length.
848 * fill in correct oid, logical length, and object extent
849 * offset, length.
850 *
851 * for now, we write only a single su, until we can
852 * pass a stride back to the caller.
853 */
854void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
855 u64 off, u64 *plen,
856 u64 *ono,
857 u64 *oxoff, u64 *oxlen)
858{
859 u32 osize = le32_to_cpu(layout->fl_object_size);
860 u32 su = le32_to_cpu(layout->fl_stripe_unit);
861 u32 sc = le32_to_cpu(layout->fl_stripe_count);
862 u32 bl, stripeno, stripepos, objsetno;
863 u32 su_per_object;
864 u64 t, su_offset;
865
866 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
867 osize, su);
868 su_per_object = osize / su;
869 dout("osize %u / su %u = su_per_object %u\n", osize, su,
870 su_per_object);
871
872 BUG_ON((su & ~PAGE_MASK) != 0);
873 /* bl = *off / su; */
874 t = off;
875 do_div(t, su);
876 bl = t;
877 dout("off %llu / su %u = bl %u\n", off, su, bl);
878
879 stripeno = bl / sc;
880 stripepos = bl % sc;
881 objsetno = stripeno / su_per_object;
882
883 *ono = objsetno * sc + stripepos;
884 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
885
886 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
887 t = off;
888 su_offset = do_div(t, su);
889 *oxoff = su_offset + (stripeno % su_per_object) * su;
890
891 /*
892 * Calculate the length of the extent being written to the selected
893 * object. This is the minimum of the full length requested (plen) or
894 * the remainder of the current stripe being written to.
895 */
896 *oxlen = min_t(u64, *plen, su - su_offset);
897 *plen = *oxlen;
898
899 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
900}
901
902/*
903 * calculate an object layout (i.e. pgid) from an oid,
904 * file_layout, and osdmap
905 */
906int ceph_calc_object_layout(struct ceph_object_layout *ol,
907 const char *oid,
908 struct ceph_file_layout *fl,
909 struct ceph_osdmap *osdmap)
910{
911 unsigned num, num_mask;
912 struct ceph_pg pgid;
913 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
914 int poolid = le32_to_cpu(fl->fl_pg_pool);
915 struct ceph_pg_pool_info *pool;
916 unsigned ps;
917
918 BUG_ON(!osdmap);
919
920 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
921 if (!pool)
922 return -EIO;
923 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
924 if (preferred >= 0) {
925 ps += preferred;
926 num = le32_to_cpu(pool->v.lpg_num);
927 num_mask = pool->lpg_num_mask;
928 } else {
929 num = le32_to_cpu(pool->v.pg_num);
930 num_mask = pool->pg_num_mask;
931 }
932
933 pgid.ps = cpu_to_le16(ps);
934 pgid.preferred = cpu_to_le16(preferred);
935 pgid.pool = fl->fl_pg_pool;
936 if (preferred >= 0)
937 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
938 (int)preferred);
939 else
940 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
941
942 ol->ol_pgid = pgid;
943 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
944 return 0;
945}
946
947/*
948 * Calculate raw osd vector for the given pgid. Return pointer to osd
949 * array, or NULL on failure.
950 */
951static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
952 int *osds, int *num)
953{
954 struct ceph_pg_mapping *pg;
955 struct ceph_pg_pool_info *pool;
956 int ruleno;
957 unsigned poolid, ps, pps;
958 int preferred;
959
960 /* pg_temp? */
961 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
962 if (pg) {
963 *num = pg->len;
964 return pg->osds;
965 }
966
967 /* crush */
968 poolid = le32_to_cpu(pgid.pool);
969 ps = le16_to_cpu(pgid.ps);
970 preferred = (s16)le16_to_cpu(pgid.preferred);
971
972 /* don't forcefeed bad device ids to crush */
973 if (preferred >= osdmap->max_osd ||
974 preferred >= osdmap->crush->max_devices)
975 preferred = -1;
976
977 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
978 if (!pool)
979 return NULL;
980 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
981 pool->v.type, pool->v.size);
982 if (ruleno < 0) {
983 pr_err("no crush rule pool %d type %d size %d\n",
984 poolid, pool->v.type, pool->v.size);
985 return NULL;
986 }
987
988 if (preferred >= 0)
989 pps = ceph_stable_mod(ps,
990 le32_to_cpu(pool->v.lpgp_num),
991 pool->lpgp_num_mask);
992 else
993 pps = ceph_stable_mod(ps,
994 le32_to_cpu(pool->v.pgp_num),
995 pool->pgp_num_mask);
996 pps += poolid;
997 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
998 min_t(int, pool->v.size, *num),
999 preferred, osdmap->osd_weight);
1000 return osds;
1001}
1002
1003/*
1004 * Return primary osd for given pgid, or -1 if none.
1005 */
1006int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1007{
1008 int rawosds[10], *osds;
1009 int i, num = ARRAY_SIZE(rawosds);
1010
1011 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1012 if (!osds)
1013 return -1;
1014
1015 /* primary is first up osd */
1016 for (i = 0; i < num; i++)
1017 if (ceph_osd_is_up(osdmap, osds[i])) {
1018 return osds[i];
1019 break;
1020 }
1021 return -1;
1022}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..1fb55afb2642
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26};
27
28struct ceph_pg_mapping {
29 struct rb_node node;
30 struct ceph_pg pgid;
31 int len;
32 int osds[];
33};
34
35struct ceph_osdmap {
36 struct ceph_fsid fsid;
37 u32 epoch;
38 u32 mkfs_epoch;
39 struct ceph_timespec created, modified;
40
41 u32 flags; /* CEPH_OSDMAP_* */
42
43 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
44 u8 *osd_state; /* CEPH_OSD_* */
45 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
46 struct ceph_entity_addr *osd_addr;
47
48 struct rb_root pg_temp;
49 struct rb_root pg_pools;
50 u32 pool_max;
51
52 /* the CRUSH map specifies the mapping of placement groups to
53 * the list of osds that store+replicate them. */
54 struct crush_map *crush;
55};
56
57/*
58 * file layout helpers
59 */
60#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
61#define ceph_file_layout_stripe_count(l) \
62 ((__s32)le32_to_cpu((l).fl_stripe_count))
63#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
64#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
65#define ceph_file_layout_object_su(l) \
66 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
67#define ceph_file_layout_pg_preferred(l) \
68 ((__s32)le32_to_cpu((l).fl_pg_preferred))
69#define ceph_file_layout_pg_pool(l) \
70 ((__s32)le32_to_cpu((l).fl_pg_pool))
71
72static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
73{
74 return le32_to_cpu(l->fl_stripe_unit) *
75 le32_to_cpu(l->fl_stripe_count);
76}
77
78/* "period" == bytes before i start on a new set of objects */
79static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
80{
81 return le32_to_cpu(l->fl_object_size) *
82 le32_to_cpu(l->fl_stripe_count);
83}
84
85
86static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
87{
88 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
89}
90
91static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
92{
93 return map && (map->flags & flag);
94}
95
96extern char *ceph_osdmap_state_str(char *str, int len, int state);
97
98static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
99 int osd)
100{
101 if (osd >= map->max_osd)
102 return NULL;
103 return &map->osd_addr[osd];
104}
105
106extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
107extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
108 struct ceph_osdmap *map,
109 struct ceph_messenger *msgr);
110extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
111
112/* calculate mapping of a file extent to an object */
113extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
114 u64 off, u64 *plen,
115 u64 *bno, u64 *oxoff, u64 *oxlen);
116
117/* calculate mapping of object to a placement group */
118extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
119 const char *oid,
120 struct ceph_file_layout *fl,
121 struct ceph_osdmap *osdmap);
122extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
123 struct ceph_pg pgid);
124
125#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..370e93695474
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
1
2#include <linux/pagemap.h>
3#include <linux/highmem.h>
4
5#include "pagelist.h"
6
7int ceph_pagelist_release(struct ceph_pagelist *pl)
8{
9 if (pl->mapped_tail)
10 kunmap(pl->mapped_tail);
11 while (!list_empty(&pl->head)) {
12 struct page *page = list_first_entry(&pl->head, struct page,
13 lru);
14 list_del(&page->lru);
15 __free_page(page);
16 }
17 return 0;
18}
19
20static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
21{
22 struct page *page = alloc_page(GFP_NOFS);
23 if (!page)
24 return -ENOMEM;
25 pl->room += PAGE_SIZE;
26 list_add_tail(&page->lru, &pl->head);
27 if (pl->mapped_tail)
28 kunmap(pl->mapped_tail);
29 pl->mapped_tail = kmap(page);
30 return 0;
31}
32
33int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
34{
35 while (pl->room < len) {
36 size_t bit = pl->room;
37 int ret;
38
39 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
40 buf, bit);
41 pl->length += bit;
42 pl->room -= bit;
43 buf += bit;
44 len -= bit;
45 ret = ceph_pagelist_addpage(pl);
46 if (ret)
47 return ret;
48 }
49
50 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
51 pl->length += len;
52 pl->room -= len;
53 return 0;
54}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..26ac8b89a676
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 4
15#define CEPH_OSDMAP_VERSION 4
16
17/*
18 * fs id
19 */
20struct ceph_fsid {
21 unsigned char fsid[16];
22};
23
24static inline int ceph_fsid_compare(const struct ceph_fsid *a,
25 const struct ceph_fsid *b)
26{
27 return memcmp(a, b, sizeof(*a));
28}
29
30/*
31 * ino, object, etc.
32 */
33typedef __le64 ceph_snapid_t;
34#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
35#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
36#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
37
38struct ceph_timespec {
39 __le32 tv_sec;
40 __le32 tv_nsec;
41} __attribute__ ((packed));
42
43
44/*
45 * object layout - how objects are mapped into PGs
46 */
47#define CEPH_OBJECT_LAYOUT_HASH 1
48#define CEPH_OBJECT_LAYOUT_LINEAR 2
49#define CEPH_OBJECT_LAYOUT_HASHINO 3
50
51/*
52 * pg layout -- how PGs are mapped onto (sets of) OSDs
53 */
54#define CEPH_PG_LAYOUT_CRUSH 0
55#define CEPH_PG_LAYOUT_HASH 1
56#define CEPH_PG_LAYOUT_LINEAR 2
57#define CEPH_PG_LAYOUT_HYBRID 3
58
59
60/*
61 * placement group.
62 * we encode this into one __le64.
63 */
64struct ceph_pg {
65 __le16 preferred; /* preferred primary osd */
66 __le16 ps; /* placement seed */
67 __le32 pool; /* object pool */
68} __attribute__ ((packed));
69
70/*
71 * pg_pool is a set of pgs storing a pool of objects
72 *
73 * pg_num -- base number of pseudorandomly placed pgs
74 *
75 * pgp_num -- effective number when calculating pg placement. this
76 * is used for pg_num increases. new pgs result in data being "split"
77 * into new pgs. for this to proceed smoothly, new pgs are intiially
78 * colocated with their parents; that is, pgp_num doesn't increase
79 * until the new pgs have successfully split. only _then_ are the new
80 * pgs placed independently.
81 *
82 * lpg_num -- localized pg count (per device). replicas are randomly
83 * selected.
84 *
85 * lpgp_num -- as above.
86 */
87#define CEPH_PG_TYPE_REP 1
88#define CEPH_PG_TYPE_RAID4 2
89#define CEPH_PG_POOL_VERSION 2
90struct ceph_pg_pool {
91 __u8 type; /* CEPH_PG_TYPE_* */
92 __u8 size; /* number of osds in each pg */
93 __u8 crush_ruleset; /* crush placement rule */
94 __u8 object_hash; /* hash mapping object name to ps */
95 __le32 pg_num, pgp_num; /* number of pg's */
96 __le32 lpg_num, lpgp_num; /* number of localized pg's */
97 __le32 last_change; /* most recent epoch changed */
98 __le64 snap_seq; /* seq for per-pool snapshot */
99 __le32 snap_epoch; /* epoch of last snap */
100 __le32 num_snaps;
101 __le32 num_removed_snap_intervals;
102 __le64 uid;
103} __attribute__ ((packed));
104
105/*
106 * stable_mod func is used to control number of placement groups.
107 * similar to straight-up modulo, but produces a stable mapping as b
108 * increases over time. b is the number of bins, and bmask is the
109 * containing power of 2 minus 1.
110 *
111 * b <= bmask and bmask=(2**n)-1
112 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
113 */
114static inline int ceph_stable_mod(int x, int b, int bmask)
115{
116 if ((x & bmask) < b)
117 return x & bmask;
118 else
119 return x & (bmask >> 1);
120}
121
122/*
123 * object layout - how a given object should be stored.
124 */
125struct ceph_object_layout {
126 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
127 __le32 ol_stripe_unit; /* for per-object parity, if any */
128} __attribute__ ((packed));
129
130/*
131 * compound epoch+version, used by storage layer to serialize mutations
132 */
133struct ceph_eversion {
134 __le32 epoch;
135 __le64 version;
136} __attribute__ ((packed));
137
138/*
139 * osd map bits
140 */
141
142/* status bits */
143#define CEPH_OSD_EXISTS 1
144#define CEPH_OSD_UP 2
145
146/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
147#define CEPH_OSD_IN 0x10000
148#define CEPH_OSD_OUT 0
149
150
151/*
152 * osd map flag bits
153 */
154#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
155#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
156#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
157#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
158#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
159
160/*
161 * osd ops
162 */
163#define CEPH_OSD_OP_MODE 0xf000
164#define CEPH_OSD_OP_MODE_RD 0x1000
165#define CEPH_OSD_OP_MODE_WR 0x2000
166#define CEPH_OSD_OP_MODE_RMW 0x3000
167#define CEPH_OSD_OP_MODE_SUB 0x4000
168
169#define CEPH_OSD_OP_TYPE 0x0f00
170#define CEPH_OSD_OP_TYPE_LOCK 0x0100
171#define CEPH_OSD_OP_TYPE_DATA 0x0200
172#define CEPH_OSD_OP_TYPE_ATTR 0x0300
173#define CEPH_OSD_OP_TYPE_EXEC 0x0400
174#define CEPH_OSD_OP_TYPE_PG 0x0500
175
176enum {
177 /** data **/
178 /* read */
179 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
180 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
181
182 /* fancy read */
183 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
184
185 /* write */
186 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
187 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
188 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
189 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
190 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
191
192 /* fancy write */
193 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
194 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
195 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
196 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
197
198 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
199 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
200 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
201
202 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
203
204 /** attrs **/
205 /* read */
206 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
207 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
208
209 /* write */
210 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
213 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
214
215 /** subop **/
216 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
217 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
218 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
219 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
220 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
221
222 /** lock **/
223 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
224 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
225 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
226 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
227 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
228 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
229
230 /** exec **/
231 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
232
233 /** pg **/
234 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
235};
236
237static inline int ceph_osd_op_type_lock(int op)
238{
239 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
240}
241static inline int ceph_osd_op_type_data(int op)
242{
243 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
244}
245static inline int ceph_osd_op_type_attr(int op)
246{
247 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
248}
249static inline int ceph_osd_op_type_exec(int op)
250{
251 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
252}
253static inline int ceph_osd_op_type_pg(int op)
254{
255 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
256}
257
258static inline int ceph_osd_op_mode_subop(int op)
259{
260 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
261}
262static inline int ceph_osd_op_mode_read(int op)
263{
264 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
265}
266static inline int ceph_osd_op_mode_modify(int op)
267{
268 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
269}
270
271#define CEPH_OSD_TMAP_HDR 'h'
272#define CEPH_OSD_TMAP_SET 's'
273#define CEPH_OSD_TMAP_RM 'r'
274
275extern const char *ceph_osd_op_name(int op);
276
277
278/*
279 * osd op flags
280 *
281 * An op may be READ, WRITE, or READ|WRITE.
282 */
283enum {
284 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
285 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
286 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
287 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
288 CEPH_OSD_FLAG_READ = 16, /* op may read */
289 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
290 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
291 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
292 CEPH_OSD_FLAG_BALANCE_READS = 256,
293 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
294 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
295 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
296};
297
298enum {
299 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
300};
301
302#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
303#define EBLACKLISTED ESHUTDOWN /* blacklisted */
304
305/*
306 * an individual object operation. each may be accompanied by some data
307 * payload
308 */
309struct ceph_osd_op {
310 __le16 op; /* CEPH_OSD_OP_* */
311 __le32 flags; /* CEPH_OSD_FLAG_* */
312 union {
313 struct {
314 __le64 offset, length;
315 __le64 truncate_size;
316 __le32 truncate_seq;
317 } __attribute__ ((packed)) extent;
318 struct {
319 __le32 name_len;
320 __le32 value_len;
321 } __attribute__ ((packed)) xattr;
322 struct {
323 __u8 class_len;
324 __u8 method_len;
325 __u8 argc;
326 __le32 indata_len;
327 } __attribute__ ((packed)) cls;
328 struct {
329 __le64 cookie, count;
330 } __attribute__ ((packed)) pgls;
331 };
332 __le32 payload_len;
333} __attribute__ ((packed));
334
335/*
336 * osd request message header. each request may include multiple
337 * ceph_osd_op object operations.
338 */
339struct ceph_osd_request_head {
340 __le32 client_inc; /* client incarnation */
341 struct ceph_object_layout layout; /* pgid */
342 __le32 osdmap_epoch; /* client's osdmap epoch */
343
344 __le32 flags;
345
346 struct ceph_timespec mtime; /* for mutations only */
347 struct ceph_eversion reassert_version; /* if we are replaying op */
348
349 __le32 object_len; /* length of object name */
350
351 __le64 snapid; /* snapid to read */
352 __le64 snap_seq; /* writer's snap context */
353 __le32 num_snaps;
354
355 __le16 num_ops;
356 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
357} __attribute__ ((packed));
358
359struct ceph_osd_reply_head {
360 __le32 client_inc; /* client incarnation */
361 __le32 flags;
362 struct ceph_object_layout layout;
363 __le32 osdmap_epoch;
364 struct ceph_eversion reassert_version; /* for replaying uncommitted */
365
366 __le32 result; /* result code */
367
368 __le32 object_len; /* length of object name */
369 __le32 num_ops;
370 struct ceph_osd_op ops[0]; /* ops[], object */
371} __attribute__ ((packed));
372
373
374#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..df04e210a055
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,906 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4
5#include "super.h"
6#include "decode.h"
7
8/*
9 * Snapshots in ceph are driven in large part by cooperation from the
10 * client. In contrast to local file systems or file servers that
11 * implement snapshots at a single point in the system, ceph's
12 * distributed access to storage requires clients to help decide
13 * whether a write logically occurs before or after a recently created
14 * snapshot.
15 *
16 * This provides a perfect instantanous client-wide snapshot. Between
17 * clients, however, snapshots may appear to be applied at slightly
18 * different points in time, depending on delays in delivering the
19 * snapshot notification.
20 *
21 * Snapshots are _not_ file system-wide. Instead, each snapshot
22 * applies to the subdirectory nested beneath some directory. This
23 * effectively divides the hierarchy into multiple "realms," where all
24 * of the files contained by each realm share the same set of
25 * snapshots. An individual realm's snap set contains snapshots
26 * explicitly created on that realm, as well as any snaps in its
27 * parent's snap set _after_ the point at which the parent became it's
28 * parent (due to, say, a rename). Similarly, snaps from prior parents
29 * during the time intervals during which they were the parent are included.
30 *
31 * The client is spared most of this detail, fortunately... it must only
32 * maintains a hierarchy of realms reflecting the current parent/child
33 * realm relationship, and for each realm has an explicit list of snaps
34 * inherited from prior parents.
35 *
36 * A snap_realm struct is maintained for realms containing every inode
37 * with an open cap in the system. (The needed snap realm information is
38 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
39 * version number is used to ensure that as realm parameters change (new
40 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
41 *
42 * The realm hierarchy drives the generation of a 'snap context' for each
43 * realm, which simply lists the resulting set of snaps for the realm. This
44 * is attached to any writes sent to OSDs.
45 */
46/*
47 * Unfortunately error handling is a bit mixed here. If we get a snap
48 * update, but don't have enough memory to update our realm hierarchy,
49 * it's not clear what we can do about it (besides complaining to the
50 * console).
51 */
52
53
54/*
55 * increase ref count for the realm
56 *
57 * caller must hold snap_rwsem for write.
58 */
59void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
60 struct ceph_snap_realm *realm)
61{
62 dout("get_realm %p %d -> %d\n", realm,
63 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
64 /*
65 * since we _only_ increment realm refs or empty the empty
66 * list with snap_rwsem held, adjusting the empty list here is
67 * safe. we do need to protect against concurrent empty list
68 * additions, however.
69 */
70 if (atomic_read(&realm->nref) == 0) {
71 spin_lock(&mdsc->snap_empty_lock);
72 list_del_init(&realm->empty_item);
73 spin_unlock(&mdsc->snap_empty_lock);
74 }
75
76 atomic_inc(&realm->nref);
77}
78
79static void __insert_snap_realm(struct rb_root *root,
80 struct ceph_snap_realm *new)
81{
82 struct rb_node **p = &root->rb_node;
83 struct rb_node *parent = NULL;
84 struct ceph_snap_realm *r = NULL;
85
86 while (*p) {
87 parent = *p;
88 r = rb_entry(parent, struct ceph_snap_realm, node);
89 if (new->ino < r->ino)
90 p = &(*p)->rb_left;
91 else if (new->ino > r->ino)
92 p = &(*p)->rb_right;
93 else
94 BUG();
95 }
96
97 rb_link_node(&new->node, parent, p);
98 rb_insert_color(&new->node, root);
99}
100
101/*
102 * create and get the realm rooted at @ino and bump its ref count.
103 *
104 * caller must hold snap_rwsem for write.
105 */
106static struct ceph_snap_realm *ceph_create_snap_realm(
107 struct ceph_mds_client *mdsc,
108 u64 ino)
109{
110 struct ceph_snap_realm *realm;
111
112 realm = kzalloc(sizeof(*realm), GFP_NOFS);
113 if (!realm)
114 return ERR_PTR(-ENOMEM);
115
116 atomic_set(&realm->nref, 0); /* tree does not take a ref */
117 realm->ino = ino;
118 INIT_LIST_HEAD(&realm->children);
119 INIT_LIST_HEAD(&realm->child_item);
120 INIT_LIST_HEAD(&realm->empty_item);
121 INIT_LIST_HEAD(&realm->inodes_with_caps);
122 spin_lock_init(&realm->inodes_with_caps_lock);
123 __insert_snap_realm(&mdsc->snap_realms, realm);
124 dout("create_snap_realm %llx %p\n", realm->ino, realm);
125 return realm;
126}
127
128/*
129 * lookup the realm rooted at @ino.
130 *
131 * caller must hold snap_rwsem for write.
132 */
133struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
134 u64 ino)
135{
136 struct rb_node *n = mdsc->snap_realms.rb_node;
137 struct ceph_snap_realm *r;
138
139 while (n) {
140 r = rb_entry(n, struct ceph_snap_realm, node);
141 if (ino < r->ino)
142 n = n->rb_left;
143 else if (ino > r->ino)
144 n = n->rb_right;
145 else {
146 dout("lookup_snap_realm %llx %p\n", r->ino, r);
147 return r;
148 }
149 }
150 return NULL;
151}
152
153static void __put_snap_realm(struct ceph_mds_client *mdsc,
154 struct ceph_snap_realm *realm);
155
156/*
157 * called with snap_rwsem (write)
158 */
159static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
160 struct ceph_snap_realm *realm)
161{
162 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
163
164 rb_erase(&realm->node, &mdsc->snap_realms);
165
166 if (realm->parent) {
167 list_del_init(&realm->child_item);
168 __put_snap_realm(mdsc, realm->parent);
169 }
170
171 kfree(realm->prior_parent_snaps);
172 kfree(realm->snaps);
173 ceph_put_snap_context(realm->cached_context);
174 kfree(realm);
175}
176
177/*
178 * caller holds snap_rwsem (write)
179 */
180static void __put_snap_realm(struct ceph_mds_client *mdsc,
181 struct ceph_snap_realm *realm)
182{
183 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
184 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
185 if (atomic_dec_and_test(&realm->nref))
186 __destroy_snap_realm(mdsc, realm);
187}
188
189/*
190 * caller needn't hold any locks
191 */
192void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
193 struct ceph_snap_realm *realm)
194{
195 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
196 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
197 if (!atomic_dec_and_test(&realm->nref))
198 return;
199
200 if (down_write_trylock(&mdsc->snap_rwsem)) {
201 __destroy_snap_realm(mdsc, realm);
202 up_write(&mdsc->snap_rwsem);
203 } else {
204 spin_lock(&mdsc->snap_empty_lock);
205 list_add(&mdsc->snap_empty, &realm->empty_item);
206 spin_unlock(&mdsc->snap_empty_lock);
207 }
208}
209
210/*
211 * Clean up any realms whose ref counts have dropped to zero. Note
212 * that this does not include realms who were created but not yet
213 * used.
214 *
215 * Called under snap_rwsem (write)
216 */
217static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
218{
219 struct ceph_snap_realm *realm;
220
221 spin_lock(&mdsc->snap_empty_lock);
222 while (!list_empty(&mdsc->snap_empty)) {
223 realm = list_first_entry(&mdsc->snap_empty,
224 struct ceph_snap_realm, empty_item);
225 list_del(&realm->empty_item);
226 spin_unlock(&mdsc->snap_empty_lock);
227 __destroy_snap_realm(mdsc, realm);
228 spin_lock(&mdsc->snap_empty_lock);
229 }
230 spin_unlock(&mdsc->snap_empty_lock);
231}
232
233void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
234{
235 down_write(&mdsc->snap_rwsem);
236 __cleanup_empty_realms(mdsc);
237 up_write(&mdsc->snap_rwsem);
238}
239
240/*
241 * adjust the parent realm of a given @realm. adjust child list, and parent
242 * pointers, and ref counts appropriately.
243 *
244 * return true if parent was changed, 0 if unchanged, <0 on error.
245 *
246 * caller must hold snap_rwsem for write.
247 */
248static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
249 struct ceph_snap_realm *realm,
250 u64 parentino)
251{
252 struct ceph_snap_realm *parent;
253
254 if (realm->parent_ino == parentino)
255 return 0;
256
257 parent = ceph_lookup_snap_realm(mdsc, parentino);
258 if (!parent) {
259 parent = ceph_create_snap_realm(mdsc, parentino);
260 if (IS_ERR(parent))
261 return PTR_ERR(parent);
262 }
263 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
264 realm->ino, realm, realm->parent_ino, realm->parent,
265 parentino, parent);
266 if (realm->parent) {
267 list_del_init(&realm->child_item);
268 ceph_put_snap_realm(mdsc, realm->parent);
269 }
270 realm->parent_ino = parentino;
271 realm->parent = parent;
272 ceph_get_snap_realm(mdsc, parent);
273 list_add(&realm->child_item, &parent->children);
274 return 1;
275}
276
277
278static int cmpu64_rev(const void *a, const void *b)
279{
280 if (*(u64 *)a < *(u64 *)b)
281 return 1;
282 if (*(u64 *)a > *(u64 *)b)
283 return -1;
284 return 0;
285}
286
287/*
288 * build the snap context for a given realm.
289 */
290static int build_snap_context(struct ceph_snap_realm *realm)
291{
292 struct ceph_snap_realm *parent = realm->parent;
293 struct ceph_snap_context *snapc;
294 int err = 0;
295 int i;
296 int num = realm->num_prior_parent_snaps + realm->num_snaps;
297
298 /*
299 * build parent context, if it hasn't been built.
300 * conservatively estimate that all parent snaps might be
301 * included by us.
302 */
303 if (parent) {
304 if (!parent->cached_context) {
305 err = build_snap_context(parent);
306 if (err)
307 goto fail;
308 }
309 num += parent->cached_context->num_snaps;
310 }
311
312 /* do i actually need to update? not if my context seq
313 matches realm seq, and my parents' does to. (this works
314 because we rebuild_snap_realms() works _downward_ in
315 hierarchy after each update.) */
316 if (realm->cached_context &&
317 realm->cached_context->seq == realm->seq &&
318 (!parent ||
319 realm->cached_context->seq >= parent->cached_context->seq)) {
320 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
321 " (unchanged)\n",
322 realm->ino, realm, realm->cached_context,
323 realm->cached_context->seq,
324 realm->cached_context->num_snaps);
325 return 0;
326 }
327
328 /* alloc new snap context */
329 err = -ENOMEM;
330 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
331 goto fail;
332 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
333 if (!snapc)
334 goto fail;
335 atomic_set(&snapc->nref, 1);
336
337 /* build (reverse sorted) snap vector */
338 num = 0;
339 snapc->seq = realm->seq;
340 if (parent) {
341 /* include any of parent's snaps occuring _after_ my
342 parent became my parent */
343 for (i = 0; i < parent->cached_context->num_snaps; i++)
344 if (parent->cached_context->snaps[i] >=
345 realm->parent_since)
346 snapc->snaps[num++] =
347 parent->cached_context->snaps[i];
348 if (parent->cached_context->seq > snapc->seq)
349 snapc->seq = parent->cached_context->seq;
350 }
351 memcpy(snapc->snaps + num, realm->snaps,
352 sizeof(u64)*realm->num_snaps);
353 num += realm->num_snaps;
354 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
355 sizeof(u64)*realm->num_prior_parent_snaps);
356 num += realm->num_prior_parent_snaps;
357
358 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
359 snapc->num_snaps = num;
360 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
361 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
362
363 if (realm->cached_context)
364 ceph_put_snap_context(realm->cached_context);
365 realm->cached_context = snapc;
366 return 0;
367
368fail:
369 /*
370 * if we fail, clear old (incorrect) cached_context... hopefully
371 * we'll have better luck building it later
372 */
373 if (realm->cached_context) {
374 ceph_put_snap_context(realm->cached_context);
375 realm->cached_context = NULL;
376 }
377 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
378 realm, err);
379 return err;
380}
381
382/*
383 * rebuild snap context for the given realm and all of its children.
384 */
385static void rebuild_snap_realms(struct ceph_snap_realm *realm)
386{
387 struct ceph_snap_realm *child;
388
389 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
390 build_snap_context(realm);
391
392 list_for_each_entry(child, &realm->children, child_item)
393 rebuild_snap_realms(child);
394}
395
396
397/*
398 * helper to allocate and decode an array of snapids. free prior
399 * instance, if any.
400 */
401static int dup_array(u64 **dst, __le64 *src, int num)
402{
403 int i;
404
405 kfree(*dst);
406 if (num) {
407 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
408 if (!*dst)
409 return -ENOMEM;
410 for (i = 0; i < num; i++)
411 (*dst)[i] = get_unaligned_le64(src + i);
412 } else {
413 *dst = NULL;
414 }
415 return 0;
416}
417
418
419/*
420 * When a snapshot is applied, the size/mtime inode metadata is queued
421 * in a ceph_cap_snap (one for each snapshot) until writeback
422 * completes and the metadata can be flushed back to the MDS.
423 *
424 * However, if a (sync) write is currently in-progress when we apply
425 * the snapshot, we have to wait until the write succeeds or fails
426 * (and a final size/mtime is known). In this case the
427 * cap_snap->writing = 1, and is said to be "pending." When the write
428 * finishes, we __ceph_finish_cap_snap().
429 *
430 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
431 * change).
432 */
433void ceph_queue_cap_snap(struct ceph_inode_info *ci,
434 struct ceph_snap_context *snapc)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p snapc %p seq %llu used %d"
454 " already pending\n", inode, snapc, snapc->seq, used);
455 kfree(capsnap);
456 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
457 igrab(inode);
458
459 atomic_set(&capsnap->nref, 1);
460 capsnap->ci = ci;
461 INIT_LIST_HEAD(&capsnap->ci_item);
462 INIT_LIST_HEAD(&capsnap->flushing_item);
463
464 capsnap->follows = snapc->seq - 1;
465 capsnap->context = ceph_get_snap_context(snapc);
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 ceph_put_snap_context(ci->i_head_snapc);
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 capsnap->size, capsnap->dirty_pages);
528 return 0;
529 }
530 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
531 inode, capsnap, capsnap->context,
532 capsnap->context->seq, capsnap->size);
533
534 spin_lock(&mdsc->snap_flush_lock);
535 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
536 spin_unlock(&mdsc->snap_flush_lock);
537 return 1; /* caller may want to ceph_flush_snaps */
538}
539
540
541/*
542 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
543 * the snap realm parameters from a given realm and all of its ancestors,
544 * up to the root.
545 *
546 * Caller must hold snap_rwsem for write.
547 */
548int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
549 void *p, void *e, bool deletion)
550{
551 struct ceph_mds_snap_realm *ri; /* encoded */
552 __le64 *snaps; /* encoded */
553 __le64 *prior_parent_snaps; /* encoded */
554 struct ceph_snap_realm *realm;
555 int invalidate = 0;
556 int err = -ENOMEM;
557
558 dout("update_snap_trace deletion=%d\n", deletion);
559more:
560 ceph_decode_need(&p, e, sizeof(*ri), bad);
561 ri = p;
562 p += sizeof(*ri);
563 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
564 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
565 snaps = p;
566 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
567 prior_parent_snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
569
570 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
571 if (!realm) {
572 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (IS_ERR(realm)) {
574 err = PTR_ERR(realm);
575 goto fail;
576 }
577 }
578
579 if (le64_to_cpu(ri->seq) > realm->seq) {
580 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
581 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
582 /*
583 * if the realm seq has changed, queue a cap_snap for every
584 * inode with open caps. we do this _before_ we update
585 * the realm info so that we prepare for writeback under the
586 * _previous_ snap context.
587 *
588 * ...unless it's a snap deletion!
589 */
590 if (!deletion) {
591 struct ceph_inode_info *ci;
592 struct inode *lastinode = NULL;
593
594 spin_lock(&realm->inodes_with_caps_lock);
595 list_for_each_entry(ci, &realm->inodes_with_caps,
596 i_snap_realm_item) {
597 struct inode *inode = igrab(&ci->vfs_inode);
598 if (!inode)
599 continue;
600 spin_unlock(&realm->inodes_with_caps_lock);
601 if (lastinode)
602 iput(lastinode);
603 lastinode = inode;
604 ceph_queue_cap_snap(ci, realm->cached_context);
605 spin_lock(&realm->inodes_with_caps_lock);
606 }
607 spin_unlock(&realm->inodes_with_caps_lock);
608 if (lastinode)
609 iput(lastinode);
610 dout("update_snap_trace cap_snaps queued\n");
611 }
612
613 } else {
614 dout("update_snap_trace %llx %p seq %lld unchanged\n",
615 realm->ino, realm, realm->seq);
616 }
617
618 /* ensure the parent is correct */
619 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
620 if (err < 0)
621 goto fail;
622 invalidate += err;
623
624 if (le64_to_cpu(ri->seq) > realm->seq) {
625 /* update realm parameters, snap lists */
626 realm->seq = le64_to_cpu(ri->seq);
627 realm->created = le64_to_cpu(ri->created);
628 realm->parent_since = le64_to_cpu(ri->parent_since);
629
630 realm->num_snaps = le32_to_cpu(ri->num_snaps);
631 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
632 if (err < 0)
633 goto fail;
634
635 realm->num_prior_parent_snaps =
636 le32_to_cpu(ri->num_prior_parent_snaps);
637 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
638 realm->num_prior_parent_snaps);
639 if (err < 0)
640 goto fail;
641
642 invalidate = 1;
643 } else if (!realm->cached_context) {
644 invalidate = 1;
645 }
646
647 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
648 realm, invalidate, p, e);
649
650 if (p < e)
651 goto more;
652
653 /* invalidate when we reach the _end_ (root) of the trace */
654 if (invalidate)
655 rebuild_snap_realms(realm);
656
657 __cleanup_empty_realms(mdsc);
658 return 0;
659
660bad:
661 err = -EINVAL;
662fail:
663 pr_err("update_snap_trace error %d\n", err);
664 return err;
665}
666
667
668/*
669 * Send any cap_snaps that are queued for flush. Try to carry
670 * s_mutex across multiple snap flushes to avoid locking overhead.
671 *
672 * Caller holds no locks.
673 */
674static void flush_snaps(struct ceph_mds_client *mdsc)
675{
676 struct ceph_inode_info *ci;
677 struct inode *inode;
678 struct ceph_mds_session *session = NULL;
679
680 dout("flush_snaps\n");
681 spin_lock(&mdsc->snap_flush_lock);
682 while (!list_empty(&mdsc->snap_flush_list)) {
683 ci = list_first_entry(&mdsc->snap_flush_list,
684 struct ceph_inode_info, i_snap_flush_item);
685 inode = &ci->vfs_inode;
686 igrab(inode);
687 spin_unlock(&mdsc->snap_flush_lock);
688 spin_lock(&inode->i_lock);
689 __ceph_flush_snaps(ci, &session);
690 spin_unlock(&inode->i_lock);
691 iput(inode);
692 spin_lock(&mdsc->snap_flush_lock);
693 }
694 spin_unlock(&mdsc->snap_flush_lock);
695
696 if (session) {
697 mutex_unlock(&session->s_mutex);
698 ceph_put_mds_session(session);
699 }
700 dout("flush_snaps done\n");
701}
702
703
704/*
705 * Handle a snap notification from the MDS.
706 *
707 * This can take two basic forms: the simplest is just a snap creation
708 * or deletion notification on an existing realm. This should update the
709 * realm and its children.
710 *
711 * The more difficult case is realm creation, due to snap creation at a
712 * new point in the file hierarchy, or due to a rename that moves a file or
713 * directory into another realm.
714 */
715void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg)
718{
719 struct super_block *sb = mdsc->client->sb;
720 int mds = session->s_mds;
721 u64 split;
722 int op;
723 int trace_len;
724 struct ceph_snap_realm *realm = NULL;
725 void *p = msg->front.iov_base;
726 void *e = p + msg->front.iov_len;
727 struct ceph_mds_snap_head *h;
728 int num_split_inos, num_split_realms;
729 __le64 *split_inos = NULL, *split_realms = NULL;
730 int i;
731 int locked_rwsem = 0;
732
733 /* decode */
734 if (msg->front.iov_len < sizeof(*h))
735 goto bad;
736 h = p;
737 op = le32_to_cpu(h->op);
738 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
739 * existing realm */
740 num_split_inos = le32_to_cpu(h->num_split_inos);
741 num_split_realms = le32_to_cpu(h->num_split_realms);
742 trace_len = le32_to_cpu(h->trace_len);
743 p += sizeof(*h);
744
745 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
746 ceph_snap_op_name(op), split, trace_len);
747
748 mutex_lock(&session->s_mutex);
749 session->s_seq++;
750 mutex_unlock(&session->s_mutex);
751
752 down_write(&mdsc->snap_rwsem);
753 locked_rwsem = 1;
754
755 if (op == CEPH_SNAP_OP_SPLIT) {
756 struct ceph_mds_snap_realm *ri;
757
758 /*
759 * A "split" breaks part of an existing realm off into
760 * a new realm. The MDS provides a list of inodes
761 * (with caps) and child realms that belong to the new
762 * child.
763 */
764 split_inos = p;
765 p += sizeof(u64) * num_split_inos;
766 split_realms = p;
767 p += sizeof(u64) * num_split_realms;
768 ceph_decode_need(&p, e, sizeof(*ri), bad);
769 /* we will peek at realm info here, but will _not_
770 * advance p, as the realm update will occur below in
771 * ceph_update_snap_trace. */
772 ri = p;
773
774 realm = ceph_lookup_snap_realm(mdsc, split);
775 if (!realm) {
776 realm = ceph_create_snap_realm(mdsc, split);
777 if (IS_ERR(realm))
778 goto out;
779 }
780 ceph_get_snap_realm(mdsc, realm);
781
782 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
783 for (i = 0; i < num_split_inos; i++) {
784 struct ceph_vino vino = {
785 .ino = le64_to_cpu(split_inos[i]),
786 .snap = CEPH_NOSNAP,
787 };
788 struct inode *inode = ceph_find_inode(sb, vino);
789 struct ceph_inode_info *ci;
790
791 if (!inode)
792 continue;
793 ci = ceph_inode(inode);
794
795 spin_lock(&inode->i_lock);
796 if (!ci->i_snap_realm)
797 goto skip_inode;
798 /*
799 * If this inode belongs to a realm that was
800 * created after our new realm, we experienced
801 * a race (due to another split notifications
802 * arriving from a different MDS). So skip
803 * this inode.
804 */
805 if (ci->i_snap_realm->created >
806 le64_to_cpu(ri->created)) {
807 dout(" leaving %p in newer realm %llx %p\n",
808 inode, ci->i_snap_realm->ino,
809 ci->i_snap_realm);
810 goto skip_inode;
811 }
812 dout(" will move %p to split realm %llx %p\n",
813 inode, realm->ino, realm);
814 /*
815 * Remove the inode from the realm's inode
816 * list, but don't add it to the new realm
817 * yet. We don't want the cap_snap to be
818 * queued (again) by ceph_update_snap_trace()
819 * below. Queue it _now_, under the old context.
820 */
821 spin_lock(&realm->inodes_with_caps_lock);
822 list_del_init(&ci->i_snap_realm_item);
823 spin_unlock(&realm->inodes_with_caps_lock);
824 spin_unlock(&inode->i_lock);
825
826 ceph_queue_cap_snap(ci,
827 ci->i_snap_realm->cached_context);
828
829 iput(inode);
830 continue;
831
832skip_inode:
833 spin_unlock(&inode->i_lock);
834 iput(inode);
835 }
836
837 /* we may have taken some of the old realm's children. */
838 for (i = 0; i < num_split_realms; i++) {
839 struct ceph_snap_realm *child =
840 ceph_lookup_snap_realm(mdsc,
841 le64_to_cpu(split_realms[i]));
842 if (!child)
843 continue;
844 adjust_snap_realm_parent(mdsc, child, realm->ino);
845 }
846 }
847
848 /*
849 * update using the provided snap trace. if we are deleting a
850 * snap, we can avoid queueing cap_snaps.
851 */
852 ceph_update_snap_trace(mdsc, p, e,
853 op == CEPH_SNAP_OP_DESTROY);
854
855 if (op == CEPH_SNAP_OP_SPLIT) {
856 /*
857 * ok, _now_ add the inodes into the new realm.
858 */
859 for (i = 0; i < num_split_inos; i++) {
860 struct ceph_vino vino = {
861 .ino = le64_to_cpu(split_inos[i]),
862 .snap = CEPH_NOSNAP,
863 };
864 struct inode *inode = ceph_find_inode(sb, vino);
865 struct ceph_inode_info *ci;
866
867 if (!inode)
868 continue;
869 ci = ceph_inode(inode);
870 spin_lock(&inode->i_lock);
871 if (!ci->i_snap_realm)
872 goto split_skip_inode;
873 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
874 spin_lock(&realm->inodes_with_caps_lock);
875 list_add(&ci->i_snap_realm_item,
876 &realm->inodes_with_caps);
877 ci->i_snap_realm = realm;
878 spin_unlock(&realm->inodes_with_caps_lock);
879 ceph_get_snap_realm(mdsc, realm);
880split_skip_inode:
881 spin_unlock(&inode->i_lock);
882 iput(inode);
883 }
884
885 /* we took a reference when we created the realm, above */
886 ceph_put_snap_realm(mdsc, realm);
887 }
888
889 __cleanup_empty_realms(mdsc);
890
891 up_write(&mdsc->snap_rwsem);
892
893 flush_snaps(mdsc);
894 return;
895
896bad:
897 pr_err("corrupt snap message from mds%d\n", mds);
898 ceph_msg_dump(msg);
899out:
900 if (locked_rwsem)
901 up_write(&mdsc->snap_rwsem);
902 return;
903}
904
905
906
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..4290a6e860b0
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1030 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16#include <linux/version.h>
17#include <linux/vmalloc.h>
18
19#include "decode.h"
20#include "super.h"
21#include "mon_client.h"
22#include "auth.h"
23
24/*
25 * Ceph superblock operations
26 *
27 * Handle the basics of mounting, unmounting.
28 */
29
30
31/*
32 * find filename portion of a path (/foo/bar/baz -> baz)
33 */
34const char *ceph_file_part(const char *s, int len)
35{
36 const char *e = s + len;
37
38 while (e != s && *(e-1) != '/')
39 e--;
40 return e;
41}
42
43
44/*
45 * super ops
46 */
47static void ceph_put_super(struct super_block *s)
48{
49 struct ceph_client *cl = ceph_client(s);
50
51 dout("put_super\n");
52 ceph_mdsc_close_sessions(&cl->mdsc);
53 return;
54}
55
56static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
57{
58 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
59 struct ceph_monmap *monmap = client->monc.monmap;
60 struct ceph_statfs st;
61 u64 fsid;
62 int err;
63
64 dout("statfs\n");
65 err = ceph_monc_do_statfs(&client->monc, &st);
66 if (err < 0)
67 return err;
68
69 /* fill in kstatfs */
70 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
71
72 /*
73 * express utilization in terms of large blocks to avoid
74 * overflow on 32-bit machines.
75 */
76 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
77 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
78 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
79 (CEPH_BLOCK_SHIFT-10);
80 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
81
82 buf->f_files = le64_to_cpu(st.num_objects);
83 buf->f_ffree = -1;
84 buf->f_namelen = PATH_MAX;
85 buf->f_frsize = PAGE_CACHE_SIZE;
86
87 /* leave fsid little-endian, regardless of host endianness */
88 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
89 buf->f_fsid.val[0] = fsid & 0xffffffff;
90 buf->f_fsid.val[1] = fsid >> 32;
91
92 return 0;
93}
94
95
96static int ceph_syncfs(struct super_block *sb, int wait)
97{
98 dout("sync_fs %d\n", wait);
99 ceph_osdc_sync(&ceph_client(sb)->osdc);
100 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
101 dout("sync_fs %d done\n", wait);
102 return 0;
103}
104
105
106/**
107 * ceph_show_options - Show mount options in /proc/mounts
108 * @m: seq_file to write to
109 * @mnt: mount descriptor
110 */
111static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
112{
113 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
114 struct ceph_mount_args *args = client->mount_args;
115
116 if (args->flags & CEPH_OPT_FSID)
117 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
118 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
120 if (args->flags & CEPH_OPT_NOSHARE)
121 seq_puts(m, ",noshare");
122 if (args->flags & CEPH_OPT_DIRSTAT)
123 seq_puts(m, ",dirstat");
124 if ((args->flags & CEPH_OPT_RBYTES) == 0)
125 seq_puts(m, ",norbytes");
126 if (args->flags & CEPH_OPT_NOCRC)
127 seq_puts(m, ",nocrc");
128 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
129 seq_puts(m, ",noasyncreaddir");
130 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
131 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
132 if (args->name)
133 seq_printf(m, ",name=%s", args->name);
134 if (args->secret)
135 seq_puts(m, ",secret=<hidden>");
136 return 0;
137}
138
139/*
140 * caches
141 */
142struct kmem_cache *ceph_inode_cachep;
143struct kmem_cache *ceph_cap_cachep;
144struct kmem_cache *ceph_dentry_cachep;
145struct kmem_cache *ceph_file_cachep;
146
147static void ceph_inode_init_once(void *foo)
148{
149 struct ceph_inode_info *ci = foo;
150 inode_init_once(&ci->vfs_inode);
151}
152
153static int default_congestion_kb(void)
154{
155 int congestion_kb;
156
157 /*
158 * Copied from NFS
159 *
160 * congestion size, scale with available memory.
161 *
162 * 64MB: 8192k
163 * 128MB: 11585k
164 * 256MB: 16384k
165 * 512MB: 23170k
166 * 1GB: 32768k
167 * 2GB: 46340k
168 * 4GB: 65536k
169 * 8GB: 92681k
170 * 16GB: 131072k
171 *
172 * This allows larger machines to have larger/more transfers.
173 * Limit the default to 256M
174 */
175 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
176 if (congestion_kb > 256*1024)
177 congestion_kb = 256*1024;
178
179 return congestion_kb;
180}
181
182static int __init init_caches(void)
183{
184 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
185 sizeof(struct ceph_inode_info),
186 __alignof__(struct ceph_inode_info),
187 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
188 ceph_inode_init_once);
189 if (ceph_inode_cachep == NULL)
190 return -ENOMEM;
191
192 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
193 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
194 if (ceph_cap_cachep == NULL)
195 goto bad_cap;
196
197 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
198 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
199 if (ceph_dentry_cachep == NULL)
200 goto bad_dentry;
201
202 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
203 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
204 if (ceph_file_cachep == NULL)
205 goto bad_file;
206
207 return 0;
208
209bad_file:
210 kmem_cache_destroy(ceph_dentry_cachep);
211bad_dentry:
212 kmem_cache_destroy(ceph_cap_cachep);
213bad_cap:
214 kmem_cache_destroy(ceph_inode_cachep);
215 return -ENOMEM;
216}
217
218static void destroy_caches(void)
219{
220 kmem_cache_destroy(ceph_inode_cachep);
221 kmem_cache_destroy(ceph_cap_cachep);
222 kmem_cache_destroy(ceph_dentry_cachep);
223 kmem_cache_destroy(ceph_file_cachep);
224}
225
226
227/*
228 * ceph_umount_begin - initiate forced umount. Tear down down the
229 * mount, skipping steps that may hang while waiting for server(s).
230 */
231static void ceph_umount_begin(struct super_block *sb)
232{
233 struct ceph_client *client = ceph_sb_to_client(sb);
234
235 dout("ceph_umount_begin - starting forced umount\n");
236 if (!client)
237 return;
238 client->mount_state = CEPH_MOUNT_SHUTDOWN;
239 return;
240}
241
242static const struct super_operations ceph_super_ops = {
243 .alloc_inode = ceph_alloc_inode,
244 .destroy_inode = ceph_destroy_inode,
245 .write_inode = ceph_write_inode,
246 .sync_fs = ceph_syncfs,
247 .put_super = ceph_put_super,
248 .show_options = ceph_show_options,
249 .statfs = ceph_statfs,
250 .umount_begin = ceph_umount_begin,
251};
252
253
254const char *ceph_msg_type_name(int type)
255{
256 switch (type) {
257 case CEPH_MSG_SHUTDOWN: return "shutdown";
258 case CEPH_MSG_PING: return "ping";
259 case CEPH_MSG_AUTH: return "auth";
260 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
261 case CEPH_MSG_MON_MAP: return "mon_map";
262 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
263 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
264 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
265 case CEPH_MSG_STATFS: return "statfs";
266 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
267 case CEPH_MSG_MDS_MAP: return "mds_map";
268 case CEPH_MSG_CLIENT_SESSION: return "client_session";
269 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
270 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
271 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
272 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
273 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
274 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
275 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
276 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
277 case CEPH_MSG_OSD_MAP: return "osd_map";
278 case CEPH_MSG_OSD_OP: return "osd_op";
279 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
280 default: return "unknown";
281 }
282}
283
284
285/*
286 * mount options
287 */
288enum {
289 Opt_fsidmajor,
290 Opt_fsidminor,
291 Opt_monport,
292 Opt_wsize,
293 Opt_rsize,
294 Opt_osdtimeout,
295 Opt_osdkeepalivetimeout,
296 Opt_mount_timeout,
297 Opt_osd_idle_ttl,
298 Opt_caps_wanted_delay_min,
299 Opt_caps_wanted_delay_max,
300 Opt_readdir_max_entries,
301 Opt_congestion_kb,
302 Opt_last_int,
303 /* int args above */
304 Opt_snapdirname,
305 Opt_name,
306 Opt_secret,
307 Opt_last_string,
308 /* string args above */
309 Opt_ip,
310 Opt_noshare,
311 Opt_dirstat,
312 Opt_nodirstat,
313 Opt_rbytes,
314 Opt_norbytes,
315 Opt_nocrc,
316 Opt_noasyncreaddir,
317};
318
319static match_table_t arg_tokens = {
320 {Opt_fsidmajor, "fsidmajor=%ld"},
321 {Opt_fsidminor, "fsidminor=%ld"},
322 {Opt_monport, "monport=%d"},
323 {Opt_wsize, "wsize=%d"},
324 {Opt_rsize, "rsize=%d"},
325 {Opt_osdtimeout, "osdtimeout=%d"},
326 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
327 {Opt_mount_timeout, "mount_timeout=%d"},
328 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
329 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
330 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
331 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
332 {Opt_congestion_kb, "write_congestion_kb=%d"},
333 /* int args above */
334 {Opt_snapdirname, "snapdirname=%s"},
335 {Opt_name, "name=%s"},
336 {Opt_secret, "secret=%s"},
337 /* string args above */
338 {Opt_ip, "ip=%s"},
339 {Opt_noshare, "noshare"},
340 {Opt_dirstat, "dirstat"},
341 {Opt_nodirstat, "nodirstat"},
342 {Opt_rbytes, "rbytes"},
343 {Opt_norbytes, "norbytes"},
344 {Opt_nocrc, "nocrc"},
345 {Opt_noasyncreaddir, "noasyncreaddir"},
346 {-1, NULL}
347};
348
349
350static struct ceph_mount_args *parse_mount_args(int flags, char *options,
351 const char *dev_name,
352 const char **path)
353{
354 struct ceph_mount_args *args;
355 const char *c;
356 int err = -ENOMEM;
357 substring_t argstr[MAX_OPT_ARGS];
358
359 args = kzalloc(sizeof(*args), GFP_KERNEL);
360 if (!args)
361 return ERR_PTR(-ENOMEM);
362 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
363 GFP_KERNEL);
364 if (!args->mon_addr)
365 goto out;
366
367 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
368
369 /* start with defaults */
370 args->sb_flags = flags;
371 args->flags = CEPH_OPT_DEFAULT;
372 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
373 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
374 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
375 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
376 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
377 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
378 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
379 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
380 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
381 args->max_readdir = 1024;
382 args->congestion_kb = default_congestion_kb();
383
384 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
385 err = -EINVAL;
386 if (!dev_name)
387 goto out;
388 *path = strstr(dev_name, ":/");
389 if (*path == NULL) {
390 pr_err("device name is missing path (no :/ in %s)\n",
391 dev_name);
392 goto out;
393 }
394
395 /* get mon ip(s) */
396 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
397 CEPH_MAX_MON, &args->num_mon);
398 if (err < 0)
399 goto out;
400
401 /* path on server */
402 *path += 2;
403 dout("server path '%s'\n", *path);
404
405 /* parse mount options */
406 while ((c = strsep(&options, ",")) != NULL) {
407 int token, intval, ret;
408 if (!*c)
409 continue;
410 err = -EINVAL;
411 token = match_token((char *)c, arg_tokens, argstr);
412 if (token < 0) {
413 pr_err("bad mount option at '%s'\n", c);
414 goto out;
415 }
416 if (token < Opt_last_int) {
417 ret = match_int(&argstr[0], &intval);
418 if (ret < 0) {
419 pr_err("bad mount option arg (not int) "
420 "at '%s'\n", c);
421 continue;
422 }
423 dout("got int token %d val %d\n", token, intval);
424 } else if (token > Opt_last_int && token < Opt_last_string) {
425 dout("got string token %d val %s\n", token,
426 argstr[0].from);
427 } else {
428 dout("got token %d\n", token);
429 }
430 switch (token) {
431 case Opt_fsidmajor:
432 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
433 break;
434 case Opt_fsidminor:
435 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
436 break;
437 case Opt_ip:
438 err = ceph_parse_ips(argstr[0].from,
439 argstr[0].to,
440 &args->my_addr,
441 1, NULL);
442 if (err < 0)
443 goto out;
444 args->flags |= CEPH_OPT_MYIP;
445 break;
446
447 case Opt_snapdirname:
448 kfree(args->snapdir_name);
449 args->snapdir_name = kstrndup(argstr[0].from,
450 argstr[0].to-argstr[0].from,
451 GFP_KERNEL);
452 break;
453 case Opt_name:
454 args->name = kstrndup(argstr[0].from,
455 argstr[0].to-argstr[0].from,
456 GFP_KERNEL);
457 break;
458 case Opt_secret:
459 args->secret = kstrndup(argstr[0].from,
460 argstr[0].to-argstr[0].from,
461 GFP_KERNEL);
462 break;
463
464 /* misc */
465 case Opt_wsize:
466 args->wsize = intval;
467 break;
468 case Opt_rsize:
469 args->rsize = intval;
470 break;
471 case Opt_osdtimeout:
472 args->osd_timeout = intval;
473 break;
474 case Opt_osdkeepalivetimeout:
475 args->osd_keepalive_timeout = intval;
476 break;
477 case Opt_mount_timeout:
478 args->mount_timeout = intval;
479 break;
480 case Opt_caps_wanted_delay_min:
481 args->caps_wanted_delay_min = intval;
482 break;
483 case Opt_caps_wanted_delay_max:
484 args->caps_wanted_delay_max = intval;
485 break;
486 case Opt_readdir_max_entries:
487 args->max_readdir = intval;
488 break;
489 case Opt_congestion_kb:
490 args->congestion_kb = intval;
491 break;
492
493 case Opt_noshare:
494 args->flags |= CEPH_OPT_NOSHARE;
495 break;
496
497 case Opt_dirstat:
498 args->flags |= CEPH_OPT_DIRSTAT;
499 break;
500 case Opt_nodirstat:
501 args->flags &= ~CEPH_OPT_DIRSTAT;
502 break;
503 case Opt_rbytes:
504 args->flags |= CEPH_OPT_RBYTES;
505 break;
506 case Opt_norbytes:
507 args->flags &= ~CEPH_OPT_RBYTES;
508 break;
509 case Opt_nocrc:
510 args->flags |= CEPH_OPT_NOCRC;
511 break;
512 case Opt_noasyncreaddir:
513 args->flags |= CEPH_OPT_NOASYNCREADDIR;
514 break;
515
516 default:
517 BUG_ON(token);
518 }
519 }
520 return args;
521
522out:
523 kfree(args->mon_addr);
524 kfree(args);
525 return ERR_PTR(err);
526}
527
528static void destroy_mount_args(struct ceph_mount_args *args)
529{
530 dout("destroy_mount_args %p\n", args);
531 kfree(args->snapdir_name);
532 args->snapdir_name = NULL;
533 kfree(args->name);
534 args->name = NULL;
535 kfree(args->secret);
536 args->secret = NULL;
537 kfree(args);
538}
539
540/*
541 * create a fresh client instance
542 */
543static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
544{
545 struct ceph_client *client;
546 int err = -ENOMEM;
547
548 client = kzalloc(sizeof(*client), GFP_KERNEL);
549 if (client == NULL)
550 return ERR_PTR(-ENOMEM);
551
552 mutex_init(&client->mount_mutex);
553
554 init_waitqueue_head(&client->auth_wq);
555
556 client->sb = NULL;
557 client->mount_state = CEPH_MOUNT_MOUNTING;
558 client->mount_args = args;
559
560 client->msgr = NULL;
561
562 client->auth_err = 0;
563 atomic_long_set(&client->writeback_count, 0);
564
565 err = bdi_init(&client->backing_dev_info);
566 if (err < 0)
567 goto fail;
568
569 err = -ENOMEM;
570 client->wb_wq = create_workqueue("ceph-writeback");
571 if (client->wb_wq == NULL)
572 goto fail_bdi;
573 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
574 if (client->pg_inv_wq == NULL)
575 goto fail_wb_wq;
576 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
577 if (client->trunc_wq == NULL)
578 goto fail_pg_inv_wq;
579
580 /* set up mempools */
581 err = -ENOMEM;
582 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
583 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
584 if (!client->wb_pagevec_pool)
585 goto fail_trunc_wq;
586
587 /* caps */
588 client->min_caps = args->max_readdir;
589 ceph_adjust_min_caps(client->min_caps);
590
591 /* subsystems */
592 err = ceph_monc_init(&client->monc, client);
593 if (err < 0)
594 goto fail_mempool;
595 err = ceph_osdc_init(&client->osdc, client);
596 if (err < 0)
597 goto fail_monc;
598 err = ceph_mdsc_init(&client->mdsc, client);
599 if (err < 0)
600 goto fail_osdc;
601 return client;
602
603fail_osdc:
604 ceph_osdc_stop(&client->osdc);
605fail_monc:
606 ceph_monc_stop(&client->monc);
607fail_mempool:
608 mempool_destroy(client->wb_pagevec_pool);
609fail_trunc_wq:
610 destroy_workqueue(client->trunc_wq);
611fail_pg_inv_wq:
612 destroy_workqueue(client->pg_inv_wq);
613fail_wb_wq:
614 destroy_workqueue(client->wb_wq);
615fail_bdi:
616 bdi_destroy(&client->backing_dev_info);
617fail:
618 kfree(client);
619 return ERR_PTR(err);
620}
621
622static void ceph_destroy_client(struct ceph_client *client)
623{
624 dout("destroy_client %p\n", client);
625
626 /* unmount */
627 ceph_mdsc_stop(&client->mdsc);
628 ceph_monc_stop(&client->monc);
629 ceph_osdc_stop(&client->osdc);
630
631 ceph_adjust_min_caps(-client->min_caps);
632
633 ceph_debugfs_client_cleanup(client);
634 destroy_workqueue(client->wb_wq);
635 destroy_workqueue(client->pg_inv_wq);
636 destroy_workqueue(client->trunc_wq);
637
638 if (client->msgr)
639 ceph_messenger_destroy(client->msgr);
640 mempool_destroy(client->wb_pagevec_pool);
641
642 destroy_mount_args(client->mount_args);
643
644 kfree(client);
645 dout("destroy_client %p done\n", client);
646}
647
648/*
649 * Initially learn our fsid, or verify an fsid matches.
650 */
651int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
652{
653 if (client->have_fsid) {
654 if (ceph_fsid_compare(&client->fsid, fsid)) {
655 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
656 PR_FSID(&client->fsid), PR_FSID(fsid));
657 return -1;
658 }
659 } else {
660 pr_info("client%lld fsid " FSID_FORMAT "\n",
661 client->monc.auth->global_id, PR_FSID(fsid));
662 memcpy(&client->fsid, fsid, sizeof(*fsid));
663 ceph_debugfs_client_init(client);
664 client->have_fsid = true;
665 }
666 return 0;
667}
668
669/*
670 * true if we have the mon map (and have thus joined the cluster)
671 */
672static int have_mon_map(struct ceph_client *client)
673{
674 return client->monc.monmap && client->monc.monmap->epoch;
675}
676
677/*
678 * Bootstrap mount by opening the root directory. Note the mount
679 * @started time from caller, and time out if this takes too long.
680 */
681static struct dentry *open_root_dentry(struct ceph_client *client,
682 const char *path,
683 unsigned long started)
684{
685 struct ceph_mds_client *mdsc = &client->mdsc;
686 struct ceph_mds_request *req = NULL;
687 int err;
688 struct dentry *root;
689
690 /* open dir */
691 dout("open_root_inode opening '%s'\n", path);
692 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
693 if (IS_ERR(req))
694 return ERR_PTR(PTR_ERR(req));
695 req->r_path1 = kstrdup(path, GFP_NOFS);
696 req->r_ino1.ino = CEPH_INO_ROOT;
697 req->r_ino1.snap = CEPH_NOSNAP;
698 req->r_started = started;
699 req->r_timeout = client->mount_args->mount_timeout * HZ;
700 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
701 req->r_num_caps = 2;
702 err = ceph_mdsc_do_request(mdsc, NULL, req);
703 if (err == 0) {
704 dout("open_root_inode success\n");
705 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
706 client->sb->s_root == NULL)
707 root = d_alloc_root(req->r_target_inode);
708 else
709 root = d_obtain_alias(req->r_target_inode);
710 req->r_target_inode = NULL;
711 dout("open_root_inode success, root dentry is %p\n", root);
712 } else {
713 root = ERR_PTR(err);
714 }
715 ceph_mdsc_put_request(req);
716 return root;
717}
718
719/*
720 * mount: join the ceph cluster, and open root directory.
721 */
722static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
723 const char *path)
724{
725 struct ceph_entity_addr *myaddr = NULL;
726 int err;
727 unsigned long timeout = client->mount_args->mount_timeout * HZ;
728 unsigned long started = jiffies; /* note the start time */
729 struct dentry *root;
730
731 dout("mount start\n");
732 mutex_lock(&client->mount_mutex);
733
734 /* initialize the messenger */
735 if (client->msgr == NULL) {
736 if (ceph_test_opt(client, MYIP))
737 myaddr = &client->mount_args->my_addr;
738 client->msgr = ceph_messenger_create(myaddr);
739 if (IS_ERR(client->msgr)) {
740 err = PTR_ERR(client->msgr);
741 client->msgr = NULL;
742 goto out;
743 }
744 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
745 }
746
747 /* open session, and wait for mon, mds, and osd maps */
748 err = ceph_monc_open_session(&client->monc);
749 if (err < 0)
750 goto out;
751
752 while (!have_mon_map(client)) {
753 err = -EIO;
754 if (timeout && time_after_eq(jiffies, started + timeout))
755 goto out;
756
757 /* wait */
758 dout("mount waiting for mon_map\n");
759 err = wait_event_interruptible_timeout(client->auth_wq,
760 have_mon_map(client) || (client->auth_err < 0),
761 timeout);
762 if (err == -EINTR || err == -ERESTARTSYS)
763 goto out;
764 if (client->auth_err < 0) {
765 err = client->auth_err;
766 goto out;
767 }
768 }
769
770 dout("mount opening root\n");
771 root = open_root_dentry(client, "", started);
772 if (IS_ERR(root)) {
773 err = PTR_ERR(root);
774 goto out;
775 }
776 if (client->sb->s_root)
777 dput(root);
778 else
779 client->sb->s_root = root;
780
781 if (path[0] == 0) {
782 dget(root);
783 } else {
784 dout("mount opening base mountpoint\n");
785 root = open_root_dentry(client, path, started);
786 if (IS_ERR(root)) {
787 err = PTR_ERR(root);
788 dput(client->sb->s_root);
789 client->sb->s_root = NULL;
790 goto out;
791 }
792 }
793
794 mnt->mnt_root = root;
795 mnt->mnt_sb = client->sb;
796
797 client->mount_state = CEPH_MOUNT_MOUNTED;
798 dout("mount success\n");
799 err = 0;
800
801out:
802 mutex_unlock(&client->mount_mutex);
803 return err;
804}
805
806static int ceph_set_super(struct super_block *s, void *data)
807{
808 struct ceph_client *client = data;
809 int ret;
810
811 dout("set_super %p data %p\n", s, data);
812
813 s->s_flags = client->mount_args->sb_flags;
814 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
815
816 s->s_fs_info = client;
817 client->sb = s;
818
819 s->s_op = &ceph_super_ops;
820 s->s_export_op = &ceph_export_ops;
821
822 s->s_time_gran = 1000; /* 1000 ns == 1 us */
823
824 ret = set_anon_super(s, NULL); /* what is that second arg for? */
825 if (ret != 0)
826 goto fail;
827
828 return ret;
829
830fail:
831 s->s_fs_info = NULL;
832 client->sb = NULL;
833 return ret;
834}
835
836/*
837 * share superblock if same fs AND options
838 */
839static int ceph_compare_super(struct super_block *sb, void *data)
840{
841 struct ceph_client *new = data;
842 struct ceph_mount_args *args = new->mount_args;
843 struct ceph_client *other = ceph_sb_to_client(sb);
844 int i;
845
846 dout("ceph_compare_super %p\n", sb);
847 if (args->flags & CEPH_OPT_FSID) {
848 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
849 dout("fsid doesn't match\n");
850 return 0;
851 }
852 } else {
853 /* do we share (a) monitor? */
854 for (i = 0; i < new->monc.monmap->num_mon; i++)
855 if (ceph_monmap_contains(other->monc.monmap,
856 &new->monc.monmap->mon_inst[i].addr))
857 break;
858 if (i == new->monc.monmap->num_mon) {
859 dout("mon ip not part of monmap\n");
860 return 0;
861 }
862 dout("mon ip matches existing sb %p\n", sb);
863 }
864 if (args->sb_flags != other->mount_args->sb_flags) {
865 dout("flags differ\n");
866 return 0;
867 }
868 return 1;
869}
870
871/*
872 * construct our own bdi so we can control readahead, etc.
873 */
874static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
875{
876 int err;
877
878 sb->s_bdi = &client->backing_dev_info;
879
880 /* set ra_pages based on rsize mount option? */
881 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
882 client->backing_dev_info.ra_pages =
883 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
884 >> PAGE_SHIFT;
885 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
886 return err;
887}
888
889static int ceph_get_sb(struct file_system_type *fs_type,
890 int flags, const char *dev_name, void *data,
891 struct vfsmount *mnt)
892{
893 struct super_block *sb;
894 struct ceph_client *client;
895 int err;
896 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
897 const char *path = NULL;
898 struct ceph_mount_args *args;
899
900 dout("ceph_get_sb\n");
901 args = parse_mount_args(flags, data, dev_name, &path);
902 if (IS_ERR(args)) {
903 err = PTR_ERR(args);
904 goto out_final;
905 }
906
907 /* create client (which we may/may not use) */
908 client = ceph_create_client(args);
909 if (IS_ERR(client)) {
910 err = PTR_ERR(client);
911 goto out_final;
912 }
913
914 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
915 compare_super = NULL;
916 sb = sget(fs_type, compare_super, ceph_set_super, client);
917 if (IS_ERR(sb)) {
918 err = PTR_ERR(sb);
919 goto out;
920 }
921
922 if (ceph_client(sb) != client) {
923 ceph_destroy_client(client);
924 client = ceph_client(sb);
925 dout("get_sb got existing client %p\n", client);
926 } else {
927 dout("get_sb using new client %p\n", client);
928 err = ceph_register_bdi(sb, client);
929 if (err < 0)
930 goto out_splat;
931 }
932
933 err = ceph_mount(client, mnt, path);
934 if (err < 0)
935 goto out_splat;
936 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
937 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
938 return 0;
939
940out_splat:
941 ceph_mdsc_close_sessions(&client->mdsc);
942 up_write(&sb->s_umount);
943 deactivate_super(sb);
944 goto out_final;
945
946out:
947 ceph_destroy_client(client);
948out_final:
949 dout("ceph_get_sb fail %d\n", err);
950 return err;
951}
952
953static void ceph_kill_sb(struct super_block *s)
954{
955 struct ceph_client *client = ceph_sb_to_client(s);
956 dout("kill_sb %p\n", s);
957 ceph_mdsc_pre_umount(&client->mdsc);
958 kill_anon_super(s); /* will call put_super after sb is r/o */
959 if (s->s_bdi == &client->backing_dev_info)
960 bdi_unregister(&client->backing_dev_info);
961 bdi_destroy(&client->backing_dev_info);
962 ceph_destroy_client(client);
963}
964
965static struct file_system_type ceph_fs_type = {
966 .owner = THIS_MODULE,
967 .name = "ceph",
968 .get_sb = ceph_get_sb,
969 .kill_sb = ceph_kill_sb,
970 .fs_flags = FS_RENAME_DOES_D_MOVE,
971};
972
973#define _STRINGIFY(x) #x
974#define STRINGIFY(x) _STRINGIFY(x)
975
976static int __init init_ceph(void)
977{
978 int ret = 0;
979
980 ret = ceph_debugfs_init();
981 if (ret < 0)
982 goto out;
983
984 ret = ceph_msgr_init();
985 if (ret < 0)
986 goto out_debugfs;
987
988 ret = init_caches();
989 if (ret)
990 goto out_msgr;
991
992 ceph_caps_init();
993
994 ret = register_filesystem(&ceph_fs_type);
995 if (ret)
996 goto out_icache;
997
998 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
999 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1000 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1001 return 0;
1002
1003out_icache:
1004 destroy_caches();
1005out_msgr:
1006 ceph_msgr_exit();
1007out_debugfs:
1008 ceph_debugfs_cleanup();
1009out:
1010 return ret;
1011}
1012
1013static void __exit exit_ceph(void)
1014{
1015 dout("exit_ceph\n");
1016 unregister_filesystem(&ceph_fs_type);
1017 ceph_caps_finalize();
1018 destroy_caches();
1019 ceph_msgr_exit();
1020 ceph_debugfs_cleanup();
1021}
1022
1023module_init(init_ceph);
1024module_exit(exit_ceph);
1025
1026MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1027MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1028MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1029MODULE_DESCRIPTION("Ceph filesystem for Linux");
1030MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..65d12036b670
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15
16#include "types.h"
17#include "messenger.h"
18#include "msgpool.h"
19#include "mon_client.h"
20#include "mds_client.h"
21#include "osd_client.h"
22#include "ceph_fs.h"
23
24/* f_type in struct statfs */
25#define CEPH_SUPER_MAGIC 0x00c36400
26
27/* large granularity for statfs utilization stats to facilitate
28 * large volume sizes on 32-bit machines. */
29#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
30#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
31
32/*
33 * mount options
34 */
35#define CEPH_OPT_FSID (1<<0)
36#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
37#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
38#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
39#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
40#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
41#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
42
43#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
44
45#define ceph_set_opt(client, opt) \
46 (client)->mount_args->flags |= CEPH_OPT_##opt;
47#define ceph_test_opt(client, opt) \
48 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
49
50
51struct ceph_mount_args {
52 int sb_flags;
53 int num_mon;
54 struct ceph_entity_addr *mon_addr;
55 int flags;
56 int mount_timeout;
57 int osd_idle_ttl;
58 int caps_wanted_delay_min, caps_wanted_delay_max;
59 struct ceph_fsid fsid;
60 struct ceph_entity_addr my_addr;
61 int wsize;
62 int rsize; /* max readahead */
63 int max_readdir; /* max readdir size */
64 int congestion_kb; /* max readdir size */
65 int osd_timeout;
66 int osd_keepalive_timeout;
67 char *snapdir_name; /* default ".snap" */
68 char *name;
69 char *secret;
70 int cap_release_safety;
71};
72
73/*
74 * defaults
75 */
76#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
77#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
78#define CEPH_OSD_KEEPALIVE_DEFAULT 5
79#define CEPH_OSD_IDLE_TTL_DEFAULT 60
80#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
81
82#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
83#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
84
85#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
86#define CEPH_AUTH_NAME_DEFAULT "guest"
87
88/*
89 * Delay telling the MDS we no longer want caps, in case we reopen
90 * the file. Delay a minimum amount of time, even if we send a cap
91 * message for some other reason. Otherwise, take the oppotunity to
92 * update the mds to avoid sending another message later.
93 */
94#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
95#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
96
97
98/* mount state */
99enum {
100 CEPH_MOUNT_MOUNTING,
101 CEPH_MOUNT_MOUNTED,
102 CEPH_MOUNT_UNMOUNTING,
103 CEPH_MOUNT_UNMOUNTED,
104 CEPH_MOUNT_SHUTDOWN,
105};
106
107/*
108 * subtract jiffies
109 */
110static inline unsigned long time_sub(unsigned long a, unsigned long b)
111{
112 BUG_ON(time_after(b, a));
113 return (long)a - (long)b;
114}
115
116/*
117 * per-filesystem client state
118 *
119 * possibly shared by multiple mount points, if they are
120 * mounting the same ceph filesystem/cluster.
121 */
122struct ceph_client {
123 struct ceph_fsid fsid;
124 bool have_fsid;
125
126 struct mutex mount_mutex; /* serialize mount attempts */
127 struct ceph_mount_args *mount_args;
128
129 struct super_block *sb;
130
131 unsigned long mount_state;
132 wait_queue_head_t auth_wq;
133
134 int auth_err;
135
136 int min_caps; /* min caps i added */
137
138 struct ceph_messenger *msgr; /* messenger instance */
139 struct ceph_mon_client monc;
140 struct ceph_mds_client mdsc;
141 struct ceph_osd_client osdc;
142
143 /* writeback */
144 mempool_t *wb_pagevec_pool;
145 struct workqueue_struct *wb_wq;
146 struct workqueue_struct *pg_inv_wq;
147 struct workqueue_struct *trunc_wq;
148 atomic_long_t writeback_count;
149
150 struct backing_dev_info backing_dev_info;
151
152#ifdef CONFIG_DEBUG_FS
153 struct dentry *debugfs_monmap;
154 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
155 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
156 struct dentry *debugfs_congestion_kb;
157 struct dentry *debugfs_bdi;
158#endif
159};
160
161static inline struct ceph_client *ceph_client(struct super_block *sb)
162{
163 return sb->s_fs_info;
164}
165
166
167/*
168 * File i/o capability. This tracks shared state with the metadata
169 * server that allows us to cache or writeback attributes or to read
170 * and write data. For any given inode, we should have one or more
171 * capabilities, one issued by each metadata server, and our
172 * cumulative access is the OR of all issued capabilities.
173 *
174 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
175 * session capability lists.
176 */
177struct ceph_cap {
178 struct ceph_inode_info *ci;
179 struct rb_node ci_node; /* per-ci cap tree */
180 struct ceph_mds_session *session;
181 struct list_head session_caps; /* per-session caplist */
182 int mds;
183 u64 cap_id; /* unique cap id (mds provided) */
184 int issued; /* latest, from the mds */
185 int implemented; /* implemented superset of issued (for revocation) */
186 int mds_wanted;
187 u32 seq, issue_seq, mseq;
188 u32 cap_gen; /* active/stale cycle */
189 unsigned long last_used;
190 struct list_head caps_item;
191};
192
193#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
194#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
195#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
196
197/*
198 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
199 * we first complete any in-process sync writes and writeback any dirty
200 * data before flushing the snapped state (tracked here) back to the MDS.
201 */
202struct ceph_cap_snap {
203 atomic_t nref;
204 struct ceph_inode_info *ci;
205 struct list_head ci_item, flushing_item;
206
207 u64 follows, flush_tid;
208 int issued, dirty;
209 struct ceph_snap_context *context;
210
211 mode_t mode;
212 uid_t uid;
213 gid_t gid;
214
215 void *xattr_blob;
216 int xattr_len;
217 u64 xattr_version;
218
219 u64 size;
220 struct timespec mtime, atime, ctime;
221 u64 time_warp_seq;
222 int writing; /* a sync write is still in progress */
223 int dirty_pages; /* dirty pages awaiting writeback */
224};
225
226static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
227{
228 if (atomic_dec_and_test(&capsnap->nref))
229 kfree(capsnap);
230}
231
232/*
233 * The frag tree describes how a directory is fragmented, potentially across
234 * multiple metadata servers. It is also used to indicate points where
235 * metadata authority is delegated, and whether/where metadata is replicated.
236 *
237 * A _leaf_ frag will be present in the i_fragtree IFF there is
238 * delegation info. That is, if mds >= 0 || ndist > 0.
239 */
240#define CEPH_MAX_DIRFRAG_REP 4
241
242struct ceph_inode_frag {
243 struct rb_node node;
244
245 /* fragtree state */
246 u32 frag;
247 int split_by; /* i.e. 2^(split_by) children */
248
249 /* delegation and replication info */
250 int mds; /* -1 if same authority as parent */
251 int ndist; /* >0 if replicated */
252 int dist[CEPH_MAX_DIRFRAG_REP];
253};
254
255/*
256 * We cache inode xattrs as an encoded blob until they are first used,
257 * at which point we parse them into an rbtree.
258 */
259struct ceph_inode_xattr {
260 struct rb_node node;
261
262 const char *name;
263 int name_len;
264 const char *val;
265 int val_len;
266 int dirty;
267
268 int should_free_name;
269 int should_free_val;
270};
271
272struct ceph_inode_xattrs_info {
273 /*
274 * (still encoded) xattr blob. we avoid the overhead of parsing
275 * this until someone actually calls getxattr, etc.
276 *
277 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
278 * NULL means we don't know.
279 */
280 struct ceph_buffer *blob, *prealloc_blob;
281
282 struct rb_root index;
283 bool dirty;
284 int count;
285 int names_size;
286 int vals_size;
287 u64 version, index_version;
288};
289
290/*
291 * Ceph inode.
292 */
293#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
294#define CEPH_I_NODELAY 4 /* do not delay cap release */
295#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
296#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
297
298struct ceph_inode_info {
299 struct ceph_vino i_vino; /* ceph ino + snap */
300
301 u64 i_version;
302 u32 i_time_warp_seq;
303
304 unsigned i_ceph_flags;
305 unsigned long i_release_count;
306
307 struct ceph_file_layout i_layout;
308 char *i_symlink;
309
310 /* for dirs */
311 struct timespec i_rctime;
312 u64 i_rbytes, i_rfiles, i_rsubdirs;
313 u64 i_files, i_subdirs;
314 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
315
316 struct rb_root i_fragtree;
317 struct mutex i_fragtree_mutex;
318
319 struct ceph_inode_xattrs_info i_xattrs;
320
321 /* capabilities. protected _both_ by i_lock and cap->session's
322 * s_mutex. */
323 struct rb_root i_caps; /* cap list */
324 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
325 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
326 struct list_head i_dirty_item, i_flushing_item;
327 u64 i_cap_flush_seq;
328 /* we need to track cap writeback on a per-cap-bit basis, to allow
329 * overlapping, pipelined cap flushes to the mds. we can probably
330 * reduce the tid to 8 bits if we're concerned about inode size. */
331 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
332 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
333 unsigned long i_hold_caps_min; /* jiffies */
334 unsigned long i_hold_caps_max; /* jiffies */
335 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
336 int i_cap_exporting_mds; /* to handle cap migration between */
337 unsigned i_cap_exporting_mseq; /* mds's. */
338 unsigned i_cap_exporting_issued;
339 struct ceph_cap_reservation i_cap_migration_resv;
340 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
341 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
342 unsigned i_snap_caps; /* cap bits for snapped files */
343
344 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
345
346 u32 i_truncate_seq; /* last truncate to smaller size */
347 u64 i_truncate_size; /* and the size we last truncated down to */
348 int i_truncate_pending; /* still need to call vmtruncate */
349
350 u64 i_max_size; /* max file size authorized by mds */
351 u64 i_reported_size; /* (max_)size reported to or requested of mds */
352 u64 i_wanted_max_size; /* offset we'd like to write too */
353 u64 i_requested_max_size; /* max_size we've requested */
354
355 /* held references to caps */
356 int i_pin_ref;
357 int i_rd_ref, i_rdcache_ref, i_wr_ref;
358 int i_wrbuffer_ref, i_wrbuffer_ref_head;
359 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
360 u32 i_rdcache_gen; /* we increment this each time we get
361 FILE_CACHE. If it's non-zero, we
362 _may_ have cached pages. */
363 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
364
365 struct list_head i_unsafe_writes; /* uncommitted sync writes */
366 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
367 spinlock_t i_unsafe_lock;
368
369 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
370 int i_snap_realm_counter; /* snap realm (if caps) */
371 struct list_head i_snap_realm_item;
372 struct list_head i_snap_flush_item;
373
374 struct work_struct i_wb_work; /* writeback work */
375 struct work_struct i_pg_inv_work; /* page invalidation work */
376
377 struct work_struct i_vmtruncate_work;
378
379 struct inode vfs_inode; /* at end */
380};
381
382static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
383{
384 return container_of(inode, struct ceph_inode_info, vfs_inode);
385}
386
387static inline void ceph_i_clear(struct inode *inode, unsigned mask)
388{
389 struct ceph_inode_info *ci = ceph_inode(inode);
390
391 spin_lock(&inode->i_lock);
392 ci->i_ceph_flags &= ~mask;
393 spin_unlock(&inode->i_lock);
394}
395
396static inline void ceph_i_set(struct inode *inode, unsigned mask)
397{
398 struct ceph_inode_info *ci = ceph_inode(inode);
399
400 spin_lock(&inode->i_lock);
401 ci->i_ceph_flags |= mask;
402 spin_unlock(&inode->i_lock);
403}
404
405static inline bool ceph_i_test(struct inode *inode, unsigned mask)
406{
407 struct ceph_inode_info *ci = ceph_inode(inode);
408 bool r;
409
410 smp_mb();
411 r = (ci->i_ceph_flags & mask) == mask;
412 return r;
413}
414
415
416/* find a specific frag @f */
417extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
418 u32 f);
419
420/*
421 * choose fragment for value @v. copy frag content to pfrag, if leaf
422 * exists
423 */
424extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
425 struct ceph_inode_frag *pfrag,
426 int *found);
427
428/*
429 * Ceph dentry state
430 */
431struct ceph_dentry_info {
432 struct ceph_mds_session *lease_session;
433 u32 lease_gen, lease_shared_gen;
434 u32 lease_seq;
435 unsigned long lease_renew_after, lease_renew_from;
436 struct list_head lru;
437 struct dentry *dentry;
438 u64 time;
439 u64 offset;
440};
441
442static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
443{
444 return (struct ceph_dentry_info *)dentry->d_fsdata;
445}
446
447static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
448{
449 return ((loff_t)frag << 32) | (loff_t)off;
450}
451
452/*
453 * ino_t is <64 bits on many architectures, blech.
454 *
455 * don't include snap in ino hash, at least for now.
456 */
457static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
458{
459 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
460#if BITS_PER_LONG == 32
461 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
462 if (!ino)
463 ino = 1;
464#endif
465 return ino;
466}
467
468static inline int ceph_set_ino_cb(struct inode *inode, void *data)
469{
470 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
471 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
472 return 0;
473}
474
475static inline struct ceph_vino ceph_vino(struct inode *inode)
476{
477 return ceph_inode(inode)->i_vino;
478}
479
480/* for printf-style formatting */
481#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
482
483static inline u64 ceph_ino(struct inode *inode)
484{
485 return ceph_inode(inode)->i_vino.ino;
486}
487static inline u64 ceph_snap(struct inode *inode)
488{
489 return ceph_inode(inode)->i_vino.snap;
490}
491
492static inline int ceph_ino_compare(struct inode *inode, void *data)
493{
494 struct ceph_vino *pvino = (struct ceph_vino *)data;
495 struct ceph_inode_info *ci = ceph_inode(inode);
496 return ci->i_vino.ino == pvino->ino &&
497 ci->i_vino.snap == pvino->snap;
498}
499
500static inline struct inode *ceph_find_inode(struct super_block *sb,
501 struct ceph_vino vino)
502{
503 ino_t t = ceph_vino_to_ino(vino);
504 return ilookup5(sb, t, ceph_ino_compare, &vino);
505}
506
507
508/*
509 * caps helpers
510 */
511static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
512{
513 return !RB_EMPTY_ROOT(&ci->i_caps);
514}
515
516extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
517extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
518extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
519 struct ceph_cap *cap);
520
521static inline int ceph_caps_issued(struct ceph_inode_info *ci)
522{
523 int issued;
524 spin_lock(&ci->vfs_inode.i_lock);
525 issued = __ceph_caps_issued(ci, NULL);
526 spin_unlock(&ci->vfs_inode.i_lock);
527 return issued;
528}
529
530static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
531 int touch)
532{
533 int r;
534 spin_lock(&ci->vfs_inode.i_lock);
535 r = __ceph_caps_issued_mask(ci, mask, touch);
536 spin_unlock(&ci->vfs_inode.i_lock);
537 return r;
538}
539
540static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
541{
542 return ci->i_dirty_caps | ci->i_flushing_caps;
543}
544extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
545
546extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
547extern int __ceph_caps_used(struct ceph_inode_info *ci);
548
549extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
550
551/*
552 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
553 */
554static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
555{
556 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
557 if (w & CEPH_CAP_FILE_BUFFER)
558 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
559 return w;
560}
561
562/* what the mds thinks we want */
563extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
564
565extern void ceph_caps_init(void);
566extern void ceph_caps_finalize(void);
567extern void ceph_adjust_min_caps(int delta);
568extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
569extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
570extern void ceph_reservation_status(struct ceph_client *client,
571 int *total, int *avail, int *used,
572 int *reserved, int *min);
573
574static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
575{
576 return (struct ceph_client *)inode->i_sb->s_fs_info;
577}
578
579static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
580{
581 return (struct ceph_client *)sb->s_fs_info;
582}
583
584
585/*
586 * we keep buffered readdir results attached to file->private_data
587 */
588struct ceph_file_info {
589 int fmode; /* initialized on open */
590
591 /* readdir: position within the dir */
592 u32 frag;
593 struct ceph_mds_request *last_readdir;
594 int at_end;
595
596 /* readdir: position within a frag */
597 unsigned offset; /* offset of last chunk, adjusted for . and .. */
598 u64 next_offset; /* offset of next chunk (last_name's + 1) */
599 char *last_name; /* last entry in previous chunk */
600 struct dentry *dentry; /* next dentry (for dcache readdir) */
601 unsigned long dir_release_count;
602
603 /* used for -o dirstat read() on directory thing */
604 char *dir_info;
605 int dir_info_len;
606};
607
608
609
610/*
611 * snapshots
612 */
613
614/*
615 * A "snap context" is the set of existing snapshots when we
616 * write data. It is used by the OSD to guide its COW behavior.
617 *
618 * The ceph_snap_context is refcounted, and attached to each dirty
619 * page, indicating which context the dirty data belonged when it was
620 * dirtied.
621 */
622struct ceph_snap_context {
623 atomic_t nref;
624 u64 seq;
625 int num_snaps;
626 u64 snaps[];
627};
628
629static inline struct ceph_snap_context *
630ceph_get_snap_context(struct ceph_snap_context *sc)
631{
632 /*
633 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
634 atomic_read(&sc->nref)+1);
635 */
636 if (sc)
637 atomic_inc(&sc->nref);
638 return sc;
639}
640
641static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
642{
643 if (!sc)
644 return;
645 /*
646 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
647 atomic_read(&sc->nref)-1);
648 */
649 if (atomic_dec_and_test(&sc->nref)) {
650 /*printk(" deleting snap_context %p\n", sc);*/
651 kfree(sc);
652 }
653}
654
655/*
656 * A "snap realm" describes a subset of the file hierarchy sharing
657 * the same set of snapshots that apply to it. The realms themselves
658 * are organized into a hierarchy, such that children inherit (some of)
659 * the snapshots of their parents.
660 *
661 * All inodes within the realm that have capabilities are linked into a
662 * per-realm list.
663 */
664struct ceph_snap_realm {
665 u64 ino;
666 atomic_t nref;
667 struct rb_node node;
668
669 u64 created, seq;
670 u64 parent_ino;
671 u64 parent_since; /* snapid when our current parent became so */
672
673 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
674 int num_prior_parent_snaps; /* had prior to parent_since */
675 u64 *snaps; /* snaps specific to this realm */
676 int num_snaps;
677
678 struct ceph_snap_realm *parent;
679 struct list_head children; /* list of child realms */
680 struct list_head child_item;
681
682 struct list_head empty_item; /* if i have ref==0 */
683
684 /* the current set of snaps for this realm */
685 struct ceph_snap_context *cached_context;
686
687 struct list_head inodes_with_caps;
688 spinlock_t inodes_with_caps_lock;
689};
690
691
692
693/*
694 * calculate the number of pages a given length and offset map onto,
695 * if we align the data.
696 */
697static inline int calc_pages_for(u64 off, u64 len)
698{
699 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
700 (off >> PAGE_CACHE_SHIFT);
701}
702
703
704
705/* snap.c */
706struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
707 u64 ino);
708extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
709 struct ceph_snap_realm *realm);
710extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
711 struct ceph_snap_realm *realm);
712extern int ceph_update_snap_trace(struct ceph_mds_client *m,
713 void *p, void *e, bool deletion);
714extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
715 struct ceph_mds_session *session,
716 struct ceph_msg *msg);
717extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
718 struct ceph_snap_context *snapc);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
722
723/*
724 * a cap_snap is "pending" if it is still awaiting an in-progress
725 * sync write (that may/may not still update size, mtime, etc.).
726 */
727static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728{
729 return !list_empty(&ci->i_cap_snaps) &&
730 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
731 ci_item)->writing;
732}
733
734
735/* super.c */
736extern struct kmem_cache *ceph_inode_cachep;
737extern struct kmem_cache *ceph_cap_cachep;
738extern struct kmem_cache *ceph_dentry_cachep;
739extern struct kmem_cache *ceph_file_cachep;
740
741extern const char *ceph_msg_type_name(int type);
742extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
743
744#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
745 "%02x%02x%02x%02x%02x%02x"
746#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
747 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
748 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
749 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
750
751/* inode.c */
752extern const struct inode_operations ceph_file_iops;
753
754extern struct inode *ceph_alloc_inode(struct super_block *sb);
755extern void ceph_destroy_inode(struct inode *inode);
756
757extern struct inode *ceph_get_inode(struct super_block *sb,
758 struct ceph_vino vino);
759extern struct inode *ceph_get_snapdir(struct inode *parent);
760extern int ceph_fill_file_size(struct inode *inode, int issued,
761 u32 truncate_seq, u64 truncate_size, u64 size);
762extern void ceph_fill_file_time(struct inode *inode, int issued,
763 u64 time_warp_seq, struct timespec *ctime,
764 struct timespec *mtime, struct timespec *atime);
765extern int ceph_fill_trace(struct super_block *sb,
766 struct ceph_mds_request *req,
767 struct ceph_mds_session *session);
768extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
769 struct ceph_mds_session *session);
770
771extern int ceph_inode_holds_cap(struct inode *inode, int mask);
772
773extern int ceph_inode_set_size(struct inode *inode, loff_t size);
774extern void __ceph_do_pending_vmtruncate(struct inode *inode);
775extern void ceph_queue_vmtruncate(struct inode *inode);
776
777extern void ceph_queue_invalidate(struct inode *inode);
778extern void ceph_queue_writeback(struct inode *inode);
779
780extern int ceph_do_getattr(struct inode *inode, int mask);
781extern int ceph_permission(struct inode *inode, int mask);
782extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
783extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
784 struct kstat *stat);
785
786/* xattr.c */
787extern int ceph_setxattr(struct dentry *, const char *, const void *,
788 size_t, int);
789extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
790extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
791extern int ceph_removexattr(struct dentry *, const char *);
792extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
793extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
794
795/* caps.c */
796extern const char *ceph_cap_string(int c);
797extern void ceph_handle_caps(struct ceph_mds_session *session,
798 struct ceph_msg *msg);
799extern int ceph_add_cap(struct inode *inode,
800 struct ceph_mds_session *session, u64 cap_id,
801 int fmode, unsigned issued, unsigned wanted,
802 unsigned cap, unsigned seq, u64 realmino, int flags,
803 struct ceph_cap_reservation *caps_reservation);
804extern void __ceph_remove_cap(struct ceph_cap *cap);
805static inline void ceph_remove_cap(struct ceph_cap *cap)
806{
807 struct inode *inode = &cap->ci->vfs_inode;
808 spin_lock(&inode->i_lock);
809 __ceph_remove_cap(cap);
810 spin_unlock(&inode->i_lock);
811}
812extern void ceph_put_cap(struct ceph_cap *cap);
813
814extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode);
820extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
821extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
822extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
823 struct ceph_snap_context *snapc);
824extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
825 struct ceph_mds_session **psession);
826extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
827 struct ceph_mds_session *session);
828extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
829extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
830
831extern int ceph_encode_inode_release(void **p, struct inode *inode,
832 int mds, int drop, int unless, int force);
833extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
834 int mds, int drop, int unless);
835
836extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
837 int *got, loff_t endoff);
838
839/* for counting open files by mode */
840static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
841{
842 ci->i_nr_by_mode[mode]++;
843}
844extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
845
846/* addr.c */
847extern const struct address_space_operations ceph_aops;
848extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
849
850/* file.c */
851extern const struct file_operations ceph_file_fops;
852extern const struct address_space_operations ceph_aops;
853extern int ceph_open(struct inode *inode, struct file *file);
854extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
855 struct nameidata *nd, int mode,
856 int locked_dir);
857extern int ceph_release(struct inode *inode, struct file *filp);
858extern void ceph_release_page_vector(struct page **pages, int num_pages);
859
860/* dir.c */
861extern const struct file_operations ceph_dir_fops;
862extern const struct inode_operations ceph_dir_iops;
863extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
864 ceph_snapdir_dentry_ops;
865
866extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
867extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
868 struct dentry *dentry, int err);
869
870extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn);
873
874/*
875 * our d_ops vary depending on whether the inode is live,
876 * snapshotted (read-only), or a virtual ".snap" directory.
877 */
878int ceph_init_dentry(struct dentry *dentry);
879
880
881/* ioctl.c */
882extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
883
884/* export.c */
885extern const struct export_operations ceph_export_ops;
886
887/* debugfs.c */
888extern int ceph_debugfs_init(void);
889extern void ceph_debugfs_cleanup(void);
890extern int ceph_debugfs_client_init(struct ceph_client *client);
891extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
892
893static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
894{
895 if (dentry && dentry->d_parent)
896 return dentry->d_parent->d_inode;
897
898 return NULL;
899}
900
901#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..37d6ce645691
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,844 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6
7static bool ceph_is_valid_xattr(const char *name)
8{
9 return !strncmp(name, XATTR_SECURITY_PREFIX,
10 XATTR_SECURITY_PREFIX_LEN) ||
11 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
12 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
13}
14
15/*
16 * These define virtual xattrs exposing the recursive directory
17 * statistics and layout metadata.
18 */
19struct ceph_vxattr_cb {
20 bool readonly;
21 char *name;
22 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
23 size_t size);
24};
25
26/* directories */
27
28static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
29 size_t size)
30{
31 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
32}
33
34static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
35 size_t size)
36{
37 return snprintf(val, size, "%lld", ci->i_files);
38}
39
40static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
41 size_t size)
42{
43 return snprintf(val, size, "%lld", ci->i_subdirs);
44}
45
46static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
47 size_t size)
48{
49 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
50}
51
52static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
53 size_t size)
54{
55 return snprintf(val, size, "%lld", ci->i_rfiles);
56}
57
58static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
59 size_t size)
60{
61 return snprintf(val, size, "%lld", ci->i_rsubdirs);
62}
63
64static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
65 size_t size)
66{
67 return snprintf(val, size, "%lld", ci->i_rbytes);
68}
69
70static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
71 size_t size)
72{
73 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
74 (long)ci->i_rctime.tv_nsec);
75}
76
77static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
78 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
79 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
80 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
81 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
82 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
83 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
84 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
85 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
86 { true, NULL, NULL }
87};
88
89/* files */
90
91static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
92 size_t size)
93{
94 int ret;
95
96 ret = snprintf(val, size,
97 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
98 (unsigned long long)ceph_file_layout_su(ci->i_layout),
99 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
100 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
101 if (ceph_file_layout_pg_preferred(ci->i_layout))
102 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
103 (unsigned long long)ceph_file_layout_pg_preferred(
104 ci->i_layout));
105 return ret;
106}
107
108static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
109 { true, "user.ceph.layout", ceph_vxattrcb_layout},
110 { NULL, NULL }
111};
112
113static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
114{
115 if (S_ISDIR(inode->i_mode))
116 return ceph_dir_vxattrs;
117 else if (S_ISREG(inode->i_mode))
118 return ceph_file_vxattrs;
119 return NULL;
120}
121
122static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
123 const char *name)
124{
125 do {
126 if (strcmp(vxattr->name, name) == 0)
127 return vxattr;
128 vxattr++;
129 } while (vxattr->name);
130 return NULL;
131}
132
133static int __set_xattr(struct ceph_inode_info *ci,
134 const char *name, int name_len,
135 const char *val, int val_len,
136 int dirty,
137 int should_free_name, int should_free_val,
138 struct ceph_inode_xattr **newxattr)
139{
140 struct rb_node **p;
141 struct rb_node *parent = NULL;
142 struct ceph_inode_xattr *xattr = NULL;
143 int c;
144 int new = 0;
145
146 p = &ci->i_xattrs.index.rb_node;
147 while (*p) {
148 parent = *p;
149 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
150 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
151 if (c < 0)
152 p = &(*p)->rb_left;
153 else if (c > 0)
154 p = &(*p)->rb_right;
155 else {
156 if (name_len == xattr->name_len)
157 break;
158 else if (name_len < xattr->name_len)
159 p = &(*p)->rb_left;
160 else
161 p = &(*p)->rb_right;
162 }
163 xattr = NULL;
164 }
165
166 if (!xattr) {
167 new = 1;
168 xattr = *newxattr;
169 xattr->name = name;
170 xattr->name_len = name_len;
171 xattr->should_free_name = should_free_name;
172
173 ci->i_xattrs.count++;
174 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
175 } else {
176 kfree(*newxattr);
177 *newxattr = NULL;
178 if (xattr->should_free_val)
179 kfree((void *)xattr->val);
180
181 if (should_free_name) {
182 kfree((void *)name);
183 name = xattr->name;
184 }
185 ci->i_xattrs.names_size -= xattr->name_len;
186 ci->i_xattrs.vals_size -= xattr->val_len;
187 }
188 if (!xattr) {
189 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
190 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
191 xattr->val);
192 return -ENOMEM;
193 }
194 ci->i_xattrs.names_size += name_len;
195 ci->i_xattrs.vals_size += val_len;
196 if (val)
197 xattr->val = val;
198 else
199 xattr->val = "";
200
201 xattr->val_len = val_len;
202 xattr->dirty = dirty;
203 xattr->should_free_val = (val && should_free_val);
204
205 if (new) {
206 rb_link_node(&xattr->node, parent, p);
207 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
208 dout("__set_xattr_val p=%p\n", p);
209 }
210
211 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
212 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
213
214 return 0;
215}
216
217static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
218 const char *name)
219{
220 struct rb_node **p;
221 struct rb_node *parent = NULL;
222 struct ceph_inode_xattr *xattr = NULL;
223 int c;
224
225 p = &ci->i_xattrs.index.rb_node;
226 while (*p) {
227 parent = *p;
228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c < 0)
231 p = &(*p)->rb_left;
232 else if (c > 0)
233 p = &(*p)->rb_right;
234 else {
235 dout("__get_xattr %s: found %.*s\n", name,
236 xattr->val_len, xattr->val);
237 return xattr;
238 }
239 }
240
241 dout("__get_xattr %s: not found\n", name);
242
243 return NULL;
244}
245
246static void __free_xattr(struct ceph_inode_xattr *xattr)
247{
248 BUG_ON(!xattr);
249
250 if (xattr->should_free_name)
251 kfree((void *)xattr->name);
252 if (xattr->should_free_val)
253 kfree((void *)xattr->val);
254
255 kfree(xattr);
256}
257
258static int __remove_xattr(struct ceph_inode_info *ci,
259 struct ceph_inode_xattr *xattr)
260{
261 if (!xattr)
262 return -EOPNOTSUPP;
263
264 rb_erase(&xattr->node, &ci->i_xattrs.index);
265
266 if (xattr->should_free_name)
267 kfree((void *)xattr->name);
268 if (xattr->should_free_val)
269 kfree((void *)xattr->val);
270
271 ci->i_xattrs.names_size -= xattr->name_len;
272 ci->i_xattrs.vals_size -= xattr->val_len;
273 ci->i_xattrs.count--;
274 kfree(xattr);
275
276 return 0;
277}
278
279static int __remove_xattr_by_name(struct ceph_inode_info *ci,
280 const char *name)
281{
282 struct rb_node **p;
283 struct ceph_inode_xattr *xattr;
284 int err;
285
286 p = &ci->i_xattrs.index.rb_node;
287 xattr = __get_xattr(ci, name);
288 err = __remove_xattr(ci, xattr);
289 return err;
290}
291
292static char *__copy_xattr_names(struct ceph_inode_info *ci,
293 char *dest)
294{
295 struct rb_node *p;
296 struct ceph_inode_xattr *xattr = NULL;
297
298 p = rb_first(&ci->i_xattrs.index);
299 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
300
301 while (p) {
302 xattr = rb_entry(p, struct ceph_inode_xattr, node);
303 memcpy(dest, xattr->name, xattr->name_len);
304 dest[xattr->name_len] = '\0';
305
306 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
307 xattr->name_len, ci->i_xattrs.names_size);
308
309 dest += xattr->name_len + 1;
310 p = rb_next(p);
311 }
312
313 return dest;
314}
315
316void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
317{
318 struct rb_node *p, *tmp;
319 struct ceph_inode_xattr *xattr = NULL;
320
321 p = rb_first(&ci->i_xattrs.index);
322
323 dout("__ceph_destroy_xattrs p=%p\n", p);
324
325 while (p) {
326 xattr = rb_entry(p, struct ceph_inode_xattr, node);
327 tmp = p;
328 p = rb_next(tmp);
329 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
330 xattr->name_len, xattr->name);
331 rb_erase(tmp, &ci->i_xattrs.index);
332
333 __free_xattr(xattr);
334 }
335
336 ci->i_xattrs.names_size = 0;
337 ci->i_xattrs.vals_size = 0;
338 ci->i_xattrs.index_version = 0;
339 ci->i_xattrs.count = 0;
340 ci->i_xattrs.index = RB_ROOT;
341}
342
343static int __build_xattrs(struct inode *inode)
344{
345 u32 namelen;
346 u32 numattr = 0;
347 void *p, *end;
348 u32 len;
349 const char *name, *val;
350 struct ceph_inode_info *ci = ceph_inode(inode);
351 int xattr_version;
352 struct ceph_inode_xattr **xattrs = NULL;
353 int err = 0;
354 int i;
355
356 dout("__build_xattrs() len=%d\n",
357 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
358
359 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
360 return 0; /* already built */
361
362 __ceph_destroy_xattrs(ci);
363
364start:
365 /* updated internal xattr rb tree */
366 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
367 p = ci->i_xattrs.blob->vec.iov_base;
368 end = p + ci->i_xattrs.blob->vec.iov_len;
369 ceph_decode_32_safe(&p, end, numattr, bad);
370 xattr_version = ci->i_xattrs.version;
371 spin_unlock(&inode->i_lock);
372
373 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
374 GFP_NOFS);
375 err = -ENOMEM;
376 if (!xattrs)
377 goto bad_lock;
378 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
379 for (i = 0; i < numattr; i++) {
380 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
381 GFP_NOFS);
382 if (!xattrs[i])
383 goto bad_lock;
384 }
385
386 spin_lock(&inode->i_lock);
387 if (ci->i_xattrs.version != xattr_version) {
388 /* lost a race, retry */
389 for (i = 0; i < numattr; i++)
390 kfree(xattrs[i]);
391 kfree(xattrs);
392 goto start;
393 }
394 err = -EIO;
395 while (numattr--) {
396 ceph_decode_32_safe(&p, end, len, bad);
397 namelen = len;
398 name = p;
399 p += len;
400 ceph_decode_32_safe(&p, end, len, bad);
401 val = p;
402 p += len;
403
404 err = __set_xattr(ci, name, namelen, val, len,
405 0, 0, 0, &xattrs[numattr]);
406
407 if (err < 0)
408 goto bad;
409 }
410 kfree(xattrs);
411 }
412 ci->i_xattrs.index_version = ci->i_xattrs.version;
413 ci->i_xattrs.dirty = false;
414
415 return err;
416bad_lock:
417 spin_lock(&inode->i_lock);
418bad:
419 if (xattrs) {
420 for (i = 0; i < numattr; i++)
421 kfree(xattrs[i]);
422 kfree(xattrs);
423 }
424 ci->i_xattrs.names_size = 0;
425 return err;
426}
427
428static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
429 int val_size)
430{
431 /*
432 * 4 bytes for the length, and additional 4 bytes per each xattr name,
433 * 4 bytes per each value
434 */
435 int size = 4 + ci->i_xattrs.count*(4 + 4) +
436 ci->i_xattrs.names_size +
437 ci->i_xattrs.vals_size;
438 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
439 ci->i_xattrs.count, ci->i_xattrs.names_size,
440 ci->i_xattrs.vals_size);
441
442 if (name_size)
443 size += 4 + 4 + name_size + val_size;
444
445 return size;
446}
447
448/*
449 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
450 * and swap into place.
451 */
452void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
453{
454 struct rb_node *p;
455 struct ceph_inode_xattr *xattr = NULL;
456 void *dest;
457
458 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
459 if (ci->i_xattrs.dirty) {
460 int need = __get_required_blob_size(ci, 0, 0);
461
462 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
463
464 p = rb_first(&ci->i_xattrs.index);
465 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
466
467 ceph_encode_32(&dest, ci->i_xattrs.count);
468 while (p) {
469 xattr = rb_entry(p, struct ceph_inode_xattr, node);
470
471 ceph_encode_32(&dest, xattr->name_len);
472 memcpy(dest, xattr->name, xattr->name_len);
473 dest += xattr->name_len;
474 ceph_encode_32(&dest, xattr->val_len);
475 memcpy(dest, xattr->val, xattr->val_len);
476 dest += xattr->val_len;
477
478 p = rb_next(p);
479 }
480
481 /* adjust buffer len; it may be larger than we need */
482 ci->i_xattrs.prealloc_blob->vec.iov_len =
483 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
484
485 if (ci->i_xattrs.blob)
486 ceph_buffer_put(ci->i_xattrs.blob);
487 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
488 ci->i_xattrs.prealloc_blob = NULL;
489 ci->i_xattrs.dirty = false;
490 }
491}
492
493ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
494 size_t size)
495{
496 struct inode *inode = dentry->d_inode;
497 struct ceph_inode_info *ci = ceph_inode(inode);
498 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
499 int err;
500 struct ceph_inode_xattr *xattr;
501 struct ceph_vxattr_cb *vxattr = NULL;
502
503 if (!ceph_is_valid_xattr(name))
504 return -ENODATA;
505
506 /* let's see if a virtual xattr was requested */
507 if (vxattrs)
508 vxattr = ceph_match_vxattr(vxattrs, name);
509
510 spin_lock(&inode->i_lock);
511 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
512 ci->i_xattrs.version, ci->i_xattrs.index_version);
513
514 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
515 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
516 goto get_xattr;
517 } else {
518 spin_unlock(&inode->i_lock);
519 /* get xattrs from mds (if we don't already have them) */
520 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
521 if (err)
522 return err;
523 }
524
525 spin_lock(&inode->i_lock);
526
527 if (vxattr && vxattr->readonly) {
528 err = vxattr->getxattr_cb(ci, value, size);
529 goto out;
530 }
531
532 err = __build_xattrs(inode);
533 if (err < 0)
534 goto out;
535
536get_xattr:
537 err = -ENODATA; /* == ENOATTR */
538 xattr = __get_xattr(ci, name);
539 if (!xattr) {
540 if (vxattr)
541 err = vxattr->getxattr_cb(ci, value, size);
542 goto out;
543 }
544
545 err = -ERANGE;
546 if (size && size < xattr->val_len)
547 goto out;
548
549 err = xattr->val_len;
550 if (size == 0)
551 goto out;
552
553 memcpy(value, xattr->val, xattr->val_len);
554
555out:
556 spin_unlock(&inode->i_lock);
557 return err;
558}
559
560ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
561{
562 struct inode *inode = dentry->d_inode;
563 struct ceph_inode_info *ci = ceph_inode(inode);
564 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
565 u32 vir_namelen = 0;
566 u32 namelen;
567 int err;
568 u32 len;
569 int i;
570
571 spin_lock(&inode->i_lock);
572 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
573 ci->i_xattrs.version, ci->i_xattrs.index_version);
574
575 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
576 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
577 goto list_xattr;
578 } else {
579 spin_unlock(&inode->i_lock);
580 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
581 if (err)
582 return err;
583 }
584
585 spin_lock(&inode->i_lock);
586
587 err = __build_xattrs(inode);
588 if (err < 0)
589 goto out;
590
591list_xattr:
592 vir_namelen = 0;
593 /* include virtual dir xattrs */
594 if (vxattrs)
595 for (i = 0; vxattrs[i].name; i++)
596 vir_namelen += strlen(vxattrs[i].name) + 1;
597 /* adding 1 byte per each variable due to the null termination */
598 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
599 err = -ERANGE;
600 if (size && namelen > size)
601 goto out;
602
603 err = namelen;
604 if (size == 0)
605 goto out;
606
607 names = __copy_xattr_names(ci, names);
608
609 /* virtual xattr names, too */
610 if (vxattrs)
611 for (i = 0; vxattrs[i].name; i++) {
612 len = sprintf(names, "%s", vxattrs[i].name);
613 names += len + 1;
614 }
615
616out:
617 spin_unlock(&inode->i_lock);
618 return err;
619}
620
621static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
622 const char *value, size_t size, int flags)
623{
624 struct ceph_client *client = ceph_client(dentry->d_sb);
625 struct inode *inode = dentry->d_inode;
626 struct ceph_inode_info *ci = ceph_inode(inode);
627 struct inode *parent_inode = dentry->d_parent->d_inode;
628 struct ceph_mds_request *req;
629 struct ceph_mds_client *mdsc = &client->mdsc;
630 int err;
631 int i, nr_pages;
632 struct page **pages = NULL;
633 void *kaddr;
634
635 /* copy value into some pages */
636 nr_pages = calc_pages_for(0, size);
637 if (nr_pages) {
638 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
639 if (!pages)
640 return -ENOMEM;
641 err = -ENOMEM;
642 for (i = 0; i < nr_pages; i++) {
643 pages[i] = alloc_page(GFP_NOFS);
644 if (!pages[i]) {
645 nr_pages = i;
646 goto out;
647 }
648 kaddr = kmap(pages[i]);
649 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
650 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
651 }
652 }
653
654 dout("setxattr value=%.*s\n", (int)size, value);
655
656 /* do request */
657 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
658 USE_AUTH_MDS);
659 if (IS_ERR(req)) {
660 err = PTR_ERR(req);
661 goto out;
662 }
663 req->r_inode = igrab(inode);
664 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
665 req->r_num_caps = 1;
666 req->r_args.setxattr.flags = cpu_to_le32(flags);
667 req->r_path2 = kstrdup(name, GFP_NOFS);
668
669 req->r_pages = pages;
670 req->r_num_pages = nr_pages;
671 req->r_data_len = size;
672
673 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
674 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
675 ceph_mdsc_put_request(req);
676 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
677
678out:
679 if (pages) {
680 for (i = 0; i < nr_pages; i++)
681 __free_page(pages[i]);
682 kfree(pages);
683 }
684 return err;
685}
686
687int ceph_setxattr(struct dentry *dentry, const char *name,
688 const void *value, size_t size, int flags)
689{
690 struct inode *inode = dentry->d_inode;
691 struct ceph_inode_info *ci = ceph_inode(inode);
692 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
693 int err;
694 int name_len = strlen(name);
695 int val_len = size;
696 char *newname = NULL;
697 char *newval = NULL;
698 struct ceph_inode_xattr *xattr = NULL;
699 int issued;
700 int required_blob_size;
701
702 if (ceph_snap(inode) != CEPH_NOSNAP)
703 return -EROFS;
704
705 if (!ceph_is_valid_xattr(name))
706 return -EOPNOTSUPP;
707
708 if (vxattrs) {
709 struct ceph_vxattr_cb *vxattr =
710 ceph_match_vxattr(vxattrs, name);
711 if (vxattr && vxattr->readonly)
712 return -EOPNOTSUPP;
713 }
714
715 /* preallocate memory for xattr name, value, index node */
716 err = -ENOMEM;
717 newname = kmalloc(name_len + 1, GFP_NOFS);
718 if (!newname)
719 goto out;
720 memcpy(newname, name, name_len + 1);
721
722 if (val_len) {
723 newval = kmalloc(val_len + 1, GFP_NOFS);
724 if (!newval)
725 goto out;
726 memcpy(newval, value, val_len);
727 newval[val_len] = '\0';
728 }
729
730 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
731 if (!xattr)
732 goto out;
733
734 spin_lock(&inode->i_lock);
735retry:
736 issued = __ceph_caps_issued(ci, NULL);
737 if (!(issued & CEPH_CAP_XATTR_EXCL))
738 goto do_sync;
739 __build_xattrs(inode);
740
741 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
742
743 if (!ci->i_xattrs.prealloc_blob ||
744 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
745 struct ceph_buffer *blob = NULL;
746
747 spin_unlock(&inode->i_lock);
748 dout(" preaallocating new blob size=%d\n", required_blob_size);
749 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
750 if (!blob)
751 goto out;
752 spin_lock(&inode->i_lock);
753 if (ci->i_xattrs.prealloc_blob)
754 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
755 ci->i_xattrs.prealloc_blob = blob;
756 goto retry;
757 }
758
759 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
760 err = __set_xattr(ci, newname, name_len, newval,
761 val_len, 1, 1, 1, &xattr);
762 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
763 ci->i_xattrs.dirty = true;
764 inode->i_ctime = CURRENT_TIME;
765 spin_unlock(&inode->i_lock);
766
767 return err;
768
769do_sync:
770 spin_unlock(&inode->i_lock);
771 err = ceph_sync_setxattr(dentry, name, value, size, flags);
772out:
773 kfree(newname);
774 kfree(newval);
775 kfree(xattr);
776 return err;
777}
778
779static int ceph_send_removexattr(struct dentry *dentry, const char *name)
780{
781 struct ceph_client *client = ceph_client(dentry->d_sb);
782 struct ceph_mds_client *mdsc = &client->mdsc;
783 struct inode *inode = dentry->d_inode;
784 struct inode *parent_inode = dentry->d_parent->d_inode;
785 struct ceph_mds_request *req;
786 int err;
787
788 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
789 USE_AUTH_MDS);
790 if (IS_ERR(req))
791 return PTR_ERR(req);
792 req->r_inode = igrab(inode);
793 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
794 req->r_num_caps = 1;
795 req->r_path2 = kstrdup(name, GFP_NOFS);
796
797 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
798 ceph_mdsc_put_request(req);
799 return err;
800}
801
802int ceph_removexattr(struct dentry *dentry, const char *name)
803{
804 struct inode *inode = dentry->d_inode;
805 struct ceph_inode_info *ci = ceph_inode(inode);
806 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
807 int issued;
808 int err;
809
810 if (ceph_snap(inode) != CEPH_NOSNAP)
811 return -EROFS;
812
813 if (!ceph_is_valid_xattr(name))
814 return -EOPNOTSUPP;
815
816 if (vxattrs) {
817 struct ceph_vxattr_cb *vxattr =
818 ceph_match_vxattr(vxattrs, name);
819 if (vxattr && vxattr->readonly)
820 return -EOPNOTSUPP;
821 }
822
823 spin_lock(&inode->i_lock);
824 __build_xattrs(inode);
825 issued = __ceph_caps_issued(ci, NULL);
826 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
827
828 if (!(issued & CEPH_CAP_XATTR_EXCL))
829 goto do_sync;
830
831 err = __remove_xattr_by_name(ceph_inode(inode), name);
832 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
833 ci->i_xattrs.dirty = true;
834 inode->i_ctime = CURRENT_TIME;
835
836 spin_unlock(&inode->i_lock);
837
838 return err;
839do_sync:
840 spin_unlock(&inode->i_lock);
841 err = ceph_send_removexattr(dentry, name);
842 return err;
843}
844
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 49503d2edc7e..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,6 +1,7 @@
1Version 1.62 1Version 1.62
2------------ 2------------
3Add sockopt=TCP_NODELAY mount option. 3Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
4to more strictly handle corrupt frames.
4 5
5Version 1.61 6Version 1.61
6------------ 7------------
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb24..a20bea598933 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
136 return 0; 136 return 0;
137 } 137 }
138 138
139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */ 139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ 140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */
141 *val = *(++(ctx->pointer)); /* value has enum value */ 141 *val = *(++(ctx->pointer)); /* value has enum value */
142 else 142 else
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b44ce0a0711c..b1d61d0bdfc7 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -54,7 +54,7 @@ void cifs_dfs_release_automount_timer(void)
54 * Extracts sharename form full UNC. 54 * Extracts sharename form full UNC.
55 * i.e. strips from UNC trailing path that is not part of share 55 * i.e. strips from UNC trailing path that is not part of share
56 * name and fixup missing '\' in the begining of DFS node refferal 56 * name and fixup missing '\' in the begining of DFS node refferal
57 * if neccessary. 57 * if necessary.
58 * Returns pointer to share name on success or ERR_PTR on error. 58 * Returns pointer to share name on success or ERR_PTR on error.
59 * Caller is responsible for freeing returned string. 59 * Caller is responsible for freeing returned string.
60 */ 60 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..5183bc2a1916 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 312 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 313 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 314 cifs_inode->delete_pending = false;
315 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 317 cifs_inode->server_eof = 0;
317 318
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 639 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 640 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 641
641 retval = cifs_revalidate(file->f_path.dentry); 642 retval = cifs_revalidate_file(file);
642 if (retval < 0) 643 if (retval < 0)
643 return (loff_t)retval; 644 return (loff_t)retval;
644 } 645 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ed751bb657db..63c89d1d70b5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -205,7 +205,7 @@ struct cifsUidInfo {
205struct cifsSesInfo { 205struct cifsSesInfo {
206 struct list_head smb_ses_list; 206 struct list_head smb_ses_list;
207 struct list_head tcon_list; 207 struct list_head tcon_list;
208 struct semaphore sesSem; 208 struct mutex session_mutex;
209#if 0 209#if 0
210 struct cifsUidInfo *uidInfo; /* pointer to user info */ 210 struct cifsUidInfo *uidInfo; /* pointer to user info */
211#endif 211#endif
@@ -389,6 +389,7 @@ struct cifsInodeInfo {
389 bool clientCanCacheRead:1; /* read oplock */ 389 bool clientCanCacheRead:1; /* read oplock */
390 bool clientCanCacheAll:1; /* read and writebehind oplock */ 390 bool clientCanCacheAll:1; /* read and writebehind oplock */
391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
392 bool invalid_mapping:1; /* pagecache is invalid */
392 u64 server_eof; /* current file size on server */ 393 u64 server_eof; /* current file size on server */
393 u64 uniqueid; /* server inode number */ 394 u64 uniqueid; /* server inode number */
394 struct inode vfs_inode; 395 struct inode vfs_inode;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3877737f96a6..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
415 __u8 WordCount; 415 __u8 WordCount;
416} __attribute__((packed)); 416} __attribute__((packed));
417/* given a pointer to an smb_hdr retrieve the value of byte count */ 417/* given a pointer to an smb_hdr retrieve the value of byte count */
418#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 418#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
419#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 419#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
421#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2) 421#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
422 422
423/* 423/*
424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 104extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 105 struct cifs_fattr *fattr);
106 106
107extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 109 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 111 struct super_block *sb, int xid, const __u16 *pfid);
112extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 113extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 114 const unsigned char *search_path,
113 struct super_block *sb, int xid); 115 struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 144extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 145 const __u16 search_handle);
144 146
147extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
148 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 149extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 150 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 151 FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 156 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 157 const struct nls_table *nls_codepage, int remap);
154 158
159extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
160 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 161extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 162 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 163 const unsigned char *searchName,
@@ -363,13 +369,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
363 __u32 filter, struct file *file, int multishot, 369 __u32 filter, struct file *file, int multishot,
364 const struct nls_table *nls_codepage); 370 const struct nls_table *nls_codepage);
365extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 371extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
366 const unsigned char *searchName, char *EAData, 372 const unsigned char *searchName,
373 const unsigned char *ea_name, char *EAData,
367 size_t bufsize, const struct nls_table *nls_codepage, 374 size_t bufsize, const struct nls_table *nls_codepage,
368 int remap_special_chars); 375 int remap_special_chars);
369extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
370 const unsigned char *searchName, const unsigned char *ea_name,
371 unsigned char *ea_value, size_t buf_size,
372 const struct nls_table *nls_codepage, int remap_special_chars);
373extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, 376extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
374 const char *fileName, const char *ea_name, 377 const char *fileName, const char *ea_name,
375 const void *ea_value, const __u16 ea_value_len, 378 const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..7cc7f83e9314 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -170,19 +170,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
170 * need to prevent multiple threads trying to simultaneously 170 * need to prevent multiple threads trying to simultaneously
171 * reconnect the same SMB session 171 * reconnect the same SMB session
172 */ 172 */
173 down(&ses->sesSem); 173 mutex_lock(&ses->session_mutex);
174 if (ses->need_reconnect) 174 if (ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage); 175 rc = cifs_setup_session(0, ses, nls_codepage);
176 176
177 /* do we need to reconnect tcon? */ 177 /* do we need to reconnect tcon? */
178 if (rc || !tcon->need_reconnect) { 178 if (rc || !tcon->need_reconnect) {
179 up(&ses->sesSem); 179 mutex_unlock(&ses->session_mutex);
180 goto out; 180 goto out;
181 } 181 }
182 182
183 mark_open_files_invalid(tcon); 183 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 up(&ses->sesSem); 185 mutex_unlock(&ses->session_mutex);
186 cFYI(1, ("reconnect tcon rc = %d", rc)); 186 cFYI(1, ("reconnect tcon rc = %d", rc));
187 187
188 if (rc) 188 if (rc)
@@ -500,7 +500,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
500 } else if (pSMBr->hdr.WordCount == 13) { 500 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 501 cERROR(1, ("mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 502 "with CIFS_WEAK_PW_HASH support"));
503 rc = -EOPNOTSUPP; 503 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 504#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 505 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 506 } else if (pSMBr->hdr.WordCount != 17) {
@@ -700,13 +700,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
700 if (!ses || !ses->server) 700 if (!ses || !ses->server)
701 return -EIO; 701 return -EIO;
702 702
703 down(&ses->sesSem); 703 mutex_lock(&ses->session_mutex);
704 if (ses->need_reconnect) 704 if (ses->need_reconnect)
705 goto session_already_dead; /* no need to send SMBlogoff if uid 705 goto session_already_dead; /* no need to send SMBlogoff if uid
706 already closed due to reconnect */ 706 already closed due to reconnect */
707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); 707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
708 if (rc) { 708 if (rc) {
709 up(&ses->sesSem); 709 mutex_unlock(&ses->session_mutex);
710 return rc; 710 return rc;
711 } 711 }
712 712
@@ -721,7 +721,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
721 pSMB->AndXCommand = 0xFF; 721 pSMB->AndXCommand = 0xFF;
722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); 722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
723session_already_dead: 723session_already_dead:
724 up(&ses->sesSem); 724 mutex_unlock(&ses->session_mutex);
725 725
726 /* if session dead then we do not need to do ulogoff, 726 /* if session dead then we do not need to do ulogoff,
727 since server closed smb session, no sense reporting 727 since server closed smb session, no sense reporting
@@ -3230,8 +3230,72 @@ QInfRetry:
3230 return rc; 3230 return rc;
3231} 3231}
3232 3232
3233int
3234CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3235 u16 netfid, FILE_ALL_INFO *pFindData)
3236{
3237 struct smb_t2_qfi_req *pSMB = NULL;
3238 struct smb_t2_qfi_rsp *pSMBr = NULL;
3239 int rc = 0;
3240 int bytes_returned;
3241 __u16 params, byte_count;
3242
3243QFileInfoRetry:
3244 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3245 (void **) &pSMBr);
3246 if (rc)
3247 return rc;
3248
3249 params = 2 /* level */ + 2 /* fid */;
3250 pSMB->t2.TotalDataCount = 0;
3251 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3252 /* BB find exact max data count below from sess structure BB */
3253 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3254 pSMB->t2.MaxSetupCount = 0;
3255 pSMB->t2.Reserved = 0;
3256 pSMB->t2.Flags = 0;
3257 pSMB->t2.Timeout = 0;
3258 pSMB->t2.Reserved2 = 0;
3259 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3260 Fid) - 4);
3261 pSMB->t2.DataCount = 0;
3262 pSMB->t2.DataOffset = 0;
3263 pSMB->t2.SetupCount = 1;
3264 pSMB->t2.Reserved3 = 0;
3265 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3266 byte_count = params + 1 /* pad */ ;
3267 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3268 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3269 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3270 pSMB->Pad = 0;
3271 pSMB->Fid = netfid;
3272 pSMB->hdr.smb_buf_length += byte_count;
3273
3274 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3275 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3276 if (rc) {
3277 cFYI(1, ("Send error in QPathInfo = %d", rc));
3278 } else { /* decode response */
3279 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3233 3280
3281 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3282 rc = -EIO;
3283 else if (pSMBr->ByteCount < 40)
3284 rc = -EIO; /* bad smb */
3285 else if (pFindData) {
3286 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3287 memcpy((char *) pFindData,
3288 (char *) &pSMBr->hdr.Protocol +
3289 data_offset, sizeof(FILE_ALL_INFO));
3290 } else
3291 rc = -ENOMEM;
3292 }
3293 cifs_buf_release(pSMB);
3294 if (rc == -EAGAIN)
3295 goto QFileInfoRetry;
3234 3296
3297 return rc;
3298}
3235 3299
3236int 3300int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3301CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3399,75 @@ QPathInfoRetry:
3335} 3399}
3336 3400
3337int 3401int
3402CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3403 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3404{
3405 struct smb_t2_qfi_req *pSMB = NULL;
3406 struct smb_t2_qfi_rsp *pSMBr = NULL;
3407 int rc = 0;
3408 int bytes_returned;
3409 __u16 params, byte_count;
3410
3411UnixQFileInfoRetry:
3412 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3413 (void **) &pSMBr);
3414 if (rc)
3415 return rc;
3416
3417 params = 2 /* level */ + 2 /* fid */;
3418 pSMB->t2.TotalDataCount = 0;
3419 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3420 /* BB find exact max data count below from sess structure BB */
3421 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3422 pSMB->t2.MaxSetupCount = 0;
3423 pSMB->t2.Reserved = 0;
3424 pSMB->t2.Flags = 0;
3425 pSMB->t2.Timeout = 0;
3426 pSMB->t2.Reserved2 = 0;
3427 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3428 Fid) - 4);
3429 pSMB->t2.DataCount = 0;
3430 pSMB->t2.DataOffset = 0;
3431 pSMB->t2.SetupCount = 1;
3432 pSMB->t2.Reserved3 = 0;
3433 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3434 byte_count = params + 1 /* pad */ ;
3435 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3436 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3437 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3438 pSMB->Pad = 0;
3439 pSMB->Fid = netfid;
3440 pSMB->hdr.smb_buf_length += byte_count;
3441
3442 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3443 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3444 if (rc) {
3445 cFYI(1, ("Send error in QPathInfo = %d", rc));
3446 } else { /* decode response */
3447 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3448
3449 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3450 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
3451 "Unix Extensions can be disabled on mount "
3452 "by specifying the nosfu mount option."));
3453 rc = -EIO; /* bad smb */
3454 } else {
3455 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3456 memcpy((char *) pFindData,
3457 (char *) &pSMBr->hdr.Protocol +
3458 data_offset,
3459 sizeof(FILE_UNIX_BASIC_INFO));
3460 }
3461 }
3462
3463 cifs_buf_release(pSMB);
3464 if (rc == -EAGAIN)
3465 goto UnixQFileInfoRetry;
3466
3467 return rc;
3468}
3469
3470int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3471CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3472 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3473 FILE_UNIX_BASIC_INFO *pFindData,
@@ -3886,7 +4019,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3886 goto parse_DFS_referrals_exit; 4019 goto parse_DFS_referrals_exit;
3887 } 4020 }
3888 4021
3889 /* collect neccessary data from referrals */ 4022 /* collect necessary data from referrals */
3890 for (i = 0; i < *num_of_nodes; i++) { 4023 for (i = 0; i < *num_of_nodes; i++) {
3891 char *temp; 4024 char *temp;
3892 int max_len; 4025 int max_len;
@@ -5269,22 +5402,34 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5269 cifs_buf_release(pSMB); 5402 cifs_buf_release(pSMB);
5270 return rc; 5403 return rc;
5271} 5404}
5405
5272#ifdef CONFIG_CIFS_XATTR 5406#ifdef CONFIG_CIFS_XATTR
5407/*
5408 * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
5409 * function used by listxattr and getxattr type calls. When ea_name is set,
5410 * it looks for that attribute name and stuffs that value into the EAData
5411 * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
5412 * buffer. In both cases, the return value is either the length of the
5413 * resulting data or a negative error code. If EAData is a NULL pointer then
5414 * the data isn't copied to it, but the length is returned.
5415 */
5273ssize_t 5416ssize_t
5274CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 5417CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5275 const unsigned char *searchName, 5418 const unsigned char *searchName, const unsigned char *ea_name,
5276 char *EAData, size_t buf_size, 5419 char *EAData, size_t buf_size,
5277 const struct nls_table *nls_codepage, int remap) 5420 const struct nls_table *nls_codepage, int remap)
5278{ 5421{
5279 /* BB assumes one setup word */ 5422 /* BB assumes one setup word */
5280 TRANSACTION2_QPI_REQ *pSMB = NULL; 5423 TRANSACTION2_QPI_REQ *pSMB = NULL;
5281 TRANSACTION2_QPI_RSP *pSMBr = NULL; 5424 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5282 int rc = 0; 5425 int rc = 0;
5283 int bytes_returned; 5426 int bytes_returned;
5284 int name_len; 5427 int list_len;
5428 struct fealist *ea_response_data;
5285 struct fea *temp_fea; 5429 struct fea *temp_fea;
5286 char *temp_ptr; 5430 char *temp_ptr;
5287 __u16 params, byte_count; 5431 char *end_of_smb;
5432 __u16 params, byte_count, data_offset;
5288 5433
5289 cFYI(1, ("In Query All EAs path %s", searchName)); 5434 cFYI(1, ("In Query All EAs path %s", searchName));
5290QAllEAsRetry: 5435QAllEAsRetry:
@@ -5294,22 +5439,22 @@ QAllEAsRetry:
5294 return rc; 5439 return rc;
5295 5440
5296 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5441 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5297 name_len = 5442 list_len =
5298 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5443 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
5299 PATH_MAX, nls_codepage, remap); 5444 PATH_MAX, nls_codepage, remap);
5300 name_len++; /* trailing null */ 5445 list_len++; /* trailing null */
5301 name_len *= 2; 5446 list_len *= 2;
5302 } else { /* BB improve the check for buffer overruns BB */ 5447 } else { /* BB improve the check for buffer overruns BB */
5303 name_len = strnlen(searchName, PATH_MAX); 5448 list_len = strnlen(searchName, PATH_MAX);
5304 name_len++; /* trailing null */ 5449 list_len++; /* trailing null */
5305 strncpy(pSMB->FileName, searchName, name_len); 5450 strncpy(pSMB->FileName, searchName, list_len);
5306 } 5451 }
5307 5452
5308 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5453 params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
5309 pSMB->TotalDataCount = 0; 5454 pSMB->TotalDataCount = 0;
5310 pSMB->MaxParameterCount = cpu_to_le16(2); 5455 pSMB->MaxParameterCount = cpu_to_le16(2);
5311 /* BB find exact max SMB PDU from sess structure BB */ 5456 /* BB find exact max SMB PDU from sess structure BB */
5312 pSMB->MaxDataCount = cpu_to_le16(4000); 5457 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
5313 pSMB->MaxSetupCount = 0; 5458 pSMB->MaxSetupCount = 0;
5314 pSMB->Reserved = 0; 5459 pSMB->Reserved = 0;
5315 pSMB->Flags = 0; 5460 pSMB->Flags = 0;
@@ -5334,237 +5479,117 @@ QAllEAsRetry:
5334 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5479 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5335 if (rc) { 5480 if (rc) {
5336 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5481 cFYI(1, ("Send error in QueryAllEAs = %d", rc));
5337 } else { /* decode response */ 5482 goto QAllEAsOut;
5338 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5483 }
5339 5484
5340 /* BB also check enough total bytes returned */ 5485
5341 /* BB we need to improve the validity checking 5486 /* BB also check enough total bytes returned */
5342 of these trans2 responses */ 5487 /* BB we need to improve the validity checking
5343 if (rc || (pSMBr->ByteCount < 4)) 5488 of these trans2 responses */
5344 rc = -EIO; /* bad smb */ 5489
5345 /* else if (pFindData){ 5490 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
5346 memcpy((char *) pFindData, 5491 if (rc || (pSMBr->ByteCount < 4)) {
5347 (char *) &pSMBr->hdr.Protocol + 5492 rc = -EIO; /* bad smb */
5348 data_offset, kl); 5493 goto QAllEAsOut;
5349 }*/ else {
5350 /* check that length of list is not more than bcc */
5351 /* check that each entry does not go beyond length
5352 of list */
5353 /* check that each element of each entry does not
5354 go beyond end of list */
5355 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5356 struct fealist *ea_response_data;
5357 rc = 0;
5358 /* validate_trans2_offsets() */
5359 /* BB check if start of smb + data_offset > &bcc+ bcc */
5360 ea_response_data = (struct fealist *)
5361 (((char *) &pSMBr->hdr.Protocol) +
5362 data_offset);
5363 name_len = le32_to_cpu(ea_response_data->list_len);
5364 cFYI(1, ("ea length %d", name_len));
5365 if (name_len <= 8) {
5366 /* returned EA size zeroed at top of function */
5367 cFYI(1, ("empty EA list returned from server"));
5368 } else {
5369 /* account for ea list len */
5370 name_len -= 4;
5371 temp_fea = ea_response_data->list;
5372 temp_ptr = (char *)temp_fea;
5373 while (name_len > 0) {
5374 __u16 value_len;
5375 name_len -= 4;
5376 temp_ptr += 4;
5377 rc += temp_fea->name_len;
5378 /* account for prefix user. and trailing null */
5379 rc = rc + 5 + 1;
5380 if (rc < (int)buf_size) {
5381 memcpy(EAData, "user.", 5);
5382 EAData += 5;
5383 memcpy(EAData, temp_ptr,
5384 temp_fea->name_len);
5385 EAData += temp_fea->name_len;
5386 /* null terminate name */
5387 *EAData = 0;
5388 EAData = EAData + 1;
5389 } else if (buf_size == 0) {
5390 /* skip copy - calc size only */
5391 } else {
5392 /* stop before overrun buffer */
5393 rc = -ERANGE;
5394 break;
5395 }
5396 name_len -= temp_fea->name_len;
5397 temp_ptr += temp_fea->name_len;
5398 /* account for trailing null */
5399 name_len--;
5400 temp_ptr++;
5401 value_len =
5402 le16_to_cpu(temp_fea->value_len);
5403 name_len -= value_len;
5404 temp_ptr += value_len;
5405 /* BB check that temp_ptr is still
5406 within the SMB BB*/
5407
5408 /* no trailing null to account for
5409 in value len */
5410 /* go on to next EA */
5411 temp_fea = (struct fea *)temp_ptr;
5412 }
5413 }
5414 }
5415 } 5494 }
5416 cifs_buf_release(pSMB);
5417 if (rc == -EAGAIN)
5418 goto QAllEAsRetry;
5419 5495
5420 return (ssize_t)rc; 5496 /* check that length of list is not more than bcc */
5421} 5497 /* check that each entry does not go beyond length
5498 of list */
5499 /* check that each element of each entry does not
5500 go beyond end of list */
5501 /* validate_trans2_offsets() */
5502 /* BB check if start of smb + data_offset > &bcc+ bcc */
5422 5503
5423ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon, 5504 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5424 const unsigned char *searchName, const unsigned char *ea_name, 5505 ea_response_data = (struct fealist *)
5425 unsigned char *ea_value, size_t buf_size, 5506 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5426 const struct nls_table *nls_codepage, int remap)
5427{
5428 TRANSACTION2_QPI_REQ *pSMB = NULL;
5429 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5430 int rc = 0;
5431 int bytes_returned;
5432 int name_len;
5433 struct fea *temp_fea;
5434 char *temp_ptr;
5435 __u16 params, byte_count;
5436 5507
5437 cFYI(1, ("In Query EA path %s", searchName)); 5508 list_len = le32_to_cpu(ea_response_data->list_len);
5438QEARetry: 5509 cFYI(1, ("ea length %d", list_len));
5439 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5510 if (list_len <= 8) {
5440 (void **) &pSMBr); 5511 cFYI(1, ("empty EA list returned from server"));
5441 if (rc) 5512 goto QAllEAsOut;
5442 return rc; 5513 }
5443 5514
5444 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5515 /* make sure list_len doesn't go past end of SMB */
5445 name_len = 5516 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5446 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5517 if ((char *)ea_response_data + list_len > end_of_smb) {
5447 PATH_MAX, nls_codepage, remap); 5518 cFYI(1, ("EA list appears to go beyond SMB"));
5448 name_len++; /* trailing null */ 5519 rc = -EIO;
5449 name_len *= 2; 5520 goto QAllEAsOut;
5450 } else { /* BB improve the check for buffer overruns BB */
5451 name_len = strnlen(searchName, PATH_MAX);
5452 name_len++; /* trailing null */
5453 strncpy(pSMB->FileName, searchName, name_len);
5454 } 5521 }
5455 5522
5456 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5523 /* account for ea list len */
5457 pSMB->TotalDataCount = 0; 5524 list_len -= 4;
5458 pSMB->MaxParameterCount = cpu_to_le16(2); 5525 temp_fea = ea_response_data->list;
5459 /* BB find exact max SMB PDU from sess structure BB */ 5526 temp_ptr = (char *)temp_fea;
5460 pSMB->MaxDataCount = cpu_to_le16(4000); 5527 while (list_len > 0) {
5461 pSMB->MaxSetupCount = 0; 5528 unsigned int name_len;
5462 pSMB->Reserved = 0; 5529 __u16 value_len;
5463 pSMB->Flags = 0; 5530
5464 pSMB->Timeout = 0; 5531 list_len -= 4;
5465 pSMB->Reserved2 = 0; 5532 temp_ptr += 4;
5466 pSMB->ParameterOffset = cpu_to_le16(offsetof( 5533 /* make sure we can read name_len and value_len */
5467 struct smb_com_transaction2_qpi_req, InformationLevel) - 4); 5534 if (list_len < 0) {
5468 pSMB->DataCount = 0; 5535 cFYI(1, ("EA entry goes beyond length of list"));
5469 pSMB->DataOffset = 0; 5536 rc = -EIO;
5470 pSMB->SetupCount = 1; 5537 goto QAllEAsOut;
5471 pSMB->Reserved3 = 0; 5538 }
5472 pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
5473 byte_count = params + 1 /* pad */ ;
5474 pSMB->TotalParameterCount = cpu_to_le16(params);
5475 pSMB->ParameterCount = pSMB->TotalParameterCount;
5476 pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
5477 pSMB->Reserved4 = 0;
5478 pSMB->hdr.smb_buf_length += byte_count;
5479 pSMB->ByteCount = cpu_to_le16(byte_count);
5480 5539
5481 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5540 name_len = temp_fea->name_len;
5482 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5541 value_len = le16_to_cpu(temp_fea->value_len);
5483 if (rc) { 5542 list_len -= name_len + 1 + value_len;
5484 cFYI(1, ("Send error in Query EA = %d", rc)); 5543 if (list_len < 0) {
5485 } else { /* decode response */ 5544 cFYI(1, ("EA entry goes beyond length of list"));
5486 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5545 rc = -EIO;
5546 goto QAllEAsOut;
5547 }
5487 5548
5488 /* BB also check enough total bytes returned */ 5549 if (ea_name) {
5489 /* BB we need to improve the validity checking 5550 if (strncmp(ea_name, temp_ptr, name_len) == 0) {
5490 of these trans2 responses */ 5551 temp_ptr += name_len + 1;
5491 if (rc || (pSMBr->ByteCount < 4)) 5552 rc = value_len;
5492 rc = -EIO; /* bad smb */ 5553 if (buf_size == 0)
5493 /* else if (pFindData){ 5554 goto QAllEAsOut;
5494 memcpy((char *) pFindData, 5555 if ((size_t)value_len > buf_size) {
5495 (char *) &pSMBr->hdr.Protocol + 5556 rc = -ERANGE;
5496 data_offset, kl); 5557 goto QAllEAsOut;
5497 }*/ else {
5498 /* check that length of list is not more than bcc */
5499 /* check that each entry does not go beyond length
5500 of list */
5501 /* check that each element of each entry does not
5502 go beyond end of list */
5503 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5504 struct fealist *ea_response_data;
5505 rc = -ENODATA;
5506 /* validate_trans2_offsets() */
5507 /* BB check if start of smb + data_offset > &bcc+ bcc*/
5508 ea_response_data = (struct fealist *)
5509 (((char *) &pSMBr->hdr.Protocol) +
5510 data_offset);
5511 name_len = le32_to_cpu(ea_response_data->list_len);
5512 cFYI(1, ("ea length %d", name_len));
5513 if (name_len <= 8) {
5514 /* returned EA size zeroed at top of function */
5515 cFYI(1, ("empty EA list returned from server"));
5516 } else {
5517 /* account for ea list len */
5518 name_len -= 4;
5519 temp_fea = ea_response_data->list;
5520 temp_ptr = (char *)temp_fea;
5521 /* loop through checking if we have a matching
5522 name and then return the associated value */
5523 while (name_len > 0) {
5524 __u16 value_len;
5525 name_len -= 4;
5526 temp_ptr += 4;
5527 value_len =
5528 le16_to_cpu(temp_fea->value_len);
5529 /* BB validate that value_len falls within SMB,
5530 even though maximum for name_len is 255 */
5531 if (memcmp(temp_fea->name, ea_name,
5532 temp_fea->name_len) == 0) {
5533 /* found a match */
5534 rc = value_len;
5535 /* account for prefix user. and trailing null */
5536 if (rc <= (int)buf_size) {
5537 memcpy(ea_value,
5538 temp_fea->name+temp_fea->name_len+1,
5539 rc);
5540 /* ea values, unlike ea
5541 names, are not null
5542 terminated */
5543 } else if (buf_size == 0) {
5544 /* skip copy - calc size only */
5545 } else {
5546 /* stop before overrun buffer */
5547 rc = -ERANGE;
5548 }
5549 break;
5550 }
5551 name_len -= temp_fea->name_len;
5552 temp_ptr += temp_fea->name_len;
5553 /* account for trailing null */
5554 name_len--;
5555 temp_ptr++;
5556 name_len -= value_len;
5557 temp_ptr += value_len;
5558 /* No trailing null to account for in
5559 value_len. Go on to next EA */
5560 temp_fea = (struct fea *)temp_ptr;
5561 } 5558 }
5559 memcpy(EAData, temp_ptr, value_len);
5560 goto QAllEAsOut;
5561 }
5562 } else {
5563 /* account for prefix user. and trailing null */
5564 rc += (5 + 1 + name_len);
5565 if (rc < (int) buf_size) {
5566 memcpy(EAData, "user.", 5);
5567 EAData += 5;
5568 memcpy(EAData, temp_ptr, name_len);
5569 EAData += name_len;
5570 /* null terminate name */
5571 *EAData = 0;
5572 ++EAData;
5573 } else if (buf_size == 0) {
5574 /* skip copy - calc size only */
5575 } else {
5576 /* stop before overrun buffer */
5577 rc = -ERANGE;
5578 break;
5562 } 5579 }
5563 } 5580 }
5581 temp_ptr += name_len + 1 + value_len;
5582 temp_fea = (struct fea *)temp_ptr;
5564 } 5583 }
5584
5585 /* didn't find the named attribute */
5586 if (ea_name)
5587 rc = -ENODATA;
5588
5589QAllEAsOut:
5565 cifs_buf_release(pSMB); 5590 cifs_buf_release(pSMB);
5566 if (rc == -EAGAIN) 5591 if (rc == -EAGAIN)
5567 goto QEARetry; 5592 goto QAllEAsRetry;
5568 5593
5569 return (ssize_t)rc; 5594 return (ssize_t)rc;
5570} 5595}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2e9e09ca0e30..45eb6cba793f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2388,13 +2388,13 @@ try_mount_again:
2388 */ 2388 */
2389 cifs_put_tcp_session(srvTcp); 2389 cifs_put_tcp_session(srvTcp);
2390 2390
2391 down(&pSesInfo->sesSem); 2391 mutex_lock(&pSesInfo->session_mutex);
2392 if (pSesInfo->need_reconnect) { 2392 if (pSesInfo->need_reconnect) {
2393 cFYI(1, ("Session needs reconnect")); 2393 cFYI(1, ("Session needs reconnect"));
2394 rc = cifs_setup_session(xid, pSesInfo, 2394 rc = cifs_setup_session(xid, pSesInfo,
2395 cifs_sb->local_nls); 2395 cifs_sb->local_nls);
2396 } 2396 }
2397 up(&pSesInfo->sesSem); 2397 mutex_unlock(&pSesInfo->session_mutex);
2398 } else if (!rc) { 2398 } else if (!rc) {
2399 cFYI(1, ("Existing smb sess not found")); 2399 cFYI(1, ("Existing smb sess not found"));
2400 pSesInfo = sesInfoAlloc(); 2400 pSesInfo = sesInfoAlloc();
@@ -2437,12 +2437,12 @@ try_mount_again:
2437 } 2437 }
2438 pSesInfo->linux_uid = volume_info->linux_uid; 2438 pSesInfo->linux_uid = volume_info->linux_uid;
2439 pSesInfo->overrideSecFlg = volume_info->secFlg; 2439 pSesInfo->overrideSecFlg = volume_info->secFlg;
2440 down(&pSesInfo->sesSem); 2440 mutex_lock(&pSesInfo->session_mutex);
2441 2441
2442 /* BB FIXME need to pass vol->secFlgs BB */ 2442 /* BB FIXME need to pass vol->secFlgs BB */
2443 rc = cifs_setup_session(xid, pSesInfo, 2443 rc = cifs_setup_session(xid, pSesInfo,
2444 cifs_sb->local_nls); 2444 cifs_sb->local_nls);
2445 up(&pSesInfo->sesSem); 2445 mutex_unlock(&pSesInfo->session_mutex);
2446 } 2446 }
2447 2447
2448 /* search for existing tcon to this server share */ 2448 /* search for existing tcon to this server share */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
739 int isValid = 1; 739 int isValid = 1;
740 740
741 if (direntry->d_inode) { 741 if (direntry->d_inode) {
742 if (cifs_revalidate(direntry)) 742 if (cifs_revalidate_dentry(direntry))
743 return 0; 743 return 0;
744 } else { 744 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 745 cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 057e1dae12ab..ca2ba7a0193c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -219,8 +219,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 cFYI(1, ("inode unchanged on server")); 219 cFYI(1, ("inode unchanged on server"));
220 } else { 220 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 221 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 222 /* BB no need to lock inode until after invalidate
223 since namei code should already have it locked? */ 223 since namei code should already have it locked? */
224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
225 if (rc != 0) 225 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -1890,11 +1890,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1890 1890
1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1892{ 1892{
1893 struct dentry *dentry = file->f_path.dentry;
1894 int rc, xid; 1893 int rc, xid;
1895 1894
1896 xid = GetXid(); 1895 xid = GetXid();
1897 rc = cifs_revalidate(dentry); 1896 rc = cifs_revalidate_file(file);
1898 if (rc) { 1897 if (rc) {
1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1898 cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
1900 FreeXid(xid); 1899 FreeXid(xid);
@@ -2289,9 +2288,9 @@ cifs_oplock_break(struct slow_work *work)
2289 if (inode && S_ISREG(inode->i_mode)) { 2288 if (inode && S_ISREG(inode->i_mode)) {
2290#ifdef CONFIG_CIFS_EXPERIMENTAL 2289#ifdef CONFIG_CIFS_EXPERIMENTAL
2291 if (cinode->clientCanCacheAll == 0) 2290 if (cinode->clientCanCacheAll == 0)
2292 break_lease(inode, FMODE_READ); 2291 break_lease(inode, O_RDONLY);
2293 else if (cinode->clientCanCacheRead == 0) 2292 else if (cinode->clientCanCacheRead == 0)
2294 break_lease(inode, FMODE_WRITE); 2293 break_lease(inode, O_WRONLY);
2295#endif 2294#endif
2296 rc = filemap_fdatawrite(inode->i_mapping); 2295 rc = filemap_fdatawrite(inode->i_mapping);
2297 if (cinode->clientCanCacheRead == 0) { 2296 if (cinode->clientCanCacheRead == 0) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e3fda978f481..723daaccbd0e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,6 +77,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 77 }
78} 78}
79 79
80/* check inode attributes against fattr. If they don't match, tag the
81 * inode for cache invalidation
82 */
83static void
84cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
85{
86 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
87
88 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
89
90 if (inode->i_state & I_NEW) {
91 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
92 return;
93 }
94
95 /* don't bother with revalidation if we have an oplock */
96 if (cifs_i->clientCanCacheRead) {
97 cFYI(1, ("%s: inode %llu is oplocked", __func__,
98 cifs_i->uniqueid));
99 return;
100 }
101
102 /* revalidate if mtime or size have changed */
103 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
104 cifs_i->server_eof == fattr->cf_eof) {
105 cFYI(1, ("%s: inode %llu is unchanged", __func__,
106 cifs_i->uniqueid));
107 return;
108 }
109
110 cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
111 cifs_i->uniqueid));
112 cifs_i->invalid_mapping = true;
113}
114
80/* populate an inode with info from a cifs_fattr struct */ 115/* populate an inode with info from a cifs_fattr struct */
81void 116void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 117cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +120,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 120 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 121 unsigned long oldtime = cifs_i->time;
87 122
123 cifs_revalidate_cache(inode, fattr);
124
88 inode->i_atime = fattr->cf_atime; 125 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 126 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 127 inode->i_ctime = fattr->cf_ctime;
@@ -111,6 +148,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
111 148
112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 149 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
113 150
151 cifs_i->server_eof = fattr->cf_eof;
114 /* 152 /*
115 * Can't safely change the file size here if the client is writing to 153 * Can't safely change the file size here if the client is writing to
116 * it due to potential races. 154 * it due to potential races.
@@ -230,6 +268,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
230 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 268 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
231} 269}
232 270
271int cifs_get_file_info_unix(struct file *filp)
272{
273 int rc;
274 int xid;
275 FILE_UNIX_BASIC_INFO find_data;
276 struct cifs_fattr fattr;
277 struct inode *inode = filp->f_path.dentry->d_inode;
278 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
279 struct cifsTconInfo *tcon = cifs_sb->tcon;
280 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
281
282 xid = GetXid();
283 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
284 if (!rc) {
285 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
286 } else if (rc == -EREMOTE) {
287 cifs_create_dfs_fattr(&fattr, inode->i_sb);
288 rc = 0;
289 }
290
291 cifs_fattr_to_inode(inode, &fattr);
292 FreeXid(xid);
293 return rc;
294}
295
233int cifs_get_inode_info_unix(struct inode **pinode, 296int cifs_get_inode_info_unix(struct inode **pinode,
234 const unsigned char *full_path, 297 const unsigned char *full_path,
235 struct super_block *sb, int xid) 298 struct super_block *sb, int xid)
@@ -366,7 +429,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
366 char ea_value[4]; 429 char ea_value[4];
367 __u32 mode; 430 __u32 mode;
368 431
369 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", 432 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
370 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 433 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
371 cifs_sb->mnt_cifs_flags & 434 cifs_sb->mnt_cifs_flags &
372 CIFS_MOUNT_MAP_SPECIAL_CHR); 435 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -431,6 +494,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
431 fattr->cf_gid = cifs_sb->mnt_gid; 494 fattr->cf_gid = cifs_sb->mnt_gid;
432} 495}
433 496
497int cifs_get_file_info(struct file *filp)
498{
499 int rc;
500 int xid;
501 FILE_ALL_INFO find_data;
502 struct cifs_fattr fattr;
503 struct inode *inode = filp->f_path.dentry->d_inode;
504 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
505 struct cifsTconInfo *tcon = cifs_sb->tcon;
506 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
507
508 xid = GetXid();
509 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
510 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
511 /*
512 * FIXME: legacy server -- fall back to path-based call?
513 * for now, just skip revalidating and mark inode for
514 * immediate reval.
515 */
516 rc = 0;
517 CIFS_I(inode)->time = 0;
518 goto cgfi_exit;
519 } else if (rc == -EREMOTE) {
520 cifs_create_dfs_fattr(&fattr, inode->i_sb);
521 rc = 0;
522 } else if (rc)
523 goto cgfi_exit;
524
525 /*
526 * don't bother with SFU junk here -- just mark inode as needing
527 * revalidation.
528 */
529 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
530 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
531 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
532 cifs_fattr_to_inode(inode, &fattr);
533cgfi_exit:
534 FreeXid(xid);
535 return rc;
536}
537
434int cifs_get_inode_info(struct inode **pinode, 538int cifs_get_inode_info(struct inode **pinode,
435 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 539 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
436 struct super_block *sb, int xid, const __u16 *pfid) 540 struct super_block *sb, int xid, const __u16 *pfid)
@@ -1388,135 +1492,103 @@ cifs_rename_exit:
1388 return rc; 1492 return rc;
1389} 1493}
1390 1494
1391int cifs_revalidate(struct dentry *direntry) 1495static bool
1496cifs_inode_needs_reval(struct inode *inode)
1392{ 1497{
1393 int xid; 1498 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1394 int rc = 0, wbrc = 0;
1395 char *full_path;
1396 struct cifs_sb_info *cifs_sb;
1397 struct cifsInodeInfo *cifsInode;
1398 loff_t local_size;
1399 struct timespec local_mtime;
1400 bool invalidate_inode = false;
1401 1499
1402 if (direntry->d_inode == NULL) 1500 if (cifs_i->clientCanCacheRead)
1403 return -ENOENT; 1501 return false;
1404 1502
1405 cifsInode = CIFS_I(direntry->d_inode); 1503 if (!lookupCacheEnabled)
1504 return true;
1406 1505
1407 if (cifsInode == NULL) 1506 if (cifs_i->time == 0)
1408 return -ENOENT; 1507 return true;
1409 1508
1410 /* no sense revalidating inode info on file that no one can write */ 1509 /* FIXME: the actimeo should be tunable */
1411 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1510 if (time_after_eq(jiffies, cifs_i->time + HZ))
1412 return rc; 1511 return true;
1512
1513 return false;
1514}
1515
1516/* check invalid_mapping flag and zap the cache if it's set */
1517static void
1518cifs_invalidate_mapping(struct inode *inode)
1519{
1520 int rc;
1521 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1522
1523 cifs_i->invalid_mapping = false;
1524
1525 /* write back any cached data */
1526 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1527 rc = filemap_write_and_wait(inode->i_mapping);
1528 if (rc)
1529 cifs_i->write_behind_rc = rc;
1530 }
1531 invalidate_remote_inode(inode);
1532}
1533
1534int cifs_revalidate_file(struct file *filp)
1535{
1536 int rc = 0;
1537 struct inode *inode = filp->f_path.dentry->d_inode;
1538
1539 if (!cifs_inode_needs_reval(inode))
1540 goto check_inval;
1541
1542 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1543 rc = cifs_get_file_info_unix(filp);
1544 else
1545 rc = cifs_get_file_info(filp);
1546
1547check_inval:
1548 if (CIFS_I(inode)->invalid_mapping)
1549 cifs_invalidate_mapping(inode);
1550
1551 return rc;
1552}
1553
1554/* revalidate a dentry's inode attributes */
1555int cifs_revalidate_dentry(struct dentry *dentry)
1556{
1557 int xid;
1558 int rc = 0;
1559 char *full_path = NULL;
1560 struct inode *inode = dentry->d_inode;
1561 struct super_block *sb = dentry->d_sb;
1562
1563 if (inode == NULL)
1564 return -ENOENT;
1413 1565
1414 xid = GetXid(); 1566 xid = GetXid();
1415 1567
1416 cifs_sb = CIFS_SB(direntry->d_sb); 1568 if (!cifs_inode_needs_reval(inode))
1569 goto check_inval;
1417 1570
1418 /* can not safely grab the rename sem here if rename calls revalidate 1571 /* can not safely grab the rename sem here if rename calls revalidate
1419 since that would deadlock */ 1572 since that would deadlock */
1420 full_path = build_path_from_dentry(direntry); 1573 full_path = build_path_from_dentry(dentry);
1421 if (full_path == NULL) { 1574 if (full_path == NULL) {
1422 rc = -ENOMEM; 1575 rc = -ENOMEM;
1423 FreeXid(xid); 1576 goto check_inval;
1424 return rc;
1425 }
1426 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1427 "jiffies %ld", full_path, direntry->d_inode,
1428 direntry->d_inode->i_count.counter, direntry,
1429 direntry->d_time, jiffies));
1430
1431 if (cifsInode->time == 0) {
1432 /* was set to zero previously to force revalidate */
1433 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1434 lookupCacheEnabled) {
1435 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1436 (direntry->d_inode->i_nlink == 1)) {
1437 kfree(full_path);
1438 FreeXid(xid);
1439 return rc;
1440 } else {
1441 cFYI(1, ("Have to revalidate file due to hardlinks"));
1442 }
1443 }
1444
1445 /* save mtime and size */
1446 local_mtime = direntry->d_inode->i_mtime;
1447 local_size = direntry->d_inode->i_size;
1448
1449 if (cifs_sb->tcon->unix_ext) {
1450 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
1451 direntry->d_sb, xid);
1452 if (rc) {
1453 cFYI(1, ("error on getting revalidate info %d", rc));
1454/* if (rc != -ENOENT)
1455 rc = 0; */ /* BB should we cache info on
1456 certain errors? */
1457 }
1458 } else {
1459 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1460 direntry->d_sb, xid, NULL);
1461 if (rc) {
1462 cFYI(1, ("error on getting revalidate info %d", rc));
1463/* if (rc != -ENOENT)
1464 rc = 0; */ /* BB should we cache info on
1465 certain errors? */
1466 }
1467 } 1577 }
1468 /* should we remap certain errors, access denied?, to zero */
1469 1578
1470 /* if not oplocked, we invalidate inode pages if mtime or file size 1579 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1471 had changed on server */ 1580 "jiffies %ld", full_path, inode, inode->i_count.counter,
1581 dentry, dentry->d_time, jiffies));
1472 1582
1473 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) && 1583 if (CIFS_SB(sb)->tcon->unix_ext)
1474 (local_size == direntry->d_inode->i_size)) { 1584 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1475 cFYI(1, ("cifs_revalidate - inode unchanged")); 1585 else
1476 } else { 1586 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1477 /* file may have changed on server */ 1587 xid, NULL);
1478 if (cifsInode->clientCanCacheRead) {
1479 /* no need to invalidate inode pages since we were the
1480 only ones who could have modified the file and the
1481 server copy is staler than ours */
1482 } else {
1483 invalidate_inode = true;
1484 }
1485 }
1486 1588
1487 /* can not grab this sem since kernel filesys locking documentation 1589check_inval:
1488 indicates i_mutex may be taken by the kernel on lookup and rename 1590 if (CIFS_I(inode)->invalid_mapping)
1489 which could deadlock if we grab the i_mutex here as well */ 1591 cifs_invalidate_mapping(inode);
1490/* mutex_lock(&direntry->d_inode->i_mutex);*/
1491 /* need to write out dirty pages here */
1492 if (direntry->d_inode->i_mapping) {
1493 /* do we need to lock inode until after invalidate completes
1494 below? */
1495 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1496 if (wbrc)
1497 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1498 }
1499 if (invalidate_inode) {
1500 /* shrink_dcache not necessary now that cifs dentry ops
1501 are exported for negative dentries */
1502/* if (S_ISDIR(direntry->d_inode->i_mode))
1503 shrink_dcache_parent(direntry); */
1504 if (S_ISREG(direntry->d_inode->i_mode)) {
1505 if (direntry->d_inode->i_mapping) {
1506 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1507 if (wbrc)
1508 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1509 }
1510 /* may eventually have to do this for open files too */
1511 if (list_empty(&(cifsInode->openFileList))) {
1512 /* changed on server - flush read ahead pages */
1513 cFYI(1, ("Invalidating read ahead data on "
1514 "closed file"));
1515 invalidate_remote_inode(direntry->d_inode);
1516 }
1517 }
1518 }
1519/* mutex_unlock(&direntry->d_inode->i_mutex); */
1520 1592
1521 kfree(full_path); 1593 kfree(full_path);
1522 FreeXid(xid); 1594 FreeXid(xid);
@@ -1526,7 +1598,7 @@ int cifs_revalidate(struct dentry *direntry)
1526int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1598int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1527 struct kstat *stat) 1599 struct kstat *stat)
1528{ 1600{
1529 int err = cifs_revalidate(dentry); 1601 int err = cifs_revalidate_dentry(dentry);
1530 if (!err) { 1602 if (!err) {
1531 generic_fillattr(dentry->d_inode, stat); 1603 generic_fillattr(dentry->d_inode, stat);
1532 stat->blksize = CIFS_MAX_MSGSIZE; 1604 stat->blksize = CIFS_MAX_MSGSIZE;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..d1474996a812 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
79 ++ret_buf->ses_count; 79 ++ret_buf->ses_count;
80 INIT_LIST_HEAD(&ret_buf->smb_ses_list); 80 INIT_LIST_HEAD(&ret_buf->smb_ses_list);
81 INIT_LIST_HEAD(&ret_buf->tcon_list); 81 INIT_LIST_HEAD(&ret_buf->tcon_list);
82 init_MUTEX(&ret_buf->sesSem); 82 mutex_init(&ret_buf->session_mutex);
83 } 83 }
84 return ret_buf; 84 return ret_buf;
85} 85}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..3e2ef0de1209 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -244,7 +244,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
244 /* revalidate/getattr then populate from inode */ 244 /* revalidate/getattr then populate from inode */
245 } /* BB add else when above is implemented */ 245 } /* BB add else when above is implemented */
246 ea_name += 5; /* skip past user. prefix */ 246 ea_name += 5; /* skip past user. prefix */
247 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 247 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
248 buf_size, cifs_sb->local_nls, 248 buf_size, cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { 250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +252,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
252 goto get_ea_exit; 252 goto get_ea_exit;
253 253
254 ea_name += 4; /* skip past os2. prefix */ 254 ea_name += 4; /* skip past os2. prefix */
255 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 255 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
256 buf_size, cifs_sb->local_nls, 256 buf_size, cifs_sb->local_nls,
257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, 258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -364,8 +364,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
364 /* if proc/fs/cifs/streamstoxattr is set then 364 /* if proc/fs/cifs/streamstoxattr is set then
365 search server for EAs or streams to 365 search server for EAs or streams to
366 returns as xattrs */ 366 returns as xattrs */
367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size, 367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
368 cifs_sb->local_nls, 368 buf_size, cifs_sb->local_nls,
369 cifs_sb->mnt_cifs_flags & 369 cifs_sb->mnt_cifs_flags &
370 CIFS_MOUNT_MAP_SPECIAL_CHR); 370 CIFS_MOUNT_MAP_SPECIAL_CHR);
371 371
diff --git a/fs/compat.c b/fs/compat.c
index 00d90c2e66f0..030602d453b7 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1795,6 +1795,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
1795 return ret; 1795 return ret;
1796} 1796}
1797 1797
1798struct compat_sel_arg_struct {
1799 compat_ulong_t n;
1800 compat_uptr_t inp;
1801 compat_uptr_t outp;
1802 compat_uptr_t exp;
1803 compat_uptr_t tvp;
1804};
1805
1806asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
1807{
1808 struct compat_sel_arg_struct a;
1809
1810 if (copy_from_user(&a, arg, sizeof(a)))
1811 return -EFAULT;
1812 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
1813 compat_ptr(a.exp), compat_ptr(a.tvp));
1814}
1815
1798#ifdef HAVE_SET_RESTORE_SIGMASK 1816#ifdef HAVE_SET_RESTORE_SIGMASK
1799static long do_compat_pselect(int n, compat_ulong_t __user *inp, 1817static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1800 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1818 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 0adced2f296f..112e45a17e99 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -28,10 +28,12 @@
28 28
29#undef elfhdr 29#undef elfhdr
30#undef elf_phdr 30#undef elf_phdr
31#undef elf_shdr
31#undef elf_note 32#undef elf_note
32#undef elf_addr_t 33#undef elf_addr_t
33#define elfhdr elf32_hdr 34#define elfhdr elf32_hdr
34#define elf_phdr elf32_phdr 35#define elf_phdr elf32_phdr
36#define elf_shdr elf32_shdr
35#define elf_note elf32_note 37#define elf_note elf32_note
36#define elf_addr_t Elf32_Addr 38#define elf_addr_t Elf32_Addr
37 39
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 0ca9ec4a79c3..6d55b61bfa79 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -545,7 +545,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
545 kcmd = MTIOCPOS; 545 kcmd = MTIOCPOS;
546 karg = &pos; 546 karg = &pos;
547 break; 547 break;
548 case MTIOCGET32: 548 default: /* MTIOCGET32 */
549 kcmd = MTIOCGET; 549 kcmd = MTIOCGET;
550 karg = &get; 550 karg = &get;
551 break; 551 break;
@@ -663,7 +663,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd,
663 663
664 switch (cmd) { 664 switch (cmd) {
665 case RAW_SETBIND: 665 case RAW_SETBIND:
666 case RAW_GETBIND: { 666 default: { /* RAW_GETBIND */
667 struct raw_config_request req; 667 struct raw_config_request req;
668 mm_segment_t oldfs = get_fs(); 668 mm_segment_t oldfs = get_fs();
669 669
diff --git a/fs/dcache.c b/fs/dcache.c
index 953173a293a9..f1358e5c3a59 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,6 +257,7 @@ kill_it:
257 if (dentry) 257 if (dentry)
258 goto repeat; 258 goto repeat;
259} 259}
260EXPORT_SYMBOL(dput);
260 261
261/** 262/**
262 * d_invalidate - invalidate a dentry 263 * d_invalidate - invalidate a dentry
@@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry)
314 spin_unlock(&dcache_lock); 315 spin_unlock(&dcache_lock);
315 return 0; 316 return 0;
316} 317}
318EXPORT_SYMBOL(d_invalidate);
317 319
318/* This should be called _only_ with dcache_lock held */ 320/* This should be called _only_ with dcache_lock held */
319 321
@@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry)
328{ 330{
329 return __dget_locked(dentry); 331 return __dget_locked(dentry);
330} 332}
333EXPORT_SYMBOL(dget_locked);
331 334
332/** 335/**
333 * d_find_alias - grab a hashed alias of inode 336 * d_find_alias - grab a hashed alias of inode
@@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode)
384 } 387 }
385 return de; 388 return de;
386} 389}
390EXPORT_SYMBOL(d_find_alias);
387 391
388/* 392/*
389 * Try to kill dentries associated with this inode. 393 * Try to kill dentries associated with this inode.
@@ -408,6 +412,7 @@ restart:
408 } 412 }
409 spin_unlock(&dcache_lock); 413 spin_unlock(&dcache_lock);
410} 414}
415EXPORT_SYMBOL(d_prune_aliases);
411 416
412/* 417/*
413 * Throw away a dentry - free the inode, dput the parent. This requires that 418 * Throw away a dentry - free the inode, dput the parent. This requires that
@@ -610,6 +615,7 @@ void shrink_dcache_sb(struct super_block * sb)
610{ 615{
611 __shrink_dcache_sb(sb, NULL, 0); 616 __shrink_dcache_sb(sb, NULL, 0);
612} 617}
618EXPORT_SYMBOL(shrink_dcache_sb);
613 619
614/* 620/*
615 * destroy a single subtree of dentries for unmount 621 * destroy a single subtree of dentries for unmount
@@ -792,6 +798,7 @@ positive:
792 spin_unlock(&dcache_lock); 798 spin_unlock(&dcache_lock);
793 return 1; 799 return 1;
794} 800}
801EXPORT_SYMBOL(have_submounts);
795 802
796/* 803/*
797 * Search the dentry child list for the specified parent, 804 * Search the dentry child list for the specified parent,
@@ -876,6 +883,7 @@ void shrink_dcache_parent(struct dentry * parent)
876 while ((found = select_parent(parent)) != 0) 883 while ((found = select_parent(parent)) != 0)
877 __shrink_dcache_sb(sb, &found, 0); 884 __shrink_dcache_sb(sb, &found, 0);
878} 885}
886EXPORT_SYMBOL(shrink_dcache_parent);
879 887
880/* 888/*
881 * Scan `nr' dentries and return the number which remain. 889 * Scan `nr' dentries and return the number which remain.
@@ -968,6 +976,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
968 976
969 return dentry; 977 return dentry;
970} 978}
979EXPORT_SYMBOL(d_alloc);
971 980
972struct dentry *d_alloc_name(struct dentry *parent, const char *name) 981struct dentry *d_alloc_name(struct dentry *parent, const char *name)
973{ 982{
@@ -1012,6 +1021,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
1012 spin_unlock(&dcache_lock); 1021 spin_unlock(&dcache_lock);
1013 security_d_instantiate(entry, inode); 1022 security_d_instantiate(entry, inode);
1014} 1023}
1024EXPORT_SYMBOL(d_instantiate);
1015 1025
1016/** 1026/**
1017 * d_instantiate_unique - instantiate a non-aliased dentry 1027 * d_instantiate_unique - instantiate a non-aliased dentry
@@ -1108,6 +1118,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1108 } 1118 }
1109 return res; 1119 return res;
1110} 1120}
1121EXPORT_SYMBOL(d_alloc_root);
1111 1122
1112static inline struct hlist_head *d_hash(struct dentry *parent, 1123static inline struct hlist_head *d_hash(struct dentry *parent,
1113 unsigned long hash) 1124 unsigned long hash)
@@ -1211,7 +1222,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1211 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1222 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
1212 spin_unlock(&dcache_lock); 1223 spin_unlock(&dcache_lock);
1213 security_d_instantiate(new, inode); 1224 security_d_instantiate(new, inode);
1214 d_rehash(dentry);
1215 d_move(new, dentry); 1225 d_move(new, dentry);
1216 iput(inode); 1226 iput(inode);
1217 } else { 1227 } else {
@@ -1225,6 +1235,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1225 d_add(dentry, inode); 1235 d_add(dentry, inode);
1226 return new; 1236 return new;
1227} 1237}
1238EXPORT_SYMBOL(d_splice_alias);
1228 1239
1229/** 1240/**
1230 * d_add_ci - lookup or allocate new dentry with case-exact name 1241 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -1314,6 +1325,7 @@ err_out:
1314 iput(inode); 1325 iput(inode);
1315 return ERR_PTR(error); 1326 return ERR_PTR(error);
1316} 1327}
1328EXPORT_SYMBOL(d_add_ci);
1317 1329
1318/** 1330/**
1319 * d_lookup - search for a dentry 1331 * d_lookup - search for a dentry
@@ -1357,6 +1369,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1357 } while (read_seqretry(&rename_lock, seq)); 1369 } while (read_seqretry(&rename_lock, seq));
1358 return dentry; 1370 return dentry;
1359} 1371}
1372EXPORT_SYMBOL(d_lookup);
1360 1373
1361struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1374struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1362{ 1375{
@@ -1483,6 +1496,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
1483out: 1496out:
1484 return 0; 1497 return 0;
1485} 1498}
1499EXPORT_SYMBOL(d_validate);
1486 1500
1487/* 1501/*
1488 * When a file is deleted, we have two options: 1502 * When a file is deleted, we have two options:
@@ -1528,6 +1542,7 @@ void d_delete(struct dentry * dentry)
1528 1542
1529 fsnotify_nameremove(dentry, isdir); 1543 fsnotify_nameremove(dentry, isdir);
1530} 1544}
1545EXPORT_SYMBOL(d_delete);
1531 1546
1532static void __d_rehash(struct dentry * entry, struct hlist_head *list) 1547static void __d_rehash(struct dentry * entry, struct hlist_head *list)
1533{ 1548{
@@ -1556,6 +1571,7 @@ void d_rehash(struct dentry * entry)
1556 spin_unlock(&entry->d_lock); 1571 spin_unlock(&entry->d_lock);
1557 spin_unlock(&dcache_lock); 1572 spin_unlock(&dcache_lock);
1558} 1573}
1574EXPORT_SYMBOL(d_rehash);
1559 1575
1560/* 1576/*
1561 * When switching names, the actual string doesn't strictly have to 1577 * When switching names, the actual string doesn't strictly have to
@@ -1702,6 +1718,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
1702 d_move_locked(dentry, target); 1718 d_move_locked(dentry, target);
1703 spin_unlock(&dcache_lock); 1719 spin_unlock(&dcache_lock);
1704} 1720}
1721EXPORT_SYMBOL(d_move);
1705 1722
1706/** 1723/**
1707 * d_ancestor - search for an ancestor 1724 * d_ancestor - search for an ancestor
@@ -1868,6 +1885,7 @@ shouldnt_be_hashed:
1868 spin_unlock(&dcache_lock); 1885 spin_unlock(&dcache_lock);
1869 BUG(); 1886 BUG();
1870} 1887}
1888EXPORT_SYMBOL_GPL(d_materialise_unique);
1871 1889
1872static int prepend(char **buffer, int *buflen, const char *str, int namelen) 1890static int prepend(char **buffer, int *buflen, const char *str, int namelen)
1873{ 1891{
@@ -2005,6 +2023,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
2005 path_put(&root); 2023 path_put(&root);
2006 return res; 2024 return res;
2007} 2025}
2026EXPORT_SYMBOL(d_path);
2008 2027
2009/* 2028/*
2010 * Helper function for dentry_operations.d_dname() members 2029 * Helper function for dentry_operations.d_dname() members
@@ -2171,6 +2190,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2171 return result; 2190 return result;
2172} 2191}
2173 2192
2193int path_is_under(struct path *path1, struct path *path2)
2194{
2195 struct vfsmount *mnt = path1->mnt;
2196 struct dentry *dentry = path1->dentry;
2197 int res;
2198 spin_lock(&vfsmount_lock);
2199 if (mnt != path2->mnt) {
2200 for (;;) {
2201 if (mnt->mnt_parent == mnt) {
2202 spin_unlock(&vfsmount_lock);
2203 return 0;
2204 }
2205 if (mnt->mnt_parent == path2->mnt)
2206 break;
2207 mnt = mnt->mnt_parent;
2208 }
2209 dentry = mnt->mnt_mountpoint;
2210 }
2211 res = is_subdir(dentry, path2->dentry);
2212 spin_unlock(&vfsmount_lock);
2213 return res;
2214}
2215EXPORT_SYMBOL(path_is_under);
2216
2174void d_genocide(struct dentry *root) 2217void d_genocide(struct dentry *root)
2175{ 2218{
2176 struct dentry *this_parent = root; 2219 struct dentry *this_parent = root;
@@ -2228,6 +2271,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
2228 } 2271 }
2229 return ino; 2272 return ino;
2230} 2273}
2274EXPORT_SYMBOL(find_inode_number);
2231 2275
2232static __initdata unsigned long dhash_entries; 2276static __initdata unsigned long dhash_entries;
2233static int __init set_dhash_entries(char *str) 2277static int __init set_dhash_entries(char *str)
@@ -2297,6 +2341,7 @@ static void __init dcache_init(void)
2297 2341
2298/* SLAB cache for __getname() consumers */ 2342/* SLAB cache for __getname() consumers */
2299struct kmem_cache *names_cachep __read_mostly; 2343struct kmem_cache *names_cachep __read_mostly;
2344EXPORT_SYMBOL(names_cachep);
2300 2345
2301EXPORT_SYMBOL(d_genocide); 2346EXPORT_SYMBOL(d_genocide);
2302 2347
@@ -2326,26 +2371,3 @@ void __init vfs_caches_init(unsigned long mempages)
2326 bdev_cache_init(); 2371 bdev_cache_init();
2327 chrdev_init(); 2372 chrdev_init();
2328} 2373}
2329
2330EXPORT_SYMBOL(d_alloc);
2331EXPORT_SYMBOL(d_alloc_root);
2332EXPORT_SYMBOL(d_delete);
2333EXPORT_SYMBOL(d_find_alias);
2334EXPORT_SYMBOL(d_instantiate);
2335EXPORT_SYMBOL(d_invalidate);
2336EXPORT_SYMBOL(d_lookup);
2337EXPORT_SYMBOL(d_move);
2338EXPORT_SYMBOL_GPL(d_materialise_unique);
2339EXPORT_SYMBOL(d_path);
2340EXPORT_SYMBOL(d_prune_aliases);
2341EXPORT_SYMBOL(d_rehash);
2342EXPORT_SYMBOL(d_splice_alias);
2343EXPORT_SYMBOL(d_add_ci);
2344EXPORT_SYMBOL(d_validate);
2345EXPORT_SYMBOL(dget_locked);
2346EXPORT_SYMBOL(dput);
2347EXPORT_SYMBOL(find_inode_number);
2348EXPORT_SYMBOL(have_submounts);
2349EXPORT_SYMBOL(names_cachep);
2350EXPORT_SYMBOL(shrink_dcache_parent);
2351EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 274ac865bae8..049d6c36da09 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -496,7 +496,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
496 } 496 }
497 d_move(old_dentry, dentry); 497 d_move(old_dentry, dentry);
498 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, 498 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
499 old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode), 499 S_ISDIR(old_dentry->d_inode->i_mode),
500 NULL, old_dentry); 500 NULL, old_dentry);
501 fsnotify_oldname_free(old_name); 501 fsnotify_oldname_free(old_name);
502 unlock_rename(new_dir, old_dir); 502 unlock_rename(new_dir, old_dir);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 26a8bd40400a..f994a7dfda85 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
148 kfree(ls); 148 kfree(ls);
149} 149}
150 150
151static struct sysfs_ops dlm_attr_ops = { 151static const struct sysfs_ops dlm_attr_ops = {
152 .show = dlm_attr_show, 152 .show = dlm_attr_show,
153 .store = dlm_attr_store, 153 .store = dlm_attr_store,
154}; 154};
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 84f70bfb0baf..b12532e553f8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
312 /* 312 /*
313 * This in_recovery lock does two things: 313 * This in_recovery lock does two things:
314 * 1) Keeps this function from returning until all threads are out 314 * 1) Keeps this function from returning until all threads are out
315 * of locking routines and locking is truely stopped. 315 * of locking routines and locking is truly stopped.
316 * 2) Keeps any new requests from being processed until it's unlocked 316 * 2) Keeps any new requests from being processed until it's unlocked
317 * when recovery is complete. 317 * when recovery is complete.
318 */ 318 */
diff --git a/fs/exec.c b/fs/exec.c
index cce6bbdbdbb1..49cdaa19e5b9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -195,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
195 * to work from. 195 * to work from.
196 */ 196 */
197 rlim = current->signal->rlim; 197 rlim = current->signal->rlim;
198 if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { 198 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
199 put_page(page); 199 put_page(page);
200 return NULL; 200 return NULL;
201 } 201 }
@@ -246,6 +246,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 246 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 247 vma->vm_flags = VM_STACK_FLAGS;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain);
249 err = insert_vm_struct(mm, vma); 250 err = insert_vm_struct(mm, vma);
250 if (err) 251 if (err)
251 goto err; 252 goto err;
@@ -516,7 +517,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
516 /* 517 /*
517 * cover the whole range: [new_start, old_end) 518 * cover the whole range: [new_start, old_end)
518 */ 519 */
519 vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL); 520 if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
521 return -ENOMEM;
520 522
521 /* 523 /*
522 * move the page tables downwards, on failure we rely on 524 * move the page tables downwards, on failure we rely on
@@ -547,15 +549,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
547 tlb_finish_mmu(tlb, new_end, old_end); 549 tlb_finish_mmu(tlb, new_end, old_end);
548 550
549 /* 551 /*
550 * shrink the vma to just the new range. 552 * Shrink the vma to just the new range. Always succeeds.
551 */ 553 */
552 vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); 554 vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
553 555
554 return 0; 556 return 0;
555} 557}
556 558
557#define EXTRA_STACK_VM_PAGES 20 /* random */
558
559/* 559/*
560 * Finalizes the stack vm_area_struct. The flags and permissions are updated, 560 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
561 * the stack is optionally relocated, and some extra space is added. 561 * the stack is optionally relocated, and some extra space is added.
@@ -577,7 +577,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
577 577
578#ifdef CONFIG_STACK_GROWSUP 578#ifdef CONFIG_STACK_GROWSUP
579 /* Limit stack size to 1GB */ 579 /* Limit stack size to 1GB */
580 stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; 580 stack_base = rlimit_max(RLIMIT_STACK);
581 if (stack_base > (1 << 30)) 581 if (stack_base > (1 << 30))
582 stack_base = 1 << 30; 582 stack_base = 1 << 30;
583 583
@@ -630,7 +630,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 630 goto out_unlock;
631 } 631 }
632 632
633 stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE; 633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 634 stack_size = vma->vm_end - vma->vm_start;
635 /* 635 /*
636 * Align this down to a page boundary as expand_stack 636 * Align this down to a page boundary as expand_stack
@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
718 /* Notify parent that we're no longer interested in the old VM */ 718 /* Notify parent that we're no longer interested in the old VM */
719 tsk = current; 719 tsk = current;
720 old_mm = current->mm; 720 old_mm = current->mm;
721 sync_mm_rss(tsk, old_mm);
721 mm_release(tsk, old_mm); 722 mm_release(tsk, old_mm);
722 723
723 if (old_mm) { 724 if (old_mm) {
@@ -1532,7 +1533,7 @@ static int format_corename(char *corename, long signr)
1532 /* core limit size */ 1533 /* core limit size */
1533 case 'c': 1534 case 'c':
1534 rc = snprintf(out_ptr, out_end - out_ptr, 1535 rc = snprintf(out_ptr, out_end - out_ptr,
1535 "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur); 1536 "%lu", rlimit(RLIMIT_CORE));
1536 if (rc > out_end - out_ptr) 1537 if (rc > out_end - out_ptr)
1537 goto out; 1538 goto out;
1538 out_ptr += rc; 1539 out_ptr += rc;
@@ -1560,12 +1561,13 @@ out:
1560 return ispipe; 1561 return ispipe;
1561} 1562}
1562 1563
1563static int zap_process(struct task_struct *start) 1564static int zap_process(struct task_struct *start, int exit_code)
1564{ 1565{
1565 struct task_struct *t; 1566 struct task_struct *t;
1566 int nr = 0; 1567 int nr = 0;
1567 1568
1568 start->signal->flags = SIGNAL_GROUP_EXIT; 1569 start->signal->flags = SIGNAL_GROUP_EXIT;
1570 start->signal->group_exit_code = exit_code;
1569 start->signal->group_stop_count = 0; 1571 start->signal->group_stop_count = 0;
1570 1572
1571 t = start; 1573 t = start;
@@ -1590,8 +1592,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1590 spin_lock_irq(&tsk->sighand->siglock); 1592 spin_lock_irq(&tsk->sighand->siglock);
1591 if (!signal_group_exit(tsk->signal)) { 1593 if (!signal_group_exit(tsk->signal)) {
1592 mm->core_state = core_state; 1594 mm->core_state = core_state;
1593 tsk->signal->group_exit_code = exit_code; 1595 nr = zap_process(tsk, exit_code);
1594 nr = zap_process(tsk);
1595 } 1596 }
1596 spin_unlock_irq(&tsk->sighand->siglock); 1597 spin_unlock_irq(&tsk->sighand->siglock);
1597 if (unlikely(nr < 0)) 1598 if (unlikely(nr < 0))
@@ -1640,7 +1641,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1640 if (p->mm) { 1641 if (p->mm) {
1641 if (unlikely(p->mm == mm)) { 1642 if (unlikely(p->mm == mm)) {
1642 lock_task_sighand(p, &flags); 1643 lock_task_sighand(p, &flags);
1643 nr += zap_process(p); 1644 nr += zap_process(p, exit_code);
1644 unlock_task_sighand(p, &flags); 1645 unlock_task_sighand(p, &flags);
1645 } 1646 }
1646 break; 1647 break;
@@ -1747,14 +1748,19 @@ void set_dumpable(struct mm_struct *mm, int value)
1747 } 1748 }
1748} 1749}
1749 1750
1750int get_dumpable(struct mm_struct *mm) 1751static int __get_dumpable(unsigned long mm_flags)
1751{ 1752{
1752 int ret; 1753 int ret;
1753 1754
1754 ret = mm->flags & 0x3; 1755 ret = mm_flags & MMF_DUMPABLE_MASK;
1755 return (ret >= 2) ? 2 : ret; 1756 return (ret >= 2) ? 2 : ret;
1756} 1757}
1757 1758
1759int get_dumpable(struct mm_struct *mm)
1760{
1761 return __get_dumpable(mm->flags);
1762}
1763
1758static void wait_for_dump_helpers(struct file *file) 1764static void wait_for_dump_helpers(struct file *file)
1759{ 1765{
1760 struct pipe_inode_info *pipe; 1766 struct pipe_inode_info *pipe;
@@ -1797,7 +1803,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1797 struct coredump_params cprm = { 1803 struct coredump_params cprm = {
1798 .signr = signr, 1804 .signr = signr,
1799 .regs = regs, 1805 .regs = regs,
1800 .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, 1806 .limit = rlimit(RLIMIT_CORE),
1807 /*
1808 * We must use the same mm->flags while dumping core to avoid
1809 * inconsistency of bit flags, since this flag is not protected
1810 * by any locks.
1811 */
1812 .mm_flags = mm->flags,
1801 }; 1813 };
1802 1814
1803 audit_core_dumps(signr); 1815 audit_core_dumps(signr);
@@ -1816,7 +1828,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1816 /* 1828 /*
1817 * If another thread got here first, or we are not dumpable, bail out. 1829 * If another thread got here first, or we are not dumpable, bail out.
1818 */ 1830 */
1819 if (mm->core_state || !get_dumpable(mm)) { 1831 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1820 up_write(&mm->mmap_sem); 1832 up_write(&mm->mmap_sem);
1821 put_cred(cred); 1833 put_cred(cred);
1822 goto fail; 1834 goto fail;
@@ -1827,7 +1839,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1827 * process nor do we know its entire history. We only know it 1839 * process nor do we know its entire history. We only know it
1828 * was tainted so we dump it as root in mode 2. 1840 * was tainted so we dump it as root in mode 2.
1829 */ 1841 */
1830 if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ 1842 if (__get_dumpable(cprm.mm_flags) == 2) {
1843 /* Setuid core dump mode */
1831 flag = O_EXCL; /* Stop rewrite attacks */ 1844 flag = O_EXCL; /* Stop rewrite attacks */
1832 cred->fsuid = 0; /* Dump root private */ 1845 cred->fsuid = 0; /* Dump root private */
1833 } 1846 }
@@ -1923,8 +1936,9 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1923 /* 1936 /*
1924 * Dont allow local users get cute and trick others to coredump 1937 * Dont allow local users get cute and trick others to coredump
1925 * into their pre-created files: 1938 * into their pre-created files:
1939 * Note, this is not relevant for pipes
1926 */ 1940 */
1927 if (inode->i_uid != current_fsuid()) 1941 if (!ispipe && (inode->i_uid != current_fsuid()))
1928 goto close_fail; 1942 goto close_fail;
1929 if (!cprm.file->f_op) 1943 if (!cprm.file->f_op)
1930 goto close_fail; 1944 goto close_fail;
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1b178e61718..f0d520312d8b 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -55,6 +55,8 @@
55/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) 56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
57# define EXOFS_ATTR_INODE_DATA 1 57# define EXOFS_ATTR_INODE_DATA 1
58# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
59# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
58 60
59/* 61/*
60 * The maximum number of files we can have is limited by the size of the 62 * The maximum number of files we can have is limited by the size of the
@@ -206,4 +208,41 @@ enum {
206 (((name_len) + offsetof(struct exofs_dir_entry, name) + \ 208 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
207 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) 209 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
208 210
211/*
212 * The on-disk (optional) layout structure.
213 * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
214 * attribute, attached to any inode, usually to a directory.
215 */
216
217enum exofs_inode_layout_gen_functions {
218 LAYOUT_MOVING_WINDOW = 0,
219 LAYOUT_IMPLICT = 1,
220};
221
222struct exofs_on_disk_inode_layout {
223 __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
224 __le16 pad;
225 union {
226 /* gen_func == LAYOUT_MOVING_WINDOW (default) */
227 struct exofs_layout_sliding_window {
228 __le32 num_devices; /* first n devices in global-table*/
229 } sliding_window __packed;
230
231 /* gen_func == LAYOUT_IMPLICT */
232 struct exofs_layout_implict_list {
233 struct exofs_dt_data_map data_map;
234 /* Variable array of size data_map.cb_num_comps. These
235 * are device indexes of the devices in the global table
236 */
237 __le32 dev_indexes[];
238 } implict __packed;
239 };
240} __packed;
241
242static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
243{
244 return sizeof(struct exofs_on_disk_inode_layout) +
245 max_devs * sizeof(__le32);
246}
247
209#endif /*ifndef __EXOFS_COM_H__*/ 248#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c35fd4623986..8442e353309f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -55,12 +55,28 @@
55/* u64 has problems with printk this will cast it to unsigned long long */ 55/* u64 has problems with printk this will cast it to unsigned long long */
56#define _LLU(x) (unsigned long long)(x) 56#define _LLU(x) (unsigned long long)(x)
57 57
58struct exofs_layout {
59 osd_id s_pid; /* partition ID of file system*/
60
61 /* Our way of looking at the data_map */
62 unsigned stripe_unit;
63 unsigned mirrors_p1;
64
65 unsigned group_width;
66 u64 group_depth;
67 unsigned group_count;
68
69 enum exofs_inode_layout_gen_functions lay_func;
70
71 unsigned s_numdevs; /* Num of devices in array */
72 struct osd_dev *s_ods[0]; /* Variable length */
73};
74
58/* 75/*
59 * our extension to the in-memory superblock 76 * our extension to the in-memory superblock
60 */ 77 */
61struct exofs_sb_info { 78struct exofs_sb_info {
62 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ 79 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
63 osd_id s_pid; /* partition ID of file system*/
64 int s_timeout; /* timeout for OSD operations */ 80 int s_timeout; /* timeout for OSD operations */
65 uint64_t s_nextid; /* highest object ID used */ 81 uint64_t s_nextid; /* highest object ID used */
66 uint32_t s_numfiles; /* number of files on fs */ 82 uint32_t s_numfiles; /* number of files on fs */
@@ -69,22 +85,27 @@ struct exofs_sb_info {
69 atomic_t s_curr_pending; /* number of pending commands */ 85 atomic_t s_curr_pending; /* number of pending commands */
70 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ 86 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
71 87
72 struct pnfs_osd_data_map data_map; /* Default raid to use */ 88 struct pnfs_osd_data_map data_map; /* Default raid to use
73 unsigned s_numdevs; /* Num of devices in array */ 89 * FIXME: Needed ?
74 struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ 90 */
91/* struct exofs_layout dir_layout;*/ /* Default dir layout */
92 struct exofs_layout layout; /* Default files layout,
93 * contains the variable osd_dev
94 * array. Keep last */
95 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
75}; 96};
76 97
77/* 98/*
78 * our extension to the in-memory inode 99 * our extension to the in-memory inode
79 */ 100 */
80struct exofs_i_info { 101struct exofs_i_info {
102 struct inode vfs_inode; /* normal in-memory inode */
103 wait_queue_head_t i_wq; /* wait queue for inode */
81 unsigned long i_flags; /* various atomic flags */ 104 unsigned long i_flags; /* various atomic flags */
82 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ 105 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
83 uint32_t i_dir_start_lookup; /* which page to start lookup */ 106 uint32_t i_dir_start_lookup; /* which page to start lookup */
84 wait_queue_head_t i_wq; /* wait queue for inode */
85 uint64_t i_commit_size; /* the object's written length */ 107 uint64_t i_commit_size; /* the object's written length */
86 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ 108 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
87 struct inode vfs_inode; /* normal in-memory inode */
88}; 109};
89 110
90static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) 111static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -101,7 +122,7 @@ struct exofs_io_state {
101 void *private; 122 void *private;
102 exofs_io_done_fn done; 123 exofs_io_done_fn done;
103 124
104 struct exofs_sb_info *sbi; 125 struct exofs_layout *layout;
105 struct osd_obj_id obj; 126 struct osd_obj_id obj;
106 u8 *cred; 127 u8 *cred;
107 128
@@ -109,7 +130,11 @@ struct exofs_io_state {
109 loff_t offset; 130 loff_t offset;
110 unsigned long length; 131 unsigned long length;
111 void *kern_buff; 132 void *kern_buff;
112 struct bio *bio; 133
134 struct page **pages;
135 unsigned nr_pages;
136 unsigned pgbase;
137 unsigned pages_consumed;
113 138
114 /* Attributes */ 139 /* Attributes */
115 unsigned in_attr_len; 140 unsigned in_attr_len;
@@ -122,6 +147,9 @@ struct exofs_io_state {
122 struct exofs_per_dev_state { 147 struct exofs_per_dev_state {
123 struct osd_request *or; 148 struct osd_request *or;
124 struct bio *bio; 149 struct bio *bio;
150 loff_t offset;
151 unsigned length;
152 unsigned dev;
125 } per_dev[]; 153 } per_dev[];
126}; 154};
127 155
@@ -175,6 +203,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
175} 203}
176 204
177/* 205/*
206 * Given a layout, object_number and stripe_index return the associated global
207 * dev_index
208 */
209unsigned exofs_layout_od_id(struct exofs_layout *layout,
210 osd_id obj_no, unsigned layout_index);
211/*
178 * Maximum count of links to a file 212 * Maximum count of links to a file
179 */ 213 */
180#define EXOFS_LINK_MAX 32000 214#define EXOFS_LINK_MAX 32000
@@ -189,7 +223,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
189int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, 223int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
190 u64 offset, void *p, unsigned length); 224 u64 offset, void *p, unsigned length);
191 225
192int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); 226int exofs_get_io_state(struct exofs_layout *layout,
227 struct exofs_io_state **ios);
193void exofs_put_io_state(struct exofs_io_state *ios); 228void exofs_put_io_state(struct exofs_io_state *ios);
194 229
195int exofs_check_io(struct exofs_io_state *ios, u64 *resid); 230int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
@@ -226,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
226 struct page **pagep, void **fsdata); 261 struct page **pagep, void **fsdata);
227extern struct inode *exofs_iget(struct super_block *, unsigned long); 262extern struct inode *exofs_iget(struct super_block *, unsigned long);
228struct inode *exofs_new_inode(struct inode *, int); 263struct inode *exofs_new_inode(struct inode *, int);
229extern int exofs_write_inode(struct inode *, int); 264extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
230extern void exofs_delete_inode(struct inode *); 265extern void exofs_delete_inode(struct inode *);
231 266
232/* dir.c: */ 267/* dir.c: */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2afbcebeda71..a17e4b733e35 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -41,16 +41,18 @@
41 41
42enum { BIO_MAX_PAGES_KMALLOC = 42enum { BIO_MAX_PAGES_KMALLOC =
43 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), 43 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
44 MAX_PAGES_KMALLOC =
45 PAGE_SIZE / sizeof(struct page *),
44}; 46};
45 47
46struct page_collect { 48struct page_collect {
47 struct exofs_sb_info *sbi; 49 struct exofs_sb_info *sbi;
48 struct request_queue *req_q;
49 struct inode *inode; 50 struct inode *inode;
50 unsigned expected_pages; 51 unsigned expected_pages;
51 struct exofs_io_state *ios; 52 struct exofs_io_state *ios;
52 53
53 struct bio *bio; 54 struct page **pages;
55 unsigned alloc_pages;
54 unsigned nr_pages; 56 unsigned nr_pages;
55 unsigned long length; 57 unsigned long length;
56 loff_t pg_first; /* keep 64bit also in 32-arches */ 58 loff_t pg_first; /* keep 64bit also in 32-arches */
@@ -62,15 +64,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
62 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 64 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
63 65
64 pcol->sbi = sbi; 66 pcol->sbi = sbi;
65 /* Create master bios on first Q, later on cloning, each clone will be
66 * allocated on it's destination Q
67 */
68 pcol->req_q = osd_request_queue(sbi->s_ods[0]);
69 pcol->inode = inode; 67 pcol->inode = inode;
70 pcol->expected_pages = expected_pages; 68 pcol->expected_pages = expected_pages;
71 69
72 pcol->ios = NULL; 70 pcol->ios = NULL;
73 pcol->bio = NULL; 71 pcol->pages = NULL;
72 pcol->alloc_pages = 0;
74 pcol->nr_pages = 0; 73 pcol->nr_pages = 0;
75 pcol->length = 0; 74 pcol->length = 0;
76 pcol->pg_first = -1; 75 pcol->pg_first = -1;
@@ -80,7 +79,8 @@ static void _pcol_reset(struct page_collect *pcol)
80{ 79{
81 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); 80 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
82 81
83 pcol->bio = NULL; 82 pcol->pages = NULL;
83 pcol->alloc_pages = 0;
84 pcol->nr_pages = 0; 84 pcol->nr_pages = 0;
85 pcol->length = 0; 85 pcol->length = 0;
86 pcol->pg_first = -1; 86 pcol->pg_first = -1;
@@ -90,38 +90,43 @@ static void _pcol_reset(struct page_collect *pcol)
90 * it might not end here. don't be left with nothing 90 * it might not end here. don't be left with nothing
91 */ 91 */
92 if (!pcol->expected_pages) 92 if (!pcol->expected_pages)
93 pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; 93 pcol->expected_pages = MAX_PAGES_KMALLOC;
94} 94}
95 95
96static int pcol_try_alloc(struct page_collect *pcol) 96static int pcol_try_alloc(struct page_collect *pcol)
97{ 97{
98 int pages = min_t(unsigned, pcol->expected_pages, 98 unsigned pages = min_t(unsigned, pcol->expected_pages,
99 BIO_MAX_PAGES_KMALLOC); 99 MAX_PAGES_KMALLOC);
100 100
101 if (!pcol->ios) { /* First time allocate io_state */ 101 if (!pcol->ios) { /* First time allocate io_state */
102 int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); 102 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
103 103
104 if (ret) 104 if (ret)
105 return ret; 105 return ret;
106 } 106 }
107 107
108 /* TODO: easily support bio chaining */
109 pages = min_t(unsigned, pages,
110 pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
111
108 for (; pages; pages >>= 1) { 112 for (; pages; pages >>= 1) {
109 pcol->bio = bio_kmalloc(GFP_KERNEL, pages); 113 pcol->pages = kmalloc(pages * sizeof(struct page *),
110 if (likely(pcol->bio)) 114 GFP_KERNEL);
115 if (likely(pcol->pages)) {
116 pcol->alloc_pages = pages;
111 return 0; 117 return 0;
118 }
112 } 119 }
113 120
114 EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", 121 EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
115 pcol->expected_pages); 122 pcol->expected_pages);
116 return -ENOMEM; 123 return -ENOMEM;
117} 124}
118 125
119static void pcol_free(struct page_collect *pcol) 126static void pcol_free(struct page_collect *pcol)
120{ 127{
121 if (pcol->bio) { 128 kfree(pcol->pages);
122 bio_put(pcol->bio); 129 pcol->pages = NULL;
123 pcol->bio = NULL;
124 }
125 130
126 if (pcol->ios) { 131 if (pcol->ios) {
127 exofs_put_io_state(pcol->ios); 132 exofs_put_io_state(pcol->ios);
@@ -132,11 +137,10 @@ static void pcol_free(struct page_collect *pcol)
132static int pcol_add_page(struct page_collect *pcol, struct page *page, 137static int pcol_add_page(struct page_collect *pcol, struct page *page,
133 unsigned len) 138 unsigned len)
134{ 139{
135 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); 140 if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
136 if (unlikely(len != added_len))
137 return -ENOMEM; 141 return -ENOMEM;
138 142
139 ++pcol->nr_pages; 143 pcol->pages[pcol->nr_pages++] = page;
140 pcol->length += len; 144 pcol->length += len;
141 return 0; 145 return 0;
142} 146}
@@ -181,7 +185,6 @@ static void update_write_page(struct page *page, int ret)
181 */ 185 */
182static int __readpages_done(struct page_collect *pcol, bool do_unlock) 186static int __readpages_done(struct page_collect *pcol, bool do_unlock)
183{ 187{
184 struct bio_vec *bvec;
185 int i; 188 int i;
186 u64 resid; 189 u64 resid;
187 u64 good_bytes; 190 u64 good_bytes;
@@ -193,13 +196,13 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
193 else 196 else
194 good_bytes = pcol->length - resid; 197 good_bytes = pcol->length - resid;
195 198
196 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" 199 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
197 " length=0x%lx nr_pages=%u\n", 200 " length=0x%lx nr_pages=%u\n",
198 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 201 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
199 pcol->nr_pages); 202 pcol->nr_pages);
200 203
201 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 204 for (i = 0; i < pcol->nr_pages; i++) {
202 struct page *page = bvec->bv_page; 205 struct page *page = pcol->pages[i];
203 struct inode *inode = page->mapping->host; 206 struct inode *inode = page->mapping->host;
204 int page_stat; 207 int page_stat;
205 208
@@ -218,11 +221,11 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
218 ret = update_read_page(page, page_stat); 221 ret = update_read_page(page, page_stat);
219 if (do_unlock) 222 if (do_unlock)
220 unlock_page(page); 223 unlock_page(page);
221 length += bvec->bv_len; 224 length += PAGE_SIZE;
222 } 225 }
223 226
224 pcol_free(pcol); 227 pcol_free(pcol);
225 EXOFS_DBGMSG("readpages_done END\n"); 228 EXOFS_DBGMSG2("readpages_done END\n");
226 return ret; 229 return ret;
227} 230}
228 231
@@ -238,11 +241,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
238 241
239static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) 242static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
240{ 243{
241 struct bio_vec *bvec;
242 int i; 244 int i;
243 245
244 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 246 for (i = 0; i < pcol->nr_pages; i++) {
245 struct page *page = bvec->bv_page; 247 struct page *page = pcol->pages[i];
246 248
247 if (rw == READ) 249 if (rw == READ)
248 update_read_page(page, ret); 250 update_read_page(page, ret);
@@ -260,13 +262,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
260 struct page_collect *pcol_copy = NULL; 262 struct page_collect *pcol_copy = NULL;
261 int ret; 263 int ret;
262 264
263 if (!pcol->bio) 265 if (!pcol->pages)
264 return 0; 266 return 0;
265 267
266 /* see comment in _readpage() about sync reads */ 268 /* see comment in _readpage() about sync reads */
267 WARN_ON(is_sync && (pcol->nr_pages != 1)); 269 WARN_ON(is_sync && (pcol->nr_pages != 1));
268 270
269 ios->bio = pcol->bio; 271 ios->pages = pcol->pages;
272 ios->nr_pages = pcol->nr_pages;
270 ios->length = pcol->length; 273 ios->length = pcol->length;
271 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 274 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
272 275
@@ -290,7 +293,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
290 293
291 atomic_inc(&pcol->sbi->s_curr_pending); 294 atomic_inc(&pcol->sbi->s_curr_pending);
292 295
293 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 296 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
294 ios->obj.id, _LLU(ios->offset), pcol->length); 297 ios->obj.id, _LLU(ios->offset), pcol->length);
295 298
296 /* pages ownership was passed to pcol_copy */ 299 /* pages ownership was passed to pcol_copy */
@@ -366,7 +369,7 @@ try_again:
366 goto try_again; 369 goto try_again;
367 } 370 }
368 371
369 if (!pcol->bio) { 372 if (!pcol->pages) {
370 ret = pcol_try_alloc(pcol); 373 ret = pcol_try_alloc(pcol);
371 if (unlikely(ret)) 374 if (unlikely(ret))
372 goto fail; 375 goto fail;
@@ -448,7 +451,6 @@ static int exofs_readpage(struct file *file, struct page *page)
448static void writepages_done(struct exofs_io_state *ios, void *p) 451static void writepages_done(struct exofs_io_state *ios, void *p)
449{ 452{
450 struct page_collect *pcol = p; 453 struct page_collect *pcol = p;
451 struct bio_vec *bvec;
452 int i; 454 int i;
453 u64 resid; 455 u64 resid;
454 u64 good_bytes; 456 u64 good_bytes;
@@ -462,13 +464,13 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
462 else 464 else
463 good_bytes = pcol->length - resid; 465 good_bytes = pcol->length - resid;
464 466
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" 467 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n", 468 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 469 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages); 470 pcol->nr_pages);
469 471
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 472 for (i = 0; i < pcol->nr_pages; i++) {
471 struct page *page = bvec->bv_page; 473 struct page *page = pcol->pages[i];
472 struct inode *inode = page->mapping->host; 474 struct inode *inode = page->mapping->host;
473 int page_stat; 475 int page_stat;
474 476
@@ -485,12 +487,12 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
485 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", 487 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat); 488 inode->i_ino, page->index, page_stat);
487 489
488 length += bvec->bv_len; 490 length += PAGE_SIZE;
489 } 491 }
490 492
491 pcol_free(pcol); 493 pcol_free(pcol);
492 kfree(pcol); 494 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n"); 495 EXOFS_DBGMSG2("writepages_done END\n");
494} 496}
495 497
496static int write_exec(struct page_collect *pcol) 498static int write_exec(struct page_collect *pcol)
@@ -500,7 +502,7 @@ static int write_exec(struct page_collect *pcol)
500 struct page_collect *pcol_copy = NULL; 502 struct page_collect *pcol_copy = NULL;
501 int ret; 503 int ret;
502 504
503 if (!pcol->bio) 505 if (!pcol->pages)
504 return 0; 506 return 0;
505 507
506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 508 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -512,9 +514,8 @@ static int write_exec(struct page_collect *pcol)
512 514
513 *pcol_copy = *pcol; 515 *pcol_copy = *pcol;
514 516
515 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ 517 ios->pages = pcol_copy->pages;
516 518 ios->nr_pages = pcol_copy->nr_pages;
517 ios->bio = pcol_copy->bio;
518 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; 519 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
519 ios->length = pcol_copy->length; 520 ios->length = pcol_copy->length;
520 ios->done = writepages_done; 521 ios->done = writepages_done;
@@ -527,7 +528,7 @@ static int write_exec(struct page_collect *pcol)
527 } 528 }
528 529
529 atomic_inc(&pcol->sbi->s_curr_pending); 530 atomic_inc(&pcol->sbi->s_curr_pending);
530 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", 531 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
531 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), 532 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
532 pcol->length); 533 pcol->length);
533 /* pages ownership was passed to pcol_copy */ 534 /* pages ownership was passed to pcol_copy */
@@ -605,7 +606,7 @@ try_again:
605 goto try_again; 606 goto try_again;
606 } 607 }
607 608
608 if (!pcol->bio) { 609 if (!pcol->pages) {
609 ret = pcol_try_alloc(pcol); 610 ret = pcol_try_alloc(pcol);
610 if (unlikely(ret)) 611 if (unlikely(ret))
611 goto fail; 612 goto fail;
@@ -616,7 +617,7 @@ try_again:
616 617
617 ret = pcol_add_page(pcol, page, len); 618 ret = pcol_add_page(pcol, page, len);
618 if (unlikely(ret)) { 619 if (unlikely(ret)) {
619 EXOFS_DBGMSG("Failed pcol_add_page " 620 EXOFS_DBGMSG2("Failed pcol_add_page "
620 "nr_pages=%u total_length=0x%lx\n", 621 "nr_pages=%u total_length=0x%lx\n",
621 pcol->nr_pages, pcol->length); 622 pcol->nr_pages, pcol->length);
622 623
@@ -663,7 +664,7 @@ static int exofs_writepages(struct address_space *mapping,
663 if (expected_pages < 32L) 664 if (expected_pages < 32L)
664 expected_pages = 32L; 665 expected_pages = 32L;
665 666
666 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " 667 EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
667 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", 668 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
668 mapping->host->i_ino, wbc->range_start, wbc->range_end, 669 mapping->host->i_ino, wbc->range_start, wbc->range_end,
669 mapping->nrpages, start, end, expected_pages); 670 mapping->nrpages, start, end, expected_pages);
@@ -859,20 +860,33 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
859 return error; 860 return error;
860} 861}
861 862
863static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
864 EXOFS_APAGE_FS_DATA,
865 EXOFS_ATTR_INODE_FILE_LAYOUT,
866 0);
867static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
868 EXOFS_APAGE_FS_DATA,
869 EXOFS_ATTR_INODE_DIR_LAYOUT,
870 0);
871
862/* 872/*
863 * Read an inode from the OSD, and return it as is. We also return the size 873 * Read the Linux inode info from the OSD, and return it as is. In exofs the
864 * attribute in the 'obj_size' argument. 874 * inode info is in an application specific page/attribute of the osd-object.
865 */ 875 */
866static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 876static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
867 struct exofs_fcb *inode, uint64_t *obj_size) 877 struct exofs_fcb *inode)
868{ 878{
869 struct exofs_sb_info *sbi = sb->s_fs_info; 879 struct exofs_sb_info *sbi = sb->s_fs_info;
870 struct osd_attr attrs[2]; 880 struct osd_attr attrs[] = {
881 [0] = g_attr_inode_data,
882 [1] = g_attr_inode_file_layout,
883 [2] = g_attr_inode_dir_layout,
884 };
871 struct exofs_io_state *ios; 885 struct exofs_io_state *ios;
886 struct exofs_on_disk_inode_layout *layout;
872 int ret; 887 int ret;
873 888
874 *obj_size = ~0; 889 ret = exofs_get_io_state(&sbi->layout, &ios);
875 ret = exofs_get_io_state(sbi, &ios);
876 if (unlikely(ret)) { 890 if (unlikely(ret)) {
877 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 891 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
878 return ret; 892 return ret;
@@ -882,14 +896,25 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
882 exofs_make_credential(oi->i_cred, &ios->obj); 896 exofs_make_credential(oi->i_cred, &ios->obj);
883 ios->cred = oi->i_cred; 897 ios->cred = oi->i_cred;
884 898
885 attrs[0] = g_attr_inode_data; 899 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
886 attrs[1] = g_attr_logical_length; 900 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
901
887 ios->in_attr = attrs; 902 ios->in_attr = attrs;
888 ios->in_attr_len = ARRAY_SIZE(attrs); 903 ios->in_attr_len = ARRAY_SIZE(attrs);
889 904
890 ret = exofs_sbi_read(ios); 905 ret = exofs_sbi_read(ios);
891 if (ret) 906 if (unlikely(ret)) {
907 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
908 _LLU(ios->obj.id), ret);
909 memset(inode, 0, sizeof(*inode));
910 inode->i_mode = 0040000 | (0777 & ~022);
911 /* If object is lost on target we might as well enable it's
912 * delete.
913 */
914 if ((ret == -ENOENT) || (ret == -EINVAL))
915 ret = 0;
892 goto out; 916 goto out;
917 }
893 918
894 ret = extract_attr_from_ios(ios, &attrs[0]); 919 ret = extract_attr_from_ios(ios, &attrs[0]);
895 if (ret) { 920 if (ret) {
@@ -901,11 +926,33 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
901 926
902 ret = extract_attr_from_ios(ios, &attrs[1]); 927 ret = extract_attr_from_ios(ios, &attrs[1]);
903 if (ret) { 928 if (ret) {
904 EXOFS_ERR("%s: extract_attr of logical_length failed\n", 929 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
905 __func__); 930 goto out;
931 }
932 if (attrs[1].len) {
933 layout = attrs[1].val_ptr;
934 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
935 EXOFS_ERR("%s: unsupported files layout %d\n",
936 __func__, layout->gen_func);
937 ret = -ENOTSUPP;
938 goto out;
939 }
940 }
941
942 ret = extract_attr_from_ios(ios, &attrs[2]);
943 if (ret) {
944 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
906 goto out; 945 goto out;
907 } 946 }
908 *obj_size = get_unaligned_be64(attrs[1].val_ptr); 947 if (attrs[2].len) {
948 layout = attrs[2].val_ptr;
949 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
950 EXOFS_ERR("%s: unsupported meta-data layout %d\n",
951 __func__, layout->gen_func);
952 ret = -ENOTSUPP;
953 goto out;
954 }
955 }
909 956
910out: 957out:
911 exofs_put_io_state(ios); 958 exofs_put_io_state(ios);
@@ -925,7 +972,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
925 struct exofs_i_info *oi; 972 struct exofs_i_info *oi;
926 struct exofs_fcb fcb; 973 struct exofs_fcb fcb;
927 struct inode *inode; 974 struct inode *inode;
928 uint64_t obj_size;
929 int ret; 975 int ret;
930 976
931 inode = iget_locked(sb, ino); 977 inode = iget_locked(sb, ino);
@@ -937,7 +983,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
937 __oi_init(oi); 983 __oi_init(oi);
938 984
939 /* read the inode from the osd */ 985 /* read the inode from the osd */
940 ret = exofs_get_inode(sb, oi, &fcb, &obj_size); 986 ret = exofs_get_inode(sb, oi, &fcb);
941 if (ret) 987 if (ret)
942 goto bad_inode; 988 goto bad_inode;
943 989
@@ -958,13 +1004,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
958 inode->i_blkbits = EXOFS_BLKSHIFT; 1004 inode->i_blkbits = EXOFS_BLKSHIFT;
959 inode->i_generation = le32_to_cpu(fcb.i_generation); 1005 inode->i_generation = le32_to_cpu(fcb.i_generation);
960 1006
961 if ((inode->i_size != obj_size) &&
962 (!exofs_inode_is_fast_symlink(inode))) {
963 EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
964 inode->i_size, _LLU(obj_size));
965 /* FIXME: call exofs_inode_recovery() */
966 }
967
968 oi->i_dir_start_lookup = 0; 1007 oi->i_dir_start_lookup = 0;
969 1008
970 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 1009 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
@@ -1043,7 +1082,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
1043 1082
1044 if (unlikely(ret)) { 1083 if (unlikely(ret)) {
1045 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1084 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1046 _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); 1085 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1047 /*TODO: When FS is corrupted creation can fail, object already 1086 /*TODO: When FS is corrupted creation can fail, object already
1048 * exist. Get rid of this asynchronous creation, if exist 1087 * exist. Get rid of this asynchronous creation, if exist
1049 * increment the obj counter and try the next object. Until we 1088 * increment the obj counter and try the next object. Until we
@@ -1104,7 +1143,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1104 1143
1105 mark_inode_dirty(inode); 1144 mark_inode_dirty(inode);
1106 1145
1107 ret = exofs_get_io_state(sbi, &ios); 1146 ret = exofs_get_io_state(&sbi->layout, &ios);
1108 if (unlikely(ret)) { 1147 if (unlikely(ret)) {
1109 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); 1148 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
1110 return ERR_PTR(ret); 1149 return ERR_PTR(ret);
@@ -1170,8 +1209,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1170 int ret; 1209 int ret;
1171 1210
1172 args = kzalloc(sizeof(*args), GFP_KERNEL); 1211 args = kzalloc(sizeof(*args), GFP_KERNEL);
1173 if (!args) 1212 if (!args) {
1213 EXOFS_DBGMSG("Faild kzalloc of args\n");
1174 return -ENOMEM; 1214 return -ENOMEM;
1215 }
1175 1216
1176 fcb = &args->fcb; 1217 fcb = &args->fcb;
1177 1218
@@ -1200,7 +1241,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1200 } else 1241 } else
1201 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1242 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1202 1243
1203 ret = exofs_get_io_state(sbi, &ios); 1244 ret = exofs_get_io_state(&sbi->layout, &ios);
1204 if (unlikely(ret)) { 1245 if (unlikely(ret)) {
1205 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 1246 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
1206 goto free_args; 1247 goto free_args;
@@ -1234,13 +1275,14 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1234free_args: 1275free_args:
1235 kfree(args); 1276 kfree(args);
1236out: 1277out:
1237 EXOFS_DBGMSG("ret=>%d\n", ret); 1278 EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
1279 inode->i_ino, do_sync, ret);
1238 return ret; 1280 return ret;
1239} 1281}
1240 1282
1241int exofs_write_inode(struct inode *inode, int wait) 1283int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1242{ 1284{
1243 return exofs_update_inode(inode, wait); 1285 return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1244} 1286}
1245 1287
1246/* 1288/*
@@ -1283,7 +1325,7 @@ void exofs_delete_inode(struct inode *inode)
1283 1325
1284 clear_inode(inode); 1326 clear_inode(inode);
1285 1327
1286 ret = exofs_get_io_state(sbi, &ios); 1328 ret = exofs_get_io_state(&sbi->layout, &ios);
1287 if (unlikely(ret)) { 1329 if (unlikely(ret)) {
1288 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1330 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1289 return; 1331 return;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5bad01fa1f9f..5293bc411d17 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -23,9 +23,13 @@
23 */ 23 */
24 24
25#include <scsi/scsi_device.h> 25#include <scsi/scsi_device.h>
26#include <asm/div64.h>
26 27
27#include "exofs.h" 28#include "exofs.h"
28 29
30#define EXOFS_DBGMSG2(M...) do {} while (0)
31/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
32
29void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) 33void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
30{ 34{
31 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); 35 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
@@ -64,21 +68,24 @@ out:
64 return ret; 68 return ret;
65} 69}
66 70
67int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) 71int exofs_get_io_state(struct exofs_layout *layout,
72 struct exofs_io_state **pios)
68{ 73{
69 struct exofs_io_state *ios; 74 struct exofs_io_state *ios;
70 75
71 /*TODO: Maybe use kmem_cach per sbi of size 76 /*TODO: Maybe use kmem_cach per sbi of size
72 * exofs_io_state_size(sbi->s_numdevs) 77 * exofs_io_state_size(layout->s_numdevs)
73 */ 78 */
74 ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); 79 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
75 if (unlikely(!ios)) { 80 if (unlikely(!ios)) {
81 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
82 exofs_io_state_size(layout->s_numdevs));
76 *pios = NULL; 83 *pios = NULL;
77 return -ENOMEM; 84 return -ENOMEM;
78 } 85 }
79 86
80 ios->sbi = sbi; 87 ios->layout = layout;
81 ios->obj.partition = sbi->s_pid; 88 ios->obj.partition = layout->s_pid;
82 *pios = ios; 89 *pios = ios;
83 return 0; 90 return 0;
84} 91}
@@ -101,6 +108,29 @@ void exofs_put_io_state(struct exofs_io_state *ios)
101 } 108 }
102} 109}
103 110
111unsigned exofs_layout_od_id(struct exofs_layout *layout,
112 osd_id obj_no, unsigned layout_index)
113{
114/* switch (layout->lay_func) {
115 case LAYOUT_MOVING_WINDOW:
116 {*/
117 unsigned dev_mod = obj_no;
118
119 return (layout_index + dev_mod * layout->mirrors_p1) %
120 layout->s_numdevs;
121/* }
122 case LAYOUT_FUNC_IMPLICT:
123 return layout->devs[layout_index];
124 }*/
125}
126
127static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
128 unsigned layout_index)
129{
130 return ios->layout->s_ods[
131 exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
132}
133
104static void _sync_done(struct exofs_io_state *ios, void *p) 134static void _sync_done(struct exofs_io_state *ios, void *p)
105{ 135{
106 struct completion *waiting = p; 136 struct completion *waiting = p;
@@ -168,6 +198,21 @@ static int exofs_io_execute(struct exofs_io_state *ios)
168 return ret; 198 return ret;
169} 199}
170 200
201static void _clear_bio(struct bio *bio)
202{
203 struct bio_vec *bv;
204 unsigned i;
205
206 __bio_for_each_segment(bv, bio, i, 0) {
207 unsigned this_count = bv->bv_len;
208
209 if (likely(PAGE_SIZE == this_count))
210 clear_highpage(bv->bv_page);
211 else
212 zero_user(bv->bv_page, bv->bv_offset, this_count);
213 }
214}
215
171int exofs_check_io(struct exofs_io_state *ios, u64 *resid) 216int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
172{ 217{
173 enum osd_err_priority acumulated_osd_err = 0; 218 enum osd_err_priority acumulated_osd_err = 0;
@@ -176,16 +221,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
176 221
177 for (i = 0; i < ios->numdevs; i++) { 222 for (i = 0; i < ios->numdevs; i++) {
178 struct osd_sense_info osi; 223 struct osd_sense_info osi;
179 int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi); 224 struct osd_request *or = ios->per_dev[i].or;
225 int ret;
226
227 if (unlikely(!or))
228 continue;
180 229
230 ret = osd_req_decode_sense(or, &osi);
181 if (likely(!ret)) 231 if (likely(!ret))
182 continue; 232 continue;
183 233
184 if (unlikely(ret == -EFAULT)) { 234 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
185 EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__); 235 /* start read offset passed endof file */
186 /*FIXME: All the pages in this device range should: 236 _clear_bio(ios->per_dev[i].bio);
187 * clear_highpage(page); 237 EXOFS_DBGMSG("start read offset passed end of file "
188 */ 238 "offset=0x%llx, length=0x%llx\n",
239 _LLU(ios->per_dev[i].offset),
240 _LLU(ios->per_dev[i].length));
241
242 continue; /* we recovered */
189 } 243 }
190 244
191 if (osi.osd_err_pri >= acumulated_osd_err) { 245 if (osi.osd_err_pri >= acumulated_osd_err) {
@@ -205,14 +259,259 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
205 return acumulated_lin_err; 259 return acumulated_lin_err;
206} 260}
207 261
262/*
263 * L - logical offset into the file
264 *
265 * U - The number of bytes in a stripe within a group
266 *
267 * U = stripe_unit * group_width
268 *
269 * T - The number of bytes striped within a group of component objects
270 * (before advancing to the next group)
271 *
272 * T = stripe_unit * group_width * group_depth
273 *
274 * S - The number of bytes striped across all component objects
275 * before the pattern repeats
276 *
277 * S = stripe_unit * group_width * group_depth * group_count
278 *
279 * M - The "major" (i.e., across all components) stripe number
280 *
281 * M = L / S
282 *
283 * G - Counts the groups from the beginning of the major stripe
284 *
285 * G = (L - (M * S)) / T [or (L % S) / T]
286 *
287 * H - The byte offset within the group
288 *
289 * H = (L - (M * S)) % T [or (L % S) % T]
290 *
291 * N - The "minor" (i.e., across the group) stripe number
292 *
293 * N = H / U
294 *
295 * C - The component index coresponding to L
296 *
297 * C = (H - (N * U)) / stripe_unit + G * group_width
298 * [or (L % U) / stripe_unit + G * group_width]
299 *
300 * O - The component offset coresponding to L
301 *
302 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
303 */
304struct _striping_info {
305 u64 obj_offset;
306 u64 group_length;
307 u64 total_group_length;
308 u64 Major;
309 unsigned dev;
310 unsigned unit_off;
311};
312
313static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
314 struct _striping_info *si)
315{
316 u32 stripe_unit = ios->layout->stripe_unit;
317 u32 group_width = ios->layout->group_width;
318 u64 group_depth = ios->layout->group_depth;
319
320 u32 U = stripe_unit * group_width;
321 u64 T = U * group_depth;
322 u64 S = T * ios->layout->group_count;
323 u64 M = div64_u64(file_offset, S);
324
325 /*
326 G = (L - (M * S)) / T
327 H = (L - (M * S)) % T
328 */
329 u64 LmodS = file_offset - M * S;
330 u32 G = div64_u64(LmodS, T);
331 u64 H = LmodS - G * T;
332
333 u32 N = div_u64(H, U);
334
335 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
336 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
337 si->dev *= ios->layout->mirrors_p1;
338
339 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
340
341 si->obj_offset = si->unit_off + (N * stripe_unit) +
342 (M * group_depth * stripe_unit);
343
344 si->group_length = T - H;
345 si->total_group_length = T;
346 si->Major = M;
347}
348
349static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
350 unsigned pgbase, struct exofs_per_dev_state *per_dev,
351 int cur_len)
352{
353 unsigned pg = *cur_pg;
354 struct request_queue *q =
355 osd_request_queue(exofs_ios_od(ios, per_dev->dev));
356
357 per_dev->length += cur_len;
358
359 if (per_dev->bio == NULL) {
360 unsigned pages_in_stripe = ios->layout->group_width *
361 (ios->layout->stripe_unit / PAGE_SIZE);
362 unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
363 ios->layout->group_width;
364
365 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
366 if (unlikely(!per_dev->bio)) {
367 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
368 bio_size);
369 return -ENOMEM;
370 }
371 }
372
373 while (cur_len > 0) {
374 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
375 unsigned added_len;
376
377 BUG_ON(ios->nr_pages <= pg);
378 cur_len -= pglen;
379
380 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
381 pglen, pgbase);
382 if (unlikely(pglen != added_len))
383 return -ENOMEM;
384 pgbase = 0;
385 ++pg;
386 }
387 BUG_ON(cur_len);
388
389 *cur_pg = pg;
390 return 0;
391}
392
393static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
394 struct _striping_info *si, unsigned first_comp)
395{
396 unsigned stripe_unit = ios->layout->stripe_unit;
397 unsigned mirrors_p1 = ios->layout->mirrors_p1;
398 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
399 unsigned dev = si->dev;
400 unsigned first_dev = dev - (dev % devs_in_group);
401 unsigned comp = first_comp + (dev - first_dev);
402 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
403 unsigned cur_pg = ios->pages_consumed;
404 int ret = 0;
405
406 while (length) {
407 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
408 unsigned cur_len, page_off = 0;
409
410 if (!per_dev->length) {
411 per_dev->dev = dev;
412 if (dev < si->dev) {
413 per_dev->offset = si->obj_offset + stripe_unit -
414 si->unit_off;
415 cur_len = stripe_unit;
416 } else if (dev == si->dev) {
417 per_dev->offset = si->obj_offset;
418 cur_len = stripe_unit - si->unit_off;
419 page_off = si->unit_off & ~PAGE_MASK;
420 BUG_ON(page_off && (page_off != ios->pgbase));
421 } else { /* dev > si->dev */
422 per_dev->offset = si->obj_offset - si->unit_off;
423 cur_len = stripe_unit;
424 }
425
426 if (max_comp < comp)
427 max_comp = comp;
428
429 dev += mirrors_p1;
430 dev = (dev % devs_in_group) + first_dev;
431 } else {
432 cur_len = stripe_unit;
433 }
434 if (cur_len >= length)
435 cur_len = length;
436
437 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
438 cur_len);
439 if (unlikely(ret))
440 goto out;
441
442 comp += mirrors_p1;
443 comp = (comp % devs_in_group) + first_comp;
444
445 length -= cur_len;
446 }
447out:
448 ios->numdevs = max_comp + mirrors_p1;
449 ios->pages_consumed = cur_pg;
450 return ret;
451}
452
453static int _prepare_for_striping(struct exofs_io_state *ios)
454{
455 u64 length = ios->length;
456 struct _striping_info si;
457 unsigned devs_in_group = ios->layout->group_width *
458 ios->layout->mirrors_p1;
459 unsigned first_comp = 0;
460 int ret = 0;
461
462 _calc_stripe_info(ios, ios->offset, &si);
463
464 if (!ios->pages) {
465 if (ios->kern_buff) {
466 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
467
468 per_dev->offset = si.obj_offset;
469 per_dev->dev = si.dev;
470
471 /* no cross device without page array */
472 BUG_ON((ios->layout->group_width > 1) &&
473 (si.unit_off + ios->length >
474 ios->layout->stripe_unit));
475 }
476 ios->numdevs = ios->layout->mirrors_p1;
477 return 0;
478 }
479
480 while (length) {
481 if (length < si.group_length)
482 si.group_length = length;
483
484 ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
485 if (unlikely(ret))
486 goto out;
487
488 length -= si.group_length;
489
490 si.group_length = si.total_group_length;
491 si.unit_off = 0;
492 ++si.Major;
493 si.obj_offset = si.Major * ios->layout->stripe_unit *
494 ios->layout->group_depth;
495
496 si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
497 si.dev %= ios->layout->s_numdevs;
498
499 first_comp += devs_in_group;
500 first_comp %= ios->layout->s_numdevs;
501 }
502
503out:
504 return ret;
505}
506
208int exofs_sbi_create(struct exofs_io_state *ios) 507int exofs_sbi_create(struct exofs_io_state *ios)
209{ 508{
210 int i, ret; 509 int i, ret;
211 510
212 for (i = 0; i < ios->sbi->s_numdevs; i++) { 511 for (i = 0; i < ios->layout->s_numdevs; i++) {
213 struct osd_request *or; 512 struct osd_request *or;
214 513
215 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 514 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
216 if (unlikely(!or)) { 515 if (unlikely(!or)) {
217 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 516 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
218 ret = -ENOMEM; 517 ret = -ENOMEM;
@@ -233,10 +532,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
233{ 532{
234 int i, ret; 533 int i, ret;
235 534
236 for (i = 0; i < ios->sbi->s_numdevs; i++) { 535 for (i = 0; i < ios->layout->s_numdevs; i++) {
237 struct osd_request *or; 536 struct osd_request *or;
238 537
239 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 538 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
240 if (unlikely(!or)) { 539 if (unlikely(!or)) {
241 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 540 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
242 ret = -ENOMEM; 541 ret = -ENOMEM;
@@ -253,51 +552,74 @@ out:
253 return ret; 552 return ret;
254} 553}
255 554
256int exofs_sbi_write(struct exofs_io_state *ios) 555static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
257{ 556{
258 int i, ret; 557 struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
558 unsigned dev = ios->per_dev[cur_comp].dev;
559 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
560 int ret = 0;
259 561
260 for (i = 0; i < ios->sbi->s_numdevs; i++) { 562 if (ios->pages && !master_dev->length)
563 return 0; /* Just an empty slot */
564
565 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
566 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
261 struct osd_request *or; 567 struct osd_request *or;
262 568
263 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 569 or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
264 if (unlikely(!or)) { 570 if (unlikely(!or)) {
265 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 571 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
266 ret = -ENOMEM; 572 ret = -ENOMEM;
267 goto out; 573 goto out;
268 } 574 }
269 ios->per_dev[i].or = or; 575 per_dev->or = or;
270 ios->numdevs++; 576 per_dev->offset = master_dev->offset;
271 577
272 if (ios->bio) { 578 if (ios->pages) {
273 struct bio *bio; 579 struct bio *bio;
274 580
275 if (i != 0) { 581 if (per_dev != master_dev) {
276 bio = bio_kmalloc(GFP_KERNEL, 582 bio = bio_kmalloc(GFP_KERNEL,
277 ios->bio->bi_max_vecs); 583 master_dev->bio->bi_max_vecs);
278 if (unlikely(!bio)) { 584 if (unlikely(!bio)) {
585 EXOFS_DBGMSG(
586 "Faild to allocate BIO size=%u\n",
587 master_dev->bio->bi_max_vecs);
279 ret = -ENOMEM; 588 ret = -ENOMEM;
280 goto out; 589 goto out;
281 } 590 }
282 591
283 __bio_clone(bio, ios->bio); 592 __bio_clone(bio, master_dev->bio);
284 bio->bi_bdev = NULL; 593 bio->bi_bdev = NULL;
285 bio->bi_next = NULL; 594 bio->bi_next = NULL;
286 ios->per_dev[i].bio = bio; 595 per_dev->length = master_dev->length;
596 per_dev->bio = bio;
597 per_dev->dev = dev;
287 } else { 598 } else {
288 bio = ios->bio; 599 bio = master_dev->bio;
600 /* FIXME: bio_set_dir() */
601 bio->bi_rw |= (1 << BIO_RW);
289 } 602 }
290 603
291 osd_req_write(or, &ios->obj, ios->offset, bio, 604 osd_req_write(or, &ios->obj, per_dev->offset, bio,
292 ios->length); 605 per_dev->length);
293/* EXOFS_DBGMSG("write sync=%d\n", sync);*/ 606 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
607 "length=0x%llx dev=%d\n",
608 _LLU(ios->obj.id), _LLU(per_dev->offset),
609 _LLU(per_dev->length), dev);
294 } else if (ios->kern_buff) { 610 } else if (ios->kern_buff) {
295 osd_req_write_kern(or, &ios->obj, ios->offset, 611 ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
296 ios->kern_buff, ios->length); 612 ios->kern_buff, ios->length);
297/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/ 613 if (unlikely(ret))
614 goto out;
615 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
616 "length=0x%llx dev=%d\n",
617 _LLU(ios->obj.id), _LLU(per_dev->offset),
618 _LLU(ios->length), dev);
298 } else { 619 } else {
299 osd_req_set_attributes(or, &ios->obj); 620 osd_req_set_attributes(or, &ios->obj);
300/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/ 621 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
622 _LLU(ios->obj.id), ios->out_attr_len, dev);
301 } 623 }
302 624
303 if (ios->out_attr) 625 if (ios->out_attr)
@@ -308,54 +630,93 @@ int exofs_sbi_write(struct exofs_io_state *ios)
308 osd_req_add_get_attr_list(or, ios->in_attr, 630 osd_req_add_get_attr_list(or, ios->in_attr,
309 ios->in_attr_len); 631 ios->in_attr_len);
310 } 632 }
311 ret = exofs_io_execute(ios);
312 633
313out: 634out:
314 return ret; 635 return ret;
315} 636}
316 637
317int exofs_sbi_read(struct exofs_io_state *ios) 638int exofs_sbi_write(struct exofs_io_state *ios)
318{ 639{
319 int i, ret; 640 int i;
641 int ret;
320 642
321 for (i = 0; i < 1; i++) { 643 ret = _prepare_for_striping(ios);
322 struct osd_request *or; 644 if (unlikely(ret))
323 unsigned first_dev = (unsigned)ios->obj.id; 645 return ret;
324 646
325 first_dev %= ios->sbi->s_numdevs; 647 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
326 or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); 648 ret = _sbi_write_mirror(ios, i);
327 if (unlikely(!or)) { 649 if (unlikely(ret))
328 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 650 return ret;
329 ret = -ENOMEM; 651 }
330 goto out;
331 }
332 ios->per_dev[i].or = or;
333 ios->numdevs++;
334 652
335 if (ios->bio) { 653 ret = exofs_io_execute(ios);
336 osd_req_read(or, &ios->obj, ios->offset, ios->bio, 654 return ret;
337 ios->length); 655}
338/* EXOFS_DBGMSG("read sync=%d\n", sync);*/
339 } else if (ios->kern_buff) {
340 osd_req_read_kern(or, &ios->obj, ios->offset,
341 ios->kern_buff, ios->length);
342/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
343 } else {
344 osd_req_get_attributes(or, &ios->obj);
345/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
346 }
347 656
348 if (ios->out_attr) 657static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
349 osd_req_add_set_attr_list(or, ios->out_attr, 658{
350 ios->out_attr_len); 659 struct osd_request *or;
660 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
661 unsigned first_dev = (unsigned)ios->obj.id;
351 662
352 if (ios->in_attr) 663 if (ios->pages && !per_dev->length)
353 osd_req_add_get_attr_list(or, ios->in_attr, 664 return 0; /* Just an empty slot */
354 ios->in_attr_len); 665
666 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
667 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
668 if (unlikely(!or)) {
669 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
670 return -ENOMEM;
355 } 671 }
356 ret = exofs_io_execute(ios); 672 per_dev->or = or;
673
674 if (ios->pages) {
675 osd_req_read(or, &ios->obj, per_dev->offset,
676 per_dev->bio, per_dev->length);
677 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
678 " dev=%d\n", _LLU(ios->obj.id),
679 _LLU(per_dev->offset), _LLU(per_dev->length),
680 first_dev);
681 } else if (ios->kern_buff) {
682 int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
683 ios->kern_buff, ios->length);
684 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
685 "length=0x%llx dev=%d ret=>%d\n",
686 _LLU(ios->obj.id), _LLU(per_dev->offset),
687 _LLU(ios->length), first_dev, ret);
688 if (unlikely(ret))
689 return ret;
690 } else {
691 osd_req_get_attributes(or, &ios->obj);
692 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
693 _LLU(ios->obj.id), ios->in_attr_len, first_dev);
694 }
695 if (ios->out_attr)
696 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
357 697
358out: 698 if (ios->in_attr)
699 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
700
701 return 0;
702}
703
704int exofs_sbi_read(struct exofs_io_state *ios)
705{
706 int i;
707 int ret;
708
709 ret = _prepare_for_striping(ios);
710 if (unlikely(ret))
711 return ret;
712
713 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
714 ret = _sbi_read_mirror(ios, i);
715 if (unlikely(ret))
716 return ret;
717 }
718
719 ret = exofs_io_execute(ios);
359 return ret; 720 return ret;
360} 721}
361 722
@@ -380,42 +741,82 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
380 return -EIO; 741 return -EIO;
381} 742}
382 743
744static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
745 struct osd_attr *attr)
746{
747 int last_comp = cur_comp + ios->layout->mirrors_p1;
748
749 for (; cur_comp < last_comp; ++cur_comp) {
750 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
751 struct osd_request *or;
752
753 or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
754 if (unlikely(!or)) {
755 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
756 return -ENOMEM;
757 }
758 per_dev->or = or;
759
760 osd_req_set_attributes(or, &ios->obj);
761 osd_req_add_set_attr_list(or, attr, 1);
762 }
763
764 return 0;
765}
766
383int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) 767int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
384{ 768{
385 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; 769 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
386 struct exofs_io_state *ios; 770 struct exofs_io_state *ios;
387 struct osd_attr attr; 771 struct exofs_trunc_attr {
388 __be64 newsize; 772 struct osd_attr attr;
773 __be64 newsize;
774 } *size_attrs;
775 struct _striping_info si;
389 int i, ret; 776 int i, ret;
390 777
391 if (exofs_get_io_state(sbi, &ios)) 778 ret = exofs_get_io_state(&sbi->layout, &ios);
392 return -ENOMEM; 779 if (unlikely(ret))
780 return ret;
781
782 size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
783 GFP_KERNEL);
784 if (unlikely(!size_attrs)) {
785 ret = -ENOMEM;
786 goto out;
787 }
393 788
394 ios->obj.id = exofs_oi_objno(oi); 789 ios->obj.id = exofs_oi_objno(oi);
395 ios->cred = oi->i_cred; 790 ios->cred = oi->i_cred;
396 791
397 newsize = cpu_to_be64(size); 792 ios->numdevs = ios->layout->s_numdevs;
398 attr = g_attr_logical_length; 793 _calc_stripe_info(ios, size, &si);
399 attr.val_ptr = &newsize;
400 794
401 for (i = 0; i < sbi->s_numdevs; i++) { 795 for (i = 0; i < ios->layout->group_width; ++i) {
402 struct osd_request *or; 796 struct exofs_trunc_attr *size_attr = &size_attrs[i];
797 u64 obj_size;
403 798
404 or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); 799 if (i < si.dev)
405 if (unlikely(!or)) { 800 obj_size = si.obj_offset +
406 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 801 ios->layout->stripe_unit - si.unit_off;
407 ret = -ENOMEM; 802 else if (i == si.dev)
408 goto out; 803 obj_size = si.obj_offset;
409 } 804 else /* i > si.dev */
410 ios->per_dev[i].or = or; 805 obj_size = si.obj_offset - si.unit_off;
411 ios->numdevs++;
412 806
413 osd_req_set_attributes(or, &ios->obj); 807 size_attr->newsize = cpu_to_be64(obj_size);
414 osd_req_add_set_attr_list(or, &attr, 1); 808 size_attr->attr = g_attr_logical_length;
809 size_attr->attr.val_ptr = &size_attr->newsize;
810
811 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
812 &size_attr->attr);
813 if (unlikely(ret))
814 goto out;
415 } 815 }
416 ret = exofs_io_execute(ios); 816 ret = exofs_io_execute(ios);
417 817
418out: 818out:
819 kfree(size_attrs);
419 exofs_put_io_state(ios); 820 exofs_put_io_state(ios);
420 return ret; 821 return ret;
421} 822}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index a1d1e77b12eb..6cf5e4e84d61 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -210,7 +210,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
210 sbi = sb->s_fs_info; 210 sbi = sb->s_fs_info;
211 fscb = &sbi->s_fscb; 211 fscb = &sbi->s_fscb;
212 212
213 ret = exofs_get_io_state(sbi, &ios); 213 ret = exofs_get_io_state(&sbi->layout, &ios);
214 if (ret) 214 if (ret)
215 goto out; 215 goto out;
216 216
@@ -264,12 +264,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
264 264
265void exofs_free_sbi(struct exofs_sb_info *sbi) 265void exofs_free_sbi(struct exofs_sb_info *sbi)
266{ 266{
267 while (sbi->s_numdevs) { 267 while (sbi->layout.s_numdevs) {
268 int i = --sbi->s_numdevs; 268 int i = --sbi->layout.s_numdevs;
269 struct osd_dev *od = sbi->s_ods[i]; 269 struct osd_dev *od = sbi->layout.s_ods[i];
270 270
271 if (od) { 271 if (od) {
272 sbi->s_ods[i] = NULL; 272 sbi->layout.s_ods[i] = NULL;
273 osduld_put_device(od); 273 osduld_put_device(od);
274 } 274 }
275 } 275 }
@@ -298,7 +298,8 @@ static void exofs_put_super(struct super_block *sb)
298 msecs_to_jiffies(100)); 298 msecs_to_jiffies(100));
299 } 299 }
300 300
301 _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); 301 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
302 sbi->layout.s_pid);
302 303
303 exofs_free_sbi(sbi); 304 exofs_free_sbi(sbi);
304 sb->s_fs_info = NULL; 305 sb->s_fs_info = NULL;
@@ -307,6 +308,8 @@ static void exofs_put_super(struct super_block *sb)
307static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, 308static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
308 struct exofs_device_table *dt) 309 struct exofs_device_table *dt)
309{ 310{
311 u64 stripe_length;
312
310 sbi->data_map.odm_num_comps = 313 sbi->data_map.odm_num_comps =
311 le32_to_cpu(dt->dt_data_map.cb_num_comps); 314 le32_to_cpu(dt->dt_data_map.cb_num_comps);
312 sbi->data_map.odm_stripe_unit = 315 sbi->data_map.odm_stripe_unit =
@@ -320,14 +323,63 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
320 sbi->data_map.odm_raid_algorithm = 323 sbi->data_map.odm_raid_algorithm =
321 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); 324 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
322 325
323/* FIXME: Hard coded mirror only for now. if not so do not mount */ 326/* FIXME: Only raid0 for now. if not so, do not mount */
324 if ((sbi->data_map.odm_num_comps != numdevs) || 327 if (sbi->data_map.odm_num_comps != numdevs) {
325 (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || 328 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
326 (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || 329 sbi->data_map.odm_num_comps, numdevs);
327 (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
328 return -EINVAL; 330 return -EINVAL;
329 else 331 }
330 return 0; 332 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
333 EXOFS_ERR("Only RAID_0 for now\n");
334 return -EINVAL;
335 }
336 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
337 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
338 numdevs, sbi->data_map.odm_mirror_cnt);
339 return -EINVAL;
340 }
341
342 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
343 EXOFS_ERR("Stripe Unit(0x%llx)"
344 " must be Multples of PAGE_SIZE(0x%lx)\n",
345 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
346 return -EINVAL;
347 }
348
349 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
350 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
351
352 if (sbi->data_map.odm_group_width) {
353 sbi->layout.group_width = sbi->data_map.odm_group_width;
354 sbi->layout.group_depth = sbi->data_map.odm_group_depth;
355 if (!sbi->layout.group_depth) {
356 EXOFS_ERR("group_depth == 0 && group_width != 0\n");
357 return -EINVAL;
358 }
359 sbi->layout.group_count = sbi->data_map.odm_num_comps /
360 sbi->layout.mirrors_p1 /
361 sbi->data_map.odm_group_width;
362 } else {
363 if (sbi->data_map.odm_group_depth) {
364 printk(KERN_NOTICE "Warning: group_depth ignored "
365 "group_width == 0 && group_depth == %d\n",
366 sbi->data_map.odm_group_depth);
367 sbi->data_map.odm_group_depth = 0;
368 }
369 sbi->layout.group_width = sbi->data_map.odm_num_comps /
370 sbi->layout.mirrors_p1;
371 sbi->layout.group_depth = -1;
372 sbi->layout.group_count = 1;
373 }
374
375 stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
376 if (stripe_length >= (1ULL << 32)) {
377 EXOFS_ERR("Total Stripe length(0x%llx)"
378 " >= 32bit is not supported\n", _LLU(stripe_length));
379 return -EINVAL;
380 }
381
382 return 0;
331} 383}
332 384
333/* @odi is valid only as long as @fscb_dev is valid */ 385/* @odi is valid only as long as @fscb_dev is valid */
@@ -361,7 +413,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
361{ 413{
362 struct exofs_sb_info *sbi = *psbi; 414 struct exofs_sb_info *sbi = *psbi;
363 struct osd_dev *fscb_od; 415 struct osd_dev *fscb_od;
364 struct osd_obj_id obj = {.partition = sbi->s_pid, 416 struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
365 .id = EXOFS_DEVTABLE_ID}; 417 .id = EXOFS_DEVTABLE_ID};
366 struct exofs_device_table *dt; 418 struct exofs_device_table *dt;
367 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + 419 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
@@ -376,9 +428,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
376 return -ENOMEM; 428 return -ENOMEM;
377 } 429 }
378 430
379 fscb_od = sbi->s_ods[0]; 431 fscb_od = sbi->layout.s_ods[0];
380 sbi->s_ods[0] = NULL; 432 sbi->layout.s_ods[0] = NULL;
381 sbi->s_numdevs = 0; 433 sbi->layout.s_numdevs = 0;
382 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); 434 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
383 if (unlikely(ret)) { 435 if (unlikely(ret)) {
384 EXOFS_ERR("ERROR: reading device table\n"); 436 EXOFS_ERR("ERROR: reading device table\n");
@@ -397,14 +449,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
397 goto out; 449 goto out;
398 450
399 if (likely(numdevs > 1)) { 451 if (likely(numdevs > 1)) {
400 unsigned size = numdevs * sizeof(sbi->s_ods[0]); 452 unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
401 453
402 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); 454 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
403 if (unlikely(!sbi)) { 455 if (unlikely(!sbi)) {
404 ret = -ENOMEM; 456 ret = -ENOMEM;
405 goto out; 457 goto out;
406 } 458 }
407 memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); 459 memset(&sbi->layout.s_ods[1], 0,
460 size - sizeof(sbi->layout.s_ods[0]));
408 *psbi = sbi; 461 *psbi = sbi;
409 } 462 }
410 463
@@ -427,8 +480,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
427 * line. We always keep them in device-table order. 480 * line. We always keep them in device-table order.
428 */ 481 */
429 if (fscb_od && osduld_device_same(fscb_od, &odi)) { 482 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
430 sbi->s_ods[i] = fscb_od; 483 sbi->layout.s_ods[i] = fscb_od;
431 ++sbi->s_numdevs; 484 ++sbi->layout.s_numdevs;
432 fscb_od = NULL; 485 fscb_od = NULL;
433 continue; 486 continue;
434 } 487 }
@@ -441,8 +494,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
441 goto out; 494 goto out;
442 } 495 }
443 496
444 sbi->s_ods[i] = od; 497 sbi->layout.s_ods[i] = od;
445 ++sbi->s_numdevs; 498 ++sbi->layout.s_numdevs;
446 499
447 /* Read the fscb of the other devices to make sure the FS 500 /* Read the fscb of the other devices to make sure the FS
448 * partition is there. 501 * partition is there.
@@ -499,9 +552,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
499 goto free_sbi; 552 goto free_sbi;
500 } 553 }
501 554
502 sbi->s_ods[0] = od; 555 /* Default layout in case we do not have a device-table */
503 sbi->s_numdevs = 1; 556 sbi->layout.stripe_unit = PAGE_SIZE;
504 sbi->s_pid = opts->pid; 557 sbi->layout.mirrors_p1 = 1;
558 sbi->layout.group_width = 1;
559 sbi->layout.group_depth = -1;
560 sbi->layout.group_count = 1;
561 sbi->layout.s_ods[0] = od;
562 sbi->layout.s_numdevs = 1;
563 sbi->layout.s_pid = opts->pid;
505 sbi->s_timeout = opts->timeout; 564 sbi->s_timeout = opts->timeout;
506 565
507 /* fill in some other data by hand */ 566 /* fill in some other data by hand */
@@ -514,7 +573,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
514 sb->s_bdev = NULL; 573 sb->s_bdev = NULL;
515 sb->s_dev = 0; 574 sb->s_dev = 0;
516 575
517 obj.partition = sbi->s_pid; 576 obj.partition = sbi->layout.s_pid;
518 obj.id = EXOFS_SUPER_ID; 577 obj.id = EXOFS_SUPER_ID;
519 exofs_make_credential(sbi->s_cred, &obj); 578 exofs_make_credential(sbi->s_cred, &obj);
520 579
@@ -578,13 +637,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
578 goto free_sbi; 637 goto free_sbi;
579 } 638 }
580 639
581 _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], 640 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
582 sbi->s_pid); 641 sbi->layout.s_pid);
583 return 0; 642 return 0;
584 643
585free_sbi: 644free_sbi:
586 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 645 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
587 opts->dev_name, sbi->s_pid, ret); 646 opts->dev_name, sbi->layout.s_pid, ret);
588 exofs_free_sbi(sbi); 647 exofs_free_sbi(sbi);
589 return ret; 648 return ret;
590} 649}
@@ -627,7 +686,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
627 uint8_t cred_a[OSD_CAP_LEN]; 686 uint8_t cred_a[OSD_CAP_LEN];
628 int ret; 687 int ret;
629 688
630 ret = exofs_get_io_state(sbi, &ios); 689 ret = exofs_get_io_state(&sbi->layout, &ios);
631 if (ret) { 690 if (ret) {
632 EXOFS_DBGMSG("exofs_get_io_state failed.\n"); 691 EXOFS_DBGMSG("exofs_get_io_state failed.\n");
633 return ret; 692 return ret;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea6..1d081f0cfec2 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -570,7 +570,7 @@ do_more:
570error_return: 570error_return:
571 brelse(bitmap_bh); 571 brelse(bitmap_bh);
572 release_blocks(sb, freed); 572 release_blocks(sb, freed);
573 vfs_dq_free_block(inode, freed); 573 dquot_free_block(inode, freed);
574} 574}
575 575
576/** 576/**
@@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
1236 unsigned short windowsz = 0; 1236 unsigned short windowsz = 0;
1237 unsigned long ngroups; 1237 unsigned long ngroups;
1238 unsigned long num = *count; 1238 unsigned long num = *count;
1239 int ret;
1239 1240
1240 *errp = -ENOSPC; 1241 *errp = -ENOSPC;
1241 sb = inode->i_sb; 1242 sb = inode->i_sb;
@@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
1247 /* 1248 /*
1248 * Check quota for allocation of this block. 1249 * Check quota for allocation of this block.
1249 */ 1250 */
1250 if (vfs_dq_alloc_block(inode, num)) { 1251 ret = dquot_alloc_block(inode, num);
1251 *errp = -EDQUOT; 1252 if (ret) {
1253 *errp = ret;
1252 return 0; 1254 return 0;
1253 } 1255 }
1254 1256
@@ -1409,7 +1411,7 @@ allocated:
1409 1411
1410 *errp = 0; 1412 *errp = 0;
1411 brelse(bitmap_bh); 1413 brelse(bitmap_bh);
1412 vfs_dq_free_block(inode, *count-num); 1414 dquot_free_block(inode, *count-num);
1413 *count = num; 1415 *count = num;
1414 return ret_block; 1416 return ret_block;
1415 1417
@@ -1420,7 +1422,7 @@ out:
1420 * Undo the block allocation 1422 * Undo the block allocation
1421 */ 1423 */
1422 if (!performed_allocation) 1424 if (!performed_allocation)
1423 vfs_dq_free_block(inode, *count); 1425 dquot_free_block(inode, *count);
1424 brelse(bitmap_bh); 1426 brelse(bitmap_bh);
1425 return 0; 1427 return 0;
1426} 1428}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 061914add3cf..0b038e47ad2f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
118 118
119/* inode.c */ 119/* inode.c */
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, int); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 586e3589d4c2..5d198d0697fb 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
20 20
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/quotaops.h>
23#include "ext2.h" 24#include "ext2.h"
24#include "xattr.h" 25#include "xattr.h"
25#include "acl.h" 26#include "acl.h"
@@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = {
70 .compat_ioctl = ext2_compat_ioctl, 71 .compat_ioctl = ext2_compat_ioctl,
71#endif 72#endif
72 .mmap = generic_file_mmap, 73 .mmap = generic_file_mmap,
73 .open = generic_file_open, 74 .open = dquot_file_open,
74 .release = ext2_release_file, 75 .release = ext2_release_file,
75 .fsync = ext2_fsync, 76 .fsync = ext2_fsync,
76 .splice_read = generic_file_splice_read, 77 .splice_read = generic_file_splice_read,
@@ -87,7 +88,7 @@ const struct file_operations ext2_xip_file_operations = {
87 .compat_ioctl = ext2_compat_ioctl, 88 .compat_ioctl = ext2_compat_ioctl,
88#endif 89#endif
89 .mmap = xip_file_mmap, 90 .mmap = xip_file_mmap,
90 .open = generic_file_open, 91 .open = dquot_file_open,
91 .release = ext2_release_file, 92 .release = ext2_release_file,
92 .fsync = ext2_fsync, 93 .fsync = ext2_fsync,
93}; 94};
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d8..ad7d572ee8dc 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
121 if (!is_bad_inode(inode)) { 121 if (!is_bad_inode(inode)) {
122 /* Quota is already initialized in iput() */ 122 /* Quota is already initialized in iput() */
123 ext2_xattr_delete_inode(inode); 123 ext2_xattr_delete_inode(inode);
124 vfs_dq_free_inode(inode); 124 dquot_free_inode(inode);
125 vfs_dq_drop(inode); 125 dquot_drop(inode);
126 } 126 }
127 127
128 es = EXT2_SB(sb)->s_es; 128 es = EXT2_SB(sb)->s_es;
@@ -586,10 +586,10 @@ got:
586 goto fail_drop; 586 goto fail_drop;
587 } 587 }
588 588
589 if (vfs_dq_alloc_inode(inode)) { 589 dquot_initialize(inode);
590 err = -EDQUOT; 590 err = dquot_alloc_inode(inode);
591 if (err)
591 goto fail_drop; 592 goto fail_drop;
592 }
593 593
594 err = ext2_init_acl(inode, dir); 594 err = ext2_init_acl(inode, dir);
595 if (err) 595 if (err)
@@ -605,10 +605,10 @@ got:
605 return inode; 605 return inode;
606 606
607fail_free_drop: 607fail_free_drop:
608 vfs_dq_free_inode(inode); 608 dquot_free_inode(inode);
609 609
610fail_drop: 610fail_drop:
611 vfs_dq_drop(inode); 611 dquot_drop(inode);
612 inode->i_flags |= S_NOQUOTA; 612 inode->i_flags |= S_NOQUOTA;
613 inode->i_nlink = 0; 613 inode->i_nlink = 0;
614 unlock_new_inode(inode); 614 unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 71b032c65a02..fc13cc119aad 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
41MODULE_DESCRIPTION("Second Extended Filesystem"); 41MODULE_DESCRIPTION("Second Extended Filesystem");
42MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
43 43
44static int __ext2_write_inode(struct inode *inode, int do_sync);
45
44/* 46/*
45 * Test whether an inode is a fast symlink. 47 * Test whether an inode is a fast symlink.
46 */ 48 */
@@ -58,13 +60,15 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
58 */ 60 */
59void ext2_delete_inode (struct inode * inode) 61void ext2_delete_inode (struct inode * inode)
60{ 62{
63 if (!is_bad_inode(inode))
64 dquot_initialize(inode);
61 truncate_inode_pages(&inode->i_data, 0); 65 truncate_inode_pages(&inode->i_data, 0);
62 66
63 if (is_bad_inode(inode)) 67 if (is_bad_inode(inode))
64 goto no_delete; 68 goto no_delete;
65 EXT2_I(inode)->i_dtime = get_seconds(); 69 EXT2_I(inode)->i_dtime = get_seconds();
66 mark_inode_dirty(inode); 70 mark_inode_dirty(inode);
67 ext2_write_inode(inode, inode_needs_sync(inode)); 71 __ext2_write_inode(inode, inode_needs_sync(inode));
68 72
69 inode->i_size = 0; 73 inode->i_size = 0;
70 if (inode->i_blocks) 74 if (inode->i_blocks)
@@ -1335,7 +1339,7 @@ bad_inode:
1335 return ERR_PTR(ret); 1339 return ERR_PTR(ret);
1336} 1340}
1337 1341
1338int ext2_write_inode(struct inode *inode, int do_sync) 1342static int __ext2_write_inode(struct inode *inode, int do_sync)
1339{ 1343{
1340 struct ext2_inode_info *ei = EXT2_I(inode); 1344 struct ext2_inode_info *ei = EXT2_I(inode);
1341 struct super_block *sb = inode->i_sb; 1345 struct super_block *sb = inode->i_sb;
@@ -1440,6 +1444,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
1440 return err; 1444 return err;
1441} 1445}
1442 1446
1447int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1448{
1449 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1450}
1451
1443int ext2_sync_inode(struct inode *inode) 1452int ext2_sync_inode(struct inode *inode)
1444{ 1453{
1445 struct writeback_control wbc = { 1454 struct writeback_control wbc = {
@@ -1457,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1457 error = inode_change_ok(inode, iattr); 1466 error = inode_change_ok(inode, iattr);
1458 if (error) 1467 if (error)
1459 return error; 1468 return error;
1469
1470 if (iattr->ia_valid & ATTR_SIZE)
1471 dquot_initialize(inode);
1460 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1472 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1461 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1473 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
1462 error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; 1474 error = dquot_transfer(inode, iattr);
1463 if (error) 1475 if (error)
1464 return error; 1476 return error;
1465 } 1477 }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175ce5606..71efb0e9a3f2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/pagemap.h> 33#include <linux/pagemap.h>
34#include <linux/quotaops.h>
34#include "ext2.h" 35#include "ext2.h"
35#include "xattr.h" 36#include "xattr.h"
36#include "acl.h" 37#include "acl.h"
@@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child)
99 */ 100 */
100static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) 101static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
101{ 102{
102 struct inode * inode = ext2_new_inode (dir, mode); 103 struct inode *inode;
103 int err = PTR_ERR(inode); 104
104 if (!IS_ERR(inode)) { 105 dquot_initialize(dir);
105 inode->i_op = &ext2_file_inode_operations; 106
106 if (ext2_use_xip(inode->i_sb)) { 107 inode = ext2_new_inode(dir, mode);
107 inode->i_mapping->a_ops = &ext2_aops_xip; 108 if (IS_ERR(inode))
108 inode->i_fop = &ext2_xip_file_operations; 109 return PTR_ERR(inode);
109 } else if (test_opt(inode->i_sb, NOBH)) { 110
110 inode->i_mapping->a_ops = &ext2_nobh_aops; 111 inode->i_op = &ext2_file_inode_operations;
111 inode->i_fop = &ext2_file_operations; 112 if (ext2_use_xip(inode->i_sb)) {
112 } else { 113 inode->i_mapping->a_ops = &ext2_aops_xip;
113 inode->i_mapping->a_ops = &ext2_aops; 114 inode->i_fop = &ext2_xip_file_operations;
114 inode->i_fop = &ext2_file_operations; 115 } else if (test_opt(inode->i_sb, NOBH)) {
115 } 116 inode->i_mapping->a_ops = &ext2_nobh_aops;
116 mark_inode_dirty(inode); 117 inode->i_fop = &ext2_file_operations;
117 err = ext2_add_nondir(dentry, inode); 118 } else {
119 inode->i_mapping->a_ops = &ext2_aops;
120 inode->i_fop = &ext2_file_operations;
118 } 121 }
119 return err; 122 mark_inode_dirty(inode);
123 return ext2_add_nondir(dentry, inode);
120} 124}
121 125
122static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) 126static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
@@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
127 if (!new_valid_dev(rdev)) 131 if (!new_valid_dev(rdev))
128 return -EINVAL; 132 return -EINVAL;
129 133
134 dquot_initialize(dir);
135
130 inode = ext2_new_inode (dir, mode); 136 inode = ext2_new_inode (dir, mode);
131 err = PTR_ERR(inode); 137 err = PTR_ERR(inode);
132 if (!IS_ERR(inode)) { 138 if (!IS_ERR(inode)) {
@@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
151 if (l > sb->s_blocksize) 157 if (l > sb->s_blocksize)
152 goto out; 158 goto out;
153 159
160 dquot_initialize(dir);
161
154 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); 162 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
155 err = PTR_ERR(inode); 163 err = PTR_ERR(inode);
156 if (IS_ERR(inode)) 164 if (IS_ERR(inode))
@@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
194 if (inode->i_nlink >= EXT2_LINK_MAX) 202 if (inode->i_nlink >= EXT2_LINK_MAX)
195 return -EMLINK; 203 return -EMLINK;
196 204
205 dquot_initialize(dir);
206
197 inode->i_ctime = CURRENT_TIME_SEC; 207 inode->i_ctime = CURRENT_TIME_SEC;
198 inode_inc_link_count(inode); 208 inode_inc_link_count(inode);
199 atomic_inc(&inode->i_count); 209 atomic_inc(&inode->i_count);
@@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
216 if (dir->i_nlink >= EXT2_LINK_MAX) 226 if (dir->i_nlink >= EXT2_LINK_MAX)
217 goto out; 227 goto out;
218 228
229 dquot_initialize(dir);
230
219 inode_inc_link_count(dir); 231 inode_inc_link_count(dir);
220 232
221 inode = ext2_new_inode (dir, S_IFDIR | mode); 233 inode = ext2_new_inode (dir, S_IFDIR | mode);
@@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
262 struct page * page; 274 struct page * page;
263 int err = -ENOENT; 275 int err = -ENOENT;
264 276
277 dquot_initialize(dir);
278
265 de = ext2_find_entry (dir, &dentry->d_name, &page); 279 de = ext2_find_entry (dir, &dentry->d_name, &page);
266 if (!de) 280 if (!de)
267 goto out; 281 goto out;
@@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
304 struct ext2_dir_entry_2 * old_de; 318 struct ext2_dir_entry_2 * old_de;
305 int err = -ENOENT; 319 int err = -ENOENT;
306 320
321 dquot_initialize(old_dir);
322 dquot_initialize(new_dir);
323
307 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); 324 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
308 if (!old_de) 325 if (!old_de)
309 goto out; 326 goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f9cb54a585ce..42e4a303b675 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -194,6 +194,8 @@ static void destroy_inodecache(void)
194static void ext2_clear_inode(struct inode *inode) 194static void ext2_clear_inode(struct inode *inode)
195{ 195{
196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; 196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
197
198 dquot_drop(inode);
197 ext2_discard_reservation(inode); 199 ext2_discard_reservation(inode);
198 EXT2_I(inode)->i_block_alloc_info = NULL; 200 EXT2_I(inode)->i_block_alloc_info = NULL;
199 if (unlikely(rsv)) 201 if (unlikely(rsv))
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 904f00642f84..e44dc92609be 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
644 the inode. */ 644 the inode. */
645 ea_bdebug(new_bh, "reusing block"); 645 ea_bdebug(new_bh, "reusing block");
646 646
647 error = -EDQUOT; 647 error = dquot_alloc_block(inode, 1);
648 if (vfs_dq_alloc_block(inode, 1)) { 648 if (error) {
649 unlock_buffer(new_bh); 649 unlock_buffer(new_bh);
650 goto cleanup; 650 goto cleanup;
651 } 651 }
@@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
702 * as if nothing happened and cleanup the unused block */ 702 * as if nothing happened and cleanup the unused block */
703 if (error && error != -ENOSPC) { 703 if (error && error != -ENOSPC) {
704 if (new_bh && new_bh != old_bh) 704 if (new_bh && new_bh != old_bh)
705 vfs_dq_free_block(inode, 1); 705 dquot_free_block(inode, 1);
706 goto cleanup; 706 goto cleanup;
707 } 707 }
708 } else 708 } else
@@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
734 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 734 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
735 if (ce) 735 if (ce)
736 mb_cache_entry_release(ce); 736 mb_cache_entry_release(ce);
737 vfs_dq_free_block(inode, 1); 737 dquot_free_block(inode, 1);
738 mark_buffer_dirty(old_bh); 738 mark_buffer_dirty(old_bh);
739 ea_bdebug(old_bh, "refcount now=%d", 739 ea_bdebug(old_bh, "refcount now=%d",
740 le32_to_cpu(HDR(old_bh)->h_refcount)); 740 le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode)
797 mark_buffer_dirty(bh); 797 mark_buffer_dirty(bh);
798 if (IS_SYNC(inode)) 798 if (IS_SYNC(inode))
799 sync_dirty_buffer(bh); 799 sync_dirty_buffer(bh);
800 vfs_dq_free_block(inode, 1); 800 dquot_free_block(inode, 1);
801 } 801 }
802 EXT2_I(inode)->i_file_acl = 0; 802 EXT2_I(inode)->i_file_acl = 0;
803 803
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e820..161da2d3f890 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
676 } 676 }
677 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 677 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
678 if (dquot_freed_blocks) 678 if (dquot_freed_blocks)
679 vfs_dq_free_block(inode, dquot_freed_blocks); 679 dquot_free_block(inode, dquot_freed_blocks);
680 return; 680 return;
681} 681}
682 682
@@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1502 /* 1502 /*
1503 * Check quota for allocation of this block. 1503 * Check quota for allocation of this block.
1504 */ 1504 */
1505 if (vfs_dq_alloc_block(inode, num)) { 1505 err = dquot_alloc_block(inode, num);
1506 *errp = -EDQUOT; 1506 if (err) {
1507 *errp = err;
1507 return 0; 1508 return 0;
1508 } 1509 }
1509 1510
@@ -1713,7 +1714,7 @@ allocated:
1713 1714
1714 *errp = 0; 1715 *errp = 0;
1715 brelse(bitmap_bh); 1716 brelse(bitmap_bh);
1716 vfs_dq_free_block(inode, *count-num); 1717 dquot_free_block(inode, *count-num);
1717 *count = num; 1718 *count = num;
1718 return ret_block; 1719 return ret_block;
1719 1720
@@ -1728,7 +1729,7 @@ out:
1728 * Undo the block allocation 1729 * Undo the block allocation
1729 */ 1730 */
1730 if (!performed_allocation) 1731 if (!performed_allocation)
1731 vfs_dq_free_block(inode, *count); 1732 dquot_free_block(inode, *count);
1732 brelse(bitmap_bh); 1733 brelse(bitmap_bh);
1733 return 0; 1734 return 0;
1734} 1735}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 388bbdfa0b4e..f55df0e61cbd 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/jbd.h> 23#include <linux/jbd.h>
24#include <linux/quotaops.h>
24#include <linux/ext3_fs.h> 25#include <linux/ext3_fs.h>
25#include <linux/ext3_jbd.h> 26#include <linux/ext3_jbd.h>
26#include "xattr.h" 27#include "xattr.h"
@@ -33,9 +34,9 @@
33 */ 34 */
34static int ext3_release_file (struct inode * inode, struct file * filp) 35static int ext3_release_file (struct inode * inode, struct file * filp)
35{ 36{
36 if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { 37 if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
37 filemap_flush(inode->i_mapping); 38 filemap_flush(inode->i_mapping);
38 EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; 39 ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
39 } 40 }
40 /* if we are the last writer on the inode, drop the block reservation */ 41 /* if we are the last writer on the inode, drop the block reservation */
41 if ((filp->f_mode & FMODE_WRITE) && 42 if ((filp->f_mode & FMODE_WRITE) &&
@@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = {
62 .compat_ioctl = ext3_compat_ioctl, 63 .compat_ioctl = ext3_compat_ioctl,
63#endif 64#endif
64 .mmap = generic_file_mmap, 65 .mmap = generic_file_mmap,
65 .open = generic_file_open, 66 .open = dquot_file_open,
66 .release = ext3_release_file, 67 .release = ext3_release_file,
67 .fsync = ext3_sync_file, 68 .fsync = ext3_sync_file,
68 .splice_read = generic_file_splice_read, 69 .splice_read = generic_file_splice_read,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b39991285136..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
123 * Note: we must free any quota before locking the superblock, 123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well. 124 * as writing the quota to disk may need the lock as well.
125 */ 125 */
126 vfs_dq_init(inode); 126 dquot_initialize(inode);
127 ext3_xattr_delete_inode(handle, inode); 127 ext3_xattr_delete_inode(handle, inode);
128 vfs_dq_free_inode(inode); 128 dquot_free_inode(inode);
129 vfs_dq_drop(inode); 129 dquot_drop(inode);
130 130
131 is_directory = S_ISDIR(inode->i_mode); 131 is_directory = S_ISDIR(inode->i_mode);
132 132
@@ -582,16 +582,18 @@ got:
582 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
583 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
584 584
585 ei->i_state = EXT3_STATE_NEW; 585 ei->i_state_flags = 0;
586 ext3_set_inode_state(inode, EXT3_STATE_NEW);
587
586 ei->i_extra_isize = 588 ei->i_extra_isize =
587 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 589 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
588 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 590 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
589 591
590 ret = inode; 592 ret = inode;
591 if (vfs_dq_alloc_inode(inode)) { 593 dquot_initialize(inode);
592 err = -EDQUOT; 594 err = dquot_alloc_inode(inode);
595 if (err)
593 goto fail_drop; 596 goto fail_drop;
594 }
595 597
596 err = ext3_init_acl(handle, inode, dir); 598 err = ext3_init_acl(handle, inode, dir);
597 if (err) 599 if (err)
@@ -619,10 +621,10 @@ really_out:
619 return ret; 621 return ret;
620 622
621fail_free_drop: 623fail_free_drop:
622 vfs_dq_free_inode(inode); 624 dquot_free_inode(inode);
623 625
624fail_drop: 626fail_drop:
625 vfs_dq_drop(inode); 627 dquot_drop(inode);
626 inode->i_flags |= S_NOQUOTA; 628 inode->i_flags |= S_NOQUOTA;
627 inode->i_nlink = 0; 629 inode->i_nlink = 0;
628 unlock_new_inode(inode); 630 unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 455e6e6e5cb9..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode)
196{ 196{
197 handle_t *handle; 197 handle_t *handle;
198 198
199 if (!is_bad_inode(inode))
200 dquot_initialize(inode);
201
199 truncate_inode_pages(&inode->i_data, 0); 202 truncate_inode_pages(&inode->i_data, 0);
200 203
201 if (is_bad_inode(inode)) 204 if (is_bad_inode(inode))
@@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file,
1378 */ 1381 */
1379 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1382 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1380 ext3_orphan_add(handle, inode); 1383 ext3_orphan_add(handle, inode);
1381 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1384 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1382 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1385 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1383 EXT3_I(inode)->i_disksize = inode->i_size; 1386 EXT3_I(inode)->i_disksize = inode->i_size;
1384 ret2 = ext3_mark_inode_dirty(handle, inode); 1387 ret2 = ext3_mark_inode_dirty(handle, inode);
@@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1417 journal_t *journal; 1420 journal_t *journal;
1418 int err; 1421 int err;
1419 1422
1420 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { 1423 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
1421 /* 1424 /*
1422 * This is a REALLY heavyweight approach, but the use of 1425 * This is a REALLY heavyweight approach, but the use of
1423 * bmap on dirty files is expected to be extremely rare: 1426 * bmap on dirty files is expected to be extremely rare:
@@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1436 * everything they get. 1439 * everything they get.
1437 */ 1440 */
1438 1441
1439 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; 1442 ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
1440 journal = EXT3_JOURNAL(inode); 1443 journal = EXT3_JOURNAL(inode);
1441 journal_lock_updates(journal); 1444 journal_lock_updates(journal);
1442 err = journal_flush(journal); 1445 err = journal_flush(journal);
@@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page,
1528 int err; 1531 int err;
1529 1532
1530 J_ASSERT(PageLocked(page)); 1533 J_ASSERT(PageLocked(page));
1534 WARN_ON_ONCE(IS_RDONLY(inode));
1531 1535
1532 /* 1536 /*
1533 * We give up here if we're reentered, because it might be for a 1537 * We give up here if we're reentered, because it might be for a
@@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page,
1600 int ret = 0; 1604 int ret = 0;
1601 int err; 1605 int err;
1602 1606
1607 J_ASSERT(PageLocked(page));
1608 WARN_ON_ONCE(IS_RDONLY(inode));
1609
1603 if (ext3_journal_current_handle()) 1610 if (ext3_journal_current_handle())
1604 goto out_fail; 1611 goto out_fail;
1605 1612
@@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page,
1642 int ret = 0; 1649 int ret = 0;
1643 int err; 1650 int err;
1644 1651
1652 J_ASSERT(PageLocked(page));
1653 WARN_ON_ONCE(IS_RDONLY(inode));
1654
1645 if (ext3_journal_current_handle()) 1655 if (ext3_journal_current_handle())
1646 goto no_write; 1656 goto no_write;
1647 1657
@@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page,
1670 PAGE_CACHE_SIZE, NULL, write_end_fn); 1680 PAGE_CACHE_SIZE, NULL, write_end_fn);
1671 if (ret == 0) 1681 if (ret == 0)
1672 ret = err; 1682 ret = err;
1673 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1683 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1674 unlock_page(page); 1684 unlock_page(page);
1675 } else { 1685 } else {
1676 /* 1686 /*
@@ -1785,8 +1795,9 @@ retry:
1785 handle = ext3_journal_start(inode, 2); 1795 handle = ext3_journal_start(inode, 2);
1786 if (IS_ERR(handle)) { 1796 if (IS_ERR(handle)) {
1787 /* This is really bad luck. We've written the data 1797 /* This is really bad luck. We've written the data
1788 * but cannot extend i_size. Bail out and pretend 1798 * but cannot extend i_size. Truncate allocated blocks
1789 * the write failed... */ 1799 * and pretend the write failed... */
1800 ext3_truncate(inode);
1790 ret = PTR_ERR(handle); 1801 ret = PTR_ERR(handle);
1791 goto out; 1802 goto out;
1792 } 1803 }
@@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode)
2402 goto out_notrans; 2413 goto out_notrans;
2403 2414
2404 if (inode->i_size == 0 && ext3_should_writeback_data(inode)) 2415 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2405 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; 2416 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2406 2417
2407 /* 2418 /*
2408 * We have to lock the EOF page here, because lock_page() nests 2419 * We have to lock the EOF page here, because lock_page() nests
@@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2721{ 2732{
2722 /* We have all inode data except xattrs in memory here. */ 2733 /* We have all inode data except xattrs in memory here. */
2723 return __ext3_get_inode_loc(inode, iloc, 2734 return __ext3_get_inode_loc(inode, iloc,
2724 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); 2735 !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
2725} 2736}
2726 2737
2727void ext3_set_inode_flags(struct inode *inode) 2738void ext3_set_inode_flags(struct inode *inode)
@@ -2800,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2800 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2801 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2802 2813
2803 ei->i_state = 0; 2814 ei->i_state_flags = 0;
2804 ei->i_dir_start_lookup = 0; 2815 ei->i_dir_start_lookup = 0;
2805 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2806 /* We now have enough fields to check if the inode was active or not. 2817 /* We now have enough fields to check if the inode was active or not.
@@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2893 EXT3_GOOD_OLD_INODE_SIZE + 2904 EXT3_GOOD_OLD_INODE_SIZE +
2894 ei->i_extra_isize; 2905 ei->i_extra_isize;
2895 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) 2906 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2896 ei->i_state |= EXT3_STATE_XATTR; 2907 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
2897 } 2908 }
2898 } else 2909 } else
2899 ei->i_extra_isize = 0; 2910 ei->i_extra_isize = 0;
@@ -2955,7 +2966,7 @@ again:
2955 2966
2956 /* For fields not not tracking in the in-memory inode, 2967 /* For fields not not tracking in the in-memory inode,
2957 * initialise them to zero for new inodes. */ 2968 * initialise them to zero for new inodes. */
2958 if (ei->i_state & EXT3_STATE_NEW) 2969 if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
2959 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 2970 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2960 2971
2961 ext3_get_inode_flags(ei); 2972 ext3_get_inode_flags(ei);
@@ -3052,7 +3063,7 @@ again:
3052 rc = ext3_journal_dirty_metadata(handle, bh); 3063 rc = ext3_journal_dirty_metadata(handle, bh);
3053 if (!err) 3064 if (!err)
3054 err = rc; 3065 err = rc;
3055 ei->i_state &= ~EXT3_STATE_NEW; 3066 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3056 3067
3057 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3068 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3058out_brelse: 3069out_brelse:
@@ -3096,7 +3107,7 @@ out_brelse:
3096 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3107 * `stuff()' is running, and the new i_size will be lost. Plus the inode
3097 * will no longer be on the superblock's dirty inode list. 3108 * will no longer be on the superblock's dirty inode list.
3098 */ 3109 */
3099int ext3_write_inode(struct inode *inode, int wait) 3110int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3100{ 3111{
3101 if (current->flags & PF_MEMALLOC) 3112 if (current->flags & PF_MEMALLOC)
3102 return 0; 3113 return 0;
@@ -3107,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait)
3107 return -EIO; 3118 return -EIO;
3108 } 3119 }
3109 3120
3110 if (!wait) 3121 if (wbc->sync_mode != WB_SYNC_ALL)
3111 return 0; 3122 return 0;
3112 3123
3113 return ext3_force_commit(inode->i_sb); 3124 return ext3_force_commit(inode->i_sb);
@@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3140 if (error) 3151 if (error)
3141 return error; 3152 return error;
3142 3153
3154 if (ia_valid & ATTR_SIZE)
3155 dquot_initialize(inode);
3143 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3144 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3145 handle_t *handle; 3158 handle_t *handle;
@@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3152 error = PTR_ERR(handle); 3165 error = PTR_ERR(handle);
3153 goto err_out; 3166 goto err_out;
3154 } 3167 }
3155 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 3168 error = dquot_transfer(inode, attr);
3156 if (error) { 3169 if (error) {
3157 ext3_journal_stop(handle); 3170 ext3_journal_stop(handle);
3158 return error; 3171 return error;
@@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
3237 ret = 2 * (bpp + indirects) + 2; 3250 ret = 2 * (bpp + indirects) + 2;
3238 3251
3239#ifdef CONFIG_QUOTA 3252#ifdef CONFIG_QUOTA
3240 /* We know that structure was already allocated during vfs_dq_init so 3253 /* We know that structure was already allocated during dquot_initialize so
3241 * we will be updating only the data blocks + inodes */ 3254 * we will be updating only the data blocks + inodes */
3242 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 3255 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
3243#endif 3256#endif
@@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3328 * i_size has been changed by generic_commit_write() and we thus need 3341 * i_size has been changed by generic_commit_write() and we thus need
3329 * to include the updated inode in the current transaction. 3342 * to include the updated inode in the current transaction.
3330 * 3343 *
3331 * Also, vfs_dq_alloc_space() will always dirty the inode when blocks 3344 * Also, dquot_alloc_space() will always dirty the inode when blocks
3332 * are allocated to the file. 3345 * are allocated to the file.
3333 * 3346 *
3334 * If the inode is marked synchronous, we don't honour that here - doing 3347 * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7b0e44f7d66f..ee184084ca42 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1696 struct inode * inode; 1696 struct inode * inode;
1697 int err, retries = 0; 1697 int err, retries = 0;
1698 1698
1699 dquot_initialize(dir);
1700
1699retry: 1701retry:
1700 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1702 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1701 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1703 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1730 if (!new_valid_dev(rdev)) 1732 if (!new_valid_dev(rdev))
1731 return -EINVAL; 1733 return -EINVAL;
1732 1734
1735 dquot_initialize(dir);
1736
1733retry: 1737retry:
1734 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1738 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1735 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1739 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1766 if (dir->i_nlink >= EXT3_LINK_MAX) 1770 if (dir->i_nlink >= EXT3_LINK_MAX)
1767 return -EMLINK; 1771 return -EMLINK;
1768 1772
1773 dquot_initialize(dir);
1774
1769retry: 1775retry:
1770 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1776 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1771 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1777 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2060 2066
2061 /* Initialize quotas before so that eventual writes go in 2067 /* Initialize quotas before so that eventual writes go in
2062 * separate transaction */ 2068 * separate transaction */
2063 vfs_dq_init(dentry->d_inode); 2069 dquot_initialize(dir);
2070 dquot_initialize(dentry->d_inode);
2071
2064 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2072 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2065 if (IS_ERR(handle)) 2073 if (IS_ERR(handle))
2066 return PTR_ERR(handle); 2074 return PTR_ERR(handle);
@@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2119 2127
2120 /* Initialize quotas before so that eventual writes go 2128 /* Initialize quotas before so that eventual writes go
2121 * in separate transaction */ 2129 * in separate transaction */
2122 vfs_dq_init(dentry->d_inode); 2130 dquot_initialize(dir);
2131 dquot_initialize(dentry->d_inode);
2132
2123 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2133 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2124 if (IS_ERR(handle)) 2134 if (IS_ERR(handle))
2125 return PTR_ERR(handle); 2135 return PTR_ERR(handle);
@@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir,
2174 if (l > dir->i_sb->s_blocksize) 2184 if (l > dir->i_sb->s_blocksize)
2175 return -ENAMETOOLONG; 2185 return -ENAMETOOLONG;
2176 2186
2187 dquot_initialize(dir);
2188
2177retry: 2189retry:
2178 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2190 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2179 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2191 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry,
2228 2240
2229 if (inode->i_nlink >= EXT3_LINK_MAX) 2241 if (inode->i_nlink >= EXT3_LINK_MAX)
2230 return -EMLINK; 2242 return -EMLINK;
2243
2244 dquot_initialize(dir);
2245
2231 /* 2246 /*
2232 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing 2247 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2233 * otherwise has the potential to corrupt the orphan inode list. 2248 * otherwise has the potential to corrupt the orphan inode list.
@@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2278 struct ext3_dir_entry_2 * old_de, * new_de; 2293 struct ext3_dir_entry_2 * old_de, * new_de;
2279 int retval, flush_file = 0; 2294 int retval, flush_file = 0;
2280 2295
2296 dquot_initialize(old_dir);
2297 dquot_initialize(new_dir);
2298
2281 old_bh = new_bh = dir_bh = NULL; 2299 old_bh = new_bh = dir_bh = NULL;
2282 2300
2283 /* Initialize quotas before so that eventual writes go 2301 /* Initialize quotas before so that eventual writes go
2284 * in separate transaction */ 2302 * in separate transaction */
2285 if (new_dentry->d_inode) 2303 if (new_dentry->d_inode)
2286 vfs_dq_init(new_dentry->d_inode); 2304 dquot_initialize(new_dentry->d_inode);
2287 handle = ext3_journal_start(old_dir, 2 * 2305 handle = ext3_journal_start(old_dir, 2 *
2288 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2306 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2289 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); 2307 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index afa2b569da10..1bee604cc6cd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -164,7 +164,7 @@ void ext3_msg(struct super_block *sb, const char *prefix,
164 * write out the superblock safely. 164 * write out the superblock safely.
165 * 165 *
166 * We'll just use the journal_abort() error code to record an error in 166 * We'll just use the journal_abort() error code to record an error in
167 * the journal instead. On recovery, the journal will compain about 167 * the journal instead. On recovery, the journal will complain about
168 * that error until we've noted it down and cleared it. 168 * that error until we've noted it down and cleared it.
169 */ 169 */
170 170
@@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb)
181 if (!test_opt (sb, ERRORS_CONT)) { 181 if (!test_opt (sb, ERRORS_CONT)) {
182 journal_t *journal = EXT3_SB(sb)->s_journal; 182 journal_t *journal = EXT3_SB(sb)->s_journal;
183 183
184 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; 184 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
185 if (journal) 185 if (journal)
186 journal_abort(journal, -EIO); 186 journal_abort(journal, -EIO);
187 } 187 }
@@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function,
296 "error: remounting filesystem read-only"); 296 "error: remounting filesystem read-only");
297 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; 297 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
298 sb->s_flags |= MS_RDONLY; 298 sb->s_flags |= MS_RDONLY;
299 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; 299 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
300 if (EXT3_SB(sb)->s_journal) 300 if (EXT3_SB(sb)->s_journal)
301 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 301 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
302} 302}
@@ -528,6 +528,8 @@ static void destroy_inodecache(void)
528static void ext3_clear_inode(struct inode *inode) 528static void ext3_clear_inode(struct inode *inode)
529{ 529{
530 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; 530 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
531
532 dquot_drop(inode);
531 ext3_discard_reservation(inode); 533 ext3_discard_reservation(inode);
532 EXT3_I(inode)->i_block_alloc_info = NULL; 534 EXT3_I(inode)->i_block_alloc_info = NULL;
533 if (unlikely(rsv)) 535 if (unlikely(rsv))
@@ -562,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
562 if (sbi->s_qf_names[GRPQUOTA]) 564 if (sbi->s_qf_names[GRPQUOTA])
563 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 565 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
564 566
565 if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) 567 if (test_opt(sb, USRQUOTA))
566 seq_puts(seq, ",usrquota"); 568 seq_puts(seq, ",usrquota");
567 569
568 if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) 570 if (test_opt(sb, GRPQUOTA))
569 seq_puts(seq, ",grpquota"); 571 seq_puts(seq, ",grpquota");
570#endif 572#endif
571} 573}
@@ -656,8 +658,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
656 if (test_opt(sb, NOBH)) 658 if (test_opt(sb, NOBH))
657 seq_puts(seq, ",nobh"); 659 seq_puts(seq, ",nobh");
658 660
659 seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & 661 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
660 EXT3_MOUNT_DATA_FLAGS));
661 if (test_opt(sb, DATA_ERR_ABORT)) 662 if (test_opt(sb, DATA_ERR_ABORT))
662 seq_puts(seq, ",data_err=abort"); 663 seq_puts(seq, ",data_err=abort");
663 664
@@ -751,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
751 const char *data, size_t len, loff_t off); 752 const char *data, size_t len, loff_t off);
752 753
753static const struct dquot_operations ext3_quota_operations = { 754static const struct dquot_operations ext3_quota_operations = {
754 .initialize = dquot_initialize,
755 .drop = dquot_drop,
756 .alloc_space = dquot_alloc_space,
757 .alloc_inode = dquot_alloc_inode,
758 .free_space = dquot_free_space,
759 .free_inode = dquot_free_inode,
760 .transfer = dquot_transfer,
761 .write_dquot = ext3_write_dquot, 755 .write_dquot = ext3_write_dquot,
762 .acquire_dquot = ext3_acquire_dquot, 756 .acquire_dquot = ext3_acquire_dquot,
763 .release_dquot = ext3_release_dquot, 757 .release_dquot = ext3_release_dquot,
@@ -896,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
896 return sb_block; 890 return sb_block;
897} 891}
898 892
893#ifdef CONFIG_QUOTA
894static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
895{
896 struct ext3_sb_info *sbi = EXT3_SB(sb);
897 char *qname;
898
899 if (sb_any_quota_loaded(sb) &&
900 !sbi->s_qf_names[qtype]) {
901 ext3_msg(sb, KERN_ERR,
902 "Cannot change journaled "
903 "quota options when quota turned on");
904 return 0;
905 }
906 qname = match_strdup(args);
907 if (!qname) {
908 ext3_msg(sb, KERN_ERR,
909 "Not enough memory for storing quotafile name");
910 return 0;
911 }
912 if (sbi->s_qf_names[qtype] &&
913 strcmp(sbi->s_qf_names[qtype], qname)) {
914 ext3_msg(sb, KERN_ERR,
915 "%s quota file already specified", QTYPE2NAME(qtype));
916 kfree(qname);
917 return 0;
918 }
919 sbi->s_qf_names[qtype] = qname;
920 if (strchr(sbi->s_qf_names[qtype], '/')) {
921 ext3_msg(sb, KERN_ERR,
922 "quotafile must be on filesystem root");
923 kfree(sbi->s_qf_names[qtype]);
924 sbi->s_qf_names[qtype] = NULL;
925 return 0;
926 }
927 set_opt(sbi->s_mount_opt, QUOTA);
928 return 1;
929}
930
931static int clear_qf_name(struct super_block *sb, int qtype) {
932
933 struct ext3_sb_info *sbi = EXT3_SB(sb);
934
935 if (sb_any_quota_loaded(sb) &&
936 sbi->s_qf_names[qtype]) {
937 ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
938 " when quota turned on");
939 return 0;
940 }
941 /*
942 * The space will be released later when all options are confirmed
943 * to be correct
944 */
945 sbi->s_qf_names[qtype] = NULL;
946 return 1;
947}
948#endif
949
899static int parse_options (char *options, struct super_block *sb, 950static int parse_options (char *options, struct super_block *sb,
900 unsigned int *inum, unsigned long *journal_devnum, 951 unsigned int *inum, unsigned long *journal_devnum,
901 ext3_fsblk_t *n_blocks_count, int is_remount) 952 ext3_fsblk_t *n_blocks_count, int is_remount)
@@ -906,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb,
906 int data_opt = 0; 957 int data_opt = 0;
907 int option; 958 int option;
908#ifdef CONFIG_QUOTA 959#ifdef CONFIG_QUOTA
909 int qtype, qfmt; 960 int qfmt;
910 char *qname;
911#endif 961#endif
912 962
913 if (!options) 963 if (!options)
@@ -1065,20 +1115,19 @@ static int parse_options (char *options, struct super_block *sb,
1065 data_opt = EXT3_MOUNT_WRITEBACK_DATA; 1115 data_opt = EXT3_MOUNT_WRITEBACK_DATA;
1066 datacheck: 1116 datacheck:
1067 if (is_remount) { 1117 if (is_remount) {
1068 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) 1118 if (test_opt(sb, DATA_FLAGS) == data_opt)
1069 == data_opt)
1070 break; 1119 break;
1071 ext3_msg(sb, KERN_ERR, 1120 ext3_msg(sb, KERN_ERR,
1072 "error: cannot change " 1121 "error: cannot change "
1073 "data mode on remount. The filesystem " 1122 "data mode on remount. The filesystem "
1074 "is mounted in data=%s mode and you " 1123 "is mounted in data=%s mode and you "
1075 "try to remount it in data=%s mode.", 1124 "try to remount it in data=%s mode.",
1076 data_mode_string(sbi->s_mount_opt & 1125 data_mode_string(test_opt(sb,
1077 EXT3_MOUNT_DATA_FLAGS), 1126 DATA_FLAGS)),
1078 data_mode_string(data_opt)); 1127 data_mode_string(data_opt));
1079 return 0; 1128 return 0;
1080 } else { 1129 } else {
1081 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; 1130 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1082 sbi->s_mount_opt |= data_opt; 1131 sbi->s_mount_opt |= data_opt;
1083 } 1132 }
1084 break; 1133 break;
@@ -1090,62 +1139,20 @@ static int parse_options (char *options, struct super_block *sb,
1090 break; 1139 break;
1091#ifdef CONFIG_QUOTA 1140#ifdef CONFIG_QUOTA
1092 case Opt_usrjquota: 1141 case Opt_usrjquota:
1093 qtype = USRQUOTA; 1142 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1094 goto set_qf_name;
1095 case Opt_grpjquota:
1096 qtype = GRPQUOTA;
1097set_qf_name:
1098 if (sb_any_quota_loaded(sb) &&
1099 !sbi->s_qf_names[qtype]) {
1100 ext3_msg(sb, KERN_ERR,
1101 "error: cannot change journaled "
1102 "quota options when quota turned on.");
1103 return 0;
1104 }
1105 qname = match_strdup(&args[0]);
1106 if (!qname) {
1107 ext3_msg(sb, KERN_ERR,
1108 "error: not enough memory for "
1109 "storing quotafile name.");
1110 return 0; 1143 return 0;
1111 } 1144 break;
1112 if (sbi->s_qf_names[qtype] && 1145 case Opt_grpjquota:
1113 strcmp(sbi->s_qf_names[qtype], qname)) { 1146 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1114 ext3_msg(sb, KERN_ERR,
1115 "error: %s quota file already "
1116 "specified.", QTYPE2NAME(qtype));
1117 kfree(qname);
1118 return 0;
1119 }
1120 sbi->s_qf_names[qtype] = qname;
1121 if (strchr(sbi->s_qf_names[qtype], '/')) {
1122 ext3_msg(sb, KERN_ERR,
1123 "error: quotafile must be on "
1124 "filesystem root.");
1125 kfree(sbi->s_qf_names[qtype]);
1126 sbi->s_qf_names[qtype] = NULL;
1127 return 0; 1147 return 0;
1128 }
1129 set_opt(sbi->s_mount_opt, QUOTA);
1130 break; 1148 break;
1131 case Opt_offusrjquota: 1149 case Opt_offusrjquota:
1132 qtype = USRQUOTA; 1150 if (!clear_qf_name(sb, USRQUOTA))
1133 goto clear_qf_name; 1151 return 0;
1152 break;
1134 case Opt_offgrpjquota: 1153 case Opt_offgrpjquota:
1135 qtype = GRPQUOTA; 1154 if (!clear_qf_name(sb, GRPQUOTA))
1136clear_qf_name:
1137 if (sb_any_quota_loaded(sb) &&
1138 sbi->s_qf_names[qtype]) {
1139 ext3_msg(sb, KERN_ERR, "error: cannot change "
1140 "journaled quota options when "
1141 "quota turned on.");
1142 return 0; 1155 return 0;
1143 }
1144 /*
1145 * The space will be released later when all options
1146 * are confirmed to be correct
1147 */
1148 sbi->s_qf_names[qtype] = NULL;
1149 break; 1156 break;
1150 case Opt_jqfmt_vfsold: 1157 case Opt_jqfmt_vfsold:
1151 qfmt = QFMT_VFS_OLD; 1158 qfmt = QFMT_VFS_OLD;
@@ -1244,18 +1251,12 @@ set_qf_format:
1244 } 1251 }
1245#ifdef CONFIG_QUOTA 1252#ifdef CONFIG_QUOTA
1246 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1253 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1247 if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && 1254 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1248 sbi->s_qf_names[USRQUOTA])
1249 clear_opt(sbi->s_mount_opt, USRQUOTA); 1255 clear_opt(sbi->s_mount_opt, USRQUOTA);
1250 1256 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1251 if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
1252 sbi->s_qf_names[GRPQUOTA])
1253 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1257 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1254 1258
1255 if ((sbi->s_qf_names[USRQUOTA] && 1259 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1256 (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
1257 (sbi->s_qf_names[GRPQUOTA] &&
1258 (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
1259 ext3_msg(sb, KERN_ERR, "error: old and new quota " 1260 ext3_msg(sb, KERN_ERR, "error: old and new quota "
1260 "format mixing."); 1261 "format mixing.");
1261 return 0; 1262 return 0;
@@ -1478,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1478 } 1479 }
1479 1480
1480 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); 1481 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1481 vfs_dq_init(inode); 1482 dquot_initialize(inode);
1482 if (inode->i_nlink) { 1483 if (inode->i_nlink) {
1483 printk(KERN_DEBUG 1484 printk(KERN_DEBUG
1484 "%s: truncating inode %lu to %Ld bytes\n", 1485 "%s: truncating inode %lu to %Ld bytes\n",
@@ -1671,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1671 set_opt(sbi->s_mount_opt, POSIX_ACL); 1672 set_opt(sbi->s_mount_opt, POSIX_ACL);
1672#endif 1673#endif
1673 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) 1674 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
1674 sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; 1675 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1675 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) 1676 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
1676 sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; 1677 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1677 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) 1678 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
1678 sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; 1679 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
1679 1680
1680 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) 1681 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
1681 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1682 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1694,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1694 goto failed_mount; 1695 goto failed_mount;
1695 1696
1696 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1697 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1697 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1698 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
1698 1699
1699 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && 1700 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
1700 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || 1701 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2561,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2561 goto restore_opts; 2562 goto restore_opts;
2562 } 2563 }
2563 2564
2564 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) 2565 if (test_opt(sb, ABORT))
2565 ext3_abort(sb, __func__, "Abort forced by user"); 2566 ext3_abort(sb, __func__, "Abort forced by user");
2566 2567
2567 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2568 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2568 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2569 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2569 2570
2570 es = sbi->s_es; 2571 es = sbi->s_es;
2571 2572
@@ -2573,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2573 2574
2574 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 2575 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2575 n_blocks_count > le32_to_cpu(es->s_blocks_count)) { 2576 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
2576 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { 2577 if (test_opt(sb, ABORT)) {
2577 err = -EROFS; 2578 err = -EROFS;
2578 goto restore_opts; 2579 goto restore_opts;
2579 } 2580 }
@@ -2734,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2734 * Process 1 Process 2 2735 * Process 1 Process 2
2735 * ext3_create() quota_sync() 2736 * ext3_create() quota_sync()
2736 * journal_start() write_dquot() 2737 * journal_start() write_dquot()
2737 * vfs_dq_init() down(dqio_mutex) 2738 * dquot_initialize() down(dqio_mutex)
2738 * down(dqio_mutex) journal_start() 2739 * down(dqio_mutex) journal_start()
2739 * 2740 *
2740 */ 2741 */
@@ -2942,9 +2943,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2942 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); 2943 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
2943 int err = 0; 2944 int err = 0;
2944 int offset = off & (sb->s_blocksize - 1); 2945 int offset = off & (sb->s_blocksize - 1);
2945 int tocopy;
2946 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; 2946 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
2947 size_t towrite = len;
2948 struct buffer_head *bh; 2947 struct buffer_head *bh;
2949 handle_t *handle = journal_current_handle(); 2948 handle_t *handle = journal_current_handle();
2950 2949
@@ -2955,53 +2954,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2955 (unsigned long long)off, (unsigned long long)len); 2954 (unsigned long long)off, (unsigned long long)len);
2956 return -EIO; 2955 return -EIO;
2957 } 2956 }
2957
2958 /*
2959 * Since we account only one data block in transaction credits,
2960 * then it is impossible to cross a block boundary.
2961 */
2962 if (sb->s_blocksize - offset < len) {
2963 ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
2964 " cancelled because not block aligned",
2965 (unsigned long long)off, (unsigned long long)len);
2966 return -EIO;
2967 }
2958 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2968 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2959 while (towrite > 0) { 2969 bh = ext3_bread(handle, inode, blk, 1, &err);
2960 tocopy = sb->s_blocksize - offset < towrite ? 2970 if (!bh)
2961 sb->s_blocksize - offset : towrite; 2971 goto out;
2962 bh = ext3_bread(handle, inode, blk, 1, &err); 2972 if (journal_quota) {
2963 if (!bh) 2973 err = ext3_journal_get_write_access(handle, bh);
2974 if (err) {
2975 brelse(bh);
2964 goto out; 2976 goto out;
2965 if (journal_quota) {
2966 err = ext3_journal_get_write_access(handle, bh);
2967 if (err) {
2968 brelse(bh);
2969 goto out;
2970 }
2971 }
2972 lock_buffer(bh);
2973 memcpy(bh->b_data+offset, data, tocopy);
2974 flush_dcache_page(bh->b_page);
2975 unlock_buffer(bh);
2976 if (journal_quota)
2977 err = ext3_journal_dirty_metadata(handle, bh);
2978 else {
2979 /* Always do at least ordered writes for quotas */
2980 err = ext3_journal_dirty_data(handle, bh);
2981 mark_buffer_dirty(bh);
2982 } 2977 }
2983 brelse(bh);
2984 if (err)
2985 goto out;
2986 offset = 0;
2987 towrite -= tocopy;
2988 data += tocopy;
2989 blk++;
2990 } 2978 }
2979 lock_buffer(bh);
2980 memcpy(bh->b_data+offset, data, len);
2981 flush_dcache_page(bh->b_page);
2982 unlock_buffer(bh);
2983 if (journal_quota)
2984 err = ext3_journal_dirty_metadata(handle, bh);
2985 else {
2986 /* Always do at least ordered writes for quotas */
2987 err = ext3_journal_dirty_data(handle, bh);
2988 mark_buffer_dirty(bh);
2989 }
2990 brelse(bh);
2991out: 2991out:
2992 if (len == towrite) { 2992 if (err) {
2993 mutex_unlock(&inode->i_mutex); 2993 mutex_unlock(&inode->i_mutex);
2994 return err; 2994 return err;
2995 } 2995 }
2996 if (inode->i_size < off+len-towrite) { 2996 if (inode->i_size < off + len) {
2997 i_size_write(inode, off+len-towrite); 2997 i_size_write(inode, off + len);
2998 EXT3_I(inode)->i_disksize = inode->i_size; 2998 EXT3_I(inode)->i_disksize = inode->i_size;
2999 } 2999 }
3000 inode->i_version++; 3000 inode->i_version++;
3001 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3001 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3002 ext3_mark_inode_dirty(handle, inode); 3002 ext3_mark_inode_dirty(handle, inode);
3003 mutex_unlock(&inode->i_mutex); 3003 mutex_unlock(&inode->i_mutex);
3004 return len - towrite; 3004 return len;
3005} 3005}
3006 3006
3007#endif 3007#endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 66895ccf76c7..534a94c3a933 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
274 void *end; 274 void *end;
275 int error; 275 int error;
276 276
277 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) 277 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
278 return -ENODATA; 278 return -ENODATA;
279 error = ext3_get_inode_loc(inode, &iloc); 279 error = ext3_get_inode_loc(inode, &iloc);
280 if (error) 280 if (error)
@@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
403 void *end; 403 void *end;
404 int error; 404 int error;
405 405
406 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) 406 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
407 return 0; 407 return 0;
408 error = ext3_get_inode_loc(inode, &iloc); 408 error = ext3_get_inode_loc(inode, &iloc);
409 if (error) 409 if (error)
@@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
500 error = ext3_journal_dirty_metadata(handle, bh); 500 error = ext3_journal_dirty_metadata(handle, bh);
501 if (IS_SYNC(inode)) 501 if (IS_SYNC(inode))
502 handle->h_sync = 1; 502 handle->h_sync = 1;
503 vfs_dq_free_block(inode, 1); 503 dquot_free_block(inode, 1);
504 ea_bdebug(bh, "refcount now=%d; releasing", 504 ea_bdebug(bh, "refcount now=%d; releasing",
505 le32_to_cpu(BHDR(bh)->h_refcount)); 505 le32_to_cpu(BHDR(bh)->h_refcount));
506 if (ce) 506 if (ce)
@@ -775,8 +775,8 @@ inserted:
775 else { 775 else {
776 /* The old block is released after updating 776 /* The old block is released after updating
777 the inode. */ 777 the inode. */
778 error = -EDQUOT; 778 error = dquot_alloc_block(inode, 1);
779 if (vfs_dq_alloc_block(inode, 1)) 779 if (error)
780 goto cleanup; 780 goto cleanup;
781 error = ext3_journal_get_write_access(handle, 781 error = ext3_journal_get_write_access(handle,
782 new_bh); 782 new_bh);
@@ -850,7 +850,7 @@ cleanup:
850 return error; 850 return error;
851 851
852cleanup_dquot: 852cleanup_dquot:
853 vfs_dq_free_block(inode, 1); 853 dquot_free_block(inode, 1);
854 goto cleanup; 854 goto cleanup;
855 855
856bad_block: 856bad_block:
@@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
882 is->s.base = is->s.first = IFIRST(header); 882 is->s.base = is->s.first = IFIRST(header);
883 is->s.here = is->s.first; 883 is->s.here = is->s.first;
884 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; 884 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
885 if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { 885 if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
886 error = ext3_xattr_check_names(IFIRST(header), is->s.end); 886 error = ext3_xattr_check_names(IFIRST(header), is->s.end);
887 if (error) 887 if (error)
888 return error; 888 return error;
@@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
914 header = IHDR(inode, ext3_raw_inode(&is->iloc)); 914 header = IHDR(inode, ext3_raw_inode(&is->iloc));
915 if (!IS_LAST_ENTRY(s->first)) { 915 if (!IS_LAST_ENTRY(s->first)) {
916 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); 916 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
917 EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; 917 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
918 } else { 918 } else {
919 header->h_magic = cpu_to_le32(0); 919 header->h_magic = cpu_to_le32(0);
920 EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; 920 ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
921 } 921 }
922 return 0; 922 return 0;
923} 923}
@@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
967 if (error) 967 if (error)
968 goto cleanup; 968 goto cleanup;
969 969
970 if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { 970 if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
971 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); 971 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
972 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 972 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
973 EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; 973 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
974 } 974 }
975 975
976 error = ext3_xattr_ibody_find(inode, &i, &is); 976 error = ext3_xattr_ibody_find(inode, &i, &is);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 22bc7435d913..d2f37a5516c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
97 /* If checksum is bad mark all blocks used to prevent allocation 97 /* If checksum is bad mark all blocks used to prevent allocation
98 * essentially implementing a per-group read-only flag. */ 98 * essentially implementing a per-group read-only flag. */
99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
100 ext4_error(sb, __func__, 100 ext4_error(sb, "Checksum bad for group %u",
101 "Checksum bad for group %u", block_group); 101 block_group);
102 ext4_free_blks_set(sb, gdp, 0); 102 ext4_free_blks_set(sb, gdp, 0);
103 ext4_free_inodes_set(sb, gdp, 0); 103 ext4_free_inodes_set(sb, gdp, 0);
104 ext4_itable_unused_set(sb, gdp, 0); 104 ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
130 * to make sure we calculate the right free blocks 130 * to make sure we calculate the right free blocks
131 */ 131 */
132 group_blocks = ext4_blocks_count(sbi->s_es) - 132 group_blocks = ext4_blocks_count(sbi->s_es) -
133 le32_to_cpu(sbi->s_es->s_first_data_block) - 133 ext4_group_first_block_no(sb, ngroups - 1);
134 (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
135 } else { 134 } else {
136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 135 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
137 } 136 }
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
189 * when a file system is mounted (see ext4_fill_super). 188 * when a file system is mounted (see ext4_fill_super).
190 */ 189 */
191 190
192
193#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
194
195/** 191/**
196 * ext4_get_group_desc() -- load group descriptor from disk 192 * ext4_get_group_desc() -- load group descriptor from disk
197 * @sb: super block 193 * @sb: super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 206 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 207
212 if (block_group >= ngroups) { 208 if (block_group >= ngroups) {
213 ext4_error(sb, "ext4_get_group_desc", 209 ext4_error(sb, "block_group >= groups_count - block_group = %u,"
214 "block_group >= groups_count - " 210 " groups_count = %u", block_group, ngroups);
215 "block_group = %u, groups_count = %u",
216 block_group, ngroups);
217 211
218 return NULL; 212 return NULL;
219 } 213 }
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 215 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 216 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
223 if (!sbi->s_group_desc[group_desc]) { 217 if (!sbi->s_group_desc[group_desc]) {
224 ext4_error(sb, "ext4_get_group_desc", 218 ext4_error(sb, "Group descriptor not loaded - "
225 "Group descriptor not loaded - "
226 "block_group = %u, group_desc = %u, desc = %u", 219 "block_group = %u, group_desc = %u, desc = %u",
227 block_group, group_desc, offset); 220 block_group, group_desc, offset);
228 return NULL; 221 return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
282 return 1; 275 return 1;
283 276
284err_out: 277err_out:
285 ext4_error(sb, __func__, 278 ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
286 "Invalid block bitmap - "
287 "block_group = %d, block = %llu",
288 block_group, bitmap_blk); 279 block_group, bitmap_blk);
289 return 0; 280 return 0;
290} 281}
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
311 bitmap_blk = ext4_block_bitmap(sb, desc); 302 bitmap_blk = ext4_block_bitmap(sb, desc);
312 bh = sb_getblk(sb, bitmap_blk); 303 bh = sb_getblk(sb, bitmap_blk);
313 if (unlikely(!bh)) { 304 if (unlikely(!bh)) {
314 ext4_error(sb, __func__, 305 ext4_error(sb, "Cannot read block bitmap - "
315 "Cannot read block bitmap - "
316 "block_group = %u, block_bitmap = %llu", 306 "block_group = %u, block_bitmap = %llu",
317 block_group, bitmap_blk); 307 block_group, bitmap_blk);
318 return NULL; 308 return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
354 set_bitmap_uptodate(bh); 344 set_bitmap_uptodate(bh);
355 if (bh_submit_read(bh) < 0) { 345 if (bh_submit_read(bh) < 0) {
356 put_bh(bh); 346 put_bh(bh);
357 ext4_error(sb, __func__, 347 ext4_error(sb, "Cannot read block bitmap - "
358 "Cannot read block bitmap - "
359 "block_group = %u, block_bitmap = %llu", 348 "block_group = %u, block_bitmap = %llu",
360 block_group, bitmap_blk); 349 block_group, bitmap_blk);
361 return NULL; 350 return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
419 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 408 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
420 in_range(block + count - 1, ext4_inode_table(sb, desc), 409 in_range(block + count - 1, ext4_inode_table(sb, desc),
421 sbi->s_itb_per_group)) { 410 sbi->s_itb_per_group)) {
422 ext4_error(sb, __func__, 411 ext4_error(sb, "Adding blocks in system zones - "
423 "Adding blocks in system zones - "
424 "Block = %llu, count = %lu", 412 "Block = %llu, count = %lu",
425 block, count); 413 block, count);
426 goto error_return; 414 goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
453 BUFFER_TRACE(bitmap_bh, "clear bit"); 441 BUFFER_TRACE(bitmap_bh, "clear bit");
454 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 442 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
455 bit + i, bitmap_bh->b_data)) { 443 bit + i, bitmap_bh->b_data)) {
456 ext4_error(sb, __func__, 444 ext4_error(sb, "bit already cleared for block %llu",
457 "bit already cleared for block %llu",
458 (ext4_fsblk_t)(block + i)); 445 (ext4_fsblk_t)(block + i));
459 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 446 BUFFER_TRACE(bitmap_bh, "bit already cleared");
460 } else { 447 } else {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index a60ab9aad57d..983f0e127493 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb)
205 entry = rb_entry(n, struct ext4_system_zone, node); 205 entry = rb_entry(n, struct ext4_system_zone, node);
206 kmem_cache_free(ext4_system_zone_cachep, entry); 206 kmem_cache_free(ext4_system_zone_cachep, entry);
207 if (!parent) 207 if (!parent)
208 EXT4_SB(sb)->system_blks.rb_node = NULL; 208 EXT4_SB(sb)->system_blks = RB_ROOT;
209 else if (parent->rb_left == n) 209 else if (parent->rb_left == n)
210 parent->rb_left = NULL; 210 parent->rb_left = NULL;
211 else if (parent->rb_right == n) 211 else if (parent->rb_right == n)
212 parent->rb_right = NULL; 212 parent->rb_right = NULL;
213 n = parent; 213 n = parent;
214 } 214 }
215 EXT4_SB(sb)->system_blks.rb_node = NULL; 215 EXT4_SB(sb)->system_blks = RB_ROOT;
216} 216}
217 217
218/* 218/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..86cb6d86a048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 ext4_error(dir->i_sb, function, 86 __ext4_error(dir->i_sb, function,
87 "bad entry in directory #%lu: %s - " 87 "bad entry in directory #%lu: %s - block=%llu"
88 "offset=%u, inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, offset, 89 dir->i_ino, error_msg,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset,
90 le32_to_cpu(de->inode), 92 le32_to_cpu(de->inode),
91 rlen, de->name_len); 93 rlen, de->name_len);
92 return error_msg == NULL ? 1 : 0; 94 return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
150 */ 152 */
151 if (!bh) { 153 if (!bh) {
152 if (!dir_has_error) { 154 if (!dir_has_error) {
153 ext4_error(sb, __func__, "directory #%lu " 155 ext4_error(sb, "directory #%lu "
154 "contains a hole at offset %Lu", 156 "contains a hole at offset %Lu",
155 inode->i_ino, 157 inode->i_ino,
156 (unsigned long long) filp->f_pos); 158 (unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
303 kfree(old); 305 kfree(old);
304 } 306 }
305 if (!parent) 307 if (!parent)
306 root->rb_node = NULL; 308 *root = RB_ROOT;
307 else if (parent->rb_left == n) 309 else if (parent->rb_left == n)
308 parent->rb_left = NULL; 310 parent->rb_left = NULL;
309 else if (parent->rb_right == n) 311 else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 874d169a193e..bf938cf7c5f0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
53#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
54#endif 54#endif
55 55
56#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a);
58
59#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a);
61
56/* data type for block offset of block group */ 62/* data type for block offset of block group */
57typedef int ext4_grpblk_t; 63typedef int ext4_grpblk_t;
58 64
@@ -133,14 +139,14 @@ struct mpage_da_data {
133 int pages_written; 139 int pages_written;
134 int retval; 140 int retval;
135}; 141};
136#define DIO_AIO_UNWRITTEN 0x1 142#define EXT4_IO_UNWRITTEN 0x1
137typedef struct ext4_io_end { 143typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */ 144 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */ 145 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */ 146 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */ 147 struct page *page; /* page struct for buffer write */
142 ext4_lblk_t offset; /* offset in the file */ 148 loff_t offset; /* offset in the file */
143 size_t size; /* size of the extent */ 149 ssize_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */ 150 struct work_struct work; /* data work queue */
145} ext4_io_end_t; 151} ext4_io_end_t;
146 152
@@ -284,10 +290,12 @@ struct flex_groups {
284#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 290#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
285#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 291#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
286#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 292#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
293#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
294#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
287#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 295#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
288 296
289#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 297#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
290#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 298#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
291 299
292/* Flags that should be inherited by new inodes from their parent. */ 300/* Flags that should be inherited by new inodes from their parent. */
293#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 301#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
313 return flags & EXT4_OTHER_FLMASK; 321 return flags & EXT4_OTHER_FLMASK;
314} 322}
315 323
316/*
317 * Inode dynamic state flags
318 */
319#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
320#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
321#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
326
327/* Used to pass group descriptor data when online resize is done */ 324/* Used to pass group descriptor data when online resize is done */
328struct ext4_new_group_input { 325struct ext4_new_group_input {
329 __u32 group; /* Group number for this data */ 326 __u32 group; /* Group number for this data */
@@ -364,19 +361,20 @@ struct ext4_new_group_data {
364 /* caller is from the direct IO path, request to creation of an 361 /* caller is from the direct IO path, request to creation of an
365 unitialized extents if not allocated, split the uninitialized 362 unitialized extents if not allocated, split the uninitialized
366 extent if blocks has been preallocated already*/ 363 extent if blocks has been preallocated already*/
367#define EXT4_GET_BLOCKS_DIO 0x0008 364#define EXT4_GET_BLOCKS_PRE_IO 0x0008
368#define EXT4_GET_BLOCKS_CONVERT 0x0010 365#define EXT4_GET_BLOCKS_CONVERT 0x0010
369#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ 366#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
367 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
368 /* Convert extent to initialized after IO complete */
369#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
371 /* Convert extent to initialized after direct IO complete */
372#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
373 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
374 371
375/* 372/*
376 * Flags used by ext4_free_blocks 373 * Flags used by ext4_free_blocks
377 */ 374 */
378#define EXT4_FREE_BLOCKS_METADATA 0x0001 375#define EXT4_FREE_BLOCKS_METADATA 0x0001
379#define EXT4_FREE_BLOCKS_FORGET 0x0002 376#define EXT4_FREE_BLOCKS_FORGET 0x0002
377#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
380 378
381/* 379/*
382 * ioctl commands 380 * ioctl commands
@@ -630,7 +628,7 @@ struct ext4_inode_info {
630 * near to their parent directory's inode. 628 * near to their parent directory's inode.
631 */ 629 */
632 ext4_group_t i_block_group; 630 ext4_group_t i_block_group;
633 __u32 i_state; /* Dynamic state flags for ext4 */ 631 unsigned long i_state_flags; /* Dynamic state flags */
634 632
635 ext4_lblk_t i_dir_start_lookup; 633 ext4_lblk_t i_dir_start_lookup;
636#ifdef CONFIG_EXT4_FS_XATTR 634#ifdef CONFIG_EXT4_FS_XATTR
@@ -708,8 +706,9 @@ struct ext4_inode_info {
708 qsize_t i_reserved_quota; 706 qsize_t i_reserved_quota;
709#endif 707#endif
710 708
711 /* completed async DIOs that might need unwritten extents handling */ 709 /* completed IOs that might need unwritten extents handling */
712 struct list_head i_aio_dio_complete_list; 710 struct list_head i_completed_io_list;
711 spinlock_t i_completed_io_lock;
713 /* current io_end structure for async DIO write*/ 712 /* current io_end structure for async DIO write*/
714 ext4_io_end_t *cur_aio_dio; 713 ext4_io_end_t *cur_aio_dio;
715 714
@@ -760,6 +759,7 @@ struct ext4_inode_info {
760#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 759#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
761#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 760#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
762#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 761#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
762#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -1014,7 +1014,7 @@ struct ext4_sb_info {
1014 atomic_t s_lock_busy; 1014 atomic_t s_lock_busy;
1015 1015
1016 /* locality groups */ 1016 /* locality groups */
1017 struct ext4_locality_group *s_locality_groups; 1017 struct ext4_locality_group __percpu *s_locality_groups;
1018 1018
1019 /* for write statistics */ 1019 /* for write statistics */
1020 unsigned long s_sectors_written_start; 1020 unsigned long s_sectors_written_start;
@@ -1050,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1050 (ino >= EXT4_FIRST_INO(sb) && 1050 (ino >= EXT4_FIRST_INO(sb) &&
1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1052} 1052}
1053
1054/*
1055 * Inode dynamic state flags
1056 */
1057enum {
1058 EXT4_STATE_JDATA, /* journaled data exists */
1059 EXT4_STATE_NEW, /* inode is newly created */
1060 EXT4_STATE_XATTR, /* has in-inode xattrs */
1061 EXT4_STATE_NO_EXPAND, /* No space for expansion */
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1065};
1066
1067static inline int ext4_test_inode_state(struct inode *inode, int bit)
1068{
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags);
1070}
1071
1072static inline void ext4_set_inode_state(struct inode *inode, int bit)
1073{
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags);
1075}
1076
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit)
1078{
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1053#else 1081#else
1054/* Assume that user mode programs are passing in an ext4fs superblock, not 1082/* Assume that user mode programs are passing in an ext4fs superblock, not
1055 * a kernel struct super_block. This will allow us to call the feature-test 1083 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1126,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1126#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 1154#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
1127#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 1155#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
1128#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1156#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1157#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1158#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1129 1159
1130#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1160#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1131#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1161#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1416,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
1416 struct buffer_head *bh_result, int create); 1446 struct buffer_head *bh_result, int create);
1417 1447
1418extern struct inode *ext4_iget(struct super_block *, unsigned long); 1448extern struct inode *ext4_iget(struct super_block *, unsigned long);
1419extern int ext4_write_inode(struct inode *, int); 1449extern int ext4_write_inode(struct inode *, struct writeback_control *);
1420extern int ext4_setattr(struct dentry *, struct iattr *); 1450extern int ext4_setattr(struct dentry *, struct iattr *);
1421extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1451extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1422 struct kstat *stat); 1452 struct kstat *stat);
@@ -1439,7 +1469,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1439 struct address_space *mapping, loff_t from); 1469 struct address_space *mapping, loff_t from);
1440extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1470extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1441extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1471extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1442extern int flush_aio_dio_completed_IO(struct inode *inode); 1472extern int flush_completed_IO(struct inode *inode);
1443extern void ext4_da_update_reserve_space(struct inode *inode, 1473extern void ext4_da_update_reserve_space(struct inode *inode,
1444 int used, int quota_claim); 1474 int used, int quota_claim);
1445/* ioctl.c */ 1475/* ioctl.c */
@@ -1465,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
1465 ext4_fsblk_t n_blocks_count); 1495 ext4_fsblk_t n_blocks_count);
1466 1496
1467/* super.c */ 1497/* super.c */
1468extern void ext4_error(struct super_block *, const char *, const char *, ...) 1498extern void __ext4_error(struct super_block *, const char *, const char *, ...)
1499 __attribute__ ((format (printf, 3, 4)));
1500#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message)
1501extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
1502 __attribute__ ((format (printf, 3, 4)));
1503extern void ext4_error_file(const char *, struct file *, const char *, ...)
1469 __attribute__ ((format (printf, 3, 4))); 1504 __attribute__ ((format (printf, 3, 4)));
1470extern void __ext4_std_error(struct super_block *, const char *, int); 1505extern void __ext4_std_error(struct super_block *, const char *, int);
1471extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1506extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1472 __attribute__ ((format (printf, 3, 4))); 1507 __attribute__ ((format (printf, 3, 4)));
1473extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1508extern void __ext4_warning(struct super_block *, const char *,
1509 const char *, ...)
1474 __attribute__ ((format (printf, 3, 4))); 1510 __attribute__ ((format (printf, 3, 4)));
1511#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message)
1475extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1512extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1476 __attribute__ ((format (printf, 3, 4))); 1513 __attribute__ ((format (printf, 3, 4)));
1477extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1514extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1744,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
1744extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1781extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1745 loff_t len); 1782 loff_t len);
1746extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1747 loff_t len); 1784 ssize_t len);
1748extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1749 sector_t block, unsigned int max_blocks, 1786 sector_t block, unsigned int max_blocks,
1750 struct buffer_head *bh, int flags); 1787 struct buffer_head *bh, int flags);
@@ -1756,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1756 __u64 len, __u64 *moved_len); 1793 __u64 len, __u64 *moved_len);
1757 1794
1758 1795
1796/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1797enum ext4_state_bits {
1798 BH_Uninit /* blocks are allocated but uninitialized on disk */
1799 = BH_JBDPrivateStart,
1800};
1801
1802BUFFER_FNS(Uninit, uninit)
1803TAS_BUFFER_FNS(Uninit, uninit)
1804
1759/* 1805/*
1760 * Add new method to test wether block and inode bitmaps are properly 1806 * Add new method to test wether block and inode bitmaps are properly
1761 * initialized. With uninit_bg reading the block from disk is not enough 1807 * initialized. With uninit_bg reading the block from disk is not enough
@@ -1773,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
1773 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 1819 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1774} 1820}
1775 1821
1822#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
1823
1776#endif /* __KERNEL__ */ 1824#endif /* __KERNEL__ */
1777 1825
1778#endif /* _EXT4_H */ 1826#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c711b6d..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
125 ext4_journal_abort_handle(where, __func__, bh, 125 ext4_journal_abort_handle(where, __func__, bh,
126 handle, err); 126 handle, err);
127 } else { 127 } else {
128 if (inode && bh) 128 if (inode)
129 mark_buffer_dirty_inode(bh, inode); 129 mark_buffer_dirty_inode(bh, inode);
130 else 130 else
131 mark_buffer_dirty(bh); 131 mark_buffer_dirty(bh);
132 if (inode && inode_needs_sync(inode)) { 132 if (inode && inode_needs_sync(inode)) {
133 sync_dirty_buffer(bh); 133 sync_dirty_buffer(bh);
134 if (buffer_req(bh) && !buffer_uptodate(bh)) { 134 if (buffer_req(bh) && !buffer_uptodate(bh)) {
135 ext4_error(inode->i_sb, __func__, 135 ext4_error(inode->i_sb,
136 "IO error syncing inode, " 136 "IO error syncing inode, "
137 "inode=%lu, block=%llu", 137 "inode=%lu, block=%llu",
138 inode->i_ino, 138 inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
304 return 0; 304 return 0;
305} 305}
306 306
307/*
308 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is
312 * enabled, since the dioread_nolock code uses b_private to pass
313 * information back to the I/O completion handler, and this conflicts
314 * with the jbd's use of b_private.
315 */
316static inline int ext4_should_dioread_nolock(struct inode *inode)
317{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode))
323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
325 return 0;
326 if (ext4_should_journal_data(inode))
327 return 0;
328 return 1;
329}
330
307#endif /* _EXT4_JBD2_H */ 331#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 765a4826b118..94c8ee81f5e1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
195 if (S_ISREG(inode->i_mode)) 195 if (S_ISREG(inode->i_mode))
196 block_group++; 196 block_group++;
197 } 197 }
198 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 198 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
199 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
200 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 199 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
201 200
202 /* 201 /*
@@ -440,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
440 return 0; 439 return 0;
441 440
442corrupted: 441corrupted:
443 ext4_error(inode->i_sb, function, 442 __ext4_error(inode->i_sb, function,
444 "bad header/extent in inode #%lu: %s - magic %x, " 443 "bad header/extent in inode #%lu: %s - magic %x, "
445 "entries %u, max %u(%u), depth %u(%u)", 444 "entries %u, max %u(%u), depth %u(%u)",
446 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -703,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
703 } 702 }
704 eh = ext_block_hdr(bh); 703 eh = ext_block_hdr(bh);
705 ppos++; 704 ppos++;
706 BUG_ON(ppos > depth); 705 if (unlikely(ppos > depth)) {
706 put_bh(bh);
707 EXT4_ERROR_INODE(inode,
708 "ppos %d > depth %d", ppos, depth);
709 goto err;
710 }
707 path[ppos].p_bh = bh; 711 path[ppos].p_bh = bh;
708 path[ppos].p_hdr = eh; 712 path[ppos].p_hdr = eh;
709 i--; 713 i--;
@@ -749,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
749 if (err) 753 if (err)
750 return err; 754 return err;
751 755
752 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); 756 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
757 EXT4_ERROR_INODE(inode,
758 "logical %d == ei_block %d!",
759 logical, le32_to_cpu(curp->p_idx->ei_block));
760 return -EIO;
761 }
753 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 762 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
754 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 763 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
755 /* insert after */ 764 /* insert after */
@@ -779,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
779 ext4_idx_store_pblock(ix, ptr); 788 ext4_idx_store_pblock(ix, ptr);
780 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 789 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
781 790
782 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) 791 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
783 > le16_to_cpu(curp->p_hdr->eh_max)); 792 > le16_to_cpu(curp->p_hdr->eh_max))) {
784 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); 793 EXT4_ERROR_INODE(inode,
794 "logical %d == ei_block %d!",
795 logical, le32_to_cpu(curp->p_idx->ei_block));
796 return -EIO;
797 }
798 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
799 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
800 return -EIO;
801 }
785 802
786 err = ext4_ext_dirty(handle, inode, curp); 803 err = ext4_ext_dirty(handle, inode, curp);
787 ext4_std_error(inode->i_sb, err); 804 ext4_std_error(inode->i_sb, err);
@@ -819,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
819 836
820 /* if current leaf will be split, then we should use 837 /* if current leaf will be split, then we should use
821 * border from split point */ 838 * border from split point */
822 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); 839 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
840 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
841 return -EIO;
842 }
823 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 843 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
824 border = path[depth].p_ext[1].ee_block; 844 border = path[depth].p_ext[1].ee_block;
825 ext_debug("leaf will be split." 845 ext_debug("leaf will be split."
@@ -860,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
860 880
861 /* initialize new leaf */ 881 /* initialize new leaf */
862 newblock = ablocks[--a]; 882 newblock = ablocks[--a];
863 BUG_ON(newblock == 0); 883 if (unlikely(newblock == 0)) {
884 EXT4_ERROR_INODE(inode, "newblock == 0!");
885 err = -EIO;
886 goto cleanup;
887 }
864 bh = sb_getblk(inode->i_sb, newblock); 888 bh = sb_getblk(inode->i_sb, newblock);
865 if (!bh) { 889 if (!bh) {
866 err = -EIO; 890 err = -EIO;
@@ -880,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
880 ex = EXT_FIRST_EXTENT(neh); 904 ex = EXT_FIRST_EXTENT(neh);
881 905
882 /* move remainder of path[depth] to the new leaf */ 906 /* move remainder of path[depth] to the new leaf */
883 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); 907 if (unlikely(path[depth].p_hdr->eh_entries !=
908 path[depth].p_hdr->eh_max)) {
909 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
910 path[depth].p_hdr->eh_entries,
911 path[depth].p_hdr->eh_max);
912 err = -EIO;
913 goto cleanup;
914 }
884 /* start copy from next extent */ 915 /* start copy from next extent */
885 /* TODO: we could do it by single memmove */ 916 /* TODO: we could do it by single memmove */
886 m = 0; 917 m = 0;
@@ -927,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
927 958
928 /* create intermediate indexes */ 959 /* create intermediate indexes */
929 k = depth - at - 1; 960 k = depth - at - 1;
930 BUG_ON(k < 0); 961 if (unlikely(k < 0)) {
962 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
963 err = -EIO;
964 goto cleanup;
965 }
931 if (k) 966 if (k)
932 ext_debug("create %d intermediate indices\n", k); 967 ext_debug("create %d intermediate indices\n", k);
933 /* insert new index into current index block */ 968 /* insert new index into current index block */
@@ -964,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
964 999
965 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
966 EXT_MAX_INDEX(path[i].p_hdr)); 1001 EXT_MAX_INDEX(path[i].p_hdr));
967 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != 1002 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
968 EXT_LAST_INDEX(path[i].p_hdr)); 1003 EXT_LAST_INDEX(path[i].p_hdr))) {
1004 EXT4_ERROR_INODE(inode,
1005 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1006 le32_to_cpu(path[i].p_ext->ee_block));
1007 err = -EIO;
1008 goto cleanup;
1009 }
969 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1010 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
970 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1011 ext_debug("%d: move %d:%llu in new index %llu\n", i,
971 le32_to_cpu(path[i].p_idx->ei_block), 1012 le32_to_cpu(path[i].p_idx->ei_block),
@@ -1203,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1203 struct ext4_extent *ex; 1244 struct ext4_extent *ex;
1204 int depth, ee_len; 1245 int depth, ee_len;
1205 1246
1206 BUG_ON(path == NULL); 1247 if (unlikely(path == NULL)) {
1248 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1249 return -EIO;
1250 }
1207 depth = path->p_depth; 1251 depth = path->p_depth;
1208 *phys = 0; 1252 *phys = 0;
1209 1253
@@ -1217,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1217 ex = path[depth].p_ext; 1261 ex = path[depth].p_ext;
1218 ee_len = ext4_ext_get_actual_len(ex); 1262 ee_len = ext4_ext_get_actual_len(ex);
1219 if (*logical < le32_to_cpu(ex->ee_block)) { 1263 if (*logical < le32_to_cpu(ex->ee_block)) {
1220 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1264 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1265 EXT4_ERROR_INODE(inode,
1266 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1267 *logical, le32_to_cpu(ex->ee_block));
1268 return -EIO;
1269 }
1221 while (--depth >= 0) { 1270 while (--depth >= 0) {
1222 ix = path[depth].p_idx; 1271 ix = path[depth].p_idx;
1223 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1272 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1273 EXT4_ERROR_INODE(inode,
1274 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1275 ix != NULL ? ix->ei_block : 0,
1276 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1277 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1278 depth);
1279 return -EIO;
1280 }
1224 } 1281 }
1225 return 0; 1282 return 0;
1226 } 1283 }
1227 1284
1228 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1285 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1286 EXT4_ERROR_INODE(inode,
1287 "logical %d < ee_block %d + ee_len %d!",
1288 *logical, le32_to_cpu(ex->ee_block), ee_len);
1289 return -EIO;
1290 }
1229 1291
1230 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1292 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1231 *phys = ext_pblock(ex) + ee_len - 1; 1293 *phys = ext_pblock(ex) + ee_len - 1;
@@ -1251,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1251 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1313 int depth; /* Note, NOT eh_depth; depth from top of tree */
1252 int ee_len; 1314 int ee_len;
1253 1315
1254 BUG_ON(path == NULL); 1316 if (unlikely(path == NULL)) {
1317 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1318 return -EIO;
1319 }
1255 depth = path->p_depth; 1320 depth = path->p_depth;
1256 *phys = 0; 1321 *phys = 0;
1257 1322
@@ -1265,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1265 ex = path[depth].p_ext; 1330 ex = path[depth].p_ext;
1266 ee_len = ext4_ext_get_actual_len(ex); 1331 ee_len = ext4_ext_get_actual_len(ex);
1267 if (*logical < le32_to_cpu(ex->ee_block)) { 1332 if (*logical < le32_to_cpu(ex->ee_block)) {
1268 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1333 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1334 EXT4_ERROR_INODE(inode,
1335 "first_extent(path[%d].p_hdr) != ex",
1336 depth);
1337 return -EIO;
1338 }
1269 while (--depth >= 0) { 1339 while (--depth >= 0) {
1270 ix = path[depth].p_idx; 1340 ix = path[depth].p_idx;
1271 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1341 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1342 EXT4_ERROR_INODE(inode,
1343 "ix != EXT_FIRST_INDEX *logical %d!",
1344 *logical);
1345 return -EIO;
1346 }
1272 } 1347 }
1273 *logical = le32_to_cpu(ex->ee_block); 1348 *logical = le32_to_cpu(ex->ee_block);
1274 *phys = ext_pblock(ex); 1349 *phys = ext_pblock(ex);
1275 return 0; 1350 return 0;
1276 } 1351 }
1277 1352
1278 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1353 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1354 EXT4_ERROR_INODE(inode,
1355 "logical %d < ee_block %d + ee_len %d!",
1356 *logical, le32_to_cpu(ex->ee_block), ee_len);
1357 return -EIO;
1358 }
1279 1359
1280 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1360 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1281 /* next allocated block in this leaf */ 1361 /* next allocated block in this leaf */
@@ -1414,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1414 1494
1415 eh = path[depth].p_hdr; 1495 eh = path[depth].p_hdr;
1416 ex = path[depth].p_ext; 1496 ex = path[depth].p_ext;
1417 BUG_ON(ex == NULL); 1497
1418 BUG_ON(eh == NULL); 1498 if (unlikely(ex == NULL || eh == NULL)) {
1499 EXT4_ERROR_INODE(inode,
1500 "ex %p == NULL or eh %p == NULL", ex, eh);
1501 return -EIO;
1502 }
1419 1503
1420 if (depth == 0) { 1504 if (depth == 0) {
1421 /* there is no tree at all */ 1505 /* there is no tree at all */
@@ -1538,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1538 merge_done = 1; 1622 merge_done = 1;
1539 WARN_ON(eh->eh_entries == 0); 1623 WARN_ON(eh->eh_entries == 0);
1540 if (!eh->eh_entries) 1624 if (!eh->eh_entries)
1541 ext4_error(inode->i_sb, "ext4_ext_try_to_merge", 1625 ext4_error(inode->i_sb,
1542 "inode#%lu, eh->eh_entries = 0!", inode->i_ino); 1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1543 } 1628 }
1544 1629
1545 return merge_done; 1630 return merge_done;
@@ -1612,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1612 ext4_lblk_t next; 1697 ext4_lblk_t next;
1613 unsigned uninitialized = 0; 1698 unsigned uninitialized = 0;
1614 1699
1615 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1700 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1701 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1702 return -EIO;
1703 }
1616 depth = ext_depth(inode); 1704 depth = ext_depth(inode);
1617 ex = path[depth].p_ext; 1705 ex = path[depth].p_ext;
1618 BUG_ON(path[depth].p_hdr == NULL); 1706 if (unlikely(path[depth].p_hdr == NULL)) {
1707 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1708 return -EIO;
1709 }
1619 1710
1620 /* try to insert block into found extent and return */ 1711 /* try to insert block into found extent and return */
1621 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1712 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1622 && ext4_can_extents_be_merged(inode, ex, newext)) { 1713 && ext4_can_extents_be_merged(inode, ex, newext)) {
1623 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1714 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1624 ext4_ext_is_uninitialized(newext), 1715 ext4_ext_is_uninitialized(newext),
@@ -1739,7 +1830,7 @@ has_space:
1739 1830
1740merge: 1831merge:
1741 /* try to merge extents to the right */ 1832 /* try to merge extents to the right */
1742 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1833 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1743 ext4_ext_try_to_merge(inode, path, nearex); 1834 ext4_ext_try_to_merge(inode, path, nearex);
1744 1835
1745 /* try to merge extents to the left */ 1836 /* try to merge extents to the left */
@@ -1787,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1787 } 1878 }
1788 1879
1789 depth = ext_depth(inode); 1880 depth = ext_depth(inode);
1790 BUG_ON(path[depth].p_hdr == NULL); 1881 if (unlikely(path[depth].p_hdr == NULL)) {
1882 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1883 err = -EIO;
1884 break;
1885 }
1791 ex = path[depth].p_ext; 1886 ex = path[depth].p_ext;
1792 next = ext4_ext_next_allocated_block(path); 1887 next = ext4_ext_next_allocated_block(path);
1793 1888
@@ -1838,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1838 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1933 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1839 } 1934 }
1840 1935
1841 BUG_ON(cbex.ec_len == 0); 1936 if (unlikely(cbex.ec_len == 0)) {
1937 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1938 err = -EIO;
1939 break;
1940 }
1842 err = func(inode, path, &cbex, ex, cbdata); 1941 err = func(inode, path, &cbex, ex, cbdata);
1843 ext4_ext_drop_refs(path); 1942 ext4_ext_drop_refs(path);
1844 1943
@@ -1952,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1952 2051
1953 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 2052 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1954 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 2053 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1955 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { 2054 if (in_range(block, cex->ec_block, cex->ec_len)) {
1956 ex->ee_block = cpu_to_le32(cex->ec_block); 2055 ex->ee_block = cpu_to_le32(cex->ec_block);
1957 ext4_ext_store_pblock(ex, cex->ec_start); 2056 ext4_ext_store_pblock(ex, cex->ec_start);
1958 ex->ee_len = cpu_to_le16(cex->ec_len); 2057 ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1981,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1981 /* free index block */ 2080 /* free index block */
1982 path--; 2081 path--;
1983 leaf = idx_pblock(path->p_idx); 2082 leaf = idx_pblock(path->p_idx);
1984 BUG_ON(path->p_hdr->eh_entries == 0); 2083 if (unlikely(path->p_hdr->eh_entries == 0)) {
2084 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2085 return -EIO;
2086 }
1985 err = ext4_ext_get_access(handle, inode, path); 2087 err = ext4_ext_get_access(handle, inode, path);
1986 if (err) 2088 if (err)
1987 return err; 2089 return err;
@@ -2119,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2119 if (!path[depth].p_hdr) 2221 if (!path[depth].p_hdr)
2120 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2222 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2121 eh = path[depth].p_hdr; 2223 eh = path[depth].p_hdr;
2122 BUG_ON(eh == NULL); 2224 if (unlikely(path[depth].p_hdr == NULL)) {
2123 2225 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2226 return -EIO;
2227 }
2124 /* find where to start removing */ 2228 /* find where to start removing */
2125 ex = EXT_LAST_EXTENT(eh); 2229 ex = EXT_LAST_EXTENT(eh);
2126 2230
@@ -2983,7 +3087,7 @@ fix_extent_len:
2983 ext4_ext_dirty(handle, inode, path + depth); 3087 ext4_ext_dirty(handle, inode, path + depth);
2984 return err; 3088 return err;
2985} 3089}
2986static int ext4_convert_unwritten_extents_dio(handle_t *handle, 3090static int ext4_convert_unwritten_extents_endio(handle_t *handle,
2987 struct inode *inode, 3091 struct inode *inode,
2988 struct ext4_ext_path *path) 3092 struct ext4_ext_path *path)
2989{ 3093{
@@ -3063,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3063 flags, allocated); 3167 flags, allocated);
3064 ext4_ext_show_leaf(inode, path); 3168 ext4_ext_show_leaf(inode, path);
3065 3169
3066 /* DIO get_block() before submit the IO, split the extent */ 3170 /* get_block() before submit the IO, split the extent */
3067 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3068 ret = ext4_split_unwritten_extents(handle, 3172 ret = ext4_split_unwritten_extents(handle,
3069 inode, path, iblock, 3173 inode, path, iblock,
3070 max_blocks, flags); 3174 max_blocks, flags);
@@ -3074,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3074 * completed 3178 * completed
3075 */ 3179 */
3076 if (io) 3180 if (io)
3077 io->flag = DIO_AIO_UNWRITTEN; 3181 io->flag = EXT4_IO_UNWRITTEN;
3078 else 3182 else
3079 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; 3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result);
3080 goto out; 3186 goto out;
3081 } 3187 }
3082 /* async DIO end_io complete, convert the filled extent to written */ 3188 /* IO end_io complete, convert the filled extent to written */
3083 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3189 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3084 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3190 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3085 path); 3191 path);
3086 if (ret >= 0) 3192 if (ret >= 0)
3087 ext4_update_inode_fsync_trans(handle, inode, 1); 3193 ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3185,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3185{ 3291{
3186 struct ext4_ext_path *path = NULL; 3292 struct ext4_ext_path *path = NULL;
3187 struct ext4_extent_header *eh; 3293 struct ext4_extent_header *eh;
3188 struct ext4_extent newex, *ex; 3294 struct ext4_extent newex, *ex, *last_ex;
3189 ext4_fsblk_t newblock; 3295 ext4_fsblk_t newblock;
3190 int err = 0, depth, ret, cache_type; 3296 int err = 0, depth, ret, cache_type;
3191 unsigned int allocated = 0; 3297 unsigned int allocated = 0;
@@ -3237,10 +3343,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3237 * this situation is possible, though, _during_ tree modification; 3343 * this situation is possible, though, _during_ tree modification;
3238 * this is why assert can't be put in ext4_ext_find_extent() 3344 * this is why assert can't be put in ext4_ext_find_extent()
3239 */ 3345 */
3240 if (path[depth].p_ext == NULL && depth != 0) { 3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3241 ext4_error(inode->i_sb, __func__, "bad extent address " 3347 EXT4_ERROR_INODE(inode, "bad extent address "
3242 "inode: %lu, iblock: %d, depth: %d", 3348 "iblock: %d, depth: %d pblock %lld",
3243 inode->i_ino, iblock, depth); 3349 iblock, depth, path[depth].p_block);
3244 err = -EIO; 3350 err = -EIO;
3245 goto out2; 3351 goto out2;
3246 } 3352 }
@@ -3258,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3258 */ 3364 */
3259 ee_len = ext4_ext_get_actual_len(ex); 3365 ee_len = ext4_ext_get_actual_len(ex);
3260 /* if found extent covers block, simply return it */ 3366 /* if found extent covers block, simply return it */
3261 if (iblock >= ee_block && iblock < ee_block + ee_len) { 3367 if (in_range(iblock, ee_block, ee_len)) {
3262 newblock = iblock - ee_block + ee_start; 3368 newblock = iblock - ee_block + ee_start;
3263 /* number of remaining blocks in the extent */ 3369 /* number of remaining blocks in the extent */
3264 allocated = ee_len - (iblock - ee_block); 3370 allocated = ee_len - (iblock - ee_block);
@@ -3350,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3350 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3456 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3351 ext4_ext_mark_uninitialized(&newex); 3457 ext4_ext_mark_uninitialized(&newex);
3352 /* 3458 /*
3353 * io_end structure was created for every async 3459 * io_end structure was created for every IO write to an
3354 * direct IO write to the middle of the file. 3460 * uninitialized extent. To avoid unecessary conversion,
3355 * To avoid unecessary convertion for every aio dio rewrite 3461 * here we flag the IO that really needs the conversion.
3356 * to the mid of file, here we flag the IO that is really
3357 * need the convertion.
3358 * For non asycn direct IO case, flag the inode state 3462 * For non asycn direct IO case, flag the inode state
3359 * that we need to perform convertion when IO is done. 3463 * that we need to perform convertion when IO is done.
3360 */ 3464 */
3361 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3362 if (io) 3466 if (io)
3363 io->flag = DIO_AIO_UNWRITTEN; 3467 io->flag = EXT4_IO_UNWRITTEN;
3364 else 3468 else
3365 EXT4_I(inode)->i_state |= 3469 ext4_set_inode_state(inode,
3366 EXT4_STATE_DIO_UNWRITTEN;; 3470 EXT4_STATE_DIO_UNWRITTEN);
3471 }
3472 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result);
3474 }
3475
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
3477 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d",
3480 ex->ee_block);
3481 err = -EIO;
3482 goto out2;
3367 } 3483 }
3484 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
3486 + ext4_ext_get_actual_len(last_ex))
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
3368 } 3488 }
3369 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3370 if (err) { 3490 if (err) {
@@ -3499,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
3499 i_size_write(inode, new_size); 3619 i_size_write(inode, new_size);
3500 if (new_size > EXT4_I(inode)->i_disksize) 3620 if (new_size > EXT4_I(inode)->i_disksize)
3501 ext4_update_i_disksize(inode, new_size); 3621 ext4_update_i_disksize(inode, new_size);
3622 } else {
3623 /*
3624 * Mark that we allocate beyond EOF so the subsequent truncate
3625 * can proceed even if the new size is the same as i_size.
3626 */
3627 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
3502 } 3629 }
3503 3630
3504} 3631}
@@ -3603,7 +3730,7 @@ retry:
3603 * Returns 0 on success. 3730 * Returns 0 on success.
3604 */ 3731 */
3605int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3732int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3606 loff_t len) 3733 ssize_t len)
3607{ 3734{
3608 handle_t *handle; 3735 handle_t *handle;
3609 ext4_lblk_t block; 3736 ext4_lblk_t block;
@@ -3635,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3635 map_bh.b_state = 0; 3762 map_bh.b_state = 0;
3636 ret = ext4_get_blocks(handle, inode, block, 3763 ret = ext4_get_blocks(handle, inode, block,
3637 max_blocks, &map_bh, 3764 max_blocks, &map_bh,
3638 EXT4_GET_BLOCKS_DIO_CONVERT_EXT); 3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3639 if (ret <= 0) { 3766 if (ret <= 0) {
3640 WARN_ON(ret <= 0); 3767 WARN_ON(ret <= 0);
3641 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3768 printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3739,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3739 int error = 0; 3866 int error = 0;
3740 3867
3741 /* in-inode? */ 3868 /* in-inode? */
3742 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 3869 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3743 struct ext4_iloc iloc; 3870 struct ext4_iloc iloc;
3744 int offset; /* offset of xattr in inode */ 3871 int offset; /* offset of xattr in inode */
3745 3872
@@ -3767,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3767 __u64 start, __u64 len) 3894 __u64 start, __u64 len)
3768{ 3895{
3769 ext4_lblk_t start_blk; 3896 ext4_lblk_t start_blk;
3770 ext4_lblk_t len_blks;
3771 int error = 0; 3897 int error = 0;
3772 3898
3773 /* fallback to generic here if not in extents fmt */ 3899 /* fallback to generic here if not in extents fmt */
@@ -3781,8 +3907,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3781 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 3907 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3782 error = ext4_xattr_fiemap(inode, fieinfo); 3908 error = ext4_xattr_fiemap(inode, fieinfo);
3783 } else { 3909 } else {
3910 ext4_lblk_t len_blks;
3911 __u64 last_blk;
3912
3784 start_blk = start >> inode->i_sb->s_blocksize_bits; 3913 start_blk = start >> inode->i_sb->s_blocksize_bits;
3785 len_blks = len >> inode->i_sb->s_blocksize_bits; 3914 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3915 if (last_blk >= EXT_MAX_BLOCK)
3916 last_blk = EXT_MAX_BLOCK-1;
3917 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3786 3918
3787 /* 3919 /*
3788 * Walk the extent tree gathering extent information. 3920 * Walk the extent tree gathering extent information.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..d0776e410f34 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
23#include <linux/jbd2.h> 23#include <linux/jbd2.h>
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/path.h> 25#include <linux/path.h>
26#include <linux/quotaops.h>
26#include "ext4.h" 27#include "ext4.h"
27#include "ext4_jbd2.h" 28#include "ext4_jbd2.h"
28#include "xattr.h" 29#include "xattr.h"
@@ -35,9 +36,9 @@
35 */ 36 */
36static int ext4_release_file(struct inode *inode, struct file *filp) 37static int ext4_release_file(struct inode *inode, struct file *filp)
37{ 38{
38 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { 39 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
39 ext4_alloc_da_blocks(inode); 40 ext4_alloc_da_blocks(inode);
40 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; 41 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
41 } 42 }
42 /* if we are the last writer on the inode, drop the block reservation */ 43 /* if we are the last writer on the inode, drop the block reservation */
43 if ((filp->f_mode & FMODE_WRITE) && 44 if ((filp->f_mode & FMODE_WRITE) &&
@@ -116,18 +117,16 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
116 * devices or filesystem images. 117 * devices or filesystem images.
117 */ 118 */
118 memset(buf, 0, sizeof(buf)); 119 memset(buf, 0, sizeof(buf));
119 path.mnt = mnt->mnt_parent; 120 path.mnt = mnt;
120 path.dentry = mnt->mnt_mountpoint; 121 path.dentry = mnt->mnt_root;
121 path_get(&path);
122 cp = d_path(&path, buf, sizeof(buf)); 122 cp = d_path(&path, buf, sizeof(buf));
123 path_put(&path);
124 if (!IS_ERR(cp)) { 123 if (!IS_ERR(cp)) {
125 memcpy(sbi->s_es->s_last_mounted, cp, 124 memcpy(sbi->s_es->s_last_mounted, cp,
126 sizeof(sbi->s_es->s_last_mounted)); 125 sizeof(sbi->s_es->s_last_mounted));
127 sb->s_dirt = 1; 126 sb->s_dirt = 1;
128 } 127 }
129 } 128 }
130 return generic_file_open(inode, filp); 129 return dquot_file_open(inode, filp);
131} 130}
132 131
133const struct file_operations ext4_file_operations = { 132const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 98bd140aad01..0d0c3239c1cd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
63 if (inode->i_sb->s_flags & MS_RDONLY) 63 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 64 return 0;
65 65
66 ret = flush_aio_dio_completed_IO(inode); 66 ret = flush_completed_IO(inode);
67 if (ret < 0) 67 if (ret < 0)
68 return ret; 68 return ret;
69 69
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
76 /* If checksum is bad mark all blocks and inodes use to prevent 76 /* If checksum is bad mark all blocks and inodes use to prevent
77 * allocation, essentially implementing a per-group read-only flag. */ 77 * allocation, essentially implementing a per-group read-only flag. */
78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
79 ext4_error(sb, __func__, "Checksum bad for group %u", 79 ext4_error(sb, "Checksum bad for group %u", block_group);
80 block_group);
81 ext4_free_blks_set(sb, gdp, 0); 80 ext4_free_blks_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 81 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 82 ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 bitmap_blk = ext4_inode_bitmap(sb, desc); 110 bitmap_blk = ext4_inode_bitmap(sb, desc);
112 bh = sb_getblk(sb, bitmap_blk); 111 bh = sb_getblk(sb, bitmap_blk);
113 if (unlikely(!bh)) { 112 if (unlikely(!bh)) {
114 ext4_error(sb, __func__, 113 ext4_error(sb, "Cannot read inode bitmap - "
115 "Cannot read inode bitmap - "
116 "block_group = %u, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
117 block_group, bitmap_blk); 115 block_group, bitmap_blk);
118 return NULL; 116 return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
153 set_bitmap_uptodate(bh); 151 set_bitmap_uptodate(bh);
154 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
155 put_bh(bh); 153 put_bh(bh);
156 ext4_error(sb, __func__, 154 ext4_error(sb, "Cannot read inode bitmap - "
157 "Cannot read inode bitmap - "
158 "block_group = %u, inode_bitmap = %llu", 155 "block_group = %u, inode_bitmap = %llu",
159 block_group, bitmap_blk); 156 block_group, bitmap_blk);
160 return NULL; 157 return NULL;
@@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
217 * Note: we must free any quota before locking the superblock, 214 * Note: we must free any quota before locking the superblock,
218 * as writing the quota to disk may need the lock as well. 215 * as writing the quota to disk may need the lock as well.
219 */ 216 */
220 vfs_dq_init(inode); 217 dquot_initialize(inode);
221 ext4_xattr_delete_inode(handle, inode); 218 ext4_xattr_delete_inode(handle, inode);
222 vfs_dq_free_inode(inode); 219 dquot_free_inode(inode);
223 vfs_dq_drop(inode); 220 dquot_drop(inode);
224 221
225 is_directory = S_ISDIR(inode->i_mode); 222 is_directory = S_ISDIR(inode->i_mode);
226 223
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
229 226
230 es = EXT4_SB(sb)->s_es; 227 es = EXT4_SB(sb)->s_es;
231 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
232 ext4_error(sb, "ext4_free_inode", 229 ext4_error(sb, "reserved or nonexistent inode %lu", ino);
233 "reserved or nonexistent inode %lu", ino);
234 goto error_return; 230 goto error_return;
235 } 231 }
236 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 232 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
249 bit, bitmap_bh->b_data); 245 bit, bitmap_bh->b_data);
250 if (!cleared) 246 if (!cleared)
251 ext4_error(sb, "ext4_free_inode", 247 ext4_error(sb, "bit already cleared for inode %lu", ino);
252 "bit already cleared for inode %lu", ino);
253 else { 248 else {
254 gdp = ext4_get_group_desc(sb, block_group, &bh2); 249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
255 250
@@ -268,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
268 ext4_group_t f; 263 ext4_group_t f;
269 264
270 f = ext4_flex_group(sbi, block_group); 265 f = ext4_flex_group(sbi, block_group);
271 atomic_dec(&sbi->s_flex_groups[f].free_inodes); 266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
272 } 267 }
273 268
274 } 269 }
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 731 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
737 ino > EXT4_INODES_PER_GROUP(sb)) { 732 ino > EXT4_INODES_PER_GROUP(sb)) {
738 ext4_unlock_group(sb, group); 733 ext4_unlock_group(sb, group);
739 ext4_error(sb, __func__, 734 ext4_error(sb, "reserved inode or inode > inodes count - "
740 "reserved inode or inode > inodes count - "
741 "block_group = %u, inode=%lu", group, 735 "block_group = %u, inode=%lu", group,
742 ino + group * EXT4_INODES_PER_GROUP(sb)); 736 ino + group * EXT4_INODES_PER_GROUP(sb));
743 return 1; 737 return 1;
@@ -779,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
779 if (sbi->s_log_groups_per_flex) { 773 if (sbi->s_log_groups_per_flex) {
780 ext4_group_t f = ext4_flex_group(sbi, group); 774 ext4_group_t f = ext4_flex_group(sbi, group);
781 775
782 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 776 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
783 } 777 }
784 } 778 }
785 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -904,7 +898,7 @@ repeat_in_this_group:
904 BUFFER_TRACE(inode_bitmap_bh, 898 BUFFER_TRACE(inode_bitmap_bh,
905 "call ext4_handle_dirty_metadata"); 899 "call ext4_handle_dirty_metadata");
906 err = ext4_handle_dirty_metadata(handle, 900 err = ext4_handle_dirty_metadata(handle,
907 inode, 901 NULL,
908 inode_bitmap_bh); 902 inode_bitmap_bh);
909 if (err) 903 if (err)
910 goto fail; 904 goto fail;
@@ -1029,15 +1023,16 @@ got:
1029 inode->i_generation = sbi->s_next_generation++; 1023 inode->i_generation = sbi->s_next_generation++;
1030 spin_unlock(&sbi->s_next_gen_lock); 1024 spin_unlock(&sbi->s_next_gen_lock);
1031 1025
1032 ei->i_state = EXT4_STATE_NEW; 1026 ei->i_state_flags = 0;
1027 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1033 1028
1034 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1029 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1035 1030
1036 ret = inode; 1031 ret = inode;
1037 if (vfs_dq_alloc_inode(inode)) { 1032 dquot_initialize(inode);
1038 err = -EDQUOT; 1033 err = dquot_alloc_inode(inode);
1034 if (err)
1039 goto fail_drop; 1035 goto fail_drop;
1040 }
1041 1036
1042 err = ext4_init_acl(handle, inode, dir); 1037 err = ext4_init_acl(handle, inode, dir);
1043 if (err) 1038 if (err)
@@ -1074,10 +1069,10 @@ really_out:
1074 return ret; 1069 return ret;
1075 1070
1076fail_free_drop: 1071fail_free_drop:
1077 vfs_dq_free_inode(inode); 1072 dquot_free_inode(inode);
1078 1073
1079fail_drop: 1074fail_drop:
1080 vfs_dq_drop(inode); 1075 dquot_drop(inode);
1081 inode->i_flags |= S_NOQUOTA; 1076 inode->i_flags |= S_NOQUOTA;
1082 inode->i_nlink = 0; 1077 inode->i_nlink = 0;
1083 unlock_new_inode(inode); 1078 unlock_new_inode(inode);
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1098 1093
1099 /* Error cases - e2fsck has already cleaned up for us */ 1094 /* Error cases - e2fsck has already cleaned up for us */
1100 if (ino > max_ino) { 1095 if (ino > max_ino) {
1101 ext4_warning(sb, __func__, 1096 ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
1102 "bad orphan ino %lu! e2fsck was run?", ino);
1103 goto error; 1097 goto error;
1104 } 1098 }
1105 1099
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1107 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); 1101 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1108 bitmap_bh = ext4_read_inode_bitmap(sb, block_group); 1102 bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1109 if (!bitmap_bh) { 1103 if (!bitmap_bh) {
1110 ext4_warning(sb, __func__, 1104 ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
1111 "inode bitmap error for orphan %lu", ino);
1112 goto error; 1105 goto error;
1113 } 1106 }
1114 1107
@@ -1140,8 +1133,7 @@ iget_failed:
1140 err = PTR_ERR(inode); 1133 err = PTR_ERR(inode);
1141 inode = NULL; 1134 inode = NULL;
1142bad_orphan: 1135bad_orphan:
1143 ext4_warning(sb, __func__, 1136 ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
1144 "bad orphan inode %lu! e2fsck was run?", ino);
1145 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", 1137 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1146 bit, (unsigned long long)bitmap_bh->b_blocknr, 1138 bit, (unsigned long long)bitmap_bh->b_blocknr,
1147 ext4_test_bit(bit, bitmap_bh->b_data)); 1139 ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e02..11119e07233b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
41 42
42#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
43#include "xattr.h" 44#include "xattr.h"
@@ -170,6 +171,9 @@ void ext4_delete_inode(struct inode *inode)
170 handle_t *handle; 171 handle_t *handle;
171 int err; 172 int err;
172 173
174 if (!is_bad_inode(inode))
175 dquot_initialize(inode);
176
173 if (ext4_should_order_data(inode)) 177 if (ext4_should_order_data(inode))
174 ext4_begin_ordered_truncate(inode, 0); 178 ext4_begin_ordered_truncate(inode, 0);
175 truncate_inode_pages(&inode->i_data, 0); 179 truncate_inode_pages(&inode->i_data, 0);
@@ -194,7 +198,7 @@ void ext4_delete_inode(struct inode *inode)
194 inode->i_size = 0; 198 inode->i_size = 0;
195 err = ext4_mark_inode_dirty(handle, inode); 199 err = ext4_mark_inode_dirty(handle, inode);
196 if (err) { 200 if (err) {
197 ext4_warning(inode->i_sb, __func__, 201 ext4_warning(inode->i_sb,
198 "couldn't mark inode dirty (err %d)", err); 202 "couldn't mark inode dirty (err %d)", err);
199 goto stop_handle; 203 goto stop_handle;
200 } 204 }
@@ -212,7 +216,7 @@ void ext4_delete_inode(struct inode *inode)
212 if (err > 0) 216 if (err > 0)
213 err = ext4_journal_restart(handle, 3); 217 err = ext4_journal_restart(handle, 3);
214 if (err != 0) { 218 if (err != 0) {
215 ext4_warning(inode->i_sb, __func__, 219 ext4_warning(inode->i_sb,
216 "couldn't extend journal (err %d)", err); 220 "couldn't extend journal (err %d)", err);
217 stop_handle: 221 stop_handle:
218 ext4_journal_stop(handle); 222 ext4_journal_stop(handle);
@@ -323,8 +327,7 @@ static int ext4_block_to_path(struct inode *inode,
323 offsets[n++] = i_block & (ptrs - 1); 327 offsets[n++] = i_block & (ptrs - 1);
324 final = ptrs; 328 final = ptrs;
325 } else { 329 } else {
326 ext4_warning(inode->i_sb, "ext4_block_to_path", 330 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
327 "block %lu > max in inode %lu",
328 i_block + direct_blocks + 331 i_block + direct_blocks +
329 indirect_blocks + double_blocks, inode->i_ino); 332 indirect_blocks + double_blocks, inode->i_ino);
330 } 333 }
@@ -344,7 +347,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
344 if (blk && 347 if (blk &&
345 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 348 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
346 blk, 1))) { 349 blk, 1))) {
347 ext4_error(inode->i_sb, function, 350 __ext4_error(inode->i_sb, function,
348 "invalid block reference %u " 351 "invalid block reference %u "
349 "in inode #%lu", blk, inode->i_ino); 352 "in inode #%lu", blk, inode->i_ino);
350 return -EIO; 353 return -EIO;
@@ -607,7 +610,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
607 if (*err) 610 if (*err)
608 goto failed_out; 611 goto failed_out;
609 612
610 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); 613 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
614 EXT4_ERROR_INODE(inode,
615 "current_block %llu + count %lu > %d!",
616 current_block, count,
617 EXT4_MAX_BLOCK_FILE_PHYS);
618 *err = -EIO;
619 goto failed_out;
620 }
611 621
612 target -= count; 622 target -= count;
613 /* allocate blocks for indirect blocks */ 623 /* allocate blocks for indirect blocks */
@@ -643,7 +653,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
643 ar.flags = EXT4_MB_HINT_DATA; 653 ar.flags = EXT4_MB_HINT_DATA;
644 654
645 current_block = ext4_mb_new_blocks(handle, &ar, err); 655 current_block = ext4_mb_new_blocks(handle, &ar, err);
646 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); 656 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
657 EXT4_ERROR_INODE(inode,
658 "current_block %llu + ar.len %d > %d!",
659 current_block, ar.len,
660 EXT4_MAX_BLOCK_FILE_PHYS);
661 *err = -EIO;
662 goto failed_out;
663 }
647 664
648 if (*err && (target == blks)) { 665 if (*err && (target == blks)) {
649 /* 666 /*
@@ -1018,7 +1035,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1018 sector_t lblock) 1035 sector_t lblock)
1019{ 1036{
1020 struct ext4_inode_info *ei = EXT4_I(inode); 1037 struct ext4_inode_info *ei = EXT4_I(inode);
1021 int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; 1038 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1022 int blk_bits; 1039 int blk_bits;
1023 1040
1024 if (lblock < EXT4_NDIR_BLOCKS) 1041 if (lblock < EXT4_NDIR_BLOCKS)
@@ -1033,7 +1050,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1033 } 1050 }
1034 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 1051 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1035 ei->i_da_metadata_calc_len = 1; 1052 ei->i_da_metadata_calc_len = 1;
1036 blk_bits = roundup_pow_of_two(lblock + 1); 1053 blk_bits = order_base_2(lblock);
1037 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 1054 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1038} 1055}
1039 1056
@@ -1061,6 +1078,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
1061 int mdb_free = 0, allocated_meta_blocks = 0; 1078 int mdb_free = 0, allocated_meta_blocks = 0;
1062 1079
1063 spin_lock(&ei->i_block_reservation_lock); 1080 spin_lock(&ei->i_block_reservation_lock);
1081 trace_ext4_da_update_reserve_space(inode, used);
1064 if (unlikely(used > ei->i_reserved_data_blocks)) { 1082 if (unlikely(used > ei->i_reserved_data_blocks)) {
1065 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 1083 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1066 "with only %d reserved data blocks\n", 1084 "with only %d reserved data blocks\n",
@@ -1093,9 +1111,9 @@ void ext4_da_update_reserve_space(struct inode *inode,
1093 1111
1094 /* Update quota subsystem */ 1112 /* Update quota subsystem */
1095 if (quota_claim) { 1113 if (quota_claim) {
1096 vfs_dq_claim_block(inode, used); 1114 dquot_claim_block(inode, used);
1097 if (mdb_free) 1115 if (mdb_free)
1098 vfs_dq_release_reservation_block(inode, mdb_free); 1116 dquot_release_reservation_block(inode, mdb_free);
1099 } else { 1117 } else {
1100 /* 1118 /*
1101 * We did fallocate with an offset that is already delayed 1119 * We did fallocate with an offset that is already delayed
@@ -1106,8 +1124,8 @@ void ext4_da_update_reserve_space(struct inode *inode,
1106 * that 1124 * that
1107 */ 1125 */
1108 if (allocated_meta_blocks) 1126 if (allocated_meta_blocks)
1109 vfs_dq_claim_block(inode, allocated_meta_blocks); 1127 dquot_claim_block(inode, allocated_meta_blocks);
1110 vfs_dq_release_reservation_block(inode, mdb_free + used); 1128 dquot_release_reservation_block(inode, mdb_free + used);
1111 } 1129 }
1112 1130
1113 /* 1131 /*
@@ -1124,7 +1142,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
1124 sector_t logical, sector_t phys, int len) 1142 sector_t logical, sector_t phys, int len)
1125{ 1143{
1126 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1144 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1127 ext4_error(inode->i_sb, msg, 1145 __ext4_error(inode->i_sb, msg,
1128 "inode #%lu logical block %llu mapped to %llu " 1146 "inode #%lu logical block %llu mapped to %llu "
1129 "(size %d)", inode->i_ino, 1147 "(size %d)", inode->i_ino,
1130 (unsigned long long) logical, 1148 (unsigned long long) logical,
@@ -1306,7 +1324,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1306 * i_data's format changing. Force the migrate 1324 * i_data's format changing. Force the migrate
1307 * to fail by clearing migrate flags 1325 * to fail by clearing migrate flags
1308 */ 1326 */
1309 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 1327 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1310 } 1328 }
1311 1329
1312 /* 1330 /*
@@ -1534,6 +1552,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
1534 ext4_truncate(inode); 1552 ext4_truncate(inode);
1535} 1553}
1536 1554
1555static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1556 struct buffer_head *bh_result, int create);
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1557static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1558 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1559 struct page **pagep, void **fsdata)
@@ -1575,8 +1595,12 @@ retry:
1575 } 1595 }
1576 *pagep = page; 1596 *pagep = page;
1577 1597
1578 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1598 if (ext4_should_dioread_nolock(inode))
1579 ext4_get_block); 1599 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1600 fsdata, ext4_get_block_write);
1601 else
1602 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1603 fsdata, ext4_get_block);
1580 1604
1581 if (!ret && ext4_should_journal_data(inode)) { 1605 if (!ret && ext4_should_journal_data(inode)) {
1582 ret = walk_page_buffers(handle, page_buffers(page), 1606 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1793,7 +1817,7 @@ static int ext4_journalled_write_end(struct file *file,
1793 new_i_size = pos + copied; 1817 new_i_size = pos + copied;
1794 if (new_i_size > inode->i_size) 1818 if (new_i_size > inode->i_size)
1795 i_size_write(inode, pos+copied); 1819 i_size_write(inode, pos+copied);
1796 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1820 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1797 if (new_i_size > EXT4_I(inode)->i_disksize) { 1821 if (new_i_size > EXT4_I(inode)->i_disksize) {
1798 ext4_update_i_disksize(inode, new_i_size); 1822 ext4_update_i_disksize(inode, new_i_size);
1799 ret2 = ext4_mark_inode_dirty(handle, inode); 1823 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1836,6 +1860,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1836 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1860 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1837 struct ext4_inode_info *ei = EXT4_I(inode); 1861 struct ext4_inode_info *ei = EXT4_I(inode);
1838 unsigned long md_needed, md_reserved; 1862 unsigned long md_needed, md_reserved;
1863 int ret;
1839 1864
1840 /* 1865 /*
1841 * recalculate the amount of metadata blocks to reserve 1866 * recalculate the amount of metadata blocks to reserve
@@ -1846,6 +1871,7 @@ repeat:
1846 spin_lock(&ei->i_block_reservation_lock); 1871 spin_lock(&ei->i_block_reservation_lock);
1847 md_reserved = ei->i_reserved_meta_blocks; 1872 md_reserved = ei->i_reserved_meta_blocks;
1848 md_needed = ext4_calc_metadata_amount(inode, lblock); 1873 md_needed = ext4_calc_metadata_amount(inode, lblock);
1874 trace_ext4_da_reserve_space(inode, md_needed);
1849 spin_unlock(&ei->i_block_reservation_lock); 1875 spin_unlock(&ei->i_block_reservation_lock);
1850 1876
1851 /* 1877 /*
@@ -1853,11 +1879,12 @@ repeat:
1853 * later. Real quota accounting is done at pages writeout 1879 * later. Real quota accounting is done at pages writeout
1854 * time. 1880 * time.
1855 */ 1881 */
1856 if (vfs_dq_reserve_block(inode, md_needed + 1)) 1882 ret = dquot_reserve_block(inode, md_needed + 1);
1857 return -EDQUOT; 1883 if (ret)
1884 return ret;
1858 1885
1859 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1886 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1860 vfs_dq_release_reservation_block(inode, md_needed + 1); 1887 dquot_release_reservation_block(inode, md_needed + 1);
1861 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1888 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1862 yield(); 1889 yield();
1863 goto repeat; 1890 goto repeat;
@@ -1914,7 +1941,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1914 1941
1915 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1942 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1916 1943
1917 vfs_dq_release_reservation_block(inode, to_free); 1944 dquot_release_reservation_block(inode, to_free);
1918} 1945}
1919 1946
1920static void ext4_da_page_release_reservation(struct page *page, 1947static void ext4_da_page_release_reservation(struct page *page,
@@ -2091,6 +2118,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2091 } else if (buffer_mapped(bh)) 2118 } else if (buffer_mapped(bh))
2092 BUG_ON(bh->b_blocknr != pblock); 2119 BUG_ON(bh->b_blocknr != pblock);
2093 2120
2121 if (buffer_uninit(exbh))
2122 set_buffer_uninit(bh);
2094 cur_logical++; 2123 cur_logical++;
2095 pblock++; 2124 pblock++;
2096 } while ((bh = bh->b_this_page) != head); 2125 } while ((bh = bh->b_this_page) != head);
@@ -2133,17 +2162,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2133 break; 2162 break;
2134 for (i = 0; i < nr_pages; i++) { 2163 for (i = 0; i < nr_pages; i++) {
2135 struct page *page = pvec.pages[i]; 2164 struct page *page = pvec.pages[i];
2136 index = page->index; 2165 if (page->index > end)
2137 if (index > end)
2138 break; 2166 break;
2139 index++;
2140
2141 BUG_ON(!PageLocked(page)); 2167 BUG_ON(!PageLocked(page));
2142 BUG_ON(PageWriteback(page)); 2168 BUG_ON(PageWriteback(page));
2143 block_invalidatepage(page, 0); 2169 block_invalidatepage(page, 0);
2144 ClearPageUptodate(page); 2170 ClearPageUptodate(page);
2145 unlock_page(page); 2171 unlock_page(page);
2146 } 2172 }
2173 index = pvec.pages[nr_pages - 1]->index + 1;
2174 pagevec_release(&pvec);
2147 } 2175 }
2148 return; 2176 return;
2149} 2177}
@@ -2220,6 +2248,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2220 */ 2248 */
2221 new.b_state = 0; 2249 new.b_state = 0;
2222 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2250 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2251 if (ext4_should_dioread_nolock(mpd->inode))
2252 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2223 if (mpd->b_state & (1 << BH_Delay)) 2253 if (mpd->b_state & (1 << BH_Delay))
2224 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2254 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2225 2255
@@ -2630,11 +2660,14 @@ static int __ext4_journalled_writepage(struct page *page,
2630 ret = err; 2660 ret = err;
2631 2661
2632 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2662 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2633 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2663 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2634out: 2664out:
2635 return ret; 2665 return ret;
2636} 2666}
2637 2667
2668static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2669static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2670
2638/* 2671/*
2639 * Note that we don't need to start a transaction unless we're journaling data 2672 * Note that we don't need to start a transaction unless we're journaling data
2640 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2673 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2682,7 +2715,7 @@ static int ext4_writepage(struct page *page,
2682 int ret = 0; 2715 int ret = 0;
2683 loff_t size; 2716 loff_t size;
2684 unsigned int len; 2717 unsigned int len;
2685 struct buffer_head *page_bufs; 2718 struct buffer_head *page_bufs = NULL;
2686 struct inode *inode = page->mapping->host; 2719 struct inode *inode = page->mapping->host;
2687 2720
2688 trace_ext4_writepage(inode, page); 2721 trace_ext4_writepage(inode, page);
@@ -2758,7 +2791,11 @@ static int ext4_writepage(struct page *page,
2758 2791
2759 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2792 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2760 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2793 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2761 else 2794 else if (page_bufs && buffer_uninit(page_bufs)) {
2795 ext4_set_bh_endio(page_bufs, inode);
2796 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2797 wbc, ext4_end_io_buffer_write);
2798 } else
2762 ret = block_write_full_page(page, noalloc_get_block_write, 2799 ret = block_write_full_page(page, noalloc_get_block_write,
2763 wbc); 2800 wbc);
2764 2801
@@ -3301,7 +3338,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3301 filemap_write_and_wait(mapping); 3338 filemap_write_and_wait(mapping);
3302 } 3339 }
3303 3340
3304 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3341 if (EXT4_JOURNAL(inode) &&
3342 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3305 /* 3343 /*
3306 * This is a REALLY heavyweight approach, but the use of 3344 * This is a REALLY heavyweight approach, but the use of
3307 * bmap on dirty files is expected to be extremely rare: 3345 * bmap on dirty files is expected to be extremely rare:
@@ -3320,7 +3358,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3320 * everything they get. 3358 * everything they get.
3321 */ 3359 */
3322 3360
3323 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3361 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3324 journal = EXT4_JOURNAL(inode); 3362 journal = EXT4_JOURNAL(inode);
3325 jbd2_journal_lock_updates(journal); 3363 jbd2_journal_lock_updates(journal);
3326 err = jbd2_journal_flush(journal); 3364 err = jbd2_journal_flush(journal);
@@ -3345,11 +3383,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3345 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3383 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3346} 3384}
3347 3385
3386static void ext4_free_io_end(ext4_io_end_t *io)
3387{
3388 BUG_ON(!io);
3389 if (io->page)
3390 put_page(io->page);
3391 iput(io->inode);
3392 kfree(io);
3393}
3394
3395static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3396{
3397 struct buffer_head *head, *bh;
3398 unsigned int curr_off = 0;
3399
3400 if (!page_has_buffers(page))
3401 return;
3402 head = bh = page_buffers(page);
3403 do {
3404 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3405 && bh->b_private) {
3406 ext4_free_io_end(bh->b_private);
3407 bh->b_private = NULL;
3408 bh->b_end_io = NULL;
3409 }
3410 curr_off = curr_off + bh->b_size;
3411 bh = bh->b_this_page;
3412 } while (bh != head);
3413}
3414
3348static void ext4_invalidatepage(struct page *page, unsigned long offset) 3415static void ext4_invalidatepage(struct page *page, unsigned long offset)
3349{ 3416{
3350 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3417 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3351 3418
3352 /* 3419 /*
3420 * free any io_end structure allocated for buffers to be discarded
3421 */
3422 if (ext4_should_dioread_nolock(page->mapping->host))
3423 ext4_invalidatepage_free_endio(page, offset);
3424 /*
3353 * If it's a full truncate we just forget about the pending dirtying 3425 * If it's a full truncate we just forget about the pending dirtying
3354 */ 3426 */
3355 if (offset == 0) 3427 if (offset == 0)
@@ -3420,7 +3492,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3420 } 3492 }
3421 3493
3422retry: 3494retry:
3423 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3495 if (rw == READ && ext4_should_dioread_nolock(inode))
3496 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
3497 inode->i_sb->s_bdev, iov,
3498 offset, nr_segs,
3499 ext4_get_block, NULL);
3500 else
3501 ret = blockdev_direct_IO(rw, iocb, inode,
3502 inode->i_sb->s_bdev, iov,
3424 offset, nr_segs, 3503 offset, nr_segs,
3425 ext4_get_block, NULL); 3504 ext4_get_block, NULL);
3426 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3505 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3436,6 +3515,9 @@ retry:
3436 * but cannot extend i_size. Bail out and pretend 3515 * but cannot extend i_size. Bail out and pretend
3437 * the write failed... */ 3516 * the write failed... */
3438 ret = PTR_ERR(handle); 3517 ret = PTR_ERR(handle);
3518 if (inode->i_nlink)
3519 ext4_orphan_del(NULL, inode);
3520
3439 goto out; 3521 goto out;
3440 } 3522 }
3441 if (inode->i_nlink) 3523 if (inode->i_nlink)
@@ -3463,75 +3545,63 @@ out:
3463 return ret; 3545 return ret;
3464} 3546}
3465 3547
3466static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, 3548static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3467 struct buffer_head *bh_result, int create) 3549 struct buffer_head *bh_result, int create)
3468{ 3550{
3469 handle_t *handle = NULL; 3551 handle_t *handle = ext4_journal_current_handle();
3470 int ret = 0; 3552 int ret = 0;
3471 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3553 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3472 int dio_credits; 3554 int dio_credits;
3555 int started = 0;
3473 3556
3474 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", 3557 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3475 inode->i_ino, create); 3558 inode->i_ino, create);
3476 /* 3559 /*
3477 * DIO VFS code passes create = 0 flag for write to 3560 * ext4_get_block in prepare for a DIO write or buffer write.
3478 * the middle of file. It does this to avoid block 3561 * We allocate an uinitialized extent if blocks haven't been allocated.
3479 * allocation for holes, to prevent expose stale data 3562 * The extent will be converted to initialized after IO complete.
3480 * out when there is parallel buffered read (which does
3481 * not hold the i_mutex lock) while direct IO write has
3482 * not completed. DIO request on holes finally falls back
3483 * to buffered IO for this reason.
3484 *
3485 * For ext4 extent based file, since we support fallocate,
3486 * new allocated extent as uninitialized, for holes, we
3487 * could fallocate blocks for holes, thus parallel
3488 * buffered IO read will zero out the page when read on
3489 * a hole while parallel DIO write to the hole has not completed.
3490 *
3491 * when we come here, we know it's a direct IO write to
3492 * to the middle of file (<i_size)
3493 * so it's safe to override the create flag from VFS.
3494 */ 3563 */
3495 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; 3564 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3496 3565
3497 if (max_blocks > DIO_MAX_BLOCKS) 3566 if (!handle) {
3498 max_blocks = DIO_MAX_BLOCKS; 3567 if (max_blocks > DIO_MAX_BLOCKS)
3499 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3568 max_blocks = DIO_MAX_BLOCKS;
3500 handle = ext4_journal_start(inode, dio_credits); 3569 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3501 if (IS_ERR(handle)) { 3570 handle = ext4_journal_start(inode, dio_credits);
3502 ret = PTR_ERR(handle); 3571 if (IS_ERR(handle)) {
3503 goto out; 3572 ret = PTR_ERR(handle);
3573 goto out;
3574 }
3575 started = 1;
3504 } 3576 }
3577
3505 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3578 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3506 create); 3579 create);
3507 if (ret > 0) { 3580 if (ret > 0) {
3508 bh_result->b_size = (ret << inode->i_blkbits); 3581 bh_result->b_size = (ret << inode->i_blkbits);
3509 ret = 0; 3582 ret = 0;
3510 } 3583 }
3511 ext4_journal_stop(handle); 3584 if (started)
3585 ext4_journal_stop(handle);
3512out: 3586out:
3513 return ret; 3587 return ret;
3514} 3588}
3515 3589
3516static void ext4_free_io_end(ext4_io_end_t *io) 3590static void dump_completed_IO(struct inode * inode)
3517{
3518 BUG_ON(!io);
3519 iput(io->inode);
3520 kfree(io);
3521}
3522static void dump_aio_dio_list(struct inode * inode)
3523{ 3591{
3524#ifdef EXT4_DEBUG 3592#ifdef EXT4_DEBUG
3525 struct list_head *cur, *before, *after; 3593 struct list_head *cur, *before, *after;
3526 ext4_io_end_t *io, *io0, *io1; 3594 ext4_io_end_t *io, *io0, *io1;
3595 unsigned long flags;
3527 3596
3528 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3597 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3529 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); 3598 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3530 return; 3599 return;
3531 } 3600 }
3532 3601
3533 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); 3602 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3534 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ 3603 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3604 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3535 cur = &io->list; 3605 cur = &io->list;
3536 before = cur->prev; 3606 before = cur->prev;
3537 io0 = container_of(before, ext4_io_end_t, list); 3607 io0 = container_of(before, ext4_io_end_t, list);
@@ -3541,32 +3611,31 @@ static void dump_aio_dio_list(struct inode * inode)
3541 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3611 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3542 io, inode->i_ino, io0, io1); 3612 io, inode->i_ino, io0, io1);
3543 } 3613 }
3614 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3544#endif 3615#endif
3545} 3616}
3546 3617
3547/* 3618/*
3548 * check a range of space and convert unwritten extents to written. 3619 * check a range of space and convert unwritten extents to written.
3549 */ 3620 */
3550static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) 3621static int ext4_end_io_nolock(ext4_io_end_t *io)
3551{ 3622{
3552 struct inode *inode = io->inode; 3623 struct inode *inode = io->inode;
3553 loff_t offset = io->offset; 3624 loff_t offset = io->offset;
3554 size_t size = io->size; 3625 ssize_t size = io->size;
3555 int ret = 0; 3626 int ret = 0;
3556 3627
3557 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," 3628 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3558 "list->prev 0x%p\n", 3629 "list->prev 0x%p\n",
3559 io, inode->i_ino, io->list.next, io->list.prev); 3630 io, inode->i_ino, io->list.next, io->list.prev);
3560 3631
3561 if (list_empty(&io->list)) 3632 if (list_empty(&io->list))
3562 return ret; 3633 return ret;
3563 3634
3564 if (io->flag != DIO_AIO_UNWRITTEN) 3635 if (io->flag != EXT4_IO_UNWRITTEN)
3565 return ret; 3636 return ret;
3566 3637
3567 if (offset + size <= i_size_read(inode)) 3638 ret = ext4_convert_unwritten_extents(inode, offset, size);
3568 ret = ext4_convert_unwritten_extents(inode, offset, size);
3569
3570 if (ret < 0) { 3639 if (ret < 0) {
3571 printk(KERN_EMERG "%s: failed to convert unwritten" 3640 printk(KERN_EMERG "%s: failed to convert unwritten"
3572 "extents to written extents, error is %d" 3641 "extents to written extents, error is %d"
@@ -3579,50 +3648,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3579 io->flag = 0; 3648 io->flag = 0;
3580 return ret; 3649 return ret;
3581} 3650}
3651
3582/* 3652/*
3583 * work on completed aio dio IO, to convert unwritten extents to extents 3653 * work on completed aio dio IO, to convert unwritten extents to extents
3584 */ 3654 */
3585static void ext4_end_aio_dio_work(struct work_struct *work) 3655static void ext4_end_io_work(struct work_struct *work)
3586{ 3656{
3587 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3657 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3588 struct inode *inode = io->inode; 3658 struct inode *inode = io->inode;
3589 int ret = 0; 3659 struct ext4_inode_info *ei = EXT4_I(inode);
3660 unsigned long flags;
3661 int ret;
3590 3662
3591 mutex_lock(&inode->i_mutex); 3663 mutex_lock(&inode->i_mutex);
3592 ret = ext4_end_aio_dio_nolock(io); 3664 ret = ext4_end_io_nolock(io);
3593 if (ret >= 0) { 3665 if (ret < 0) {
3594 if (!list_empty(&io->list)) 3666 mutex_unlock(&inode->i_mutex);
3595 list_del_init(&io->list); 3667 return;
3596 ext4_free_io_end(io);
3597 } 3668 }
3669
3670 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3671 if (!list_empty(&io->list))
3672 list_del_init(&io->list);
3673 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3598 mutex_unlock(&inode->i_mutex); 3674 mutex_unlock(&inode->i_mutex);
3675 ext4_free_io_end(io);
3599} 3676}
3677
3600/* 3678/*
3601 * This function is called from ext4_sync_file(). 3679 * This function is called from ext4_sync_file().
3602 * 3680 *
3603 * When AIO DIO IO is completed, the work to convert unwritten 3681 * When IO is completed, the work to convert unwritten extents to
3604 * extents to written is queued on workqueue but may not get immediately 3682 * written is queued on workqueue but may not get immediately
3605 * scheduled. When fsync is called, we need to ensure the 3683 * scheduled. When fsync is called, we need to ensure the
3606 * conversion is complete before fsync returns. 3684 * conversion is complete before fsync returns.
3607 * The inode keeps track of a list of completed AIO from DIO path 3685 * The inode keeps track of a list of pending/completed IO that
3608 * that might needs to do the conversion. This function walks through 3686 * might needs to do the conversion. This function walks through
3609 * the list and convert the related unwritten extents to written. 3687 * the list and convert the related unwritten extents for completed IO
3688 * to written.
3689 * The function return the number of pending IOs on success.
3610 */ 3690 */
3611int flush_aio_dio_completed_IO(struct inode *inode) 3691int flush_completed_IO(struct inode *inode)
3612{ 3692{
3613 ext4_io_end_t *io; 3693 ext4_io_end_t *io;
3694 struct ext4_inode_info *ei = EXT4_I(inode);
3695 unsigned long flags;
3614 int ret = 0; 3696 int ret = 0;
3615 int ret2 = 0; 3697 int ret2 = 0;
3616 3698
3617 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) 3699 if (list_empty(&ei->i_completed_io_list))
3618 return ret; 3700 return ret;
3619 3701
3620 dump_aio_dio_list(inode); 3702 dump_completed_IO(inode);
3621 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3703 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3622 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, 3704 while (!list_empty(&ei->i_completed_io_list)){
3705 io = list_entry(ei->i_completed_io_list.next,
3623 ext4_io_end_t, list); 3706 ext4_io_end_t, list);
3624 /* 3707 /*
3625 * Calling ext4_end_aio_dio_nolock() to convert completed 3708 * Calling ext4_end_io_nolock() to convert completed
3626 * IO to written. 3709 * IO to written.
3627 * 3710 *
3628 * When ext4_sync_file() is called, run_queue() may already 3711 * When ext4_sync_file() is called, run_queue() may already
@@ -3635,20 +3718,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
3635 * avoid double converting from both fsync and background work 3718 * avoid double converting from both fsync and background work
3636 * queue work. 3719 * queue work.
3637 */ 3720 */
3638 ret = ext4_end_aio_dio_nolock(io); 3721 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3722 ret = ext4_end_io_nolock(io);
3723 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3639 if (ret < 0) 3724 if (ret < 0)
3640 ret2 = ret; 3725 ret2 = ret;
3641 else 3726 else
3642 list_del_init(&io->list); 3727 list_del_init(&io->list);
3643 } 3728 }
3729 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3644 return (ret2 < 0) ? ret2 : 0; 3730 return (ret2 < 0) ? ret2 : 0;
3645} 3731}
3646 3732
3647static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3733static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3648{ 3734{
3649 ext4_io_end_t *io = NULL; 3735 ext4_io_end_t *io = NULL;
3650 3736
3651 io = kmalloc(sizeof(*io), GFP_NOFS); 3737 io = kmalloc(sizeof(*io), flags);
3652 3738
3653 if (io) { 3739 if (io) {
3654 igrab(inode); 3740 igrab(inode);
@@ -3656,8 +3742,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3656 io->flag = 0; 3742 io->flag = 0;
3657 io->offset = 0; 3743 io->offset = 0;
3658 io->size = 0; 3744 io->size = 0;
3659 io->error = 0; 3745 io->page = NULL;
3660 INIT_WORK(&io->work, ext4_end_aio_dio_work); 3746 INIT_WORK(&io->work, ext4_end_io_work);
3661 INIT_LIST_HEAD(&io->list); 3747 INIT_LIST_HEAD(&io->list);
3662 } 3748 }
3663 3749
@@ -3669,6 +3755,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3669{ 3755{
3670 ext4_io_end_t *io_end = iocb->private; 3756 ext4_io_end_t *io_end = iocb->private;
3671 struct workqueue_struct *wq; 3757 struct workqueue_struct *wq;
3758 unsigned long flags;
3759 struct ext4_inode_info *ei;
3672 3760
3673 /* if not async direct IO or dio with 0 bytes write, just return */ 3761 /* if not async direct IO or dio with 0 bytes write, just return */
3674 if (!io_end || !size) 3762 if (!io_end || !size)
@@ -3680,7 +3768,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3680 size); 3768 size);
3681 3769
3682 /* if not aio dio with unwritten extents, just free io and return */ 3770 /* if not aio dio with unwritten extents, just free io and return */
3683 if (io_end->flag != DIO_AIO_UNWRITTEN){ 3771 if (io_end->flag != EXT4_IO_UNWRITTEN){
3684 ext4_free_io_end(io_end); 3772 ext4_free_io_end(io_end);
3685 iocb->private = NULL; 3773 iocb->private = NULL;
3686 return; 3774 return;
@@ -3688,16 +3776,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3688 3776
3689 io_end->offset = offset; 3777 io_end->offset = offset;
3690 io_end->size = size; 3778 io_end->size = size;
3779 io_end->flag = EXT4_IO_UNWRITTEN;
3691 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3780 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3692 3781
3693 /* queue the work to convert unwritten extents to written */ 3782 /* queue the work to convert unwritten extents to written */
3694 queue_work(wq, &io_end->work); 3783 queue_work(wq, &io_end->work);
3695 3784
3696 /* Add the io_end to per-inode completed aio dio list*/ 3785 /* Add the io_end to per-inode completed aio dio list*/
3697 list_add_tail(&io_end->list, 3786 ei = EXT4_I(io_end->inode);
3698 &EXT4_I(io_end->inode)->i_aio_dio_complete_list); 3787 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3788 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3789 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3699 iocb->private = NULL; 3790 iocb->private = NULL;
3700} 3791}
3792
3793static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3794{
3795 ext4_io_end_t *io_end = bh->b_private;
3796 struct workqueue_struct *wq;
3797 struct inode *inode;
3798 unsigned long flags;
3799
3800 if (!test_clear_buffer_uninit(bh) || !io_end)
3801 goto out;
3802
3803 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3804 printk("sb umounted, discard end_io request for inode %lu\n",
3805 io_end->inode->i_ino);
3806 ext4_free_io_end(io_end);
3807 goto out;
3808 }
3809
3810 io_end->flag = EXT4_IO_UNWRITTEN;
3811 inode = io_end->inode;
3812
3813 /* Add the io_end to per-inode completed io list*/
3814 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3815 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3816 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3817
3818 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3819 /* queue the work to convert unwritten extents to written */
3820 queue_work(wq, &io_end->work);
3821out:
3822 bh->b_private = NULL;
3823 bh->b_end_io = NULL;
3824 clear_buffer_uninit(bh);
3825 end_buffer_async_write(bh, uptodate);
3826}
3827
3828static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3829{
3830 ext4_io_end_t *io_end;
3831 struct page *page = bh->b_page;
3832 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3833 size_t size = bh->b_size;
3834
3835retry:
3836 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3837 if (!io_end) {
3838 if (printk_ratelimit())
3839 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3840 schedule();
3841 goto retry;
3842 }
3843 io_end->offset = offset;
3844 io_end->size = size;
3845 /*
3846 * We need to hold a reference to the page to make sure it
3847 * doesn't get evicted before ext4_end_io_work() has a chance
3848 * to convert the extent from written to unwritten.
3849 */
3850 io_end->page = page;
3851 get_page(io_end->page);
3852
3853 bh->b_private = io_end;
3854 bh->b_end_io = ext4_end_io_buffer_write;
3855 return 0;
3856}
3857
3701/* 3858/*
3702 * For ext4 extent files, ext4 will do direct-io write to holes, 3859 * For ext4 extent files, ext4 will do direct-io write to holes,
3703 * preallocated extents, and those write extend the file, no need to 3860 * preallocated extents, and those write extend the file, no need to
@@ -3751,7 +3908,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3751 iocb->private = NULL; 3908 iocb->private = NULL;
3752 EXT4_I(inode)->cur_aio_dio = NULL; 3909 EXT4_I(inode)->cur_aio_dio = NULL;
3753 if (!is_sync_kiocb(iocb)) { 3910 if (!is_sync_kiocb(iocb)) {
3754 iocb->private = ext4_init_io_end(inode); 3911 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3755 if (!iocb->private) 3912 if (!iocb->private)
3756 return -ENOMEM; 3913 return -ENOMEM;
3757 /* 3914 /*
@@ -3767,7 +3924,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3767 ret = blockdev_direct_IO(rw, iocb, inode, 3924 ret = blockdev_direct_IO(rw, iocb, inode,
3768 inode->i_sb->s_bdev, iov, 3925 inode->i_sb->s_bdev, iov,
3769 offset, nr_segs, 3926 offset, nr_segs,
3770 ext4_get_block_dio_write, 3927 ext4_get_block_write,
3771 ext4_end_io_dio); 3928 ext4_end_io_dio);
3772 if (iocb->private) 3929 if (iocb->private)
3773 EXT4_I(inode)->cur_aio_dio = NULL; 3930 EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3788,8 +3945,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3788 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3945 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3789 ext4_free_io_end(iocb->private); 3946 ext4_free_io_end(iocb->private);
3790 iocb->private = NULL; 3947 iocb->private = NULL;
3791 } else if (ret > 0 && (EXT4_I(inode)->i_state & 3948 } else if (ret > 0 && ext4_test_inode_state(inode,
3792 EXT4_STATE_DIO_UNWRITTEN)) { 3949 EXT4_STATE_DIO_UNWRITTEN)) {
3793 int err; 3950 int err;
3794 /* 3951 /*
3795 * for non AIO case, since the IO is already 3952 * for non AIO case, since the IO is already
@@ -3799,7 +3956,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3799 offset, ret); 3956 offset, ret);
3800 if (err < 0) 3957 if (err < 0)
3801 ret = err; 3958 ret = err;
3802 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; 3959 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3803 } 3960 }
3804 return ret; 3961 return ret;
3805 } 3962 }
@@ -4130,18 +4287,27 @@ no_top:
4130 * We release `count' blocks on disk, but (last - first) may be greater 4287 * We release `count' blocks on disk, but (last - first) may be greater
4131 * than `count' because there can be holes in there. 4288 * than `count' because there can be holes in there.
4132 */ 4289 */
4133static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 4290static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4134 struct buffer_head *bh, 4291 struct buffer_head *bh,
4135 ext4_fsblk_t block_to_free, 4292 ext4_fsblk_t block_to_free,
4136 unsigned long count, __le32 *first, 4293 unsigned long count, __le32 *first,
4137 __le32 *last) 4294 __le32 *last)
4138{ 4295{
4139 __le32 *p; 4296 __le32 *p;
4140 int flags = EXT4_FREE_BLOCKS_FORGET; 4297 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4141 4298
4142 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4299 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4143 flags |= EXT4_FREE_BLOCKS_METADATA; 4300 flags |= EXT4_FREE_BLOCKS_METADATA;
4144 4301
4302 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4303 count)) {
4304 ext4_error(inode->i_sb, "inode #%lu: "
4305 "attempt to clear blocks %llu len %lu, invalid",
4306 inode->i_ino, (unsigned long long) block_to_free,
4307 count);
4308 return 1;
4309 }
4310
4145 if (try_to_extend_transaction(handle, inode)) { 4311 if (try_to_extend_transaction(handle, inode)) {
4146 if (bh) { 4312 if (bh) {
4147 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4313 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4160,6 +4326,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4160 *p = 0; 4326 *p = 0;
4161 4327
4162 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4328 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4329 return 0;
4163} 4330}
4164 4331
4165/** 4332/**
@@ -4215,9 +4382,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4215 } else if (nr == block_to_free + count) { 4382 } else if (nr == block_to_free + count) {
4216 count++; 4383 count++;
4217 } else { 4384 } else {
4218 ext4_clear_blocks(handle, inode, this_bh, 4385 if (ext4_clear_blocks(handle, inode, this_bh,
4219 block_to_free, 4386 block_to_free, count,
4220 count, block_to_free_p, p); 4387 block_to_free_p, p))
4388 break;
4221 block_to_free = nr; 4389 block_to_free = nr;
4222 block_to_free_p = p; 4390 block_to_free_p = p;
4223 count = 1; 4391 count = 1;
@@ -4241,7 +4409,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4241 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4409 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4242 ext4_handle_dirty_metadata(handle, inode, this_bh); 4410 ext4_handle_dirty_metadata(handle, inode, this_bh);
4243 else 4411 else
4244 ext4_error(inode->i_sb, __func__, 4412 ext4_error(inode->i_sb,
4245 "circular indirect block detected, " 4413 "circular indirect block detected, "
4246 "inode=%lu, block=%llu", 4414 "inode=%lu, block=%llu",
4247 inode->i_ino, 4415 inode->i_ino,
@@ -4281,6 +4449,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4281 if (!nr) 4449 if (!nr)
4282 continue; /* A hole */ 4450 continue; /* A hole */
4283 4451
4452 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4453 nr, 1)) {
4454 ext4_error(inode->i_sb,
4455 "indirect mapped block in inode "
4456 "#%lu invalid (level %d, blk #%lu)",
4457 inode->i_ino, depth,
4458 (unsigned long) nr);
4459 break;
4460 }
4461
4284 /* Go read the buffer for the next level down */ 4462 /* Go read the buffer for the next level down */
4285 bh = sb_bread(inode->i_sb, nr); 4463 bh = sb_bread(inode->i_sb, nr);
4286 4464
@@ -4289,7 +4467,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4289 * (should be rare). 4467 * (should be rare).
4290 */ 4468 */
4291 if (!bh) { 4469 if (!bh) {
4292 ext4_error(inode->i_sb, "ext4_free_branches", 4470 ext4_error(inode->i_sb,
4293 "Read failure, inode=%lu, block=%llu", 4471 "Read failure, inode=%lu, block=%llu",
4294 inode->i_ino, nr); 4472 inode->i_ino, nr);
4295 continue; 4473 continue;
@@ -4433,8 +4611,10 @@ void ext4_truncate(struct inode *inode)
4433 if (!ext4_can_truncate(inode)) 4611 if (!ext4_can_truncate(inode))
4434 return; 4612 return;
4435 4613
4614 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
4615
4436 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4616 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4437 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4617 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4438 4618
4439 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4619 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4440 ext4_ext_truncate(inode); 4620 ext4_ext_truncate(inode);
@@ -4604,9 +4784,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4604 4784
4605 bh = sb_getblk(sb, block); 4785 bh = sb_getblk(sb, block);
4606 if (!bh) { 4786 if (!bh) {
4607 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4787 ext4_error(sb, "unable to read inode block - "
4608 "inode block - inode=%lu, block=%llu", 4788 "inode=%lu, block=%llu", inode->i_ino, block);
4609 inode->i_ino, block);
4610 return -EIO; 4789 return -EIO;
4611 } 4790 }
4612 if (!buffer_uptodate(bh)) { 4791 if (!buffer_uptodate(bh)) {
@@ -4704,9 +4883,8 @@ make_io:
4704 submit_bh(READ_META, bh); 4883 submit_bh(READ_META, bh);
4705 wait_on_buffer(bh); 4884 wait_on_buffer(bh);
4706 if (!buffer_uptodate(bh)) { 4885 if (!buffer_uptodate(bh)) {
4707 ext4_error(sb, __func__, 4886 ext4_error(sb, "unable to read inode block - inode=%lu,"
4708 "unable to read inode block - inode=%lu, " 4887 " block=%llu", inode->i_ino, block);
4709 "block=%llu", inode->i_ino, block);
4710 brelse(bh); 4888 brelse(bh);
4711 return -EIO; 4889 return -EIO;
4712 } 4890 }
@@ -4720,7 +4898,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4720{ 4898{
4721 /* We have all inode data except xattrs in memory here. */ 4899 /* We have all inode data except xattrs in memory here. */
4722 return __ext4_get_inode_loc(inode, iloc, 4900 return __ext4_get_inode_loc(inode, iloc,
4723 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4901 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4724} 4902}
4725 4903
4726void ext4_set_inode_flags(struct inode *inode) 4904void ext4_set_inode_flags(struct inode *inode)
@@ -4814,7 +4992,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4814 } 4992 }
4815 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4993 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4816 4994
4817 ei->i_state = 0; 4995 ei->i_state_flags = 0;
4818 ei->i_dir_start_lookup = 0; 4996 ei->i_dir_start_lookup = 0;
4819 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4997 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4820 /* We now have enough fields to check if the inode was active or not. 4998 /* We now have enough fields to check if the inode was active or not.
@@ -4897,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4897 EXT4_GOOD_OLD_INODE_SIZE + 5075 EXT4_GOOD_OLD_INODE_SIZE +
4898 ei->i_extra_isize; 5076 ei->i_extra_isize;
4899 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 5077 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4900 ei->i_state |= EXT4_STATE_XATTR; 5078 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4901 } 5079 }
4902 } else 5080 } else
4903 ei->i_extra_isize = 0; 5081 ei->i_extra_isize = 0;
@@ -4917,8 +5095,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4917 ret = 0; 5095 ret = 0;
4918 if (ei->i_file_acl && 5096 if (ei->i_file_acl &&
4919 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5097 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4920 ext4_error(sb, __func__, 5098 ext4_error(sb, "bad extended attribute block %llu inode #%lu",
4921 "bad extended attribute block %llu in inode #%lu",
4922 ei->i_file_acl, inode->i_ino); 5099 ei->i_file_acl, inode->i_ino);
4923 ret = -EIO; 5100 ret = -EIO;
4924 goto bad_inode; 5101 goto bad_inode;
@@ -4964,8 +5141,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4964 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5141 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4965 } else { 5142 } else {
4966 ret = -EIO; 5143 ret = -EIO;
4967 ext4_error(inode->i_sb, __func__, 5144 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
4968 "bogus i_mode (%o) for inode=%lu",
4969 inode->i_mode, inode->i_ino); 5145 inode->i_mode, inode->i_ino);
4970 goto bad_inode; 5146 goto bad_inode;
4971 } 5147 }
@@ -5037,7 +5213,7 @@ static int ext4_do_update_inode(handle_t *handle,
5037 5213
5038 /* For fields not not tracking in the in-memory inode, 5214 /* For fields not not tracking in the in-memory inode,
5039 * initialise them to zero for new inodes. */ 5215 * initialise them to zero for new inodes. */
5040 if (ei->i_state & EXT4_STATE_NEW) 5216 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5041 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5217 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5042 5218
5043 ext4_get_inode_flags(ei); 5219 ext4_get_inode_flags(ei);
@@ -5101,7 +5277,7 @@ static int ext4_do_update_inode(handle_t *handle,
5101 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5277 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5102 sb->s_dirt = 1; 5278 sb->s_dirt = 1;
5103 ext4_handle_sync(handle); 5279 ext4_handle_sync(handle);
5104 err = ext4_handle_dirty_metadata(handle, inode, 5280 err = ext4_handle_dirty_metadata(handle, NULL,
5105 EXT4_SB(sb)->s_sbh); 5281 EXT4_SB(sb)->s_sbh);
5106 } 5282 }
5107 } 5283 }
@@ -5130,10 +5306,10 @@ static int ext4_do_update_inode(handle_t *handle,
5130 } 5306 }
5131 5307
5132 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5308 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5133 rc = ext4_handle_dirty_metadata(handle, inode, bh); 5309 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5134 if (!err) 5310 if (!err)
5135 err = rc; 5311 err = rc;
5136 ei->i_state &= ~EXT4_STATE_NEW; 5312 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5137 5313
5138 ext4_update_inode_fsync_trans(handle, inode, 0); 5314 ext4_update_inode_fsync_trans(handle, inode, 0);
5139out_brelse: 5315out_brelse:
@@ -5177,7 +5353,7 @@ out_brelse:
5177 * `stuff()' is running, and the new i_size will be lost. Plus the inode 5353 * `stuff()' is running, and the new i_size will be lost. Plus the inode
5178 * will no longer be on the superblock's dirty inode list. 5354 * will no longer be on the superblock's dirty inode list.
5179 */ 5355 */
5180int ext4_write_inode(struct inode *inode, int wait) 5356int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5181{ 5357{
5182 int err; 5358 int err;
5183 5359
@@ -5191,7 +5367,7 @@ int ext4_write_inode(struct inode *inode, int wait)
5191 return -EIO; 5367 return -EIO;
5192 } 5368 }
5193 5369
5194 if (!wait) 5370 if (wbc->sync_mode != WB_SYNC_ALL)
5195 return 0; 5371 return 0;
5196 5372
5197 err = ext4_force_commit(inode->i_sb); 5373 err = ext4_force_commit(inode->i_sb);
@@ -5201,13 +5377,11 @@ int ext4_write_inode(struct inode *inode, int wait)
5201 err = ext4_get_inode_loc(inode, &iloc); 5377 err = ext4_get_inode_loc(inode, &iloc);
5202 if (err) 5378 if (err)
5203 return err; 5379 return err;
5204 if (wait) 5380 if (wbc->sync_mode == WB_SYNC_ALL)
5205 sync_dirty_buffer(iloc.bh); 5381 sync_dirty_buffer(iloc.bh);
5206 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5382 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5207 ext4_error(inode->i_sb, __func__, 5383 ext4_error(inode->i_sb, "IO error syncing inode, "
5208 "IO error syncing inode, " 5384 "inode=%lu, block=%llu", inode->i_ino,
5209 "inode=%lu, block=%llu",
5210 inode->i_ino,
5211 (unsigned long long)iloc.bh->b_blocknr); 5385 (unsigned long long)iloc.bh->b_blocknr);
5212 err = -EIO; 5386 err = -EIO;
5213 } 5387 }
@@ -5249,6 +5423,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5249 if (error) 5423 if (error)
5250 return error; 5424 return error;
5251 5425
5426 if (ia_valid & ATTR_SIZE)
5427 dquot_initialize(inode);
5252 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5428 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5253 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5429 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5254 handle_t *handle; 5430 handle_t *handle;
@@ -5261,7 +5437,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5261 error = PTR_ERR(handle); 5437 error = PTR_ERR(handle);
5262 goto err_out; 5438 goto err_out;
5263 } 5439 }
5264 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 5440 error = dquot_transfer(inode, attr);
5265 if (error) { 5441 if (error) {
5266 ext4_journal_stop(handle); 5442 ext4_journal_stop(handle);
5267 return error; 5443 return error;
@@ -5288,7 +5464,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5288 } 5464 }
5289 5465
5290 if (S_ISREG(inode->i_mode) && 5466 if (S_ISREG(inode->i_mode) &&
5291 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 5467 attr->ia_valid & ATTR_SIZE &&
5468 (attr->ia_size < inode->i_size ||
5469 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
5292 handle_t *handle; 5470 handle_t *handle;
5293 5471
5294 handle = ext4_journal_start(inode, 3); 5472 handle = ext4_journal_start(inode, 3);
@@ -5319,6 +5497,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5319 goto err_out; 5497 goto err_out;
5320 } 5498 }
5321 } 5499 }
5500 /* ext4_truncate will clear the flag */
5501 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
5502 ext4_truncate(inode);
5322 } 5503 }
5323 5504
5324 rc = inode_setattr(inode, attr); 5505 rc = inode_setattr(inode, attr);
@@ -5557,8 +5738,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
5557 entry = IFIRST(header); 5738 entry = IFIRST(header);
5558 5739
5559 /* No extended attributes present */ 5740 /* No extended attributes present */
5560 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5741 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5561 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5742 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5562 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5743 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5563 new_extra_isize); 5744 new_extra_isize);
5564 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5745 EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5602,7 +5783,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5602 err = ext4_reserve_inode_write(handle, inode, &iloc); 5783 err = ext4_reserve_inode_write(handle, inode, &iloc);
5603 if (ext4_handle_valid(handle) && 5784 if (ext4_handle_valid(handle) &&
5604 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5785 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5605 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5786 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5606 /* 5787 /*
5607 * We need extra buffer credits since we may write into EA block 5788 * We need extra buffer credits since we may write into EA block
5608 * with this same handle. If journal_extend fails, then it will 5789 * with this same handle. If journal_extend fails, then it will
@@ -5616,10 +5797,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5616 sbi->s_want_extra_isize, 5797 sbi->s_want_extra_isize,
5617 iloc, handle); 5798 iloc, handle);
5618 if (ret) { 5799 if (ret) {
5619 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5800 ext4_set_inode_state(inode,
5801 EXT4_STATE_NO_EXPAND);
5620 if (mnt_count != 5802 if (mnt_count !=
5621 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5803 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5622 ext4_warning(inode->i_sb, __func__, 5804 ext4_warning(inode->i_sb,
5623 "Unable to expand inode %lu. Delete" 5805 "Unable to expand inode %lu. Delete"
5624 " some EAs or run e2fsck.", 5806 " some EAs or run e2fsck.",
5625 inode->i_ino); 5807 inode->i_ino);
@@ -5641,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5641 * i_size has been changed by generic_commit_write() and we thus need 5823 * i_size has been changed by generic_commit_write() and we thus need
5642 * to include the updated inode in the current transaction. 5824 * to include the updated inode in the current transaction.
5643 * 5825 *
5644 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks 5826 * Also, dquot_alloc_block() will always dirty the inode when blocks
5645 * are allocated to the file. 5827 * are allocated to the file.
5646 * 5828 *
5647 * If the inode is marked synchronous, we don't honour that here - doing 5829 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5683,7 +5865,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5683 err = jbd2_journal_get_write_access(handle, iloc.bh); 5865 err = jbd2_journal_get_write_access(handle, iloc.bh);
5684 if (!err) 5866 if (!err)
5685 err = ext4_handle_dirty_metadata(handle, 5867 err = ext4_handle_dirty_metadata(handle,
5686 inode, 5868 NULL,
5687 iloc.bh); 5869 iloc.bh);
5688 brelse(iloc.bh); 5870 brelse(iloc.bh);
5689 } 5871 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193126db..016d0249294f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
92 flags &= ~EXT4_EXTENTS_FL; 92 flags &= ~EXT4_EXTENTS_FL;
93 } 93 }
94 94
95 if (flags & EXT4_EOFBLOCKS_FL) {
96 /* we don't support adding EOFBLOCKS flag */
97 if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
98 err = -EOPNOTSUPP;
99 goto flags_out;
100 }
101 } else if (oldflags & EXT4_EOFBLOCKS_FL)
102 ext4_truncate(inode);
103
95 handle = ext4_journal_start(inode, 1); 104 handle = ext4_journal_start(inode, 1);
96 if (IS_ERR(handle)) { 105 if (IS_ERR(handle)) {
97 err = PTR_ERR(handle); 106 err = PTR_ERR(handle);
@@ -249,7 +258,8 @@ setversion_out:
249 if (me.moved_len > 0) 258 if (me.moved_len > 0)
250 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
251 260
252 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me)))
253 err = -EFAULT; 263 err = -EFAULT;
254mext_out: 264mext_out:
255 fput(donor_filp); 265 fput(donor_filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e137..54df209d2eed 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -69,7 +69,7 @@
69 * 69 *
70 * pa_lstart -> the logical start block for this prealloc space 70 * pa_lstart -> the logical start block for this prealloc space
71 * pa_pstart -> the physical start block for this prealloc space 71 * pa_pstart -> the physical start block for this prealloc space
72 * pa_len -> lenght for this prealloc space 72 * pa_len -> length for this prealloc space
73 * pa_free -> free space available in this prealloc space 73 * pa_free -> free space available in this prealloc space
74 * 74 *
75 * The inode preallocation space is used looking at the _logical_ start 75 * The inode preallocation space is used looking at the _logical_ start
@@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
441 for (i = 0; i < count; i++) { 441 for (i = 0; i < count; i++) {
442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
443 ext4_fsblk_t blocknr; 443 ext4_fsblk_t blocknr;
444 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 444
445 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
445 blocknr += first + i; 446 blocknr += first + i;
446 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 ext4_grp_locked_error(sb, e4b->bd_group, 447 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 448 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %u)", 449 " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1255 1254
1256 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1255 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1257 ext4_fsblk_t blocknr; 1256 ext4_fsblk_t blocknr;
1258 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1257
1258 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1259 blocknr += block; 1259 blocknr += block;
1260 blocknr +=
1261 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1262 ext4_grp_locked_error(sb, e4b->bd_group, 1260 ext4_grp_locked_error(sb, e4b->bd_group,
1263 __func__, "double-free of inode" 1261 __func__, "double-free of inode"
1264 " %lu's block %llu(bit %u in group %u)", 1262 " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1631 int max; 1629 int max;
1632 int err; 1630 int err;
1633 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1631 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1634 struct ext4_super_block *es = sbi->s_es;
1635 struct ext4_free_extent ex; 1632 struct ext4_free_extent ex;
1636 1633
1637 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1634 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1648 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1645 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1649 ext4_fsblk_t start; 1646 ext4_fsblk_t start;
1650 1647
1651 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1648 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1652 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1649 ex.fe_start;
1653 /* use do_div to get remainder (would be 64-bit modulo) */ 1650 /* use do_div to get remainder (would be 64-bit modulo) */
1654 if (do_div(start, sbi->s_stripe) == 0) { 1651 if (do_div(start, sbi->s_stripe) == 0) {
1655 ac->ac_found++; 1652 ac->ac_found++;
@@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1803 BUG_ON(sbi->s_stripe == 0); 1800 BUG_ON(sbi->s_stripe == 0);
1804 1801
1805 /* find first stripe-aligned block in group */ 1802 /* find first stripe-aligned block in group */
1806 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1803 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1807 + le32_to_cpu(sbi->s_es->s_first_data_block); 1804
1808 a = first_group_block + sbi->s_stripe - 1; 1805 a = first_group_block + sbi->s_stripe - 1;
1809 do_div(a, sbi->s_stripe); 1806 do_div(a, sbi->s_stripe);
1810 i = (a * sbi->s_stripe) - first_group_block; 1807 i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2256 2253
2257 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2254 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2258 init_rwsem(&meta_group_info[i]->alloc_sem); 2255 init_rwsem(&meta_group_info[i]->alloc_sem);
2259 meta_group_info[i]->bb_free_root.rb_node = NULL; 2256 meta_group_info[i]->bb_free_root = RB_ROOT;
2260 2257
2261#ifdef DOUBLE_CHECK 2258#ifdef DOUBLE_CHECK
2262 { 2259 {
@@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2560 ext4_unlock_group(sb, entry->group); 2557 ext4_unlock_group(sb, entry->group);
2561 if (test_opt(sb, DISCARD)) { 2558 if (test_opt(sb, DISCARD)) {
2562 ext4_fsblk_t discard_block; 2559 ext4_fsblk_t discard_block;
2563 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2564 2560
2565 discard_block = (ext4_fsblk_t)entry->group * 2561 discard_block = entry->start_blk +
2566 EXT4_BLOCKS_PER_GROUP(sb) 2562 ext4_group_first_block_no(sb, entry->group);
2567 + entry->start_blk
2568 + le32_to_cpu(es->s_first_data_block);
2569 trace_ext4_discard_blocks(sb, 2563 trace_ext4_discard_blocks(sb,
2570 (unsigned long long)discard_block, 2564 (unsigned long long)discard_block,
2571 entry->count); 2565 entry->count);
@@ -2703,14 +2697,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2703 if (err) 2697 if (err)
2704 goto out_err; 2698 goto out_err;
2705 2699
2706 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2700 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2707 + ac->ac_b_ex.fe_start
2708 + le32_to_cpu(es->s_first_data_block);
2709 2701
2710 len = ac->ac_b_ex.fe_len; 2702 len = ac->ac_b_ex.fe_len;
2711 if (!ext4_data_block_valid(sbi, block, len)) { 2703 if (!ext4_data_block_valid(sbi, block, len)) {
2712 ext4_error(sb, __func__, 2704 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2713 "Allocating blocks %llu-%llu which overlap "
2714 "fs metadata\n", block, block+len); 2705 "fs metadata\n", block, block+len);
2715 /* File system mounted not to panic on error 2706 /* File system mounted not to panic on error
2716 * Fix the bitmap and repeat the block allocation 2707 * Fix the bitmap and repeat the block allocation
@@ -3161,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3161 /* The max size of hash table is PREALLOC_TB_SIZE */ 3152 /* The max size of hash table is PREALLOC_TB_SIZE */
3162 order = PREALLOC_TB_SIZE - 1; 3153 order = PREALLOC_TB_SIZE - 1;
3163 3154
3164 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + 3155 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3165 ac->ac_g_ex.fe_start +
3166 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
3167 /* 3156 /*
3168 * search for the prealloc space that is having 3157 * search for the prealloc space that is having
3169 * minimal distance from the goal block. 3158 * minimal distance from the goal block.
@@ -3526,8 +3515,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3526 if (bit >= end) 3515 if (bit >= end)
3527 break; 3516 break;
3528 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3517 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3529 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3518 start = ext4_group_first_block_no(sb, group) + bit;
3530 le32_to_cpu(sbi->s_es->s_first_data_block);
3531 mb_debug(1, " free preallocated %u/%u in group %u\n", 3519 mb_debug(1, " free preallocated %u/%u in group %u\n",
3532 (unsigned) start, (unsigned) next - bit, 3520 (unsigned) start, (unsigned) next - bit,
3533 (unsigned) group); 3521 (unsigned) group);
@@ -3623,15 +3611,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3623 3611
3624 bitmap_bh = ext4_read_block_bitmap(sb, group); 3612 bitmap_bh = ext4_read_block_bitmap(sb, group);
3625 if (bitmap_bh == NULL) { 3613 if (bitmap_bh == NULL) {
3626 ext4_error(sb, __func__, "Error in reading block " 3614 ext4_error(sb, "Error reading block bitmap for %u", group);
3627 "bitmap for %u", group);
3628 return 0; 3615 return 0;
3629 } 3616 }
3630 3617
3631 err = ext4_mb_load_buddy(sb, group, &e4b); 3618 err = ext4_mb_load_buddy(sb, group, &e4b);
3632 if (err) { 3619 if (err) {
3633 ext4_error(sb, __func__, "Error in loading buddy " 3620 ext4_error(sb, "Error loading buddy information for %u", group);
3634 "information for %u", group);
3635 put_bh(bitmap_bh); 3621 put_bh(bitmap_bh);
3636 return 0; 3622 return 0;
3637 } 3623 }
@@ -3804,15 +3790,15 @@ repeat:
3804 3790
3805 err = ext4_mb_load_buddy(sb, group, &e4b); 3791 err = ext4_mb_load_buddy(sb, group, &e4b);
3806 if (err) { 3792 if (err) {
3807 ext4_error(sb, __func__, "Error in loading buddy " 3793 ext4_error(sb, "Error loading buddy information for %u",
3808 "information for %u", group); 3794 group);
3809 continue; 3795 continue;
3810 } 3796 }
3811 3797
3812 bitmap_bh = ext4_read_block_bitmap(sb, group); 3798 bitmap_bh = ext4_read_block_bitmap(sb, group);
3813 if (bitmap_bh == NULL) { 3799 if (bitmap_bh == NULL) {
3814 ext4_error(sb, __func__, "Error in reading block " 3800 ext4_error(sb, "Error reading block bitmap for %u",
3815 "bitmap for %u", group); 3801 group);
3816 ext4_mb_release_desc(&e4b); 3802 ext4_mb_release_desc(&e4b);
3817 continue; 3803 continue;
3818 } 3804 }
@@ -3938,7 +3924,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3938 3924
3939 /* don't use group allocation for large files */ 3925 /* don't use group allocation for large files */
3940 size = max(size, isize); 3926 size = max(size, isize);
3941 if (size >= sbi->s_mb_stream_request) { 3927 if (size > sbi->s_mb_stream_request) {
3942 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3928 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
3943 return; 3929 return;
3944 } 3930 }
@@ -4077,8 +4063,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4077 4063
4078 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4064 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4079 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4065 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4080 ext4_error(sb, __func__, "Error in loading buddy " 4066 ext4_error(sb, "Error loading buddy information for %u",
4081 "information for %u", group); 4067 group);
4082 continue; 4068 continue;
4083 } 4069 }
4084 ext4_lock_group(sb, group); 4070 ext4_lock_group(sb, group);
@@ -4254,7 +4240,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4254 return 0; 4240 return 0;
4255 } 4241 }
4256 reserv_blks = ar->len; 4242 reserv_blks = ar->len;
4257 while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { 4243 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
4258 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4244 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4259 ar->len--; 4245 ar->len--;
4260 } 4246 }
@@ -4331,7 +4317,7 @@ out2:
4331 kmem_cache_free(ext4_ac_cachep, ac); 4317 kmem_cache_free(ext4_ac_cachep, ac);
4332out1: 4318out1:
4333 if (inquota && ar->len < inquota) 4319 if (inquota && ar->len < inquota)
4334 vfs_dq_free_block(ar->inode, inquota - ar->len); 4320 dquot_free_block(ar->inode, inquota - ar->len);
4335out3: 4321out3:
4336 if (!ar->len) { 4322 if (!ar->len) {
4337 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4323 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4476,10 +4462,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4476 4462
4477 sbi = EXT4_SB(sb); 4463 sbi = EXT4_SB(sb);
4478 es = EXT4_SB(sb)->s_es; 4464 es = EXT4_SB(sb)->s_es;
4479 if (!ext4_data_block_valid(sbi, block, count)) { 4465 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4480 ext4_error(sb, __func__, 4466 !ext4_data_block_valid(sbi, block, count)) {
4481 "Freeing blocks not in datazone - " 4467 ext4_error(sb, "Freeing blocks not in datazone - "
4482 "block = %llu, count = %lu", block, count); 4468 "block = %llu, count = %lu", block, count);
4483 goto error_return; 4469 goto error_return;
4484 } 4470 }
4485 4471
@@ -4547,8 +4533,7 @@ do_more:
4547 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4533 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4548 EXT4_SB(sb)->s_itb_per_group)) { 4534 EXT4_SB(sb)->s_itb_per_group)) {
4549 4535
4550 ext4_error(sb, __func__, 4536 ext4_error(sb, "Freeing blocks in system zone - "
4551 "Freeing blocks in system zone - "
4552 "Block = %llu, count = %lu", block, count); 4537 "Block = %llu, count = %lu", block, count);
4553 /* err = 0. ext4_std_error should be a no op */ 4538 /* err = 0. ext4_std_error should be a no op */
4554 goto error_return; 4539 goto error_return;
@@ -4646,7 +4631,7 @@ do_more:
4646 sb->s_dirt = 1; 4631 sb->s_dirt = 1;
4647error_return: 4632error_return:
4648 if (freed) 4633 if (freed)
4649 vfs_dq_free_block(inode, freed); 4634 dquot_free_block(inode, freed);
4650 brelse(bitmap_bh); 4635 brelse(bitmap_bh);
4651 ext4_std_error(sb, err); 4636 ext4_std_error(sb, err);
4652 if (ac) 4637 if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 436521cae456..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -220,16 +220,9 @@ struct ext4_buddy {
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
222 222
223#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
224
225static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 223static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
226 struct ext4_free_extent *fex) 224 struct ext4_free_extent *fex)
227{ 225{
228 ext4_fsblk_t block; 226 return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
229
230 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
231 + fex->fe_start
232 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
233 return block;
234} 227}
235#endif 228#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 81415814b00b..8b87bd0eac95 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
365 * happened after we started the migrate. We need to 365 * happened after we started the migrate. We need to
366 * fail the migrate 366 * fail the migrate
367 */ 367 */
368 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { 368 if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
369 retval = -EAGAIN; 369 retval = -EAGAIN;
370 up_write(&EXT4_I(inode)->i_data_sem); 370 up_write(&EXT4_I(inode)->i_data_sem);
371 goto err_out; 371 goto err_out;
372 } else 372 } else
373 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 373 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
374 /* 374 /*
375 * We have the extent map build with the tmp inode. 375 * We have the extent map build with the tmp inode.
376 * Now copy the i_data across 376 * Now copy the i_data across
@@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode)
503 } 503 }
504 i_size_write(tmp_inode, i_size_read(inode)); 504 i_size_write(tmp_inode, i_size_read(inode));
505 /* 505 /*
506 * We don't want the inode to be reclaimed 506 * Set the i_nlink to zero so it will be deleted later
507 * if we got interrupted in between. We have 507 * when we drop inode reference.
508 * this tmp inode carrying reference to the
509 * data blocks of the original file. We set
510 * the i_nlink to zero at the last stage after
511 * switching the original file to extent format
512 */ 508 */
513 tmp_inode->i_nlink = 1; 509 tmp_inode->i_nlink = 0;
514 510
515 ext4_ext_tree_init(handle, tmp_inode); 511 ext4_ext_tree_init(handle, tmp_inode);
516 ext4_orphan_add(handle, tmp_inode); 512 ext4_orphan_add(handle, tmp_inode);
@@ -533,10 +529,20 @@ int ext4_ext_migrate(struct inode *inode)
533 * allocation. 529 * allocation.
534 */ 530 */
535 down_read((&EXT4_I(inode)->i_data_sem)); 531 down_read((&EXT4_I(inode)->i_data_sem));
536 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; 532 ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
537 up_read((&EXT4_I(inode)->i_data_sem)); 533 up_read((&EXT4_I(inode)->i_data_sem));
538 534
539 handle = ext4_journal_start(inode, 1); 535 handle = ext4_journal_start(inode, 1);
536 if (IS_ERR(handle)) {
537 /*
538 * It is impossible to update on-disk structures without
539 * a handle, so just rollback in-core changes and live other
540 * work to orphan_list_cleanup()
541 */
542 ext4_orphan_del(NULL, tmp_inode);
543 retval = PTR_ERR(handle);
544 goto out;
545 }
540 546
541 ei = EXT4_I(inode); 547 ei = EXT4_I(inode);
542 i_data = ei->i_data; 548 i_data = ei->i_data;
@@ -618,15 +624,8 @@ err_out:
618 624
619 /* Reset the extent details */ 625 /* Reset the extent details */
620 ext4_ext_tree_init(handle, tmp_inode); 626 ext4_ext_tree_init(handle, tmp_inode);
621
622 /*
623 * Set the i_nlink to zero so that
624 * generic_drop_inode really deletes the
625 * inode
626 */
627 tmp_inode->i_nlink = 0;
628
629 ext4_journal_stop(handle); 627 ext4_journal_stop(handle);
628out:
630 unlock_new_inode(tmp_inode); 629 unlock_new_inode(tmp_inode);
631 iput(tmp_inode); 630 iput(tmp_inode);
632 631
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415be87a4..aa5fe28d180f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
152 int ret = 0; 152 int ret = 0;
153 153
154 if (inode1 == NULL) { 154 if (inode1 == NULL) {
155 ext4_error(inode2->i_sb, function, 155 __ext4_error(inode2->i_sb, function,
156 "Both inodes should not be NULL: " 156 "Both inodes should not be NULL: "
157 "inode1 NULL inode2 %lu", inode2->i_ino); 157 "inode1 NULL inode2 %lu", inode2->i_ino);
158 ret = -EIO; 158 ret = -EIO;
159 } else if (inode2 == NULL) { 159 } else if (inode2 == NULL) {
160 ext4_error(inode1->i_sb, function, 160 __ext4_error(inode1->i_sb, function,
161 "Both inodes should not be NULL: " 161 "Both inodes should not be NULL: "
162 "inode1 %lu inode2 NULL", inode1->i_ino); 162 "inode1 %lu inode2 NULL", inode1->i_ino);
163 ret = -EIO; 163 ret = -EIO;
@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
252 } 252 }
253 253
254 o_start->ee_len = start_ext->ee_len; 254 o_start->ee_len = start_ext->ee_len;
255 eblock = le32_to_cpu(start_ext->ee_block);
255 new_flag = 1; 256 new_flag = 1;
256 257
257 } else if (start_ext->ee_len && new_ext->ee_len && 258 } else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
262 * orig |------------------------------| 263 * orig |------------------------------|
263 */ 264 */
264 o_start->ee_len = start_ext->ee_len; 265 o_start->ee_len = start_ext->ee_len;
266 eblock = le32_to_cpu(start_ext->ee_block);
265 new_flag = 1; 267 new_flag = 1;
266 268
267 } else if (!start_ext->ee_len && new_ext->ee_len && 269 } else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -475,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
475 struct ext4_extent *oext, *o_start, *o_end, *prev_ext; 477 struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
476 struct ext4_extent new_ext, start_ext, end_ext; 478 struct ext4_extent new_ext, start_ext, end_ext;
477 ext4_lblk_t new_ext_end; 479 ext4_lblk_t new_ext_end;
478 ext4_fsblk_t new_phys_end;
479 int oext_alen, new_ext_alen, end_ext_alen; 480 int oext_alen, new_ext_alen, end_ext_alen;
480 int depth = ext_depth(orig_inode); 481 int depth = ext_depth(orig_inode);
481 int ret; 482 int ret;
@@ -489,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
489 new_ext.ee_len = dext->ee_len; 490 new_ext.ee_len = dext->ee_len;
490 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 491 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
491 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 492 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
492 new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
493 493
494 /* 494 /*
495 * Case: original extent is first 495 * Case: original extent is first
@@ -502,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
502 le32_to_cpu(oext->ee_block) + oext_alen) { 502 le32_to_cpu(oext->ee_block) + oext_alen) {
503 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - 503 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
504 le32_to_cpu(oext->ee_block)); 504 le32_to_cpu(oext->ee_block));
505 start_ext.ee_block = oext->ee_block;
505 copy_extent_status(oext, &start_ext); 506 copy_extent_status(oext, &start_ext);
506 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { 507 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
507 prev_ext = oext - 1; 508 prev_ext = oext - 1;
@@ -515,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
515 start_ext.ee_len = cpu_to_le16( 516 start_ext.ee_len = cpu_to_le16(
516 ext4_ext_get_actual_len(prev_ext) + 517 ext4_ext_get_actual_len(prev_ext) +
517 new_ext_alen); 518 new_ext_alen);
519 start_ext.ee_block = oext->ee_block;
518 copy_extent_status(prev_ext, &start_ext); 520 copy_extent_status(prev_ext, &start_ext);
519 new_ext.ee_len = 0; 521 new_ext.ee_len = 0;
520 } 522 }
@@ -526,7 +528,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
526 * new_ext |-------| 528 * new_ext |-------|
527 */ 529 */
528 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 530 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
529 ext4_error(orig_inode->i_sb, __func__, 531 ext4_error(orig_inode->i_sb,
530 "new_ext_end(%u) should be less than or equal to " 532 "new_ext_end(%u) should be less than or equal to "
531 "oext->ee_block(%u) + oext_alen(%d) - 1", 533 "oext->ee_block(%u) + oext_alen(%d) - 1",
532 new_ext_end, le32_to_cpu(oext->ee_block), 534 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +691,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
689 while (1) { 691 while (1) {
690 /* The extent for donor must be found. */ 692 /* The extent for donor must be found. */
691 if (!dext) { 693 if (!dext) {
692 ext4_error(donor_inode->i_sb, __func__, 694 ext4_error(donor_inode->i_sb,
693 "The extent for donor must be found"); 695 "The extent for donor must be found");
694 *err = -EIO; 696 *err = -EIO;
695 goto out; 697 goto out;
696 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 698 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
697 ext4_error(donor_inode->i_sb, __func__, 699 ext4_error(donor_inode->i_sb,
698 "Donor offset(%u) and the first block of donor " 700 "Donor offset(%u) and the first block of donor "
699 "extent(%u) should be equal", 701 "extent(%u) should be equal",
700 donor_off, 702 donor_off,
@@ -928,7 +930,7 @@ out2:
928} 930}
929 931
930/** 932/**
931 * mext_check_argumants - Check whether move extent can be done 933 * mext_check_arguments - Check whether move extent can be done
932 * 934 *
933 * @orig_inode: original inode 935 * @orig_inode: original inode
934 * @donor_inode: donor inode 936 * @donor_inode: donor inode
@@ -949,14 +951,6 @@ mext_check_arguments(struct inode *orig_inode,
949 unsigned int blkbits = orig_inode->i_blkbits; 951 unsigned int blkbits = orig_inode->i_blkbits;
950 unsigned int blocksize = 1 << blkbits; 952 unsigned int blocksize = 1 << blkbits;
951 953
952 /* Regular file check */
953 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
954 ext4_debug("ext4 move extent: The argument files should be "
955 "regular file [ino:orig %lu, donor %lu]\n",
956 orig_inode->i_ino, donor_inode->i_ino);
957 return -EINVAL;
958 }
959
960 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 954 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
961 ext4_debug("ext4 move extent: suid or sgid is set" 955 ext4_debug("ext4 move extent: suid or sgid is set"
962 " to donor file [ino:orig %lu, donor %lu]\n", 956 " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1204,6 +1198,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1204 return -EINVAL; 1198 return -EINVAL;
1205 } 1199 }
1206 1200
1201 /* Regular file check */
1202 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
1203 ext4_debug("ext4 move extent: The argument files should be "
1204 "regular file [ino:orig %lu, donor %lu]\n",
1205 orig_inode->i_ino, donor_inode->i_ino);
1206 return -EINVAL;
1207 }
1208
1207 /* Protect orig and donor inodes against a truncate */ 1209 /* Protect orig and donor inodes against a truncate */
1208 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1210 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1209 if (ret1 < 0) 1211 if (ret1 < 0)
@@ -1351,7 +1353,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1351 if (ret1 < 0) 1353 if (ret1 < 0)
1352 break; 1354 break;
1353 if (*moved_len > len) { 1355 if (*moved_len > len) {
1354 ext4_error(orig_inode->i_sb, __func__, 1356 ext4_error(orig_inode->i_sb,
1355 "We replaced blocks too much! " 1357 "We replaced blocks too much! "
1356 "sum of replaced: %llu requested: %llu", 1358 "sum of replaced: %llu requested: %llu",
1357 *moved_len, len); 1359 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e10dd60..0c070fabd108 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
383 if (root->info.hash_version != DX_HASH_TEA && 383 if (root->info.hash_version != DX_HASH_TEA &&
384 root->info.hash_version != DX_HASH_HALF_MD4 && 384 root->info.hash_version != DX_HASH_HALF_MD4 &&
385 root->info.hash_version != DX_HASH_LEGACY) { 385 root->info.hash_version != DX_HASH_LEGACY) {
386 ext4_warning(dir->i_sb, __func__, 386 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
387 "Unrecognised inode hash code %d",
388 root->info.hash_version); 387 root->info.hash_version);
389 brelse(bh); 388 brelse(bh);
390 *err = ERR_BAD_DX_DIR; 389 *err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
399 hash = hinfo->hash; 398 hash = hinfo->hash;
400 399
401 if (root->info.unused_flags & 1) { 400 if (root->info.unused_flags & 1) {
402 ext4_warning(dir->i_sb, __func__, 401 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
403 "Unimplemented inode hash flags: %#06x",
404 root->info.unused_flags); 402 root->info.unused_flags);
405 brelse(bh); 403 brelse(bh);
406 *err = ERR_BAD_DX_DIR; 404 *err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
408 } 406 }
409 407
410 if ((indirect = root->info.indirect_levels) > 1) { 408 if ((indirect = root->info.indirect_levels) > 1) {
411 ext4_warning(dir->i_sb, __func__, 409 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
412 "Unimplemented inode hash depth: %#06x",
413 root->info.indirect_levels); 410 root->info.indirect_levels);
414 brelse(bh); 411 brelse(bh);
415 *err = ERR_BAD_DX_DIR; 412 *err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
421 418
422 if (dx_get_limit(entries) != dx_root_limit(dir, 419 if (dx_get_limit(entries) != dx_root_limit(dir,
423 root->info.info_length)) { 420 root->info.info_length)) {
424 ext4_warning(dir->i_sb, __func__, 421 ext4_warning(dir->i_sb, "dx entry: limit != root limit");
425 "dx entry: limit != root limit");
426 brelse(bh); 422 brelse(bh);
427 *err = ERR_BAD_DX_DIR; 423 *err = ERR_BAD_DX_DIR;
428 goto fail; 424 goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
433 { 429 {
434 count = dx_get_count(entries); 430 count = dx_get_count(entries);
435 if (!count || count > dx_get_limit(entries)) { 431 if (!count || count > dx_get_limit(entries)) {
436 ext4_warning(dir->i_sb, __func__, 432 ext4_warning(dir->i_sb,
437 "dx entry: no count or count > limit"); 433 "dx entry: no count or count > limit");
438 brelse(bh); 434 brelse(bh);
439 *err = ERR_BAD_DX_DIR; 435 *err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
478 goto fail2; 474 goto fail2;
479 at = entries = ((struct dx_node *) bh->b_data)->entries; 475 at = entries = ((struct dx_node *) bh->b_data)->entries;
480 if (dx_get_limit(entries) != dx_node_limit (dir)) { 476 if (dx_get_limit(entries) != dx_node_limit (dir)) {
481 ext4_warning(dir->i_sb, __func__, 477 ext4_warning(dir->i_sb,
482 "dx entry: limit != node limit"); 478 "dx entry: limit != node limit");
483 brelse(bh); 479 brelse(bh);
484 *err = ERR_BAD_DX_DIR; 480 *err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
494 } 490 }
495fail: 491fail:
496 if (*err == ERR_BAD_DX_DIR) 492 if (*err == ERR_BAD_DX_DIR)
497 ext4_warning(dir->i_sb, __func__, 493 ext4_warning(dir->i_sb,
498 "Corrupt dir inode %ld, running e2fsck is " 494 "Corrupt dir inode %ld, running e2fsck is "
499 "recommended.", dir->i_ino); 495 "recommended.", dir->i_ino);
500 return NULL; 496 return NULL;
@@ -947,9 +943,8 @@ restart:
947 wait_on_buffer(bh); 943 wait_on_buffer(bh);
948 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
949 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
950 ext4_error(sb, __func__, "reading directory #%lu " 946 ext4_error(sb, "reading directory #%lu offset %lu",
951 "offset %lu", dir->i_ino, 947 dir->i_ino, (unsigned long)block);
952 (unsigned long)block);
953 brelse(bh); 948 brelse(bh);
954 goto next; 949 goto next;
955 } 950 }
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1041 retval = ext4_htree_next_block(dir, hash, frame, 1036 retval = ext4_htree_next_block(dir, hash, frame,
1042 frames, NULL); 1037 frames, NULL);
1043 if (retval < 0) { 1038 if (retval < 0) {
1044 ext4_warning(sb, __func__, 1039 ext4_warning(sb,
1045 "error reading index page in directory #%lu", 1040 "error reading index page in directory #%lu",
1046 dir->i_ino); 1041 dir->i_ino);
1047 *err = retval; 1042 *err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1071 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1072 brelse(bh); 1067 brelse(bh);
1073 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1074 ext4_error(dir->i_sb, "ext4_lookup", 1069 ext4_error(dir->i_sb, "bad inode number: %u", ino);
1075 "bad inode number: %u", ino);
1076 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1077 } 1071 }
1078 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1079 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1080 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1081 ext4_error(dir->i_sb, __func__, 1075 ext4_error(dir->i_sb,
1082 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1083 ino); 1077 ino);
1084 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1110 brelse(bh); 1104 brelse(bh);
1111 1105
1112 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1113 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1107 ext4_error(child->d_inode->i_sb,
1114 "bad inode number: %u", ino); 1108 "bad inode number: %u", ino);
1115 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1116 } 1110 }
@@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1410 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1411 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1412 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1413 ext4_error(dir->i_sb, __func__, 1407 ext4_error(dir->i_sb,
1414 "invalid rec_len for '..' in inode %lu", 1408 "invalid rec_len for '..' in inode %lu",
1415 dir->i_ino); 1409 dir->i_ino);
1416 brelse(bh); 1410 brelse(bh);
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1575 1569
1576 if (levels && (dx_get_count(frames->entries) == 1570 if (levels && (dx_get_count(frames->entries) ==
1577 dx_get_limit(frames->entries))) { 1571 dx_get_limit(frames->entries))) {
1578 ext4_warning(sb, __func__, 1572 ext4_warning(sb, "Directory index full!");
1579 "Directory index full!");
1580 err = -ENOSPC; 1573 err = -ENOSPC;
1581 goto cleanup; 1574 goto cleanup;
1582 } 1575 }
@@ -1766,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1766 struct inode *inode; 1759 struct inode *inode;
1767 int err, retries = 0; 1760 int err, retries = 0;
1768 1761
1762 dquot_initialize(dir);
1763
1769retry: 1764retry:
1770 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1765 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1771 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1766 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1800,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1800 if (!new_valid_dev(rdev)) 1795 if (!new_valid_dev(rdev))
1801 return -EINVAL; 1796 return -EINVAL;
1802 1797
1798 dquot_initialize(dir);
1799
1803retry: 1800retry:
1804 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1801 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1805 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1802 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1837,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1837 if (EXT4_DIR_LINK_MAX(dir)) 1834 if (EXT4_DIR_LINK_MAX(dir))
1838 return -EMLINK; 1835 return -EMLINK;
1839 1836
1837 dquot_initialize(dir);
1838
1840retry: 1839retry:
1841 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1840 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1842 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1841 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1916,11 +1915,11 @@ static int empty_dir(struct inode *inode)
1916 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1917 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1918 if (err) 1917 if (err)
1919 ext4_error(inode->i_sb, __func__, 1918 ext4_error(inode->i_sb,
1920 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory #%lu offset 0",
1921 err, inode->i_ino); 1920 err, inode->i_ino);
1922 else 1921 else
1923 ext4_warning(inode->i_sb, __func__, 1922 ext4_warning(inode->i_sb,
1924 "bad directory (dir #%lu) - no data block", 1923 "bad directory (dir #%lu) - no data block",
1925 inode->i_ino); 1924 inode->i_ino);
1926 return 1; 1925 return 1;
@@ -1931,7 +1930,7 @@ static int empty_dir(struct inode *inode)
1931 !le32_to_cpu(de1->inode) || 1930 !le32_to_cpu(de1->inode) ||
1932 strcmp(".", de->name) || 1931 strcmp(".", de->name) ||
1933 strcmp("..", de1->name)) { 1932 strcmp("..", de1->name)) {
1934 ext4_warning(inode->i_sb, "empty_dir", 1933 ext4_warning(inode->i_sb,
1935 "bad directory (dir #%lu) - no `.' or `..'", 1934 "bad directory (dir #%lu) - no `.' or `..'",
1936 inode->i_ino); 1935 inode->i_ino);
1937 brelse(bh); 1936 brelse(bh);
@@ -1949,7 +1948,7 @@ static int empty_dir(struct inode *inode)
1949 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1950 if (!bh) { 1949 if (!bh) {
1951 if (err) 1950 if (err)
1952 ext4_error(sb, __func__, 1951 ext4_error(sb,
1953 "error %d reading directory" 1952 "error %d reading directory"
1954 " #%lu offset %u", 1953 " #%lu offset %u",
1955 err, inode->i_ino, offset); 1954 err, inode->i_ino, offset);
@@ -2020,11 +2019,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2020 err = ext4_reserve_inode_write(handle, inode, &iloc); 2019 err = ext4_reserve_inode_write(handle, inode, &iloc);
2021 if (err) 2020 if (err)
2022 goto out_unlock; 2021 goto out_unlock;
2022 /*
2023 * Due to previous errors inode may be already a part of on-disk
2024 * orphan list. If so skip on-disk list modification.
2025 */
2026 if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
2027 (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
2028 goto mem_insert;
2023 2029
2024 /* Insert this inode at the head of the on-disk orphan list... */ 2030 /* Insert this inode at the head of the on-disk orphan list... */
2025 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 2031 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2026 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 2032 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2027 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); 2033 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
2028 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 2034 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2029 if (!err) 2035 if (!err)
2030 err = rc; 2036 err = rc;
@@ -2037,6 +2043,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2037 * 2043 *
2038 * This is safe: on error we're going to ignore the orphan list 2044 * This is safe: on error we're going to ignore the orphan list
2039 * anyway on the next recovery. */ 2045 * anyway on the next recovery. */
2046mem_insert:
2040 if (!err) 2047 if (!err)
2041 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2048 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2042 2049
@@ -2096,7 +2103,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2096 if (err) 2103 if (err)
2097 goto out_brelse; 2104 goto out_brelse;
2098 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2105 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2099 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); 2106 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
2100 } else { 2107 } else {
2101 struct ext4_iloc iloc2; 2108 struct ext4_iloc iloc2;
2102 struct inode *i_prev = 2109 struct inode *i_prev =
@@ -2136,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2136 2143
2137 /* Initialize quotas before so that eventual writes go in 2144 /* Initialize quotas before so that eventual writes go in
2138 * separate transaction */ 2145 * separate transaction */
2139 vfs_dq_init(dentry->d_inode); 2146 dquot_initialize(dir);
2147 dquot_initialize(dentry->d_inode);
2148
2140 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2149 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2141 if (IS_ERR(handle)) 2150 if (IS_ERR(handle))
2142 return PTR_ERR(handle); 2151 return PTR_ERR(handle);
@@ -2163,7 +2172,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2163 if (retval) 2172 if (retval)
2164 goto end_rmdir; 2173 goto end_rmdir;
2165 if (!EXT4_DIR_LINK_EMPTY(inode)) 2174 if (!EXT4_DIR_LINK_EMPTY(inode))
2166 ext4_warning(inode->i_sb, "ext4_rmdir", 2175 ext4_warning(inode->i_sb,
2167 "empty directory has too many links (%d)", 2176 "empty directory has too many links (%d)",
2168 inode->i_nlink); 2177 inode->i_nlink);
2169 inode->i_version++; 2178 inode->i_version++;
@@ -2195,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2195 2204
2196 /* Initialize quotas before so that eventual writes go 2205 /* Initialize quotas before so that eventual writes go
2197 * in separate transaction */ 2206 * in separate transaction */
2198 vfs_dq_init(dentry->d_inode); 2207 dquot_initialize(dir);
2208 dquot_initialize(dentry->d_inode);
2209
2199 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2210 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2200 if (IS_ERR(handle)) 2211 if (IS_ERR(handle))
2201 return PTR_ERR(handle); 2212 return PTR_ERR(handle);
@@ -2215,7 +2226,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2215 goto end_unlink; 2226 goto end_unlink;
2216 2227
2217 if (!inode->i_nlink) { 2228 if (!inode->i_nlink) {
2218 ext4_warning(inode->i_sb, "ext4_unlink", 2229 ext4_warning(inode->i_sb,
2219 "Deleting nonexistent file (%lu), %d", 2230 "Deleting nonexistent file (%lu), %d",
2220 inode->i_ino, inode->i_nlink); 2231 inode->i_ino, inode->i_nlink);
2221 inode->i_nlink = 1; 2232 inode->i_nlink = 1;
@@ -2250,6 +2261,8 @@ static int ext4_symlink(struct inode *dir,
2250 if (l > dir->i_sb->s_blocksize) 2261 if (l > dir->i_sb->s_blocksize)
2251 return -ENAMETOOLONG; 2262 return -ENAMETOOLONG;
2252 2263
2264 dquot_initialize(dir);
2265
2253retry: 2266retry:
2254 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2267 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2255 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2268 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2308,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry,
2308 if (inode->i_nlink >= EXT4_LINK_MAX) 2321 if (inode->i_nlink >= EXT4_LINK_MAX)
2309 return -EMLINK; 2322 return -EMLINK;
2310 2323
2324 dquot_initialize(dir);
2325
2311 /* 2326 /*
2312 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing 2327 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2313 * otherwise has the potential to corrupt the orphan inode list. 2328 * otherwise has the potential to corrupt the orphan inode list.
@@ -2358,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2358 struct ext4_dir_entry_2 *old_de, *new_de; 2373 struct ext4_dir_entry_2 *old_de, *new_de;
2359 int retval, force_da_alloc = 0; 2374 int retval, force_da_alloc = 0;
2360 2375
2376 dquot_initialize(old_dir);
2377 dquot_initialize(new_dir);
2378
2361 old_bh = new_bh = dir_bh = NULL; 2379 old_bh = new_bh = dir_bh = NULL;
2362 2380
2363 /* Initialize quotas before so that eventual writes go 2381 /* Initialize quotas before so that eventual writes go
2364 * in separate transaction */ 2382 * in separate transaction */
2365 if (new_dentry->d_inode) 2383 if (new_dentry->d_inode)
2366 vfs_dq_init(new_dentry->d_inode); 2384 dquot_initialize(new_dentry->d_inode);
2367 handle = ext4_journal_start(old_dir, 2 * 2385 handle = ext4_journal_start(old_dir, 2 *
2368 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2386 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2369 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); 2387 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2462,7 +2480,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2462 } 2480 }
2463 } 2481 }
2464 if (retval) { 2482 if (retval) {
2465 ext4_warning(old_dir->i_sb, "ext4_rename", 2483 ext4_warning(old_dir->i_sb,
2466 "Deleting old file (%lu), %d, error=%d", 2484 "Deleting old file (%lu), %d, error=%d",
2467 old_dir->i_ino, old_dir->i_nlink, retval); 2485 old_dir->i_ino, old_dir->i_nlink, retval);
2468 } 2486 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c5541d8a6..5692c48754a0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
48 48
49 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 49 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
50 if (group != sbi->s_groups_count) 50 if (group != sbi->s_groups_count)
51 ext4_warning(sb, __func__, 51 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
52 "Cannot add at group %u (only %u groups)",
53 input->group, sbi->s_groups_count); 52 input->group, sbi->s_groups_count);
54 else if (offset != 0) 53 else if (offset != 0)
55 ext4_warning(sb, __func__, "Last group not full"); 54 ext4_warning(sb, "Last group not full");
56 else if (input->reserved_blocks > input->blocks_count / 5) 55 else if (input->reserved_blocks > input->blocks_count / 5)
57 ext4_warning(sb, __func__, "Reserved blocks too high (%u)", 56 ext4_warning(sb, "Reserved blocks too high (%u)",
58 input->reserved_blocks); 57 input->reserved_blocks);
59 else if (free_blocks_count < 0) 58 else if (free_blocks_count < 0)
60 ext4_warning(sb, __func__, "Bad blocks count %u", 59 ext4_warning(sb, "Bad blocks count %u",
61 input->blocks_count); 60 input->blocks_count);
62 else if (!(bh = sb_bread(sb, end - 1))) 61 else if (!(bh = sb_bread(sb, end - 1)))
63 ext4_warning(sb, __func__, 62 ext4_warning(sb, "Cannot read last block (%llu)",
64 "Cannot read last block (%llu)",
65 end - 1); 63 end - 1);
66 else if (outside(input->block_bitmap, start, end)) 64 else if (outside(input->block_bitmap, start, end))
67 ext4_warning(sb, __func__, 65 ext4_warning(sb, "Block bitmap not in group (block %llu)",
68 "Block bitmap not in group (block %llu)",
69 (unsigned long long)input->block_bitmap); 66 (unsigned long long)input->block_bitmap);
70 else if (outside(input->inode_bitmap, start, end)) 67 else if (outside(input->inode_bitmap, start, end))
71 ext4_warning(sb, __func__, 68 ext4_warning(sb, "Inode bitmap not in group (block %llu)",
72 "Inode bitmap not in group (block %llu)",
73 (unsigned long long)input->inode_bitmap); 69 (unsigned long long)input->inode_bitmap);
74 else if (outside(input->inode_table, start, end) || 70 else if (outside(input->inode_table, start, end) ||
75 outside(itend - 1, start, end)) 71 outside(itend - 1, start, end))
76 ext4_warning(sb, __func__, 72 ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
77 "Inode table not in group (blocks %llu-%llu)",
78 (unsigned long long)input->inode_table, itend - 1); 73 (unsigned long long)input->inode_table, itend - 1);
79 else if (input->inode_bitmap == input->block_bitmap) 74 else if (input->inode_bitmap == input->block_bitmap)
80 ext4_warning(sb, __func__, 75 ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
81 "Block bitmap same as inode bitmap (%llu)",
82 (unsigned long long)input->block_bitmap); 76 (unsigned long long)input->block_bitmap);
83 else if (inside(input->block_bitmap, input->inode_table, itend)) 77 else if (inside(input->block_bitmap, input->inode_table, itend))
84 ext4_warning(sb, __func__, 78 ext4_warning(sb, "Block bitmap (%llu) in inode table "
85 "Block bitmap (%llu) in inode table (%llu-%llu)", 79 "(%llu-%llu)",
86 (unsigned long long)input->block_bitmap, 80 (unsigned long long)input->block_bitmap,
87 (unsigned long long)input->inode_table, itend - 1); 81 (unsigned long long)input->inode_table, itend - 1);
88 else if (inside(input->inode_bitmap, input->inode_table, itend)) 82 else if (inside(input->inode_bitmap, input->inode_table, itend))
89 ext4_warning(sb, __func__, 83 ext4_warning(sb, "Inode bitmap (%llu) in inode table "
90 "Inode bitmap (%llu) in inode table (%llu-%llu)", 84 "(%llu-%llu)",
91 (unsigned long long)input->inode_bitmap, 85 (unsigned long long)input->inode_bitmap,
92 (unsigned long long)input->inode_table, itend - 1); 86 (unsigned long long)input->inode_table, itend - 1);
93 else if (inside(input->block_bitmap, start, metaend)) 87 else if (inside(input->block_bitmap, start, metaend))
94 ext4_warning(sb, __func__, 88 ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
95 "Block bitmap (%llu) in GDT table"
96 " (%llu-%llu)",
97 (unsigned long long)input->block_bitmap, 89 (unsigned long long)input->block_bitmap,
98 start, metaend - 1); 90 start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend)) 91 else if (inside(input->inode_bitmap, start, metaend))
100 ext4_warning(sb, __func__, 92 ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
101 "Inode bitmap (%llu) in GDT table"
102 " (%llu-%llu)",
103 (unsigned long long)input->inode_bitmap, 93 (unsigned long long)input->inode_bitmap,
104 start, metaend - 1); 94 start, metaend - 1);
105 else if (inside(input->inode_table, start, metaend) || 95 else if (inside(input->inode_table, start, metaend) ||
106 inside(itend - 1, start, metaend)) 96 inside(itend - 1, start, metaend))
107 ext4_warning(sb, __func__, 97 ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
108 "Inode table (%llu-%llu) overlaps" 98 "(%llu-%llu)",
109 "GDT table (%llu-%llu)",
110 (unsigned long long)input->inode_table, 99 (unsigned long long)input->inode_table,
111 itend - 1, start, metaend - 1); 100 itend - 1, start, metaend - 1);
112 else 101 else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
364 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { 353 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
365 if (le32_to_cpu(*p++) != 354 if (le32_to_cpu(*p++) !=
366 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ 355 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
367 ext4_warning(sb, __func__, 356 ext4_warning(sb, "reserved GDT %llu"
368 "reserved GDT %llu"
369 " missing grp %d (%llu)", 357 " missing grp %d (%llu)",
370 blk, grp, 358 blk, grp,
371 grp * 359 grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
420 */ 408 */
421 if (EXT4_SB(sb)->s_sbh->b_blocknr != 409 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
422 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { 410 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
423 ext4_warning(sb, __func__, 411 ext4_warning(sb, "won't resize using backup superblock at %llu",
424 "won't resize using backup superblock at %llu",
425 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); 412 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
426 return -EPERM; 413 return -EPERM;
427 } 414 }
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
444 431
445 data = (__le32 *)dind->b_data; 432 data = (__le32 *)dind->b_data;
446 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { 433 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
447 ext4_warning(sb, __func__, 434 ext4_warning(sb, "new group %u GDT block %llu not reserved",
448 "new group %u GDT block %llu not reserved",
449 input->group, gdblock); 435 input->group, gdblock);
450 err = -EINVAL; 436 err = -EINVAL;
451 goto exit_dind; 437 goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
468 GFP_NOFS); 454 GFP_NOFS);
469 if (!n_group_desc) { 455 if (!n_group_desc) {
470 err = -ENOMEM; 456 err = -ENOMEM;
471 ext4_warning(sb, __func__, 457 ext4_warning(sb,
472 "not enough memory for %lu groups", gdb_num + 1); 458 "not enough memory for %lu groups", gdb_num + 1);
473 goto exit_inode; 459 goto exit_inode;
474 } 460 }
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
567 /* Get each reserved primary GDT block and verify it holds backups */ 553 /* Get each reserved primary GDT block and verify it holds backups */
568 for (res = 0; res < reserved_gdb; res++, blk++) { 554 for (res = 0; res < reserved_gdb; res++, blk++) {
569 if (le32_to_cpu(*data) != blk) { 555 if (le32_to_cpu(*data) != blk) {
570 ext4_warning(sb, __func__, 556 ext4_warning(sb, "reserved block %llu"
571 "reserved block %llu"
572 " not at offset %ld", 557 " not at offset %ld",
573 blk, 558 blk,
574 (long)(data - (__le32 *)dind->b_data)); 559 (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
713 */ 698 */
714exit_err: 699exit_err:
715 if (err) { 700 if (err) {
716 ext4_warning(sb, __func__, 701 ext4_warning(sb, "can't update backup for group %u (err %d), "
717 "can't update backup for group %u (err %d), "
718 "forcing fsck on next reboot", group, err); 702 "forcing fsck on next reboot", group, err);
719 sbi->s_mount_state &= ~EXT4_VALID_FS; 703 sbi->s_mount_state &= ~EXT4_VALID_FS;
720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 704 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
753 737
754 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 738 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
755 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { 739 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
756 ext4_warning(sb, __func__, 740 ext4_warning(sb, "Can't resize non-sparse filesystem further");
757 "Can't resize non-sparse filesystem further");
758 return -EPERM; 741 return -EPERM;
759 } 742 }
760 743
761 if (ext4_blocks_count(es) + input->blocks_count < 744 if (ext4_blocks_count(es) + input->blocks_count <
762 ext4_blocks_count(es)) { 745 ext4_blocks_count(es)) {
763 ext4_warning(sb, __func__, "blocks_count overflow"); 746 ext4_warning(sb, "blocks_count overflow");
764 return -EINVAL; 747 return -EINVAL;
765 } 748 }
766 749
767 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 750 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
768 le32_to_cpu(es->s_inodes_count)) { 751 le32_to_cpu(es->s_inodes_count)) {
769 ext4_warning(sb, __func__, "inodes_count overflow"); 752 ext4_warning(sb, "inodes_count overflow");
770 return -EINVAL; 753 return -EINVAL;
771 } 754 }
772 755
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
774 if (!EXT4_HAS_COMPAT_FEATURE(sb, 757 if (!EXT4_HAS_COMPAT_FEATURE(sb,
775 EXT4_FEATURE_COMPAT_RESIZE_INODE) 758 EXT4_FEATURE_COMPAT_RESIZE_INODE)
776 || !le16_to_cpu(es->s_reserved_gdt_blocks)) { 759 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
777 ext4_warning(sb, __func__, 760 ext4_warning(sb,
778 "No reserved GDT blocks, can't resize"); 761 "No reserved GDT blocks, can't resize");
779 return -EPERM; 762 return -EPERM;
780 } 763 }
781 inode = ext4_iget(sb, EXT4_RESIZE_INO); 764 inode = ext4_iget(sb, EXT4_RESIZE_INO);
782 if (IS_ERR(inode)) { 765 if (IS_ERR(inode)) {
783 ext4_warning(sb, __func__, 766 ext4_warning(sb, "Error opening resize inode");
784 "Error opening resize inode");
785 return PTR_ERR(inode); 767 return PTR_ERR(inode);
786 } 768 }
787 } 769 }
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
810 792
811 mutex_lock(&sbi->s_resize_lock); 793 mutex_lock(&sbi->s_resize_lock);
812 if (input->group != sbi->s_groups_count) { 794 if (input->group != sbi->s_groups_count) {
813 ext4_warning(sb, __func__, 795 ext4_warning(sb, "multiple resizers run on filesystem!");
814 "multiple resizers run on filesystem!");
815 err = -EBUSY; 796 err = -EBUSY;
816 goto exit_journal; 797 goto exit_journal;
817 } 798 }
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 978 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 979 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 980 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); 981 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1001 return -EINVAL; 982 return -EINVAL;
1002 } 983 }
1003 984
1004 if (n_blocks_count < o_blocks_count) { 985 if (n_blocks_count < o_blocks_count) {
1005 ext4_warning(sb, __func__, 986 ext4_warning(sb, "can't shrink FS - resize aborted");
1006 "can't shrink FS - resize aborted");
1007 return -EBUSY; 987 return -EBUSY;
1008 } 988 }
1009 989
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1011 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); 991 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
1012 992
1013 if (last == 0) { 993 if (last == 0) {
1014 ext4_warning(sb, __func__, 994 ext4_warning(sb, "need to use ext2online to resize further");
1015 "need to use ext2online to resize further");
1016 return -EPERM; 995 return -EPERM;
1017 } 996 }
1018 997
1019 add = EXT4_BLOCKS_PER_GROUP(sb) - last; 998 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
1020 999
1021 if (o_blocks_count + add < o_blocks_count) { 1000 if (o_blocks_count + add < o_blocks_count) {
1022 ext4_warning(sb, __func__, "blocks_count overflow"); 1001 ext4_warning(sb, "blocks_count overflow");
1023 return -EINVAL; 1002 return -EINVAL;
1024 } 1003 }
1025 1004
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1027 add = n_blocks_count - o_blocks_count; 1006 add = n_blocks_count - o_blocks_count;
1028 1007
1029 if (o_blocks_count + add < n_blocks_count) 1008 if (o_blocks_count + add < n_blocks_count)
1030 ext4_warning(sb, __func__, 1009 ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
1031 "will only finish group (%llu"
1032 " blocks, %u new)",
1033 o_blocks_count + add, add); 1010 o_blocks_count + add, add);
1034 1011
1035 /* See if the device is actually as big as what was requested */ 1012 /* See if the device is actually as big as what was requested */
1036 bh = sb_bread(sb, o_blocks_count + add - 1); 1013 bh = sb_bread(sb, o_blocks_count + add - 1);
1037 if (!bh) { 1014 if (!bh) {
1038 ext4_warning(sb, __func__, 1015 ext4_warning(sb, "can't read last block, resize aborted");
1039 "can't read last block, resize aborted");
1040 return -ENOSPC; 1016 return -ENOSPC;
1041 } 1017 }
1042 brelse(bh); 1018 brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1047 handle = ext4_journal_start_sb(sb, 3); 1023 handle = ext4_journal_start_sb(sb, 3);
1048 if (IS_ERR(handle)) { 1024 if (IS_ERR(handle)) {
1049 err = PTR_ERR(handle); 1025 err = PTR_ERR(handle);
1050 ext4_warning(sb, __func__, "error %d on journal start", err); 1026 ext4_warning(sb, "error %d on journal start", err);
1051 goto exit_put; 1027 goto exit_put;
1052 } 1028 }
1053 1029
1054 mutex_lock(&EXT4_SB(sb)->s_resize_lock); 1030 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1055 if (o_blocks_count != ext4_blocks_count(es)) { 1031 if (o_blocks_count != ext4_blocks_count(es)) {
1056 ext4_warning(sb, __func__, 1032 ext4_warning(sb, "multiple resizers run on filesystem!");
1057 "multiple resizers run on filesystem!");
1058 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1033 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1059 ext4_journal_stop(handle); 1034 ext4_journal_stop(handle);
1060 err = -EBUSY; 1035 err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1063 1038
1064 if ((err = ext4_journal_get_write_access(handle, 1039 if ((err = ext4_journal_get_write_access(handle,
1065 EXT4_SB(sb)->s_sbh))) { 1040 EXT4_SB(sb)->s_sbh))) {
1066 ext4_warning(sb, __func__, 1041 ext4_warning(sb, "error %d on journal write access", err);
1067 "error %d on journal write access", err);
1068 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1042 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1069 ext4_journal_stop(handle); 1043 ext4_journal_stop(handle);
1070 goto exit_put; 1044 goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d5fd56..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 68static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 69static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 70static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt);
71 73
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE,
77 .name = "ext3",
78 .get_sb = ext4_get_sb,
79 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV,
81};
82#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
83#else
84#define IS_EXT3_SB(sb) (0)
85#endif
72 86
73ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74 struct ext4_group_desc *bg) 88 struct ext4_group_desc *bg)
@@ -302,7 +316,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
302 * write out the superblock safely. 316 * write out the superblock safely.
303 * 317 *
304 * We'll just use the jbd2_journal_abort() error code to record an error in 318 * We'll just use the jbd2_journal_abort() error code to record an error in
305 * the journal instead. On recovery, the journal will compain about 319 * the journal instead. On recovery, the journal will complain about
306 * that error until we've noted it down and cleared it. 320 * that error until we've noted it down and cleared it.
307 */ 321 */
308 322
@@ -333,7 +347,7 @@ static void ext4_handle_error(struct super_block *sb)
333 sb->s_id); 347 sb->s_id);
334} 348}
335 349
336void ext4_error(struct super_block *sb, const char *function, 350void __ext4_error(struct super_block *sb, const char *function,
337 const char *fmt, ...) 351 const char *fmt, ...)
338{ 352{
339 va_list args; 353 va_list args;
@@ -347,6 +361,42 @@ void ext4_error(struct super_block *sb, const char *function,
347 ext4_handle_error(sb); 361 ext4_handle_error(sb);
348} 362}
349 363
364void ext4_error_inode(const char *function, struct inode *inode,
365 const char *fmt, ...)
366{
367 va_list args;
368
369 va_start(args, fmt);
370 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
371 inode->i_sb->s_id, function, inode->i_ino, current->comm);
372 vprintk(fmt, args);
373 printk("\n");
374 va_end(args);
375
376 ext4_handle_error(inode->i_sb);
377}
378
379void ext4_error_file(const char *function, struct file *file,
380 const char *fmt, ...)
381{
382 va_list args;
383 struct inode *inode = file->f_dentry->d_inode;
384 char pathname[80], *path;
385
386 va_start(args, fmt);
387 path = d_path(&(file->f_path), pathname, sizeof(pathname));
388 if (!path)
389 path = "(unknown)";
390 printk(KERN_CRIT
391 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
392 inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
393 vprintk(fmt, args);
394 printk("\n");
395 va_end(args);
396
397 ext4_handle_error(inode->i_sb);
398}
399
350static const char *ext4_decode_error(struct super_block *sb, int errno, 400static const char *ext4_decode_error(struct super_block *sb, int errno,
351 char nbuf[16]) 401 char nbuf[16])
352{ 402{
@@ -450,7 +500,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
450 va_end(args); 500 va_end(args);
451} 501}
452 502
453void ext4_warning(struct super_block *sb, const char *function, 503void __ext4_warning(struct super_block *sb, const char *function,
454 const char *fmt, ...) 504 const char *fmt, ...)
455{ 505{
456 va_list args; 506 va_list args;
@@ -507,7 +557,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
507 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) 557 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
508 return; 558 return;
509 559
510 ext4_warning(sb, __func__, 560 ext4_warning(sb,
511 "updating to rev %d because of new feature flag, " 561 "updating to rev %d because of new feature flag, "
512 "running e2fsck is recommended", 562 "running e2fsck is recommended",
513 EXT4_DYNAMIC_REV); 563 EXT4_DYNAMIC_REV);
@@ -708,7 +758,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
708#ifdef CONFIG_QUOTA 758#ifdef CONFIG_QUOTA
709 ei->i_reserved_quota = 0; 759 ei->i_reserved_quota = 0;
710#endif 760#endif
711 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 761 INIT_LIST_HEAD(&ei->i_completed_io_list);
762 spin_lock_init(&ei->i_completed_io_lock);
712 ei->cur_aio_dio = NULL; 763 ei->cur_aio_dio = NULL;
713 ei->i_sync_tid = 0; 764 ei->i_sync_tid = 0;
714 ei->i_datasync_tid = 0; 765 ei->i_datasync_tid = 0;
@@ -761,6 +812,7 @@ static void destroy_inodecache(void)
761 812
762static void ext4_clear_inode(struct inode *inode) 813static void ext4_clear_inode(struct inode *inode)
763{ 814{
815 dquot_drop(inode);
764 ext4_discard_preallocations(inode); 816 ext4_discard_preallocations(inode);
765 if (EXT4_JOURNAL(inode)) 817 if (EXT4_JOURNAL(inode))
766 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 818 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -796,10 +848,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
796 if (sbi->s_qf_names[GRPQUOTA]) 848 if (sbi->s_qf_names[GRPQUOTA])
797 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 849 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
798 850
799 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) 851 if (test_opt(sb, USRQUOTA))
800 seq_puts(seq, ",usrquota"); 852 seq_puts(seq, ",usrquota");
801 853
802 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) 854 if (test_opt(sb, GRPQUOTA))
803 seq_puts(seq, ",grpquota"); 855 seq_puts(seq, ",grpquota");
804#endif 856#endif
805} 857}
@@ -926,6 +978,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
926 if (test_opt(sb, NOLOAD)) 978 if (test_opt(sb, NOLOAD))
927 seq_puts(seq, ",norecovery"); 979 seq_puts(seq, ",norecovery");
928 980
981 if (test_opt(sb, DIOREAD_NOLOCK))
982 seq_puts(seq, ",dioread_nolock");
983
929 ext4_show_quota_options(seq, sb); 984 ext4_show_quota_options(seq, sb);
930 985
931 return 0; 986 return 0;
@@ -1012,19 +1067,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
1012 const char *data, size_t len, loff_t off); 1067 const char *data, size_t len, loff_t off);
1013 1068
1014static const struct dquot_operations ext4_quota_operations = { 1069static const struct dquot_operations ext4_quota_operations = {
1015 .initialize = dquot_initialize,
1016 .drop = dquot_drop,
1017 .alloc_space = dquot_alloc_space,
1018 .reserve_space = dquot_reserve_space,
1019 .claim_space = dquot_claim_space,
1020 .release_rsv = dquot_release_reserved_space,
1021#ifdef CONFIG_QUOTA 1070#ifdef CONFIG_QUOTA
1022 .get_reserved_space = ext4_get_reserved_space, 1071 .get_reserved_space = ext4_get_reserved_space,
1023#endif 1072#endif
1024 .alloc_inode = dquot_alloc_inode,
1025 .free_space = dquot_free_space,
1026 .free_inode = dquot_free_inode,
1027 .transfer = dquot_transfer,
1028 .write_dquot = ext4_write_dquot, 1073 .write_dquot = ext4_write_dquot,
1029 .acquire_dquot = ext4_acquire_dquot, 1074 .acquire_dquot = ext4_acquire_dquot,
1030 .release_dquot = ext4_release_dquot, 1075 .release_dquot = ext4_release_dquot,
@@ -1109,6 +1154,7 @@ enum {
1109 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1154 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1110 Opt_block_validity, Opt_noblock_validity, 1155 Opt_block_validity, Opt_noblock_validity,
1111 Opt_inode_readahead_blks, Opt_journal_ioprio, 1156 Opt_inode_readahead_blks, Opt_journal_ioprio,
1157 Opt_dioread_nolock, Opt_dioread_lock,
1112 Opt_discard, Opt_nodiscard, 1158 Opt_discard, Opt_nodiscard,
1113}; 1159};
1114 1160
@@ -1176,6 +1222,8 @@ static const match_table_t tokens = {
1176 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1222 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1177 {Opt_auto_da_alloc, "auto_da_alloc"}, 1223 {Opt_auto_da_alloc, "auto_da_alloc"},
1178 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1224 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1225 {Opt_dioread_nolock, "dioread_nolock"},
1226 {Opt_dioread_lock, "dioread_lock"},
1179 {Opt_discard, "discard"}, 1227 {Opt_discard, "discard"},
1180 {Opt_nodiscard, "nodiscard"}, 1228 {Opt_nodiscard, "nodiscard"},
1181 {Opt_err, NULL}, 1229 {Opt_err, NULL},
@@ -1205,6 +1253,66 @@ static ext4_fsblk_t get_sb_block(void **data)
1205} 1253}
1206 1254
1207#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1255#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1256static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1257 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1258
1259#ifdef CONFIG_QUOTA
1260static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1261{
1262 struct ext4_sb_info *sbi = EXT4_SB(sb);
1263 char *qname;
1264
1265 if (sb_any_quota_loaded(sb) &&
1266 !sbi->s_qf_names[qtype]) {
1267 ext4_msg(sb, KERN_ERR,
1268 "Cannot change journaled "
1269 "quota options when quota turned on");
1270 return 0;
1271 }
1272 qname = match_strdup(args);
1273 if (!qname) {
1274 ext4_msg(sb, KERN_ERR,
1275 "Not enough memory for storing quotafile name");
1276 return 0;
1277 }
1278 if (sbi->s_qf_names[qtype] &&
1279 strcmp(sbi->s_qf_names[qtype], qname)) {
1280 ext4_msg(sb, KERN_ERR,
1281 "%s quota file already specified", QTYPE2NAME(qtype));
1282 kfree(qname);
1283 return 0;
1284 }
1285 sbi->s_qf_names[qtype] = qname;
1286 if (strchr(sbi->s_qf_names[qtype], '/')) {
1287 ext4_msg(sb, KERN_ERR,
1288 "quotafile must be on filesystem root");
1289 kfree(sbi->s_qf_names[qtype]);
1290 sbi->s_qf_names[qtype] = NULL;
1291 return 0;
1292 }
1293 set_opt(sbi->s_mount_opt, QUOTA);
1294 return 1;
1295}
1296
1297static int clear_qf_name(struct super_block *sb, int qtype)
1298{
1299
1300 struct ext4_sb_info *sbi = EXT4_SB(sb);
1301
1302 if (sb_any_quota_loaded(sb) &&
1303 sbi->s_qf_names[qtype]) {
1304 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1305 " when quota turned on");
1306 return 0;
1307 }
1308 /*
1309 * The space will be released later when all options are confirmed
1310 * to be correct
1311 */
1312 sbi->s_qf_names[qtype] = NULL;
1313 return 1;
1314}
1315#endif
1208 1316
1209static int parse_options(char *options, struct super_block *sb, 1317static int parse_options(char *options, struct super_block *sb,
1210 unsigned long *journal_devnum, 1318 unsigned long *journal_devnum,
@@ -1217,8 +1325,7 @@ static int parse_options(char *options, struct super_block *sb,
1217 int data_opt = 0; 1325 int data_opt = 0;
1218 int option; 1326 int option;
1219#ifdef CONFIG_QUOTA 1327#ifdef CONFIG_QUOTA
1220 int qtype, qfmt; 1328 int qfmt;
1221 char *qname;
1222#endif 1329#endif
1223 1330
1224 if (!options) 1331 if (!options)
@@ -1229,19 +1336,31 @@ static int parse_options(char *options, struct super_block *sb,
1229 if (!*p) 1336 if (!*p)
1230 continue; 1337 continue;
1231 1338
1339 /*
1340 * Initialize args struct so we know whether arg was
1341 * found; some options take optional arguments.
1342 */
1343 args[0].to = args[0].from = 0;
1232 token = match_token(p, tokens, args); 1344 token = match_token(p, tokens, args);
1233 switch (token) { 1345 switch (token) {
1234 case Opt_bsd_df: 1346 case Opt_bsd_df:
1347 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1235 clear_opt(sbi->s_mount_opt, MINIX_DF); 1348 clear_opt(sbi->s_mount_opt, MINIX_DF);
1236 break; 1349 break;
1237 case Opt_minix_df: 1350 case Opt_minix_df:
1351 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1238 set_opt(sbi->s_mount_opt, MINIX_DF); 1352 set_opt(sbi->s_mount_opt, MINIX_DF);
1353
1239 break; 1354 break;
1240 case Opt_grpid: 1355 case Opt_grpid:
1356 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1241 set_opt(sbi->s_mount_opt, GRPID); 1357 set_opt(sbi->s_mount_opt, GRPID);
1358
1242 break; 1359 break;
1243 case Opt_nogrpid: 1360 case Opt_nogrpid:
1361 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1244 clear_opt(sbi->s_mount_opt, GRPID); 1362 clear_opt(sbi->s_mount_opt, GRPID);
1363
1245 break; 1364 break;
1246 case Opt_resuid: 1365 case Opt_resuid:
1247 if (match_int(&args[0], &option)) 1366 if (match_int(&args[0], &option))
@@ -1378,14 +1497,13 @@ static int parse_options(char *options, struct super_block *sb,
1378 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1497 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1379 datacheck: 1498 datacheck:
1380 if (is_remount) { 1499 if (is_remount) {
1381 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) 1500 if (test_opt(sb, DATA_FLAGS) != data_opt) {
1382 != data_opt) {
1383 ext4_msg(sb, KERN_ERR, 1501 ext4_msg(sb, KERN_ERR,
1384 "Cannot change data mode on remount"); 1502 "Cannot change data mode on remount");
1385 return 0; 1503 return 0;
1386 } 1504 }
1387 } else { 1505 } else {
1388 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; 1506 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1389 sbi->s_mount_opt |= data_opt; 1507 sbi->s_mount_opt |= data_opt;
1390 } 1508 }
1391 break; 1509 break;
@@ -1397,63 +1515,22 @@ static int parse_options(char *options, struct super_block *sb,
1397 break; 1515 break;
1398#ifdef CONFIG_QUOTA 1516#ifdef CONFIG_QUOTA
1399 case Opt_usrjquota: 1517 case Opt_usrjquota:
1400 qtype = USRQUOTA; 1518 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1401 goto set_qf_name;
1402 case Opt_grpjquota:
1403 qtype = GRPQUOTA;
1404set_qf_name:
1405 if (sb_any_quota_loaded(sb) &&
1406 !sbi->s_qf_names[qtype]) {
1407 ext4_msg(sb, KERN_ERR,
1408 "Cannot change journaled "
1409 "quota options when quota turned on");
1410 return 0; 1519 return 0;
1411 } 1520 break;
1412 qname = match_strdup(&args[0]); 1521 case Opt_grpjquota:
1413 if (!qname) { 1522 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1414 ext4_msg(sb, KERN_ERR,
1415 "Not enough memory for "
1416 "storing quotafile name");
1417 return 0;
1418 }
1419 if (sbi->s_qf_names[qtype] &&
1420 strcmp(sbi->s_qf_names[qtype], qname)) {
1421 ext4_msg(sb, KERN_ERR,
1422 "%s quota file already "
1423 "specified", QTYPE2NAME(qtype));
1424 kfree(qname);
1425 return 0;
1426 }
1427 sbi->s_qf_names[qtype] = qname;
1428 if (strchr(sbi->s_qf_names[qtype], '/')) {
1429 ext4_msg(sb, KERN_ERR,
1430 "quotafile must be on "
1431 "filesystem root");
1432 kfree(sbi->s_qf_names[qtype]);
1433 sbi->s_qf_names[qtype] = NULL;
1434 return 0; 1523 return 0;
1435 }
1436 set_opt(sbi->s_mount_opt, QUOTA);
1437 break; 1524 break;
1438 case Opt_offusrjquota: 1525 case Opt_offusrjquota:
1439 qtype = USRQUOTA; 1526 if (!clear_qf_name(sb, USRQUOTA))
1440 goto clear_qf_name; 1527 return 0;
1528 break;
1441 case Opt_offgrpjquota: 1529 case Opt_offgrpjquota:
1442 qtype = GRPQUOTA; 1530 if (!clear_qf_name(sb, GRPQUOTA))
1443clear_qf_name:
1444 if (sb_any_quota_loaded(sb) &&
1445 sbi->s_qf_names[qtype]) {
1446 ext4_msg(sb, KERN_ERR, "Cannot change "
1447 "journaled quota options when "
1448 "quota turned on");
1449 return 0; 1531 return 0;
1450 }
1451 /*
1452 * The space will be released later when all options
1453 * are confirmed to be correct
1454 */
1455 sbi->s_qf_names[qtype] = NULL;
1456 break; 1532 break;
1533
1457 case Opt_jqfmt_vfsold: 1534 case Opt_jqfmt_vfsold:
1458 qfmt = QFMT_VFS_OLD; 1535 qfmt = QFMT_VFS_OLD;
1459 goto set_qf_format; 1536 goto set_qf_format;
@@ -1518,10 +1595,11 @@ set_qf_format:
1518 clear_opt(sbi->s_mount_opt, BARRIER); 1595 clear_opt(sbi->s_mount_opt, BARRIER);
1519 break; 1596 break;
1520 case Opt_barrier: 1597 case Opt_barrier:
1521 if (match_int(&args[0], &option)) { 1598 if (args[0].from) {
1522 set_opt(sbi->s_mount_opt, BARRIER); 1599 if (match_int(&args[0], &option))
1523 break; 1600 return 0;
1524 } 1601 } else
1602 option = 1; /* No argument, default to 1 */
1525 if (option) 1603 if (option)
1526 set_opt(sbi->s_mount_opt, BARRIER); 1604 set_opt(sbi->s_mount_opt, BARRIER);
1527 else 1605 else
@@ -1594,10 +1672,11 @@ set_qf_format:
1594 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1672 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1595 break; 1673 break;
1596 case Opt_auto_da_alloc: 1674 case Opt_auto_da_alloc:
1597 if (match_int(&args[0], &option)) { 1675 if (args[0].from) {
1598 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1676 if (match_int(&args[0], &option))
1599 break; 1677 return 0;
1600 } 1678 } else
1679 option = 1; /* No argument, default to 1 */
1601 if (option) 1680 if (option)
1602 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1681 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1603 else 1682 else
@@ -1609,6 +1688,12 @@ set_qf_format:
1609 case Opt_nodiscard: 1688 case Opt_nodiscard:
1610 clear_opt(sbi->s_mount_opt, DISCARD); 1689 clear_opt(sbi->s_mount_opt, DISCARD);
1611 break; 1690 break;
1691 case Opt_dioread_nolock:
1692 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1693 break;
1694 case Opt_dioread_lock:
1695 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1696 break;
1612 default: 1697 default:
1613 ext4_msg(sb, KERN_ERR, 1698 ext4_msg(sb, KERN_ERR,
1614 "Unrecognized mount option \"%s\" " 1699 "Unrecognized mount option \"%s\" "
@@ -1618,18 +1703,13 @@ set_qf_format:
1618 } 1703 }
1619#ifdef CONFIG_QUOTA 1704#ifdef CONFIG_QUOTA
1620 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1705 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1621 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && 1706 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1622 sbi->s_qf_names[USRQUOTA])
1623 clear_opt(sbi->s_mount_opt, USRQUOTA); 1707 clear_opt(sbi->s_mount_opt, USRQUOTA);
1624 1708
1625 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && 1709 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1626 sbi->s_qf_names[GRPQUOTA])
1627 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1710 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1628 1711
1629 if ((sbi->s_qf_names[USRQUOTA] && 1712 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1630 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1631 (sbi->s_qf_names[GRPQUOTA] &&
1632 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1633 ext4_msg(sb, KERN_ERR, "old and new quota " 1713 ext4_msg(sb, KERN_ERR, "old and new quota "
1634 "format mixing"); 1714 "format mixing");
1635 return 0; 1715 return 0;
@@ -1939,7 +2019,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1939 } 2019 }
1940 2020
1941 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2021 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1942 vfs_dq_init(inode); 2022 dquot_initialize(inode);
1943 if (inode->i_nlink) { 2023 if (inode->i_nlink) {
1944 ext4_msg(sb, KERN_DEBUG, 2024 ext4_msg(sb, KERN_DEBUG,
1945 "%s: truncating inode %lu to %lld bytes", 2025 "%s: truncating inode %lu to %lld bytes",
@@ -2292,7 +2372,7 @@ static void ext4_sb_release(struct kobject *kobj)
2292} 2372}
2293 2373
2294 2374
2295static struct sysfs_ops ext4_attr_ops = { 2375static const struct sysfs_ops ext4_attr_ops = {
2296 .show = ext4_attr_show, 2376 .show = ext4_attr_show,
2297 .store = ext4_attr_store, 2377 .store = ext4_attr_store,
2298}; 2378};
@@ -2432,8 +2512,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2432 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2512 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
2433 if (def_mount_opts & EXT4_DEFM_DEBUG) 2513 if (def_mount_opts & EXT4_DEFM_DEBUG)
2434 set_opt(sbi->s_mount_opt, DEBUG); 2514 set_opt(sbi->s_mount_opt, DEBUG);
2435 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) 2515 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
2516 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
2517 "2.6.38");
2436 set_opt(sbi->s_mount_opt, GRPID); 2518 set_opt(sbi->s_mount_opt, GRPID);
2519 }
2437 if (def_mount_opts & EXT4_DEFM_UID16) 2520 if (def_mount_opts & EXT4_DEFM_UID16)
2438 set_opt(sbi->s_mount_opt, NO_UID32); 2521 set_opt(sbi->s_mount_opt, NO_UID32);
2439#ifdef CONFIG_EXT4_FS_XATTR 2522#ifdef CONFIG_EXT4_FS_XATTR
@@ -2445,11 +2528,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2445 set_opt(sbi->s_mount_opt, POSIX_ACL); 2528 set_opt(sbi->s_mount_opt, POSIX_ACL);
2446#endif 2529#endif
2447 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 2530 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2448 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; 2531 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2449 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 2532 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2450 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; 2533 set_opt(sbi->s_mount_opt, ORDERED_DATA);
2451 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 2534 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2452 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; 2535 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2453 2536
2454 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 2537 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2455 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 2538 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2470,14 +2553,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2470 * enable delayed allocation by default 2553 * enable delayed allocation by default
2471 * Use -o nodelalloc to turn it off 2554 * Use -o nodelalloc to turn it off
2472 */ 2555 */
2473 set_opt(sbi->s_mount_opt, DELALLOC); 2556 if (!IS_EXT3_SB(sb))
2557 set_opt(sbi->s_mount_opt, DELALLOC);
2474 2558
2475 if (!parse_options((char *) data, sb, &journal_devnum, 2559 if (!parse_options((char *) data, sb, &journal_devnum,
2476 &journal_ioprio, NULL, 0)) 2560 &journal_ioprio, NULL, 0))
2477 goto failed_mount; 2561 goto failed_mount;
2478 2562
2479 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2563 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2480 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2564 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2481 2565
2482 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 2566 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
2483 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || 2567 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2766,7 +2850,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2766 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2850 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2767 ext4_msg(sb, KERN_ERR, "required journal recovery " 2851 ext4_msg(sb, KERN_ERR, "required journal recovery "
2768 "suppressed and not mounted read-only"); 2852 "suppressed and not mounted read-only");
2769 goto failed_mount4; 2853 goto failed_mount_wq;
2770 } else { 2854 } else {
2771 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2855 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2772 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 2856 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2779,7 +2863,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2779 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2863 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2780 JBD2_FEATURE_INCOMPAT_64BIT)) { 2864 JBD2_FEATURE_INCOMPAT_64BIT)) {
2781 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 2865 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2782 goto failed_mount4; 2866 goto failed_mount_wq;
2783 } 2867 }
2784 2868
2785 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2869 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2818,7 +2902,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2818 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2902 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2819 ext4_msg(sb, KERN_ERR, "Journal does not support " 2903 ext4_msg(sb, KERN_ERR, "Journal does not support "
2820 "requested data journaling mode"); 2904 "requested data journaling mode");
2821 goto failed_mount4; 2905 goto failed_mount_wq;
2822 } 2906 }
2823 default: 2907 default:
2824 break; 2908 break;
@@ -2826,13 +2910,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2826 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2827 2911
2828no_journal: 2912no_journal:
2829
2830 if (test_opt(sb, NOBH)) { 2913 if (test_opt(sb, NOBH)) {
2831 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2832 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2833 "its supported only with writeback mode"); 2916 "its supported only with writeback mode");
2834 clear_opt(sbi->s_mount_opt, NOBH); 2917 clear_opt(sbi->s_mount_opt, NOBH);
2835 } 2918 }
2919 if (test_opt(sb, DIOREAD_NOLOCK)) {
2920 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2921 "not supported with nobh mode");
2922 goto failed_mount_wq;
2923 }
2836 } 2924 }
2837 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 2925 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2838 if (!EXT4_SB(sb)->dio_unwritten_wq) { 2926 if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2897,6 +2985,18 @@ no_journal:
2897 "requested data journaling mode"); 2985 "requested data journaling mode");
2898 clear_opt(sbi->s_mount_opt, DELALLOC); 2986 clear_opt(sbi->s_mount_opt, DELALLOC);
2899 } 2987 }
2988 if (test_opt(sb, DIOREAD_NOLOCK)) {
2989 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2990 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2991 "option - requested data journaling mode");
2992 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2993 }
2994 if (sb->s_blocksize < PAGE_SIZE) {
2995 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2996 "option - block size is too small");
2997 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2998 }
2999 }
2900 3000
2901 err = ext4_setup_system_zone(sb); 3001 err = ext4_setup_system_zone(sb);
2902 if (err) { 3002 if (err) {
@@ -3360,10 +3460,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
3360 char nbuf[16]; 3460 char nbuf[16];
3361 3461
3362 errstr = ext4_decode_error(sb, j_errno, nbuf); 3462 errstr = ext4_decode_error(sb, j_errno, nbuf);
3363 ext4_warning(sb, __func__, "Filesystem error recorded " 3463 ext4_warning(sb, "Filesystem error recorded "
3364 "from previous mount: %s", errstr); 3464 "from previous mount: %s", errstr);
3365 ext4_warning(sb, __func__, "Marking fs in need of " 3465 ext4_warning(sb, "Marking fs in need of filesystem check.");
3366 "filesystem check.");
3367 3466
3368 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 3467 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
3369 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 3468 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3514,7 +3613,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3514 ext4_abort(sb, __func__, "Abort forced by user"); 3613 ext4_abort(sb, __func__, "Abort forced by user");
3515 3614
3516 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3615 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3517 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 3616 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3518 3617
3519 es = sbi->s_es; 3618 es = sbi->s_es;
3520 3619
@@ -3708,7 +3807,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3708 * Process 1 Process 2 3807 * Process 1 Process 2
3709 * ext4_create() quota_sync() 3808 * ext4_create() quota_sync()
3710 * jbd2_journal_start() write_dquot() 3809 * jbd2_journal_start() write_dquot()
3711 * vfs_dq_init() down(dqio_mutex) 3810 * dquot_initialize() down(dqio_mutex)
3712 * down(dqio_mutex) jbd2_journal_start() 3811 * down(dqio_mutex) jbd2_journal_start()
3713 * 3812 *
3714 */ 3813 */
@@ -3917,9 +4016,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3917 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4016 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
3918 int err = 0; 4017 int err = 0;
3919 int offset = off & (sb->s_blocksize - 1); 4018 int offset = off & (sb->s_blocksize - 1);
3920 int tocopy;
3921 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; 4019 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
3922 size_t towrite = len;
3923 struct buffer_head *bh; 4020 struct buffer_head *bh;
3924 handle_t *handle = journal_current_handle(); 4021 handle_t *handle = journal_current_handle();
3925 4022
@@ -3929,52 +4026,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3929 (unsigned long long)off, (unsigned long long)len); 4026 (unsigned long long)off, (unsigned long long)len);
3930 return -EIO; 4027 return -EIO;
3931 } 4028 }
4029 /*
4030 * Since we account only one data block in transaction credits,
4031 * then it is impossible to cross a block boundary.
4032 */
4033 if (sb->s_blocksize - offset < len) {
4034 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
4035 " cancelled because not block aligned",
4036 (unsigned long long)off, (unsigned long long)len);
4037 return -EIO;
4038 }
4039
3932 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 4040 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
3933 while (towrite > 0) { 4041 bh = ext4_bread(handle, inode, blk, 1, &err);
3934 tocopy = sb->s_blocksize - offset < towrite ? 4042 if (!bh)
3935 sb->s_blocksize - offset : towrite; 4043 goto out;
3936 bh = ext4_bread(handle, inode, blk, 1, &err); 4044 if (journal_quota) {
3937 if (!bh) 4045 err = ext4_journal_get_write_access(handle, bh);
4046 if (err) {
4047 brelse(bh);
3938 goto out; 4048 goto out;
3939 if (journal_quota) {
3940 err = ext4_journal_get_write_access(handle, bh);
3941 if (err) {
3942 brelse(bh);
3943 goto out;
3944 }
3945 }
3946 lock_buffer(bh);
3947 memcpy(bh->b_data+offset, data, tocopy);
3948 flush_dcache_page(bh->b_page);
3949 unlock_buffer(bh);
3950 if (journal_quota)
3951 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3952 else {
3953 /* Always do at least ordered writes for quotas */
3954 err = ext4_jbd2_file_inode(handle, inode);
3955 mark_buffer_dirty(bh);
3956 } 4049 }
3957 brelse(bh);
3958 if (err)
3959 goto out;
3960 offset = 0;
3961 towrite -= tocopy;
3962 data += tocopy;
3963 blk++;
3964 } 4050 }
4051 lock_buffer(bh);
4052 memcpy(bh->b_data+offset, data, len);
4053 flush_dcache_page(bh->b_page);
4054 unlock_buffer(bh);
4055 if (journal_quota)
4056 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4057 else {
4058 /* Always do at least ordered writes for quotas */
4059 err = ext4_jbd2_file_inode(handle, inode);
4060 mark_buffer_dirty(bh);
4061 }
4062 brelse(bh);
3965out: 4063out:
3966 if (len == towrite) { 4064 if (err) {
3967 mutex_unlock(&inode->i_mutex); 4065 mutex_unlock(&inode->i_mutex);
3968 return err; 4066 return err;
3969 } 4067 }
3970 if (inode->i_size < off+len-towrite) { 4068 if (inode->i_size < off + len) {
3971 i_size_write(inode, off+len-towrite); 4069 i_size_write(inode, off + len);
3972 EXT4_I(inode)->i_disksize = inode->i_size; 4070 EXT4_I(inode)->i_disksize = inode->i_size;
3973 } 4071 }
3974 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 4072 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3975 ext4_mark_inode_dirty(handle, inode); 4073 ext4_mark_inode_dirty(handle, inode);
3976 mutex_unlock(&inode->i_mutex); 4074 mutex_unlock(&inode->i_mutex);
3977 return len - towrite; 4075 return len;
3978} 4076}
3979 4077
3980#endif 4078#endif
@@ -3985,7 +4083,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
3985 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4083 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3986} 4084}
3987 4085
3988#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4086#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
3989static struct file_system_type ext2_fs_type = { 4087static struct file_system_type ext2_fs_type = {
3990 .owner = THIS_MODULE, 4088 .owner = THIS_MODULE,
3991 .name = "ext2", 4089 .name = "ext2",
@@ -4012,15 +4110,7 @@ static inline void register_as_ext2(void) { }
4012static inline void unregister_as_ext2(void) { } 4110static inline void unregister_as_ext2(void) { }
4013#endif 4111#endif
4014 4112
4015#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4113#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4016static struct file_system_type ext3_fs_type = {
4017 .owner = THIS_MODULE,
4018 .name = "ext3",
4019 .get_sb = ext4_get_sb,
4020 .kill_sb = kill_block_super,
4021 .fs_flags = FS_REQUIRES_DEV,
4022};
4023
4024static inline void register_as_ext3(void) 4114static inline void register_as_ext3(void)
4025{ 4115{
4026 int err = register_filesystem(&ext3_fs_type); 4116 int err = register_filesystem(&ext3_fs_type);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7ed45aa..b4c5aa8489d8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
227 ea_bdebug(bh, "b_count=%d, refcount=%d", 227 ea_bdebug(bh, "b_count=%d, refcount=%d",
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: ext4_error(inode->i_sb, __func__, 230bad_block:
231 ext4_error(inode->i_sb,
231 "inode %lu: bad block %llu", inode->i_ino, 232 "inode %lu: bad block %llu", inode->i_ino,
232 EXT4_I(inode)->i_file_acl); 233 EXT4_I(inode)->i_file_acl);
233 error = -EIO; 234 error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
267 void *end; 268 void *end;
268 int error; 269 int error;
269 270
270 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 271 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
271 return -ENODATA; 272 return -ENODATA;
272 error = ext4_get_inode_loc(inode, &iloc); 273 error = ext4_get_inode_loc(inode, &iloc);
273 if (error) 274 if (error)
@@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
371 ea_bdebug(bh, "b_count=%d, refcount=%d", 372 ea_bdebug(bh, "b_count=%d, refcount=%d",
372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
373 if (ext4_xattr_check_block(bh)) { 374 if (ext4_xattr_check_block(bh)) {
374 ext4_error(inode->i_sb, __func__, 375 ext4_error(inode->i_sb,
375 "inode %lu: bad block %llu", inode->i_ino, 376 "inode %lu: bad block %llu", inode->i_ino,
376 EXT4_I(inode)->i_file_acl); 377 EXT4_I(inode)->i_file_acl);
377 error = -EIO; 378 error = -EIO;
@@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
396 void *end; 397 void *end;
397 int error; 398 int error;
398 399
399 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 400 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
400 return 0; 401 return 0;
401 error = ext4_get_inode_loc(inode, &iloc); 402 error = ext4_get_inode_loc(inode, &iloc);
402 if (error) 403 if (error)
@@ -494,7 +495,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
494 error = ext4_handle_dirty_metadata(handle, inode, bh); 495 error = ext4_handle_dirty_metadata(handle, inode, bh);
495 if (IS_SYNC(inode)) 496 if (IS_SYNC(inode))
496 ext4_handle_sync(handle); 497 ext4_handle_sync(handle);
497 vfs_dq_free_block(inode, 1); 498 dquot_free_block(inode, 1);
498 ea_bdebug(bh, "refcount now=%d; releasing", 499 ea_bdebug(bh, "refcount now=%d; releasing",
499 le32_to_cpu(BHDR(bh)->h_refcount)); 500 le32_to_cpu(BHDR(bh)->h_refcount));
500 if (ce) 501 if (ce)
@@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
665 atomic_read(&(bs->bh->b_count)), 666 atomic_read(&(bs->bh->b_count)),
666 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 667 le32_to_cpu(BHDR(bs->bh)->h_refcount));
667 if (ext4_xattr_check_block(bs->bh)) { 668 if (ext4_xattr_check_block(bs->bh)) {
668 ext4_error(sb, __func__, 669 ext4_error(sb, "inode %lu: bad block %llu",
669 "inode %lu: bad block %llu", inode->i_ino, 670 inode->i_ino, EXT4_I(inode)->i_file_acl);
670 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 671 error = -EIO;
672 goto cleanup; 672 goto cleanup;
673 } 673 }
@@ -787,8 +787,8 @@ inserted:
787 else { 787 else {
788 /* The old block is released after updating 788 /* The old block is released after updating
789 the inode. */ 789 the inode. */
790 error = -EDQUOT; 790 error = dquot_alloc_block(inode, 1);
791 if (vfs_dq_alloc_block(inode, 1)) 791 if (error)
792 goto cleanup; 792 goto cleanup;
793 error = ext4_journal_get_write_access(handle, 793 error = ext4_journal_get_write_access(handle,
794 new_bh); 794 new_bh);
@@ -876,13 +876,12 @@ cleanup:
876 return error; 876 return error;
877 877
878cleanup_dquot: 878cleanup_dquot:
879 vfs_dq_free_block(inode, 1); 879 dquot_free_block(inode, 1);
880 goto cleanup; 880 goto cleanup;
881 881
882bad_block: 882bad_block:
883 ext4_error(inode->i_sb, __func__, 883 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
884 "inode %lu: bad block %llu", inode->i_ino, 884 inode->i_ino, EXT4_I(inode)->i_file_acl);
885 EXT4_I(inode)->i_file_acl);
886 goto cleanup; 885 goto cleanup;
887 886
888#undef header 887#undef header
@@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
908 is->s.base = is->s.first = IFIRST(header); 907 is->s.base = is->s.first = IFIRST(header);
909 is->s.here = is->s.first; 908 is->s.here = is->s.first;
910 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 909 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
911 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 910 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
912 error = ext4_xattr_check_names(IFIRST(header), is->s.end); 911 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
913 if (error) 912 if (error)
914 return error; 913 return error;
@@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
940 header = IHDR(inode, ext4_raw_inode(&is->iloc)); 939 header = IHDR(inode, ext4_raw_inode(&is->iloc));
941 if (!IS_LAST_ENTRY(s->first)) { 940 if (!IS_LAST_ENTRY(s->first)) {
942 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); 941 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
943 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; 942 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
944 } else { 943 } else {
945 header->h_magic = cpu_to_le32(0); 944 header->h_magic = cpu_to_le32(0);
946 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; 945 ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
947 } 946 }
948 return 0; 947 return 0;
949} 948}
@@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
986 if (strlen(name) > 255) 985 if (strlen(name) > 255)
987 return -ERANGE; 986 return -ERANGE;
988 down_write(&EXT4_I(inode)->xattr_sem); 987 down_write(&EXT4_I(inode)->xattr_sem);
989 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; 988 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
990 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 989 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
991 990
992 error = ext4_get_inode_loc(inode, &is.iloc); 991 error = ext4_get_inode_loc(inode, &is.iloc);
993 if (error) 992 if (error)
@@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
997 if (error) 996 if (error)
998 goto cleanup; 997 goto cleanup;
999 998
1000 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 999 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
1001 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 1000 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
1002 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 1001 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
1003 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; 1002 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
1004 } 1003 }
1005 1004
1006 error = ext4_xattr_ibody_find(inode, &i, &is); 1005 error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1052 ext4_xattr_update_super_block(handle, inode->i_sb); 1051 ext4_xattr_update_super_block(handle, inode->i_sb);
1053 inode->i_ctime = ext4_current_time(inode); 1052 inode->i_ctime = ext4_current_time(inode);
1054 if (!value) 1053 if (!value)
1055 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1054 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1056 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 1055 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1057 /* 1056 /*
1058 * The bh is consumed by ext4_mark_iloc_dirty, even with 1057 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1066,7 @@ cleanup:
1067 brelse(is.iloc.bh); 1066 brelse(is.iloc.bh);
1068 brelse(bs.bh); 1067 brelse(bs.bh);
1069 if (no_expand == 0) 1068 if (no_expand == 0)
1070 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1069 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1071 up_write(&EXT4_I(inode)->xattr_sem); 1070 up_write(&EXT4_I(inode)->xattr_sem);
1072 return error; 1071 return error;
1073} 1072}
@@ -1195,9 +1194,8 @@ retry:
1195 if (!bh) 1194 if (!bh)
1196 goto cleanup; 1195 goto cleanup;
1197 if (ext4_xattr_check_block(bh)) { 1196 if (ext4_xattr_check_block(bh)) {
1198 ext4_error(inode->i_sb, __func__, 1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1199 "inode %lu: bad block %llu", inode->i_ino, 1198 inode->i_ino, EXT4_I(inode)->i_file_acl);
1200 EXT4_I(inode)->i_file_acl);
1201 error = -EIO; 1199 error = -EIO;
1202 goto cleanup; 1200 goto cleanup;
1203 } 1201 }
@@ -1302,6 +1300,8 @@ retry:
1302 1300
1303 /* Remove the chosen entry from the inode */ 1301 /* Remove the chosen entry from the inode */
1304 error = ext4_xattr_ibody_set(handle, inode, &i, is); 1302 error = ext4_xattr_ibody_set(handle, inode, &i, is);
1303 if (error)
1304 goto cleanup;
1305 1305
1306 entry = IFIRST(header); 1306 entry = IFIRST(header);
1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) 1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1372,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1372 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1374 if (!bh) {
1375 ext4_error(inode->i_sb, __func__, 1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error",
1376 "inode %lu: block %llu read error", inode->i_ino, 1376 inode->i_ino, EXT4_I(inode)->i_file_acl);
1377 EXT4_I(inode)->i_file_acl);
1378 goto cleanup; 1377 goto cleanup;
1379 } 1378 }
1380 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1381 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1382 ext4_error(inode->i_sb, __func__, 1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1383 "inode %lu: bad block %llu", inode->i_ino, 1382 inode->i_ino, EXT4_I(inode)->i_file_acl);
1384 EXT4_I(inode)->i_file_acl);
1385 goto cleanup; 1383 goto cleanup;
1386 } 1384 }
1387 ext4_xattr_release_block(handle, inode, bh); 1385 ext4_xattr_release_block(handle, inode, bh);
@@ -1506,7 +1504,7 @@ again:
1506 } 1504 }
1507 bh = sb_bread(inode->i_sb, ce->e_block); 1505 bh = sb_bread(inode->i_sb, ce->e_block);
1508 if (!bh) { 1506 if (!bh) {
1509 ext4_error(inode->i_sb, __func__, 1507 ext4_error(inode->i_sb,
1510 "inode %lu: block %lu read error", 1508 "inode %lu: block %lu read error",
1511 inode->i_ino, (unsigned long) ce->e_block); 1509 inode->i_ino, (unsigned long) ce->e_block);
1512 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 14da530b05ca..0ce143bd7d56 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
558 buf->f_bavail = sbi->free_clusters; 558 buf->f_bavail = sbi->free_clusters;
559 buf->f_fsid.val[0] = (u32)id; 559 buf->f_fsid.val[0] = (u32)id;
560 buf->f_fsid.val[1] = (u32)(id >> 32); 560 buf->f_fsid.val[1] = (u32)(id >> 32);
561 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 561 buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
562 562
563 return 0; 563 return 0;
564} 564}
@@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
577 return i_pos; 577 return i_pos;
578} 578}
579 579
580static int fat_write_inode(struct inode *inode, int wait) 580static int __fat_write_inode(struct inode *inode, int wait)
581{ 581{
582 struct super_block *sb = inode->i_sb; 582 struct super_block *sb = inode->i_sb;
583 struct msdos_sb_info *sbi = MSDOS_SB(sb); 583 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +634,14 @@ retry:
634 return err; 634 return err;
635} 635}
636 636
637static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
638{
639 return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
640}
641
637int fat_sync_inode(struct inode *inode) 642int fat_sync_inode(struct inode *inode)
638{ 643{
639 return fat_write_inode(inode, 1); 644 return __fat_write_inode(inode, 1);
640} 645}
641 646
642EXPORT_SYMBOL_GPL(fat_sync_inode); 647EXPORT_SYMBOL_GPL(fat_sync_inode);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
309{ 309{
310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; 310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
311 wchar_t *ip, *ext_start, *end, *name_start; 311 wchar_t *ip, *ext_start, *end, *name_start;
312 unsigned char base[9], ext[4], buf[8], *p; 312 unsigned char base[9], ext[4], buf[5], *p;
313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; 313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
314 int chl, chi; 314 int chl, chi;
315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; 315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
467 return 0; 467 return 0;
468 } 468 }
469 469
470 i = jiffies & 0xffff; 470 i = jiffies;
471 sz = (jiffies >> 16) & 0x7; 471 sz = (jiffies >> 16) & 0x7;
472 if (baselen > 2) { 472 if (baselen > 2) {
473 baselen = numtail2_baselen; 473 baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
476 name_res[baselen + 4] = '~'; 476 name_res[baselen + 4] = '~';
477 name_res[baselen + 5] = '1' + sz; 477 name_res[baselen + 5] = '1' + sz;
478 while (1) { 478 while (1) {
479 sprintf(buf, "%04X", i); 479 snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
480 memcpy(&name_res[baselen], buf, 4); 480 memcpy(&name_res[baselen], buf, 4);
481 if (vfat_find_form(dir, name_res) < 0) 481 if (vfat_find_form(dir, name_res) < 0)
482 break; 482 break;
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 if (*outlen < 0) 503 if (*outlen < 0)
504 return *outlen; 504 return *outlen;
505 else if (*outlen > 255) 505 else if (*outlen > FAT_LFN_LEN)
506 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
507 507
508 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
509 } else { 509 } else {
510 if (nls) { 510 if (nls) {
511 for (i = 0, ip = name, op = outname, *outlen = 0; 511 for (i = 0, ip = name, op = outname, *outlen = 0;
512 i < len && *outlen <= 255; 512 i < len && *outlen <= FAT_LFN_LEN;
513 *outlen += 1) 513 *outlen += 1)
514 { 514 {
515 if (escape && (*ip == ':')) { 515 if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
549 return -ENAMETOOLONG; 549 return -ENAMETOOLONG;
550 } else { 550 } else {
551 for (i = 0, ip = name, op = outname, *outlen = 0; 551 for (i = 0, ip = name, op = outname, *outlen = 0;
552 i < len && *outlen <= 255; 552 i < len && *outlen <= FAT_LFN_LEN;
553 i++, *outlen += 1) 553 i++, *outlen += 1)
554 { 554 {
555 *op++ = *ip++; 555 *op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
701 return fat_search_long(dir, qname->name, len, sinfo); 701 return fat_search_long(dir, qname->name, len, sinfo);
702} 702}
703 703
704/*
705 * (nfsd's) anonymous disconnected dentry?
706 * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
707 */
708static int vfat_d_anon_disconn(struct dentry *dentry)
709{
710 return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
711}
712
704static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, 713static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
705 struct nameidata *nd) 714 struct nameidata *nd)
706{ 715{
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
729 } 738 }
730 739
731 alias = d_find_alias(inode); 740 alias = d_find_alias(inode);
732 if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { 741 if (alias && !vfat_d_anon_disconn(alias)) {
733 /* 742 /*
734 * This inode has non DCACHE_DISCONNECTED dentry. This 743 * This inode has non anonymous-DCACHE_DISCONNECTED
735 * means, the user did ->lookup() by an another name 744 * dentry. This means, the user did ->lookup() by an
736 * (longname vs 8.3 alias of it) in past. 745 * another name (longname vs 8.3 alias of it) in past.
737 * 746 *
738 * Switch to new one for reason of locality if possible. 747 * Switch to new one for reason of locality if possible.
739 */ 748 */
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
743 iput(inode); 752 iput(inode);
744 unlock_super(sb); 753 unlock_super(sb);
745 return alias; 754 return alias;
746 } 755 } else
756 dput(alias);
757
747out: 758out:
748 unlock_super(sb); 759 unlock_super(sb);
749 dentry->d_op = sb->s_root->d_op; 760 dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 97e01dc0d95f..452d02f9075e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
344 switch (cmd) { 344 switch (cmd) {
345 case F_DUPFD: 345 case F_DUPFD:
346 case F_DUPFD_CLOEXEC: 346 case F_DUPFD_CLOEXEC:
347 if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 347 if (arg >= rlimit(RLIMIT_NOFILE))
348 break; 348 break;
349 err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); 349 err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
350 if (err >= 0) { 350 if (err >= 0) {
diff --git a/fs/file.c b/fs/file.c
index 38039af67663..34bb7f71d994 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr)
257 * N.B. For clone tasks sharing a files structure, this test 257 * N.B. For clone tasks sharing a files structure, this test
258 * will limit the total number of files that can be opened. 258 * will limit the total number of files that can be opened.
259 */ 259 */
260 if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 260 if (nr >= rlimit(RLIMIT_NOFILE))
261 return -EMFILE; 261 return -EMFILE;
262 262
263 /* Do we need to expand? */ 263 /* Do we need to expand? */
diff --git a/fs/file_table.c b/fs/file_table.c
index b98404b54383..32d12b78bac8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -393,7 +393,9 @@ retry:
393 continue; 393 continue;
394 if (!(f->f_mode & FMODE_WRITE)) 394 if (!(f->f_mode & FMODE_WRITE))
395 continue; 395 continue;
396 spin_lock(&f->f_lock);
396 f->f_mode &= ~FMODE_WRITE; 397 f->f_mode &= ~FMODE_WRITE;
398 spin_unlock(&f->f_lock);
397 if (file_check_writeable(f) != 0) 399 if (file_check_writeable(f) != 0)
398 continue; 400 continue;
399 file_release_write(f); 401 file_release_write(f);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c64ff4..76fc4d594acb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
381 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 381 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
382} 382}
383 383
384static int write_inode(struct inode *inode, int sync) 384static int write_inode(struct inode *inode, struct writeback_control *wbc)
385{ 385{
386 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 386 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
387 return inode->i_sb->s_op->write_inode(inode, sync); 387 return inode->i_sb->s_op->write_inode(inode, wbc);
388 return 0; 388 return 0;
389} 389}
390 390
@@ -421,7 +421,6 @@ static int
421writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 421writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
422{ 422{
423 struct address_space *mapping = inode->i_mapping; 423 struct address_space *mapping = inode->i_mapping;
424 int wait = wbc->sync_mode == WB_SYNC_ALL;
425 unsigned dirty; 424 unsigned dirty;
426 int ret; 425 int ret;
427 426
@@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
439 * We'll have another go at writing back this inode when we 438 * We'll have another go at writing back this inode when we
440 * completed a full scan of b_io. 439 * completed a full scan of b_io.
441 */ 440 */
442 if (!wait) { 441 if (wbc->sync_mode != WB_SYNC_ALL) {
443 requeue_io(inode); 442 requeue_io(inode);
444 return 0; 443 return 0;
445 } 444 }
@@ -461,15 +460,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
461 460
462 ret = do_writepages(mapping, wbc); 461 ret = do_writepages(mapping, wbc);
463 462
464 /* Don't write the inode if only I_DIRTY_PAGES was set */ 463 /*
465 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 464 * Make sure to wait on the data before writing out the metadata.
466 int err = write_inode(inode, wait); 465 * This is important for filesystems that modify metadata on data
466 * I/O completion.
467 */
468 if (wbc->sync_mode == WB_SYNC_ALL) {
469 int err = filemap_fdatawait(mapping);
467 if (ret == 0) 470 if (ret == 0)
468 ret = err; 471 ret = err;
469 } 472 }
470 473
471 if (wait) { 474 /* Don't write the inode if only I_DIRTY_PAGES was set */
472 int err = filemap_fdatawait(mapping); 475 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
476 int err = write_inode(inode, wbc);
473 if (ret == 0) 477 if (ret == 0)
474 ret = err; 478 ret = err;
475 } 479 }
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a242..cc94bb9563f2 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
1 1
2config FSCACHE 2config FSCACHE
3 tristate "General filesystem local caching manager" 3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK 4 select SLOW_WORK
6 help 5 help
7 This option enables a generic filesystem caching manager that can be 6 This option enables a generic filesystem caching manager that can be
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
53static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
55static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC 56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); 57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif 58#endif
59static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
69 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC 72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc, 73 .desc = fscache_object_slow_work_desc,
74#endif 74#endif
75}; 75};
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
364/* 364/*
365 * describe an object for slow-work debugging 365 * describe an object for slow-work debugging
366 */ 366 */
367#ifdef CONFIG_SLOW_WORK_PROC 367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work, 368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m) 369 struct seq_file *m)
370{ 370{
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..9f6c928d4586 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -500,7 +500,7 @@ static void fscache_op_execute(struct slow_work *work)
500/* 500/*
501 * describe an operation for slow-work debugging 501 * describe an operation for slow-work debugging
502 */ 502 */
503#ifdef CONFIG_SLOW_WORK_PROC 503#ifdef CONFIG_SLOW_WORK_DEBUG
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m) 504static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{ 505{
506 struct fscache_operation *op = 506 struct fscache_operation *op =
@@ -517,7 +517,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
517 .get_ref = fscache_op_get_ref, 517 .get_ref = fscache_op_get_ref,
518 .put_ref = fscache_op_put_ref, 518 .put_ref = fscache_op_put_ref,
519 .execute = fscache_op_execute, 519 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC 520#ifdef CONFIG_SLOW_WORK_DEBUG
521 .desc = fscache_op_desc, 521 .desc = fscache_op_desc,
522#endif 522#endif
523}; 523};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..69809024d71d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -881,6 +881,7 @@ submit_failed:
881 goto nobufs; 881 goto nobufs;
882 882
883nobufs_unlock_obj: 883nobufs_unlock_obj:
884 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 885 spin_unlock(&object->lock);
885nobufs: 886nobufs:
886 spin_unlock(&cookie->lock); 887 spin_unlock(&cookie->lock);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..eb7e9423691f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
865 865
866 down_read(&fc->killsb); 866 down_read(&fc->killsb);
867 err = -ENOENT; 867 err = -ENOENT;
868 if (!fc->sb) 868 if (fc->sb) {
869 goto err_unlock; 869 err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
870 870 outarg.off, outarg.len);
871 err = fuse_reverse_inval_inode(fc->sb, outarg.ino, 871 }
872 outarg.off, outarg.len);
873
874err_unlock:
875 up_read(&fc->killsb); 872 up_read(&fc->killsb);
876 return err; 873 return err;
877 874
@@ -884,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
884 struct fuse_copy_state *cs) 881 struct fuse_copy_state *cs)
885{ 882{
886 struct fuse_notify_inval_entry_out outarg; 883 struct fuse_notify_inval_entry_out outarg;
887 int err = -EINVAL; 884 int err = -ENOMEM;
888 char buf[FUSE_NAME_MAX+1]; 885 char *buf;
889 struct qstr name; 886 struct qstr name;
890 887
888 buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
889 if (!buf)
890 goto err;
891
892 err = -EINVAL;
891 if (size < sizeof(outarg)) 893 if (size < sizeof(outarg))
892 goto err; 894 goto err;
893 895
@@ -910,16 +912,14 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
910 912
911 down_read(&fc->killsb); 913 down_read(&fc->killsb);
912 err = -ENOENT; 914 err = -ENOENT;
913 if (!fc->sb) 915 if (fc->sb)
914 goto err_unlock; 916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
915
916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
917
918err_unlock:
919 up_read(&fc->killsb); 917 up_read(&fc->killsb);
918 kfree(buf);
920 return err; 919 return err;
921 920
922err: 921err:
922 kfree(buf);
923 fuse_copy_finish(cs); 923 fuse_copy_finish(cs);
924 return err; 924 return err;
925} 925}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24b..ec14d19ce501 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
850 req->in.args[0].size = sizeof(*arg); 850 req->in.args[0].size = sizeof(*arg);
851 req->in.args[0].value = arg; 851 req->in.args[0].value = arg;
852 req->out.numargs = 1; 852 req->out.numargs = 1;
853 /* Variable length arguement used for backward compatibility 853 /* Variable length argument used for backward compatibility
854 with interface version < 7.5. Rest of init_out is zeroed 854 with interface version < 7.5. Rest of init_out is zeroed
855 by do_get_request(), so a short reply is not a problem */ 855 by do_get_request(), so a short reply is not a problem */
856 req->out.argvar = 1; 856 req->out.argvar = 1;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326f..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK 10 select SLOW_WORK
11 select QUOTA
12 select QUOTACTL 11 select QUOTACTL
13 help 12 help
14 A cluster filesystem. 13 A cluster filesystem.
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da9415267..0c1d0b82dcf1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1061,8 +1061,8 @@ out:
1061 1061
1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask) 1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1063{ 1063{
1064 struct inode *aspace = page->mapping->host; 1064 struct address_space *mapping = page->mapping;
1065 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; 1065 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
1066 struct buffer_head *bh, *head; 1066 struct buffer_head *bh, *head;
1067 struct gfs2_bufdata *bd; 1067 struct gfs2_bufdata *bd;
1068 1068
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a6abbae8a278..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -640,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
640 640
641 if (!(fl->fl_flags & FL_POSIX)) 641 if (!(fl->fl_flags & FL_POSIX))
642 return -ENOLCK; 642 return -ENOLCK;
643 if (__mandatory_lock(&ip->i_inode)) 643 if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
644 return -ENOLCK; 644 return -ENOLCK;
645 645
646 if (cmd == F_CANCELLK) { 646 if (cmd == F_CANCELLK) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42663325931..454d4b4eb36b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
19#include <linux/list.h> 19#include <linux/list.h>
20#include <linux/wait.h> 20#include <linux/wait.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/rwsem.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24#include <linux/seq_file.h> 23#include <linux/seq_file.h>
25#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
60#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) 59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
61static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
62 61
63static DECLARE_RWSEM(gfs2_umount_flush_sem);
64static struct dentry *gfs2_root; 62static struct dentry *gfs2_root;
65static struct workqueue_struct *glock_workqueue; 63static struct workqueue_struct *glock_workqueue;
66struct workqueue_struct *gfs2_delete_workqueue; 64struct workqueue_struct *gfs2_delete_workqueue;
@@ -154,12 +152,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
154static void glock_free(struct gfs2_glock *gl) 152static void glock_free(struct gfs2_glock *gl)
155{ 153{
156 struct gfs2_sbd *sdp = gl->gl_sbd; 154 struct gfs2_sbd *sdp = gl->gl_sbd;
157 struct inode *aspace = gl->gl_aspace; 155 struct address_space *mapping = gfs2_glock2aspace(gl);
156 struct kmem_cache *cachep = gfs2_glock_cachep;
158 157
159 if (aspace) 158 GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
160 gfs2_aspace_put(aspace);
161 trace_gfs2_glock_put(gl); 159 trace_gfs2_glock_put(gl);
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); 160 if (mapping)
161 cachep = gfs2_glock_aspace_cachep;
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
163} 163}
164 164
165/** 165/**
@@ -712,7 +712,6 @@ static void glock_work_func(struct work_struct *work)
712 finish_xmote(gl, gl->gl_reply); 712 finish_xmote(gl, gl->gl_reply);
713 drop_ref = 1; 713 drop_ref = 1;
714 } 714 }
715 down_read(&gfs2_umount_flush_sem);
716 spin_lock(&gl->gl_spin); 715 spin_lock(&gl->gl_spin);
717 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 716 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
718 gl->gl_state != LM_ST_UNLOCKED && 717 gl->gl_state != LM_ST_UNLOCKED &&
@@ -725,7 +724,6 @@ static void glock_work_func(struct work_struct *work)
725 } 724 }
726 run_queue(gl, 0); 725 run_queue(gl, 0);
727 spin_unlock(&gl->gl_spin); 726 spin_unlock(&gl->gl_spin);
728 up_read(&gfs2_umount_flush_sem);
729 if (!delay || 727 if (!delay ||
730 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 728 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
731 gfs2_glock_put(gl); 729 gfs2_glock_put(gl);
@@ -750,10 +748,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
750 const struct gfs2_glock_operations *glops, int create, 748 const struct gfs2_glock_operations *glops, int create,
751 struct gfs2_glock **glp) 749 struct gfs2_glock **glp)
752{ 750{
751 struct super_block *s = sdp->sd_vfs;
753 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; 752 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
754 struct gfs2_glock *gl, *tmp; 753 struct gfs2_glock *gl, *tmp;
755 unsigned int hash = gl_hash(sdp, &name); 754 unsigned int hash = gl_hash(sdp, &name);
756 int error; 755 struct address_space *mapping;
757 756
758 read_lock(gl_lock_addr(hash)); 757 read_lock(gl_lock_addr(hash));
759 gl = search_bucket(hash, sdp, &name); 758 gl = search_bucket(hash, sdp, &name);
@@ -765,7 +764,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
765 if (!create) 764 if (!create)
766 return -ENOENT; 765 return -ENOENT;
767 766
768 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 767 if (glops->go_flags & GLOF_ASPACE)
768 gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
769 else
770 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
769 if (!gl) 771 if (!gl)
770 return -ENOMEM; 772 return -ENOMEM;
771 773
@@ -784,18 +786,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
784 gl->gl_tchange = jiffies; 786 gl->gl_tchange = jiffies;
785 gl->gl_object = NULL; 787 gl->gl_object = NULL;
786 gl->gl_sbd = sdp; 788 gl->gl_sbd = sdp;
787 gl->gl_aspace = NULL;
788 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 789 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
789 INIT_WORK(&gl->gl_delete, delete_work_func); 790 INIT_WORK(&gl->gl_delete, delete_work_func);
790 791
791 /* If this glock protects actual on-disk data or metadata blocks, 792 mapping = gfs2_glock2aspace(gl);
792 create a VFS inode to manage the pages/buffers holding them. */ 793 if (mapping) {
793 if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { 794 mapping->a_ops = &gfs2_meta_aops;
794 gl->gl_aspace = gfs2_aspace_get(sdp); 795 mapping->host = s->s_bdev->bd_inode;
795 if (!gl->gl_aspace) { 796 mapping->flags = 0;
796 error = -ENOMEM; 797 mapping_set_gfp_mask(mapping, GFP_NOFS);
797 goto fail; 798 mapping->assoc_mapping = NULL;
798 } 799 mapping->backing_dev_info = s->s_bdi;
800 mapping->writeback_index = 0;
799 } 801 }
800 802
801 write_lock(gl_lock_addr(hash)); 803 write_lock(gl_lock_addr(hash));
@@ -812,10 +814,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
812 *glp = gl; 814 *glp = gl;
813 815
814 return 0; 816 return 0;
815
816fail:
817 kmem_cache_free(gfs2_glock_cachep, gl);
818 return error;
819} 817}
820 818
821/** 819/**
@@ -1510,35 +1508,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1510 1508
1511void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1509void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1512{ 1510{
1513 unsigned long t;
1514 unsigned int x; 1511 unsigned int x;
1515 int cont;
1516 1512
1517 t = jiffies; 1513 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1518 1514 examine_bucket(clear_glock, sdp, x);
1519 for (;;) {
1520 cont = 0;
1521 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
1522 if (examine_bucket(clear_glock, sdp, x))
1523 cont = 1;
1524 }
1525
1526 if (!cont)
1527 break;
1528
1529 if (time_after_eq(jiffies,
1530 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
1531 fs_warn(sdp, "Unmount seems to be stalled. "
1532 "Dumping lock state...\n");
1533 gfs2_dump_lockstate(sdp);
1534 t = jiffies;
1535 }
1536
1537 down_write(&gfs2_umount_flush_sem);
1538 invalidate_inodes(sdp->sd_vfs);
1539 up_write(&gfs2_umount_flush_sem);
1540 msleep(10);
1541 }
1542 flush_workqueue(glock_workqueue); 1515 flush_workqueue(glock_workqueue);
1543 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); 1516 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
1544 gfs2_dump_lockstate(sdp); 1517 gfs2_dump_lockstate(sdp);
@@ -1685,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1685 dtime *= 1000000/HZ; /* demote time in uSec */ 1658 dtime *= 1000000/HZ; /* demote time in uSec */
1686 if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) 1659 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1687 dtime = 0; 1660 dtime = 0;
1688 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n", 1661 gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
1689 state2str(gl->gl_state), 1662 state2str(gl->gl_state),
1690 gl->gl_name.ln_type, 1663 gl->gl_name.ln_type,
1691 (unsigned long long)gl->gl_name.ln_number, 1664 (unsigned long long)gl->gl_name.ln_number,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c0262faf4725..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,6 +180,13 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
180 return gl->gl_state == LM_ST_SHARED; 180 return gl->gl_state == LM_ST_SHARED;
181} 181}
182 182
183static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
184{
185 if (gl->gl_ops->go_flags & GLOF_ASPACE)
186 return (struct address_space *)(gl + 1);
187 return NULL;
188}
189
183int gfs2_glock_get(struct gfs2_sbd *sdp, 190int gfs2_glock_get(struct gfs2_sbd *sdp,
184 u64 number, const struct gfs2_glock_operations *glops, 191 u64 number, const struct gfs2_glock_operations *glops,
185 int create, struct gfs2_glock **glp); 192 int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78554acc0605..38e3749d476c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -87,7 +87,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
87 87
88static void rgrp_go_sync(struct gfs2_glock *gl) 88static void rgrp_go_sync(struct gfs2_glock *gl)
89{ 89{
90 struct address_space *metamapping = gl->gl_aspace->i_mapping; 90 struct address_space *metamapping = gfs2_glock2aspace(gl);
91 int error; 91 int error;
92 92
93 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 93 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -113,7 +113,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
113 113
114static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 114static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
115{ 115{
116 struct address_space *mapping = gl->gl_aspace->i_mapping; 116 struct address_space *mapping = gfs2_glock2aspace(gl);
117 117
118 BUG_ON(!(flags & DIO_METADATA)); 118 BUG_ON(!(flags & DIO_METADATA));
119 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 119 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -134,7 +134,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
134static void inode_go_sync(struct gfs2_glock *gl) 134static void inode_go_sync(struct gfs2_glock *gl)
135{ 135{
136 struct gfs2_inode *ip = gl->gl_object; 136 struct gfs2_inode *ip = gl->gl_object;
137 struct address_space *metamapping = gl->gl_aspace->i_mapping; 137 struct address_space *metamapping = gfs2_glock2aspace(gl);
138 int error; 138 int error;
139 139
140 if (ip && !S_ISREG(ip->i_inode.i_mode)) 140 if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -183,7 +183,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
183 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 183 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
184 184
185 if (flags & DIO_METADATA) { 185 if (flags & DIO_METADATA) {
186 struct address_space *mapping = gl->gl_aspace->i_mapping; 186 struct address_space *mapping = gfs2_glock2aspace(gl);
187 truncate_inode_pages(mapping, 0); 187 truncate_inode_pages(mapping, 0);
188 if (ip) { 188 if (ip) {
189 set_bit(GIF_INVALID, &ip->i_flags); 189 set_bit(GIF_INVALID, &ip->i_flags);
@@ -282,7 +282,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
282 282
283static int rgrp_go_demote_ok(const struct gfs2_glock *gl) 283static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
284{ 284{
285 return !gl->gl_aspace->i_mapping->nrpages; 285 const struct address_space *mapping = (const struct address_space *)(gl + 1);
286 return !mapping->nrpages;
286} 287}
287 288
288/** 289/**
@@ -387,8 +388,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
387 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; 388 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
388 389
389 if (gl->gl_demote_state == LM_ST_UNLOCKED && 390 if (gl->gl_demote_state == LM_ST_UNLOCKED &&
390 gl->gl_state == LM_ST_SHARED && 391 gl->gl_state == LM_ST_SHARED && ip) {
391 ip && test_bit(GIF_USER, &ip->i_flags)) {
392 gfs2_glock_hold(gl); 392 gfs2_glock_hold(gl);
393 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) 393 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
394 gfs2_glock_put_nolock(gl); 394 gfs2_glock_put_nolock(gl);
@@ -407,6 +407,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
407 .go_dump = inode_go_dump, 407 .go_dump = inode_go_dump,
408 .go_type = LM_TYPE_INODE, 408 .go_type = LM_TYPE_INODE,
409 .go_min_hold_time = HZ / 5, 409 .go_min_hold_time = HZ / 5,
410 .go_flags = GLOF_ASPACE,
410}; 411};
411 412
412const struct gfs2_glock_operations gfs2_rgrp_glops = { 413const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -418,6 +419,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
418 .go_dump = gfs2_rgrp_dump, 419 .go_dump = gfs2_rgrp_dump,
419 .go_type = LM_TYPE_RGRP, 420 .go_type = LM_TYPE_RGRP,
420 .go_min_hold_time = HZ / 5, 421 .go_min_hold_time = HZ / 5,
422 .go_flags = GLOF_ASPACE,
421}; 423};
422 424
423const struct gfs2_glock_operations gfs2_trans_glops = { 425const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index bc0ad158e6b4..3aac46f6853e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
162 void (*go_callback) (struct gfs2_glock *gl); 162 void (*go_callback) (struct gfs2_glock *gl);
163 const int go_type; 163 const int go_type;
164 const unsigned long go_min_hold_time; 164 const unsigned long go_min_hold_time;
165 const unsigned long go_flags;
166#define GLOF_ASPACE 1
165}; 167};
166 168
167enum { 169enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
225 227
226 struct gfs2_sbd *gl_sbd; 228 struct gfs2_sbd *gl_sbd;
227 229
228 struct inode *gl_aspace;
229 struct list_head gl_ail_list; 230 struct list_head gl_ail_list;
230 atomic_t gl_ail_count; 231 atomic_t gl_ail_count;
231 struct delayed_work gl_work; 232 struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
258 GIF_INVALID = 0, 259 GIF_INVALID = 0,
259 GIF_QD_LOCKED = 1, 260 GIF_QD_LOCKED = 1,
260 GIF_SW_PAGED = 3, 261 GIF_SW_PAGED = 3,
261 GIF_USER = 4, /* user inode, not metadata addr space */
262}; 262};
263 263
264 264
@@ -451,7 +451,6 @@ struct gfs2_tune {
451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
452 unsigned int gt_new_files_jdata; 452 unsigned int gt_new_files_jdata;
453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
454 unsigned int gt_stall_secs; /* Detects trouble! */
455 unsigned int gt_complain_secs; 454 unsigned int gt_complain_secs;
456 unsigned int gt_statfs_quantum; 455 unsigned int gt_statfs_quantum;
457 unsigned int gt_statfs_slow; 456 unsigned int gt_statfs_slow;
@@ -617,7 +616,7 @@ struct gfs2_sbd {
617 unsigned int sd_log_blks_reserved; 616 unsigned int sd_log_blks_reserved;
618 unsigned int sd_log_commited_buf; 617 unsigned int sd_log_commited_buf;
619 unsigned int sd_log_commited_databuf; 618 unsigned int sd_log_commited_databuf;
620 unsigned int sd_log_commited_revoke; 619 int sd_log_commited_revoke;
621 620
622 unsigned int sd_log_num_buf; 621 unsigned int sd_log_num_buf;
623 unsigned int sd_log_num_revoke; 622 unsigned int sd_log_num_revoke;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e220f4eee7d..b1bf2694fb2b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
45 struct gfs2_inode *ip = GFS2_I(inode); 45 struct gfs2_inode *ip = GFS2_I(inode);
46 u64 *no_addr = opaque; 46 u64 *no_addr = opaque;
47 47
48 if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags)) 48 if (ip->i_no_addr == *no_addr)
49 return 1; 49 return 1;
50 50
51 return 0; 51 return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
58 58
59 inode->i_ino = (unsigned long)*no_addr; 59 inode->i_ino = (unsigned long)*no_addr;
60 ip->i_no_addr = *no_addr; 60 ip->i_no_addr = *no_addr;
61 set_bit(GIF_USER, &ip->i_flags);
62 return 0; 61 return 0;
63} 62}
64 63
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
84 struct gfs2_inode *ip = GFS2_I(inode); 83 struct gfs2_inode *ip = GFS2_I(inode);
85 struct gfs2_skip_data *data = opaque; 84 struct gfs2_skip_data *data = opaque;
86 85
87 if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){ 86 if (ip->i_no_addr == data->no_addr) {
88 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ 87 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
89 data->skipped = 1; 88 data->skipped = 1;
90 return 0; 89 return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
103 return 1; 102 return 1;
104 inode->i_ino = (unsigned long)(data->no_addr); 103 inode->i_ino = (unsigned long)(data->no_addr);
105 ip->i_no_addr = data->no_addr; 104 ip->i_no_addr = data->no_addr;
106 set_bit(GIF_USER, &ip->i_flags);
107 return 0; 105 return 0;
108} 106}
109 107
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e5e0e7022e5..569b46240f61 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -30,7 +30,10 @@ static void gdlm_ast(void *arg)
30 30
31 switch (gl->gl_lksb.sb_status) { 31 switch (gl->gl_lksb.sb_status) {
32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
33 kmem_cache_free(gfs2_glock_cachep, gl); 33 if (gl->gl_ops->go_flags & GLOF_ASPACE)
34 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
35 else
36 kmem_cache_free(gfs2_glock_cachep, gl);
34 if (atomic_dec_and_test(&sdp->sd_glock_disposal)) 37 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
35 wake_up(&sdp->sd_glock_wait); 38 wake_up(&sdp->sd_glock_wait);
36 return; 39 return;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc451..e5bf4b59d46e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
417 databufhdrs_needed = (sdp->sd_log_commited_databuf + 417 databufhdrs_needed = (sdp->sd_log_commited_databuf +
418 (dbuf_limit - 1)) / dbuf_limit; 418 (dbuf_limit - 1)) / dbuf_limit;
419 419
420 if (sdp->sd_log_commited_revoke) 420 if (sdp->sd_log_commited_revoke > 0)
421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
422 sizeof(u64)); 422 sizeof(u64));
423 423
@@ -790,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
791 (((int)sdp->sd_log_commited_databuf) >= 0)); 791 (((int)sdp->sd_log_commited_databuf) >= 0));
792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
793 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
794 reserved = calc_reserved(sdp); 793 reserved = calc_reserved(sdp);
795 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 794 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
796 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 795 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de97632ba32f..adc260fbea90 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -528,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
528 gfs2_pin(sdp, bd->bd_bh); 528 gfs2_pin(sdp, bd->bd_bh);
529 tr->tr_num_databuf_new++; 529 tr->tr_num_databuf_new++;
530 sdp->sd_log_num_databuf++; 530 sdp->sd_log_num_databuf++;
531 list_add(&le->le_list, &sdp->sd_log_le_databuf); 531 list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
532 } else { 532 } else {
533 list_add(&le->le_list, &sdp->sd_log_le_ordered); 533 list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
534 } 534 }
535out: 535out:
536 gfs2_log_unlock(sdp); 536 gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..a88fadc704bb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
52 atomic_set(&gl->gl_ail_count, 0); 52 atomic_set(&gl->gl_ail_count, 0);
53} 53}
54 54
55static void gfs2_init_gl_aspace_once(void *foo)
56{
57 struct gfs2_glock *gl = foo;
58 struct address_space *mapping = (struct address_space *)(gl + 1);
59
60 gfs2_init_glock_once(gl);
61 memset(mapping, 0, sizeof(*mapping));
62 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
63 spin_lock_init(&mapping->tree_lock);
64 spin_lock_init(&mapping->i_mmap_lock);
65 INIT_LIST_HEAD(&mapping->private_list);
66 spin_lock_init(&mapping->private_lock);
67 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
68 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
69}
70
55/** 71/**
56 * init_gfs2_fs - Register GFS2 as a filesystem 72 * init_gfs2_fs - Register GFS2 as a filesystem
57 * 73 *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
78 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
79 goto fail; 95 goto fail;
80 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once);
101
102 if (!gfs2_glock_aspace_cachep)
103 goto fail;
104
81 gfs2_inode_cachep = kmem_cache_create("gfs2_inode", 105 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
82 sizeof(struct gfs2_inode), 106 sizeof(struct gfs2_inode),
83 0, SLAB_RECLAIM_ACCOUNT| 107 0, SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
144 if (gfs2_inode_cachep) 168 if (gfs2_inode_cachep)
145 kmem_cache_destroy(gfs2_inode_cachep); 169 kmem_cache_destroy(gfs2_inode_cachep);
146 170
171 if (gfs2_glock_aspace_cachep)
172 kmem_cache_destroy(gfs2_glock_aspace_cachep);
173
147 if (gfs2_glock_cachep) 174 if (gfs2_glock_cachep)
148 kmem_cache_destroy(gfs2_glock_cachep); 175 kmem_cache_destroy(gfs2_glock_cachep);
149 176
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
169 kmem_cache_destroy(gfs2_rgrpd_cachep); 196 kmem_cache_destroy(gfs2_rgrpd_cachep);
170 kmem_cache_destroy(gfs2_bufdata_cachep); 197 kmem_cache_destroy(gfs2_bufdata_cachep);
171 kmem_cache_destroy(gfs2_inode_cachep); 198 kmem_cache_destroy(gfs2_inode_cachep);
199 kmem_cache_destroy(gfs2_glock_aspace_cachep);
172 kmem_cache_destroy(gfs2_glock_cachep); 200 kmem_cache_destroy(gfs2_glock_cachep);
173 201
174 gfs2_sys_uninit(); 202 gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 6f68a5f18eb8..0bb12c80937a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -93,49 +93,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
93 return err; 93 return err;
94} 94}
95 95
96static const struct address_space_operations aspace_aops = { 96const struct address_space_operations gfs2_meta_aops = {
97 .writepage = gfs2_aspace_writepage, 97 .writepage = gfs2_aspace_writepage,
98 .releasepage = gfs2_releasepage, 98 .releasepage = gfs2_releasepage,
99 .sync_page = block_sync_page, 99 .sync_page = block_sync_page,
100}; 100};
101 101
102/** 102/**
103 * gfs2_aspace_get - Create and initialize a struct inode structure
104 * @sdp: the filesystem the aspace is in
105 *
106 * Right now a struct inode is just a struct inode. Maybe Linux
107 * will supply a more lightweight address space construct (that works)
108 * in the future.
109 *
110 * Make sure pages/buffers in this aspace aren't in high memory.
111 *
112 * Returns: the aspace
113 */
114
115struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
116{
117 struct inode *aspace;
118 struct gfs2_inode *ip;
119
120 aspace = new_inode(sdp->sd_vfs);
121 if (aspace) {
122 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
123 aspace->i_mapping->a_ops = &aspace_aops;
124 aspace->i_size = MAX_LFS_FILESIZE;
125 ip = GFS2_I(aspace);
126 clear_bit(GIF_USER, &ip->i_flags);
127 insert_inode_hash(aspace);
128 }
129 return aspace;
130}
131
132void gfs2_aspace_put(struct inode *aspace)
133{
134 remove_inode_hash(aspace);
135 iput(aspace);
136}
137
138/**
139 * gfs2_meta_sync - Sync all buffers associated with a glock 103 * gfs2_meta_sync - Sync all buffers associated with a glock
140 * @gl: The glock 104 * @gl: The glock
141 * 105 *
@@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace)
143 107
144void gfs2_meta_sync(struct gfs2_glock *gl) 108void gfs2_meta_sync(struct gfs2_glock *gl)
145{ 109{
146 struct address_space *mapping = gl->gl_aspace->i_mapping; 110 struct address_space *mapping = gfs2_glock2aspace(gl);
147 int error; 111 int error;
148 112
149 filemap_fdatawrite(mapping); 113 filemap_fdatawrite(mapping);
@@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
164 128
165struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) 129struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
166{ 130{
167 struct address_space *mapping = gl->gl_aspace->i_mapping; 131 struct address_space *mapping = gfs2_glock2aspace(gl);
168 struct gfs2_sbd *sdp = gl->gl_sbd; 132 struct gfs2_sbd *sdp = gl->gl_sbd;
169 struct page *page; 133 struct page *page;
170 struct buffer_head *bh; 134 struct buffer_head *bh;
@@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
344 308
345void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) 309void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
346{ 310{
347 struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host); 311 struct address_space *mapping = bh->b_page->mapping;
312 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
348 struct gfs2_bufdata *bd = bh->b_private; 313 struct gfs2_bufdata *bd = bh->b_private;
314
349 if (test_clear_buffer_pinned(bh)) { 315 if (test_clear_buffer_pinned(bh)) {
350 list_del_init(&bd->bd_le.le_list); 316 list_del_init(&bd->bd_le.le_list);
351 if (meta) { 317 if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
37 0, from_head - to_head); 37 0, from_head - to_head);
38} 38}
39 39
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); 40extern const struct address_space_operations gfs2_meta_aops;
41void gfs2_aspace_put(struct inode *aspace); 41
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{
44 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
47 else
48 return inode->i_sb->s_fs_info;
49}
42 50
43void gfs2_meta_sync(struct gfs2_glock *gl); 51void gfs2_meta_sync(struct gfs2_glock *gl);
44 52
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a86ed6381566..c1309ed1c496 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -65,7 +65,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
65 gt->gt_quota_scale_den = 1; 65 gt->gt_quota_scale_den = 1;
66 gt->gt_new_files_jdata = 0; 66 gt->gt_new_files_jdata = 0;
67 gt->gt_max_readahead = 1 << 18; 67 gt->gt_max_readahead = 1 << 18;
68 gt->gt_stall_secs = 600;
69 gt->gt_complain_secs = 10; 68 gt->gt_complain_secs = 10;
70} 69}
71 70
@@ -1002,7 +1001,7 @@ static const struct lm_lockops nolock_ops = {
1002/** 1001/**
1003 * gfs2_lm_mount - mount a locking protocol 1002 * gfs2_lm_mount - mount a locking protocol
1004 * @sdp: the filesystem 1003 * @sdp: the filesystem
1005 * @args: mount arguements 1004 * @args: mount arguments
1006 * @silent: if 1, don't complain if the FS isn't a GFS2 fs 1005 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
1007 * 1006 *
1008 * Returns: errno 1007 * Returns: errno
@@ -1241,10 +1240,9 @@ fail_sb:
1241fail_locking: 1240fail_locking:
1242 init_locking(sdp, &mount_gh, UNDO); 1241 init_locking(sdp, &mount_gh, UNDO);
1243fail_lm: 1242fail_lm:
1243 invalidate_inodes(sb);
1244 gfs2_gl_hash_clear(sdp); 1244 gfs2_gl_hash_clear(sdp);
1245 gfs2_lm_unmount(sdp); 1245 gfs2_lm_unmount(sdp);
1246 while (invalidate_inodes(sb))
1247 yield();
1248fail_sys: 1246fail_sys:
1249 gfs2_sys_fs_del(sdp); 1247 gfs2_sys_fs_del(sdp);
1250fail: 1248fail:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 84350e1be66d..4e64352d49de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -976,122 +976,62 @@ out:
976} 976}
977 977
978/** 978/**
979 * gfs2_readlinki - return the contents of a symlink 979 * gfs2_follow_link - Follow a symbolic link
980 * @ip: the symlink's inode 980 * @dentry: The dentry of the link
981 * @buf: a pointer to the buffer to be filled 981 * @nd: Data that we pass to vfs_follow_link()
982 * @len: a pointer to the length of @buf
983 * 982 *
984 * If @buf is too small, a piece of memory is kmalloc()ed and needs 983 * This can handle symlinks of any size.
985 * to be freed by the caller.
986 * 984 *
987 * Returns: errno 985 * Returns: 0 on success or error code
988 */ 986 */
989 987
990static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) 988static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
991{ 989{
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
992 struct gfs2_holder i_gh; 991 struct gfs2_holder i_gh;
993 struct buffer_head *dibh; 992 struct buffer_head *dibh;
994 unsigned int x; 993 unsigned int x;
994 char *buf;
995 int error; 995 int error;
996 996
997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); 997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
998 error = gfs2_glock_nq(&i_gh); 998 error = gfs2_glock_nq(&i_gh);
999 if (error) { 999 if (error) {
1000 gfs2_holder_uninit(&i_gh); 1000 gfs2_holder_uninit(&i_gh);
1001 return error; 1001 nd_set_link(nd, ERR_PTR(error));
1002 return NULL;
1002 } 1003 }
1003 1004
1004 if (!ip->i_disksize) { 1005 if (!ip->i_disksize) {
1005 gfs2_consist_inode(ip); 1006 gfs2_consist_inode(ip);
1006 error = -EIO; 1007 buf = ERR_PTR(-EIO);
1007 goto out; 1008 goto out;
1008 } 1009 }
1009 1010
1010 error = gfs2_meta_inode_buffer(ip, &dibh); 1011 error = gfs2_meta_inode_buffer(ip, &dibh);
1011 if (error) 1012 if (error) {
1013 buf = ERR_PTR(error);
1012 goto out; 1014 goto out;
1013
1014 x = ip->i_disksize + 1;
1015 if (x > *len) {
1016 *buf = kmalloc(x, GFP_NOFS);
1017 if (!*buf) {
1018 error = -ENOMEM;
1019 goto out_brelse;
1020 }
1021 } 1015 }
1022 1016
1023 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); 1017 x = ip->i_disksize + 1;
1024 *len = x; 1018 buf = kmalloc(x, GFP_NOFS);
1025 1019 if (!buf)
1026out_brelse: 1020 buf = ERR_PTR(-ENOMEM);
1021 else
1022 memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1027 brelse(dibh); 1023 brelse(dibh);
1028out: 1024out:
1029 gfs2_glock_dq_uninit(&i_gh); 1025 gfs2_glock_dq_uninit(&i_gh);
1030 return error; 1026 nd_set_link(nd, buf);
1031} 1027 return NULL;
1032
1033/**
1034 * gfs2_readlink - Read the value of a symlink
1035 * @dentry: the symlink
1036 * @buf: the buffer to read the symlink data into
1037 * @size: the size of the buffer
1038 *
1039 * Returns: errno
1040 */
1041
1042static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
1043 int user_size)
1044{
1045 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
1046 char array[GFS2_FAST_NAME_SIZE], *buf = array;
1047 unsigned int len = GFS2_FAST_NAME_SIZE;
1048 int error;
1049
1050 error = gfs2_readlinki(ip, &buf, &len);
1051 if (error)
1052 return error;
1053
1054 if (user_size > len - 1)
1055 user_size = len - 1;
1056
1057 if (copy_to_user(user_buf, buf, user_size))
1058 error = -EFAULT;
1059 else
1060 error = user_size;
1061
1062 if (buf != array)
1063 kfree(buf);
1064
1065 return error;
1066} 1028}
1067 1029
1068/** 1030static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1069 * gfs2_follow_link - Follow a symbolic link
1070 * @dentry: The dentry of the link
1071 * @nd: Data that we pass to vfs_follow_link()
1072 *
1073 * This can handle symlinks of any size. It is optimised for symlinks
1074 * under GFS2_FAST_NAME_SIZE.
1075 *
1076 * Returns: 0 on success or error code
1077 */
1078
1079static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1080{ 1031{
1081 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 1032 char *s = nd_get_link(nd);
1082 char array[GFS2_FAST_NAME_SIZE], *buf = array; 1033 if (!IS_ERR(s))
1083 unsigned int len = GFS2_FAST_NAME_SIZE; 1034 kfree(s);
1084 int error;
1085
1086 error = gfs2_readlinki(ip, &buf, &len);
1087 if (!error) {
1088 error = vfs_follow_link(nd, buf);
1089 if (buf != array)
1090 kfree(buf);
1091 } else
1092 path_put(&nd->path);
1093
1094 return ERR_PTR(error);
1095} 1035}
1096 1036
1097/** 1037/**
@@ -1426,8 +1366,9 @@ const struct inode_operations gfs2_dir_iops = {
1426}; 1366};
1427 1367
1428const struct inode_operations gfs2_symlink_iops = { 1368const struct inode_operations gfs2_symlink_iops = {
1429 .readlink = gfs2_readlink, 1369 .readlink = generic_readlink,
1430 .follow_link = gfs2_follow_link, 1370 .follow_link = gfs2_follow_link,
1371 .put_link = gfs2_put_link,
1431 .permission = gfs2_permission, 1372 .permission = gfs2_permission,
1432 .setattr = gfs2_setattr, 1373 .setattr = gfs2_setattr,
1433 .getattr = gfs2_getattr, 1374 .getattr = gfs2_getattr,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e3bf6eab8750..6dbcbad6ab17 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1083 } 1083 }
1084} 1084}
1085 1085
1086int gfs2_quota_sync(struct super_block *sb, int type) 1086int gfs2_quota_sync(struct super_block *sb, int type, int wait)
1087{ 1087{
1088 struct gfs2_sbd *sdp = sb->s_fs_info; 1088 struct gfs2_sbd *sdp = sb->s_fs_info;
1089 struct gfs2_quota_data **qda; 1089 struct gfs2_quota_data **qda;
@@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type)
1127 return error; 1127 return error;
1128} 1128}
1129 1129
1130static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
1131{
1132 return gfs2_quota_sync(sb, type, 0);
1133}
1134
1130int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) 1135int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
1131{ 1136{
1132 struct gfs2_quota_data *qd; 1137 struct gfs2_quota_data *qd;
@@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data)
1382 &tune->gt_statfs_quantum); 1387 &tune->gt_statfs_quantum);
1383 1388
1384 /* Update quota file */ 1389 /* Update quota file */
1385 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, 1390 quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
1386 &quotad_timeo, &tune->gt_quota_quantum); 1391 &quotad_timeo, &tune->gt_quota_quantum);
1387 1392
1388 /* Check for & recover partially truncated inodes */ 1393 /* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e271fa07ad02..195f60c8bd14 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid); 26 u32 uid, u32 gid);
27 27
28extern int gfs2_quota_sync(struct super_block *sb, int type); 28extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); 29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30 30
31extern int gfs2_quota_init(struct gfs2_sbd *sdp); 31extern int gfs2_quota_init(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b9dd3da22c0a..50aac606b990 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -22,6 +22,7 @@
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/time.h> 23#include <linux/time.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/writeback.h>
25 26
26#include "gfs2.h" 27#include "gfs2.h"
27#include "incore.h" 28#include "incore.h"
@@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
711 * Returns: errno 712 * Returns: errno
712 */ 713 */
713 714
714static int gfs2_write_inode(struct inode *inode, int sync) 715static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
715{ 716{
716 struct gfs2_inode *ip = GFS2_I(inode); 717 struct gfs2_inode *ip = GFS2_I(inode);
717 struct gfs2_sbd *sdp = GFS2_SB(inode); 718 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -722,8 +723,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
722 int ret = 0; 723 int ret = 0;
723 724
724 /* Check this is a "normal" inode, etc */ 725 /* Check this is a "normal" inode, etc */
725 if (!test_bit(GIF_USER, &ip->i_flags) || 726 if (current->flags & PF_MEMALLOC)
726 (current->flags & PF_MEMALLOC))
727 return 0; 727 return 0;
728 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 728 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
729 if (ret) 729 if (ret)
@@ -746,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
746do_unlock: 746do_unlock:
747 gfs2_glock_dq_uninit(&gh); 747 gfs2_glock_dq_uninit(&gh);
748do_flush: 748do_flush:
749 if (sync != 0) 749 if (wbc->sync_mode == WB_SYNC_ALL)
750 gfs2_log_flush(GFS2_SB(inode), ip->i_gl); 750 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
751 return ret; 751 return ret;
752} 752}
@@ -764,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
764 int error; 764 int error;
765 765
766 flush_workqueue(gfs2_delete_workqueue); 766 flush_workqueue(gfs2_delete_workqueue);
767 gfs2_quota_sync(sdp->sd_vfs, 0); 767 gfs2_quota_sync(sdp->sd_vfs, 0, 1);
768 gfs2_statfs_sync(sdp->sd_vfs, 0); 768 gfs2_statfs_sync(sdp->sd_vfs, 0);
769 769
770 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, 770 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
@@ -860,6 +860,7 @@ restart:
860 gfs2_clear_rgrpd(sdp); 860 gfs2_clear_rgrpd(sdp);
861 gfs2_jindex_free(sdp); 861 gfs2_jindex_free(sdp);
862 /* Take apart glock structures and buffer lists */ 862 /* Take apart glock structures and buffer lists */
863 invalidate_inodes(sdp->sd_vfs);
863 gfs2_gl_hash_clear(sdp); 864 gfs2_gl_hash_clear(sdp);
864 /* Unmount the locking protocol */ 865 /* Unmount the locking protocol */
865 gfs2_lm_unmount(sdp); 866 gfs2_lm_unmount(sdp);
@@ -1194,7 +1195,7 @@ static void gfs2_drop_inode(struct inode *inode)
1194{ 1195{
1195 struct gfs2_inode *ip = GFS2_I(inode); 1196 struct gfs2_inode *ip = GFS2_I(inode);
1196 1197
1197 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { 1198 if (inode->i_nlink) {
1198 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1199 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1199 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1200 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1200 clear_nlink(inode); 1201 clear_nlink(inode);
@@ -1212,18 +1213,12 @@ static void gfs2_clear_inode(struct inode *inode)
1212{ 1213{
1213 struct gfs2_inode *ip = GFS2_I(inode); 1214 struct gfs2_inode *ip = GFS2_I(inode);
1214 1215
1215 /* This tells us its a "real" inode and not one which only 1216 ip->i_gl->gl_object = NULL;
1216 * serves to contain an address space (see rgrp.c, meta_io.c) 1217 gfs2_glock_put(ip->i_gl);
1217 * which therefore doesn't have its own glocks. 1218 ip->i_gl = NULL;
1218 */ 1219 if (ip->i_iopen_gh.gh_gl) {
1219 if (test_bit(GIF_USER, &ip->i_flags)) { 1220 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1220 ip->i_gl->gl_object = NULL; 1221 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1221 gfs2_glock_put(ip->i_gl);
1222 ip->i_gl = NULL;
1223 if (ip->i_iopen_gh.gh_gl) {
1224 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1225 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1226 }
1227 } 1222 }
1228} 1223}
1229 1224
@@ -1358,9 +1353,6 @@ static void gfs2_delete_inode(struct inode *inode)
1358 struct gfs2_holder gh; 1353 struct gfs2_holder gh;
1359 int error; 1354 int error;
1360 1355
1361 if (!test_bit(GIF_USER, &ip->i_flags))
1362 goto out;
1363
1364 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1356 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1365 if (unlikely(error)) { 1357 if (unlikely(error)) {
1366 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1358 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0dc34621f6a6..419042f7f0b6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -49,7 +49,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
49 return a->store ? a->store(sdp, buf, len) : len; 49 return a->store ? a->store(sdp, buf, len) : len;
50} 50}
51 51
52static struct sysfs_ops gfs2_attr_ops = { 52static const struct sysfs_ops gfs2_attr_ops = {
53 .show = gfs2_attr_show, 53 .show = gfs2_attr_show,
54 .store = gfs2_attr_store, 54 .store = gfs2_attr_store,
55}; 55};
@@ -167,7 +167,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
167 if (simple_strtol(buf, NULL, 0) != 1) 167 if (simple_strtol(buf, NULL, 0) != 1)
168 return -EINVAL; 168 return -EINVAL;
169 169
170 gfs2_quota_sync(sdp->sd_vfs, 0); 170 gfs2_quota_sync(sdp->sd_vfs, 0, 1);
171 return len; 171 return len;
172} 172}
173 173
@@ -478,7 +478,6 @@ TUNE_ATTR(complain_secs, 0);
478TUNE_ATTR(statfs_slow, 0); 478TUNE_ATTR(statfs_slow, 0);
479TUNE_ATTR(new_files_jdata, 0); 479TUNE_ATTR(new_files_jdata, 0);
480TUNE_ATTR(quota_simul_sync, 1); 480TUNE_ATTR(quota_simul_sync, 1);
481TUNE_ATTR(stall_secs, 1);
482TUNE_ATTR(statfs_quantum, 1); 481TUNE_ATTR(statfs_quantum, 1);
483TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 482TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
484 483
@@ -491,7 +490,6 @@ static struct attribute *tune_attrs[] = {
491 &tune_attr_complain_secs.attr, 490 &tune_attr_complain_secs.attr,
492 &tune_attr_statfs_slow.attr, 491 &tune_attr_statfs_slow.attr,
493 &tune_attr_quota_simul_sync.attr, 492 &tune_attr_quota_simul_sync.attr,
494 &tune_attr_stall_secs.attr,
495 &tune_attr_statfs_quantum.attr, 493 &tune_attr_statfs_quantum.attr,
496 &tune_attr_quota_scale.attr, 494 &tune_attr_quota_scale.attr,
497 &tune_attr_new_files_jdata.attr, 495 &tune_attr_new_files_jdata.attr,
@@ -576,7 +574,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
576 return 0; 574 return 0;
577} 575}
578 576
579static struct kset_uevent_ops gfs2_uevent_ops = { 577static const struct kset_uevent_ops gfs2_uevent_ops = {
580 .uevent = gfs2_uevent, 578 .uevent = gfs2_uevent,
581}; 579};
582 580
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..226f2bfbf16a 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -21,6 +21,7 @@
21#include "util.h" 21#include "util.h"
22 22
23struct kmem_cache *gfs2_glock_cachep __read_mostly; 23struct kmem_cache *gfs2_glock_cachep __read_mostly;
24struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
24struct kmem_cache *gfs2_inode_cachep __read_mostly; 25struct kmem_cache *gfs2_inode_cachep __read_mostly;
25struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 26struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
26struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
145 145
146 146
147extern struct kmem_cache *gfs2_glock_cachep; 147extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_glock_aspace_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 149extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 150extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 151extern struct kmem_cache *gfs2_rgrpd_cachep;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e11671..fe35e3b626c4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
188 188
189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); 189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); 190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
191extern int hfs_write_inode(struct inode *, int); 191extern int hfs_write_inode(struct inode *, struct writeback_control *);
192extern int hfs_inode_setattr(struct dentry *, struct iattr *); 192extern int hfs_inode_setattr(struct dentry *, struct iattr *);
193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, 193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
194 __be32 log_size, __be32 phys_size, u32 clump_size); 194 __be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d99..14f5cb1b9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
381 HFS_SB(inode->i_sb)->alloc_blksz); 381 HFS_SB(inode->i_sb)->alloc_blksz);
382} 382}
383 383
384int hfs_write_inode(struct inode *inode, int unused) 384int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
385{ 385{
386 struct inode *main_inode = inode; 386 struct inode *main_inode = inode;
387 struct hfs_find_data fd; 387 struct hfs_find_data fd;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d5148..74b473a8ef92 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
87 return ERR_PTR(err); 87 return ERR_PTR(err);
88} 88}
89 89
90static int hfsplus_write_inode(struct inode *inode, int unused) 90static int hfsplus_write_inode(struct inode *inode,
91 struct writeback_control *wbc)
91{ 92{
92 struct hfsplus_vh *vhdr; 93 struct hfsplus_vh *vhdr;
93 int ret = 0; 94 int ret = 0;
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 1aa88c4e0964..6a2f04bf3df0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
353} 353}
354 354
355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos, 355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
356 unsigned len, char *buf) 356 unsigned len, const char *buf)
357{ 357{
358 struct buffer_head *bh; 358 struct buffer_head *bh;
359 char *data; 359 char *data;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 940d6d150bee..67d9d36b3d5f 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
20 20
21 if (l == 1) if (qstr->name[0]=='.') goto x; 21 if (l == 1) if (qstr->name[0]=='.') goto x;
22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x; 22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
23 hpfs_adjust_length((char *)qstr->name, &l); 23 hpfs_adjust_length(qstr->name, &l);
24 /*if (hpfs_chk_name((char *)qstr->name,&l))*/ 24 /*if (hpfs_chk_name(qstr->name,&l))*/
25 /*return -ENAMETOOLONG;*/ 25 /*return -ENAMETOOLONG;*/
26 /*return -ENOENT;*/ 26 /*return -ENOENT;*/
27 x: 27 x:
@@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
38{ 38{
39 unsigned al=a->len; 39 unsigned al=a->len;
40 unsigned bl=b->len; 40 unsigned bl=b->len;
41 hpfs_adjust_length((char *)a->name, &al); 41 hpfs_adjust_length(a->name, &al);
42 /*hpfs_adjust_length((char *)b->name, &bl);*/ 42 /*hpfs_adjust_length(b->name, &bl);*/
43 /* 'a' is the qstr of an already existing dentry, so the name 43 /* 'a' is the qstr of an already existing dentry, so the name
44 * must be valid. 'b' must be validated first. 44 * must be valid. 'b' must be validated first.
45 */ 45 */
46 46
47 if (hpfs_chk_name((char *)b->name, &bl)) return 1; 47 if (hpfs_chk_name(b->name, &bl))
48 if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1; 48 return 1;
49 if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
50 return 1;
49 return 0; 51 return 0;
50} 52}
51 53
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8865c94f55f6..26e3964a4b8c 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -59,7 +59,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
59 struct hpfs_dirent *de; 59 struct hpfs_dirent *de;
60 int lc; 60 int lc;
61 long old_pos; 61 long old_pos;
62 char *tempname; 62 unsigned char *tempname;
63 int c1, c2 = 0; 63 int c1, c2 = 0;
64 int ret = 0; 64 int ret = 0;
65 65
@@ -158,11 +158,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
158 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); 158 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
159 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) { 159 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) {
160 filp->f_pos = old_pos; 160 filp->f_pos = old_pos;
161 if (tempname != (char *)de->name) kfree(tempname); 161 if (tempname != de->name) kfree(tempname);
162 hpfs_brelse4(&qbh); 162 hpfs_brelse4(&qbh);
163 goto out; 163 goto out;
164 } 164 }
165 if (tempname != (char *)de->name) kfree(tempname); 165 if (tempname != de->name) kfree(tempname);
166 hpfs_brelse4(&qbh); 166 hpfs_brelse4(&qbh);
167 } 167 }
168out: 168out:
@@ -187,7 +187,7 @@ out:
187 187
188struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 188struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
189{ 189{
190 const char *name = dentry->d_name.name; 190 const unsigned char *name = dentry->d_name.name;
191 unsigned len = dentry->d_name.len; 191 unsigned len = dentry->d_name.len;
192 struct quad_buffer_head qbh; 192 struct quad_buffer_head qbh;
193 struct hpfs_dirent *de; 193 struct hpfs_dirent *de;
@@ -197,7 +197,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
197 struct hpfs_inode_info *hpfs_result; 197 struct hpfs_inode_info *hpfs_result;
198 198
199 lock_kernel(); 199 lock_kernel();
200 if ((err = hpfs_chk_name((char *)name, &len))) { 200 if ((err = hpfs_chk_name(name, &len))) {
201 if (err == -ENAMETOOLONG) { 201 if (err == -ENAMETOOLONG) {
202 unlock_kernel(); 202 unlock_kernel();
203 return ERR_PTR(-ENAMETOOLONG); 203 return ERR_PTR(-ENAMETOOLONG);
@@ -209,7 +209,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
209 * '.' and '..' will never be passed here. 209 * '.' and '..' will never be passed here.
210 */ 210 */
211 211
212 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh); 212 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
213 213
214 /* 214 /*
215 * This is not really a bailout, just means file not found. 215 * This is not really a bailout, just means file not found.
@@ -250,7 +250,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
250 hpfs_result = hpfs_i(result); 250 hpfs_result = hpfs_i(result);
251 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; 251 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
252 252
253 hpfs_decide_conv(result, (char *)name, len); 253 hpfs_decide_conv(result, name, len);
254 254
255 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { 255 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
256 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); 256 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index fe83c2b7d2d8..9b2ffadfc8c4 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
158 158
159/* Add an entry to dnode and don't care if it grows over 2048 bytes */ 159/* Add an entry to dnode and don't care if it grows over 2048 bytes */
160 160
161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name, 161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
162 const unsigned char *name,
162 unsigned namelen, secno down_ptr) 163 unsigned namelen, secno down_ptr)
163{ 164{
164 struct hpfs_dirent *de; 165 struct hpfs_dirent *de;
@@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d)
223/* Add an entry to dnode and do dnode splitting if required */ 224/* Add an entry to dnode and do dnode splitting if required */
224 225
225static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, 226static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
226 unsigned char *name, unsigned namelen, 227 const unsigned char *name, unsigned namelen,
227 struct hpfs_dirent *new_de, dnode_secno down_ptr) 228 struct hpfs_dirent *new_de, dnode_secno down_ptr)
228{ 229{
229 struct quad_buffer_head qbh, qbh1, qbh2; 230 struct quad_buffer_head qbh, qbh1, qbh2;
@@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
231 dnode_secno adno, rdno; 232 dnode_secno adno, rdno;
232 struct hpfs_dirent *de; 233 struct hpfs_dirent *de;
233 struct hpfs_dirent nde; 234 struct hpfs_dirent nde;
234 char *nname; 235 unsigned char *nname;
235 int h; 236 int h;
236 int pos; 237 int pos;
237 struct buffer_head *bh; 238 struct buffer_head *bh;
@@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
305 pos++; 306 pos++;
306 } 307 }
307 copy_de(new_de = &nde, de); 308 copy_de(new_de = &nde, de);
308 memcpy(name = nname, de->name, namelen = de->namelen); 309 memcpy(nname, de->name, de->namelen);
310 name = nname;
311 namelen = de->namelen;
309 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); 312 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
310 down_ptr = adno; 313 down_ptr = adno;
311 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); 314 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
@@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
368 * I hope, now it's finally bug-free. 371 * I hope, now it's finally bug-free.
369 */ 372 */
370 373
371int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen, 374int hpfs_add_dirent(struct inode *i,
375 const unsigned char *name, unsigned namelen,
372 struct hpfs_dirent *new_de, int cdepth) 376 struct hpfs_dirent *new_de, int cdepth)
373{ 377{
374 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 378 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
@@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
897 901
898/* Find a dirent in tree */ 902/* Find a dirent in tree */
899 903
900struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len, 904struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
905 const unsigned char *name, unsigned len,
901 dnode_secno *dd, struct quad_buffer_head *qbh) 906 dnode_secno *dd, struct quad_buffer_head *qbh)
902{ 907{
903 struct dnode *dnode; 908 struct dnode *dnode;
@@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
988struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, 993struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
989 struct fnode *f, struct quad_buffer_head *qbh) 994 struct fnode *f, struct quad_buffer_head *qbh)
990{ 995{
991 char *name1; 996 unsigned char *name1;
992 char *name2; 997 unsigned char *name2;
993 int name1len, name2len; 998 int name1len, name2len;
994 struct dnode *d; 999 struct dnode *d;
995 dnode_secno dno, downd; 1000 dnode_secno dno, downd;
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 547a8384571f..45e53d972b42 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
62 return ret; 62 return ret;
63} 63}
64 64
65static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data, 65static void set_indirect_ea(struct super_block *s, int ano, secno a,
66 int size) 66 const char *data, int size)
67{ 67{
68 hpfs_ea_write(s, a, ano, 0, size, data); 68 hpfs_ea_write(s, a, ano, 0, size, data);
69} 69}
@@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
186 * This driver can't change sizes of eas ('cause I just don't need it). 186 * This driver can't change sizes of eas ('cause I just don't need it).
187 */ 187 */
188 188
189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size) 189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
190 const char *data, int size)
190{ 191{
191 fnode_secno fno = inode->i_ino; 192 fnode_secno fno = inode->i_ino;
192 struct super_block *s = inode->i_sb; 193 struct super_block *s = inode->i_sb;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 701ca54c0867..97bf738cd5d6 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade
215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned); 215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
216void hpfs_remove_btree(struct super_block *, struct bplus_header *); 216void hpfs_remove_btree(struct super_block *, struct bplus_header *);
217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *); 217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *); 218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
219void hpfs_ea_remove(struct super_block *, secno, int, unsigned); 219void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned); 220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
221void hpfs_remove_fnode(struct super_block *, fnode_secno fno); 221void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
@@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops;
244 244
245void hpfs_add_pos(struct inode *, loff_t *); 245void hpfs_add_pos(struct inode *, loff_t *);
246void hpfs_del_pos(struct inode *, loff_t *); 246void hpfs_del_pos(struct inode *, loff_t *);
247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno); 247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
248int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int); 248 const unsigned char *, unsigned, secno);
249int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
250 struct hpfs_dirent *, int);
249int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); 251int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
250void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); 252void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
251dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); 253dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
252struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *); 254struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
253struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *); 255struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
256 const unsigned char *, unsigned, dnode_secno *,
257 struct quad_buffer_head *);
254void hpfs_remove_dtree(struct super_block *, dnode_secno); 258void hpfs_remove_dtree(struct super_block *, dnode_secno);
255struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *); 259struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
256 260
@@ -259,7 +263,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f
259void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned); 263void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
260int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int); 264int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
261char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *); 265char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
262void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int); 266void hpfs_set_ea(struct inode *, struct fnode *, const char *,
267 const char *, int);
263 268
264/* file.c */ 269/* file.c */
265 270
@@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *);
282 287
283unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); 288unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
284unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); 289unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
285char *hpfs_load_code_page(struct super_block *, secno); 290unsigned char *hpfs_load_code_page(struct super_block *, secno);
286secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); 291secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
287struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); 292struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
288struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); 293struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
@@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
292/* name.c */ 297/* name.c */
293 298
294unsigned char hpfs_upcase(unsigned char *, unsigned char); 299unsigned char hpfs_upcase(unsigned char *, unsigned char);
295int hpfs_chk_name(unsigned char *, unsigned *); 300int hpfs_chk_name(const unsigned char *, unsigned *);
296char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); 301unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
297int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int); 302int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
298int hpfs_is_name_long(unsigned char *, unsigned); 303 const unsigned char *, unsigned, int);
299void hpfs_adjust_length(unsigned char *, unsigned *); 304int hpfs_is_name_long(const unsigned char *, unsigned);
300void hpfs_decide_conv(struct inode *, unsigned char *, unsigned); 305void hpfs_adjust_length(const unsigned char *, unsigned *);
306void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned);
301 307
302/* namei.c */ 308/* namei.c */
303 309
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index fe703ae46bc7..ff90affb94e1 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -46,7 +46,7 @@ void hpfs_read_inode(struct inode *i)
46 struct fnode *fnode; 46 struct fnode *fnode;
47 struct super_block *sb = i->i_sb; 47 struct super_block *sb = i->i_sb;
48 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 48 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
49 unsigned char *ea; 49 void *ea;
50 int ea_size; 50 int ea_size;
51 51
52 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) { 52 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
@@ -112,7 +112,7 @@ void hpfs_read_inode(struct inode *i)
112 } 112 }
113 } 113 }
114 if (fnode->dirflag) { 114 if (fnode->dirflag) {
115 unsigned n_dnodes, n_subdirs; 115 int n_dnodes, n_subdirs;
116 i->i_mode |= S_IFDIR; 116 i->i_mode |= S_IFDIR;
117 i->i_op = &hpfs_dir_iops; 117 i->i_op = &hpfs_dir_iops;
118 i->i_fop = &hpfs_dir_ops; 118 i->i_fop = &hpfs_dir_ops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index c4724589b2eb..840d033ecee8 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
35 * lowercasing table 35 * lowercasing table
36 */ 36 */
37 37
38char *hpfs_load_code_page(struct super_block *s, secno cps) 38unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
39{ 39{
40 struct buffer_head *bh; 40 struct buffer_head *bh;
41 secno cpds; 41 secno cpds;
@@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
71 brelse(bh); 71 brelse(bh);
72 return NULL; 72 return NULL;
73 } 73 }
74 ptr = (char *)cpd + cpd->offs[cpi] + 6; 74 ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6;
75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) { 75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
76 printk("HPFS: out of memory for code page table\n"); 76 printk("HPFS: out of memory for code page table\n");
77 brelse(bh); 77 brelse(bh);
@@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) 217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
218 if (hpfs_sb(s)->sb_chk) { 218 if (hpfs_sb(s)->sb_chk) {
219 unsigned p, pp = 0; 219 unsigned p, pp = 0;
220 unsigned char *d = (char *)dnode; 220 unsigned char *d = (unsigned char *)dnode;
221 int b = 0; 221 int b = 0;
222 if (dnode->magic != DNODE_MAGIC) { 222 if (dnode->magic != DNODE_MAGIC) {
223 hpfs_error(s, "bad magic on dnode %08x", secno); 223 hpfs_error(s, "bad magic on dnode %08x", secno);
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 1f4a964384eb..f24736d7a439 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -8,16 +8,16 @@
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10 10
11static char *text_postfix[]={ 11static const char *text_postfix[]={
12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF", 12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF",
13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS", 13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS",
14".RC", ".TEX", ".TXT", ".Y", ""}; 14".RC", ".TEX", ".TXT", ".Y", ""};
15 15
16static char *text_prefix[]={ 16static const char *text_prefix[]={
17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ", 17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ",
18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""}; 18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""};
19 19
20void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len) 20void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len)
21{ 21{
22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
23 int i; 23 int i;
@@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a)
71 return dir[a]; 71 return dir[a];
72} 72}
73 73
74int hpfs_chk_name(unsigned char *name, unsigned *len) 74int hpfs_chk_name(const unsigned char *name, unsigned *len)
75{ 75{
76 int i; 76 int i;
77 if (*len > 254) return -ENAMETOOLONG; 77 if (*len > 254) return -ENAMETOOLONG;
@@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len)
83 return 0; 83 return 0;
84} 84}
85 85
86char *hpfs_translate_name(struct super_block *s, unsigned char *from, 86unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
87 unsigned len, int lc, int lng) 87 unsigned len, int lc, int lng)
88{ 88{
89 char *to; 89 unsigned char *to;
90 int i; 90 int i;
91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { 91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
92 printk("HPFS: Long name flag mismatch - name "); 92 printk("HPFS: Long name flag mismatch - name ");
@@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
103 return to; 103 return to;
104} 104}
105 105
106int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1, 106int hpfs_compare_names(struct super_block *s,
107 unsigned char *n2, unsigned l2, int last) 107 const unsigned char *n1, unsigned l1,
108 const unsigned char *n2, unsigned l2, int last)
108{ 109{
109 unsigned l = l1 < l2 ? l1 : l2; 110 unsigned l = l1 < l2 ? l1 : l2;
110 unsigned i; 111 unsigned i;
@@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
120 return 0; 121 return 0;
121} 122}
122 123
123int hpfs_is_name_long(unsigned char *name, unsigned len) 124int hpfs_is_name_long(const unsigned char *name, unsigned len)
124{ 125{
125 int i,j; 126 int i,j;
126 for (i = 0; i < len && name[i] != '.'; i++) 127 for (i = 0; i < len && name[i] != '.'; i++)
@@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len)
134 135
135/* OS/2 clears dots and spaces at the end of file name, so we have to */ 136/* OS/2 clears dots and spaces at the end of file name, so we have to */
136 137
137void hpfs_adjust_length(unsigned char *name, unsigned *len) 138void hpfs_adjust_length(const unsigned char *name, unsigned *len)
138{ 139{
139 if (!*len) return; 140 if (!*len) return;
140 if (*len == 1 && name[0] == '.') return; 141 if (*len == 1 && name[0] == '.') return;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 82b9c4ba9ed0..11c2b4080f65 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
11 11
12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
13{ 13{
14 const char *name = dentry->d_name.name; 14 const unsigned char *name = dentry->d_name.name;
15 unsigned len = dentry->d_name.len; 15 unsigned len = dentry->d_name.len;
16 struct quad_buffer_head qbh0; 16 struct quad_buffer_head qbh0;
17 struct buffer_head *bh; 17 struct buffer_head *bh;
@@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
24 int r; 24 int r;
25 struct hpfs_dirent dee; 25 struct hpfs_dirent dee;
26 int err; 26 int err;
27 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 27 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
28 lock_kernel(); 28 lock_kernel();
29 err = -ENOSPC; 29 err = -ENOSPC;
30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
62 result->i_mode &= ~0222; 62 result->i_mode &= ~0222;
63 63
64 mutex_lock(&hpfs_i(dir)->i_mutex); 64 mutex_lock(&hpfs_i(dir)->i_mutex);
65 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 65 r = hpfs_add_dirent(dir, name, len, &dee, 0);
66 if (r == 1) 66 if (r == 1)
67 goto bail3; 67 goto bail3;
68 if (r == -1) { 68 if (r == -1) {
@@ -121,7 +121,7 @@ bail:
121 121
122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) 122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
123{ 123{
124 const char *name = dentry->d_name.name; 124 const unsigned char *name = dentry->d_name.name;
125 unsigned len = dentry->d_name.len; 125 unsigned len = dentry->d_name.len;
126 struct inode *result = NULL; 126 struct inode *result = NULL;
127 struct buffer_head *bh; 127 struct buffer_head *bh;
@@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
130 int r; 130 int r;
131 struct hpfs_dirent dee; 131 struct hpfs_dirent dee;
132 int err; 132 int err;
133 if ((err = hpfs_chk_name((char *)name, &len))) 133 if ((err = hpfs_chk_name(name, &len)))
134 return err==-ENOENT ? -EINVAL : err; 134 return err==-ENOENT ? -EINVAL : err;
135 lock_kernel(); 135 lock_kernel();
136 err = -ENOSPC; 136 err = -ENOSPC;
@@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
155 result->i_op = &hpfs_file_iops; 155 result->i_op = &hpfs_file_iops;
156 result->i_fop = &hpfs_file_ops; 156 result->i_fop = &hpfs_file_ops;
157 result->i_nlink = 1; 157 result->i_nlink = 1;
158 hpfs_decide_conv(result, (char *)name, len); 158 hpfs_decide_conv(result, name, len);
159 hpfs_i(result)->i_parent_dir = dir->i_ino; 159 hpfs_i(result)->i_parent_dir = dir->i_ino;
160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); 160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date);
161 result->i_ctime.tv_nsec = 0; 161 result->i_ctime.tv_nsec = 0;
@@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
170 hpfs_i(result)->mmu_private = 0; 170 hpfs_i(result)->mmu_private = 0;
171 171
172 mutex_lock(&hpfs_i(dir)->i_mutex); 172 mutex_lock(&hpfs_i(dir)->i_mutex);
173 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 173 r = hpfs_add_dirent(dir, name, len, &dee, 0);
174 if (r == 1) 174 if (r == 1)
175 goto bail2; 175 goto bail2;
176 if (r == -1) { 176 if (r == -1) {
@@ -211,7 +211,7 @@ bail:
211 211
212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) 212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
213{ 213{
214 const char *name = dentry->d_name.name; 214 const unsigned char *name = dentry->d_name.name;
215 unsigned len = dentry->d_name.len; 215 unsigned len = dentry->d_name.len;
216 struct buffer_head *bh; 216 struct buffer_head *bh;
217 struct fnode *fnode; 217 struct fnode *fnode;
@@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
220 struct hpfs_dirent dee; 220 struct hpfs_dirent dee;
221 struct inode *result = NULL; 221 struct inode *result = NULL;
222 int err; 222 int err;
223 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 223 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; 224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
225 if (!new_valid_dev(rdev)) 225 if (!new_valid_dev(rdev))
226 return -EINVAL; 226 return -EINVAL;
@@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
256 init_special_inode(result, mode, rdev); 256 init_special_inode(result, mode, rdev);
257 257
258 mutex_lock(&hpfs_i(dir)->i_mutex); 258 mutex_lock(&hpfs_i(dir)->i_mutex);
259 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 259 r = hpfs_add_dirent(dir, name, len, &dee, 0);
260 if (r == 1) 260 if (r == 1)
261 goto bail2; 261 goto bail2;
262 if (r == -1) { 262 if (r == -1) {
@@ -289,7 +289,7 @@ bail:
289 289
290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) 290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
291{ 291{
292 const char *name = dentry->d_name.name; 292 const unsigned char *name = dentry->d_name.name;
293 unsigned len = dentry->d_name.len; 293 unsigned len = dentry->d_name.len;
294 struct buffer_head *bh; 294 struct buffer_head *bh;
295 struct fnode *fnode; 295 struct fnode *fnode;
@@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
298 struct hpfs_dirent dee; 298 struct hpfs_dirent dee;
299 struct inode *result; 299 struct inode *result;
300 int err; 300 int err;
301 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 301 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
302 lock_kernel(); 302 lock_kernel();
303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) { 303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
304 unlock_kernel(); 304 unlock_kernel();
@@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
335 result->i_data.a_ops = &hpfs_symlink_aops; 335 result->i_data.a_ops = &hpfs_symlink_aops;
336 336
337 mutex_lock(&hpfs_i(dir)->i_mutex); 337 mutex_lock(&hpfs_i(dir)->i_mutex);
338 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 338 r = hpfs_add_dirent(dir, name, len, &dee, 0);
339 if (r == 1) 339 if (r == 1)
340 goto bail2; 340 goto bail2;
341 if (r == -1) { 341 if (r == -1) {
@@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
345 fnode->len = len; 345 fnode->len = len;
346 memcpy(fnode->name, name, len > 15 ? 15 : len); 346 memcpy(fnode->name, name, len > 15 ? 15 : len);
347 fnode->up = dir->i_ino; 347 fnode->up = dir->i_ino;
348 hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink)); 348 hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
349 mark_buffer_dirty(bh); 349 mark_buffer_dirty(bh);
350 brelse(bh); 350 brelse(bh);
351 351
@@ -369,7 +369,7 @@ bail:
369 369
370static int hpfs_unlink(struct inode *dir, struct dentry *dentry) 370static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
371{ 371{
372 const char *name = dentry->d_name.name; 372 const unsigned char *name = dentry->d_name.name;
373 unsigned len = dentry->d_name.len; 373 unsigned len = dentry->d_name.len;
374 struct quad_buffer_head qbh; 374 struct quad_buffer_head qbh;
375 struct hpfs_dirent *de; 375 struct hpfs_dirent *de;
@@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
381 int err; 381 int err;
382 382
383 lock_kernel(); 383 lock_kernel();
384 hpfs_adjust_length((char *)name, &len); 384 hpfs_adjust_length(name, &len);
385again: 385again:
386 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 386 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
387 mutex_lock(&hpfs_i(dir)->i_mutex); 387 mutex_lock(&hpfs_i(dir)->i_mutex);
388 err = -ENOENT; 388 err = -ENOENT;
389 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 389 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
390 if (!de) 390 if (!de)
391 goto out; 391 goto out;
392 392
@@ -413,22 +413,25 @@ again:
413 413
414 mutex_unlock(&hpfs_i(dir)->i_mutex); 414 mutex_unlock(&hpfs_i(dir)->i_mutex);
415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
416 d_drop(dentry); 416 dentry_unhash(dentry);
417 spin_lock(&dentry->d_lock); 417 if (!d_unhashed(dentry)) {
418 if (atomic_read(&dentry->d_count) > 1 || 418 dput(dentry);
419 generic_permission(inode, MAY_WRITE, NULL) || 419 unlock_kernel();
420 return -ENOSPC;
421 }
422 if (generic_permission(inode, MAY_WRITE, NULL) ||
420 !S_ISREG(inode->i_mode) || 423 !S_ISREG(inode->i_mode) ||
421 get_write_access(inode)) { 424 get_write_access(inode)) {
422 spin_unlock(&dentry->d_lock);
423 d_rehash(dentry); 425 d_rehash(dentry);
426 dput(dentry);
424 } else { 427 } else {
425 struct iattr newattrs; 428 struct iattr newattrs;
426 spin_unlock(&dentry->d_lock);
427 /*printk("HPFS: truncating file before delete.\n");*/ 429 /*printk("HPFS: truncating file before delete.\n");*/
428 newattrs.ia_size = 0; 430 newattrs.ia_size = 0;
429 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 431 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
430 err = notify_change(dentry, &newattrs); 432 err = notify_change(dentry, &newattrs);
431 put_write_access(inode); 433 put_write_access(inode);
434 dput(dentry);
432 if (!err) 435 if (!err)
433 goto again; 436 goto again;
434 } 437 }
@@ -451,7 +454,7 @@ out:
451 454
452static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) 455static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
453{ 456{
454 const char *name = dentry->d_name.name; 457 const unsigned char *name = dentry->d_name.name;
455 unsigned len = dentry->d_name.len; 458 unsigned len = dentry->d_name.len;
456 struct quad_buffer_head qbh; 459 struct quad_buffer_head qbh;
457 struct hpfs_dirent *de; 460 struct hpfs_dirent *de;
@@ -462,12 +465,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
462 int err; 465 int err;
463 int r; 466 int r;
464 467
465 hpfs_adjust_length((char *)name, &len); 468 hpfs_adjust_length(name, &len);
466 lock_kernel(); 469 lock_kernel();
467 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 470 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
468 mutex_lock(&hpfs_i(dir)->i_mutex); 471 mutex_lock(&hpfs_i(dir)->i_mutex);
469 err = -ENOENT; 472 err = -ENOENT;
470 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 473 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
471 if (!de) 474 if (!de)
472 goto out; 475 goto out;
473 476
@@ -546,10 +549,10 @@ const struct address_space_operations hpfs_symlink_aops = {
546static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, 549static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
547 struct inode *new_dir, struct dentry *new_dentry) 550 struct inode *new_dir, struct dentry *new_dentry)
548{ 551{
549 char *old_name = (char *)old_dentry->d_name.name; 552 const unsigned char *old_name = old_dentry->d_name.name;
550 int old_len = old_dentry->d_name.len; 553 unsigned old_len = old_dentry->d_name.len;
551 char *new_name = (char *)new_dentry->d_name.name; 554 const unsigned char *new_name = new_dentry->d_name.name;
552 int new_len = new_dentry->d_name.len; 555 unsigned new_len = new_dentry->d_name.len;
553 struct inode *i = old_dentry->d_inode; 556 struct inode *i = old_dentry->d_inode;
554 struct inode *new_inode = new_dentry->d_inode; 557 struct inode *new_inode = new_dentry->d_inode;
555 struct quad_buffer_head qbh, qbh1; 558 struct quad_buffer_head qbh, qbh1;
@@ -560,9 +563,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
560 struct buffer_head *bh; 563 struct buffer_head *bh;
561 struct fnode *fnode; 564 struct fnode *fnode;
562 int err; 565 int err;
563 if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err; 566 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
564 err = 0; 567 err = 0;
565 hpfs_adjust_length((char *)old_name, &old_len); 568 hpfs_adjust_length(old_name, &old_len);
566 569
567 lock_kernel(); 570 lock_kernel();
568 /* order doesn't matter, due to VFS exclusion */ 571 /* order doesn't matter, due to VFS exclusion */
@@ -579,7 +582,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
579 goto end1; 582 goto end1;
580 } 583 }
581 584
582 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 585 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
583 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed"); 586 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
584 err = -ENOENT; 587 err = -ENOENT;
585 goto end1; 588 goto end1;
@@ -590,7 +593,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
590 if (new_inode) { 593 if (new_inode) {
591 int r; 594 int r;
592 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) { 595 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
593 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) { 596 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
594 clear_nlink(new_inode); 597 clear_nlink(new_inode);
595 copy_de(nde, &de); 598 copy_de(nde, &de);
596 memcpy(nde->name, new_name, new_len); 599 memcpy(nde->name, new_name, new_len);
@@ -618,7 +621,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
618 } 621 }
619 622
620 if (new_dir == old_dir) 623 if (new_dir == old_dir)
621 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 624 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
622 hpfs_unlock_creation(i->i_sb); 625 hpfs_unlock_creation(i->i_sb);
623 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); 626 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
624 err = -ENOENT; 627 err = -ENOENT;
@@ -648,7 +651,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
648 brelse(bh); 651 brelse(bh);
649 } 652 }
650 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv; 653 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
651 hpfs_decide_conv(i, (char *)new_name, new_len); 654 hpfs_decide_conv(i, new_name, new_len);
652end1: 655end1:
653 if (old_dir != new_dir) 656 if (old_dir != new_dir)
654 mutex_unlock(&hpfs_i(new_dir)->i_mutex); 657 mutex_unlock(&hpfs_i(new_dir)->i_mutex);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7239efc690d8..2e4dfa8593da 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -718,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
718 struct vfsmount *proc_mnt; 718 struct vfsmount *proc_mnt;
719 int err = -ENOENT; 719 int err = -ENOENT;
720 720
721 proc_mnt = do_kern_mount("proc", 0, "proc", NULL); 721 proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
722 if (IS_ERR(proc_mnt)) 722 if (IS_ERR(proc_mnt))
723 goto out; 723 goto out;
724 724
diff --git a/fs/inode.c b/fs/inode.c
index 03dfeb2e3928..407bf392e20a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/dcache.h> 9#include <linux/dcache.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/quotaops.h>
12#include <linux/slab.h> 11#include <linux/slab.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/module.h> 13#include <linux/module.h>
@@ -314,7 +313,6 @@ void clear_inode(struct inode *inode)
314 BUG_ON(!(inode->i_state & I_FREEING)); 313 BUG_ON(!(inode->i_state & I_FREEING));
315 BUG_ON(inode->i_state & I_CLEAR); 314 BUG_ON(inode->i_state & I_CLEAR);
316 inode_sync_wait(inode); 315 inode_sync_wait(inode);
317 vfs_dq_drop(inode);
318 if (inode->i_sb->s_op->clear_inode) 316 if (inode->i_sb->s_op->clear_inode)
319 inode->i_sb->s_op->clear_inode(inode); 317 inode->i_sb->s_op->clear_inode(inode);
320 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 318 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -1211,8 +1209,6 @@ void generic_delete_inode(struct inode *inode)
1211 1209
1212 if (op->delete_inode) { 1210 if (op->delete_inode) {
1213 void (*delete)(struct inode *) = op->delete_inode; 1211 void (*delete)(struct inode *) = op->delete_inode;
1214 if (!is_bad_inode(inode))
1215 vfs_dq_init(inode);
1216 /* Filesystems implementing their own 1212 /* Filesystems implementing their own
1217 * s_op->delete_inode are required to call 1213 * s_op->delete_inode are required to call
1218 * truncate_inode_pages and clear_inode() 1214 * truncate_inode_pages and clear_inode()
diff --git a/fs/internal.h b/fs/internal.h
index e96a1667d749..8a03a5447bdf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 70
71extern void __init mnt_init(void); 71extern void __init mnt_init(void);
72 72
73extern spinlock_t vfsmount_lock;
74
73/* 75/*
74 * fs_struct.c 76 * fs_struct.c
75 */ 77 */
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 4bd882548c45..2c90e3ef625f 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -862,12 +862,12 @@ restart_loop:
862 /* A buffer which has been freed while still being 862 /* A buffer which has been freed while still being
863 * journaled by a previous transaction may end up still 863 * journaled by a previous transaction may end up still
864 * being dirty here, but we want to avoid writing back 864 * being dirty here, but we want to avoid writing back
865 * that buffer in the future now that the last use has 865 * that buffer in the future after the "add to orphan"
866 * been committed. That's not only a performance gain, 866 * operation been committed, That's not only a performance
867 * it also stops aliasing problems if the buffer is left 867 * gain, it also stops aliasing problems if the buffer is
868 * behind for writeback and gets reallocated for another 868 * left behind for writeback and gets reallocated for another
869 * use in a different page. */ 869 * use in a different page. */
870 if (buffer_freed(bh)) { 870 if (buffer_freed(bh) && !jh->b_next_transaction) {
871 clear_buffer_freed(bh); 871 clear_buffer_freed(bh);
872 clear_buffer_jbddirty(bh); 872 clear_buffer_jbddirty(bh);
873 } 873 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 006f9ad838a2..5ae71e75a491 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
1398 * the case where our storage is so fast that it is more optimal to go 1398 * the case where our storage is so fast that it is more optimal to go
1399 * ahead and force a flush and wait for the transaction to be committed 1399 * ahead and force a flush and wait for the transaction to be committed
1400 * than it is to wait for an arbitrary amount of time for new writers to 1400 * than it is to wait for an arbitrary amount of time for new writers to
1401 * join the transaction. We acheive this by measuring how long it takes 1401 * join the transaction. We achieve this by measuring how long it takes
1402 * to commit a transaction, and compare it with how long this 1402 * to commit a transaction, and compare it with how long this
1403 * transaction has been running, and if run time < commit time then we 1403 * transaction has been running, and if run time < commit time then we
1404 * sleep for the delta and commit. This greatly helps super fast disks 1404 * sleep for the delta and commit. This greatly helps super fast disks
@@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1864 if (!jh) 1864 if (!jh)
1865 goto zap_buffer_no_jh; 1865 goto zap_buffer_no_jh;
1866 1866
1867 /*
1868 * We cannot remove the buffer from checkpoint lists until the
1869 * transaction adding inode to orphan list (let's call it T)
1870 * is committed. Otherwise if the transaction changing the
1871 * buffer would be cleaned from the journal before T is
1872 * committed, a crash will cause that the correct contents of
1873 * the buffer will be lost. On the other hand we have to
1874 * clear the buffer dirty bit at latest at the moment when the
1875 * transaction marking the buffer as freed in the filesystem
1876 * structures is committed because from that moment on the
1877 * buffer can be reallocated and used by a different page.
1878 * Since the block hasn't been freed yet but the inode has
1879 * already been added to orphan list, it is safe for us to add
1880 * the buffer to BJ_Forget list of the newest transaction.
1881 */
1867 transaction = jh->b_transaction; 1882 transaction = jh->b_transaction;
1868 if (transaction == NULL) { 1883 if (transaction == NULL) {
1869 /* First case: not on any transaction. If it 1884 /* First case: not on any transaction. If it
@@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1929 goto zap_buffer; 1944 goto zap_buffer;
1930 } 1945 }
1931 /* 1946 /*
1932 * If it is committing, we simply cannot touch it. We 1947 * The buffer is committing, we simply cannot touch
1933 * can remove it's next_transaction pointer from the 1948 * it. So we just set j_next_transaction to the
1934 * running transaction if that is set, but nothing 1949 * running transaction (if there is one) and mark
1935 * else. */ 1950 * buffer as freed so that commit code knows it should
1951 * clear dirty bits when it is done with the buffer.
1952 */
1936 set_buffer_freed(bh); 1953 set_buffer_freed(bh);
1937 if (jh->b_next_transaction) { 1954 if (journal->j_running_transaction && buffer_jbddirty(bh))
1938 J_ASSERT(jh->b_next_transaction == 1955 jh->b_next_transaction = journal->j_running_transaction;
1939 journal->j_running_transaction);
1940 jh->b_next_transaction = NULL;
1941 }
1942 journal_put_journal_head(jh); 1956 journal_put_journal_head(jh);
1943 spin_unlock(&journal->j_list_lock); 1957 spin_unlock(&journal->j_list_lock);
1944 jbd_unlock_bh_state(bh); 1958 jbd_unlock_bh_state(bh);
@@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh,
2120 */ 2134 */
2121void __journal_refile_buffer(struct journal_head *jh) 2135void __journal_refile_buffer(struct journal_head *jh)
2122{ 2136{
2123 int was_dirty; 2137 int was_dirty, jlist;
2124 struct buffer_head *bh = jh2bh(jh); 2138 struct buffer_head *bh = jh2bh(jh);
2125 2139
2126 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 2140 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh)
2142 __journal_temp_unlink_buffer(jh); 2156 __journal_temp_unlink_buffer(jh);
2143 jh->b_transaction = jh->b_next_transaction; 2157 jh->b_transaction = jh->b_next_transaction;
2144 jh->b_next_transaction = NULL; 2158 jh->b_next_transaction = NULL;
2145 __journal_file_buffer(jh, jh->b_transaction, 2159 if (buffer_freed(bh))
2146 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2160 jlist = BJ_Forget;
2161 else if (jh->b_modified)
2162 jlist = BJ_Metadata;
2163 else
2164 jlist = BJ_Reserved;
2165 __journal_file_buffer(jh, jh->b_transaction, jlist);
2147 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2166 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2148 2167
2149 if (was_dirty) 2168 if (was_dirty)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 886849370950..30beb11ef928 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
507 if (blocknr < journal->j_tail) 507 if (blocknr < journal->j_tail)
508 freed = freed + journal->j_last - journal->j_first; 508 freed = freed + journal->j_last - journal->j_first;
509 509
510 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
510 jbd_debug(1, 511 jbd_debug(1,
511 "Cleaning journal tail from %d to %d (offset %lu), " 512 "Cleaning journal tail from %d to %d (offset %lu), "
512 "freeing %lu\n", 513 "freeing %lu\n",
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1bc74b6f26d2..671da7fb7ffd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -883,8 +883,7 @@ restart_loop:
883 spin_unlock(&journal->j_list_lock); 883 spin_unlock(&journal->j_list_lock);
884 bh = jh2bh(jh); 884 bh = jh2bh(jh);
885 jbd_lock_bh_state(bh); 885 jbd_lock_bh_state(bh);
886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || 886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
887 jh->b_transaction == journal->j_running_transaction);
888 887
889 /* 888 /*
890 * If there is undo-protected committed data against 889 * If there is undo-protected committed data against
@@ -930,12 +929,12 @@ restart_loop:
930 /* A buffer which has been freed while still being 929 /* A buffer which has been freed while still being
931 * journaled by a previous transaction may end up still 930 * journaled by a previous transaction may end up still
932 * being dirty here, but we want to avoid writing back 931 * being dirty here, but we want to avoid writing back
933 * that buffer in the future now that the last use has 932 * that buffer in the future after the "add to orphan"
934 * been committed. That's not only a performance gain, 933 * operation been committed, That's not only a performance
935 * it also stops aliasing problems if the buffer is left 934 * gain, it also stops aliasing problems if the buffer is
936 * behind for writeback and gets reallocated for another 935 * left behind for writeback and gets reallocated for another
937 * use in a different page. */ 936 * use in a different page. */
938 if (buffer_freed(bh)) { 937 if (buffer_freed(bh) && !jh->b_next_transaction) {
939 clear_buffer_freed(bh); 938 clear_buffer_freed(bh);
940 clear_buffer_jbddirty(bh); 939 clear_buffer_jbddirty(bh);
941 } 940 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ac0d027595d0..c03d4dce4d76 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h> 40#include <linux/math64.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h>
43#include <linux/vmalloc.h>
42 44
43#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
44#include <trace/events/jbd2.h> 46#include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
93 95
94static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 96static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
95static void __journal_abort_soft (journal_t *journal, int errno); 97static void __journal_abort_soft (journal_t *journal, int errno);
98static int jbd2_journal_create_slab(size_t slab_size);
96 99
97/* 100/*
98 * Helper function used to manage commit timeouts 101 * Helper function used to manage commit timeouts
@@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal)
1248 } 1251 }
1249 } 1252 }
1250 1253
1254 /*
1255 * Create a slab for this blocksize
1256 */
1257 err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1258 if (err)
1259 return err;
1260
1251 /* Let the recovery code check whether it needs to recover any 1261 /* Let the recovery code check whether it needs to recover any
1252 * data from the journal. */ 1262 * data from the journal. */
1253 if (jbd2_journal_recover(journal)) 1263 if (jbd2_journal_recover(journal))
@@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
1807} 1817}
1808 1818
1809/* 1819/*
1820 * JBD memory management
1821 *
1822 * These functions are used to allocate block-sized chunks of memory
1823 * used for making copies of buffer_head data. Very often it will be
1824 * page-sized chunks of data, but sometimes it will be in
1825 * sub-page-size chunks. (For example, 16k pages on Power systems
1826 * with a 4k block file system.) For blocks smaller than a page, we
1827 * use a SLAB allocator. There are slab caches for each block size,
1828 * which are allocated at mount time, if necessary, and we only free
1829 * (all of) the slab caches when/if the jbd2 module is unloaded. For
1830 * this reason we don't need to a mutex to protect access to
1831 * jbd2_slab[] allocating or releasing memory; only in
1832 * jbd2_journal_create_slab().
1833 */
1834#define JBD2_MAX_SLABS 8
1835static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1836static DECLARE_MUTEX(jbd2_slab_create_sem);
1837
1838static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1839 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
1840 "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
1841};
1842
1843
1844static void jbd2_journal_destroy_slabs(void)
1845{
1846 int i;
1847
1848 for (i = 0; i < JBD2_MAX_SLABS; i++) {
1849 if (jbd2_slab[i])
1850 kmem_cache_destroy(jbd2_slab[i]);
1851 jbd2_slab[i] = NULL;
1852 }
1853}
1854
1855static int jbd2_journal_create_slab(size_t size)
1856{
1857 int i = order_base_2(size) - 10;
1858 size_t slab_size;
1859
1860 if (size == PAGE_SIZE)
1861 return 0;
1862
1863 if (i >= JBD2_MAX_SLABS)
1864 return -EINVAL;
1865
1866 if (unlikely(i < 0))
1867 i = 0;
1868 down(&jbd2_slab_create_sem);
1869 if (jbd2_slab[i]) {
1870 up(&jbd2_slab_create_sem);
1871 return 0; /* Already created */
1872 }
1873
1874 slab_size = 1 << (i+10);
1875 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1876 slab_size, 0, NULL);
1877 up(&jbd2_slab_create_sem);
1878 if (!jbd2_slab[i]) {
1879 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1880 return -ENOMEM;
1881 }
1882 return 0;
1883}
1884
1885static struct kmem_cache *get_slab(size_t size)
1886{
1887 int i = order_base_2(size) - 10;
1888
1889 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0))
1891 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0);
1893 return jbd2_slab[i];
1894}
1895
1896void *jbd2_alloc(size_t size, gfp_t flags)
1897{
1898 void *ptr;
1899
1900 BUG_ON(size & (size-1)); /* Must be a power of 2 */
1901
1902 flags |= __GFP_REPEAT;
1903 if (size == PAGE_SIZE)
1904 ptr = (void *)__get_free_pages(flags, 0);
1905 else if (size > PAGE_SIZE) {
1906 int order = get_order(size);
1907
1908 if (order < 3)
1909 ptr = (void *)__get_free_pages(flags, order);
1910 else
1911 ptr = vmalloc(size);
1912 } else
1913 ptr = kmem_cache_alloc(get_slab(size), flags);
1914
1915 /* Check alignment; SLUB has gotten this wrong in the past,
1916 * and this can lead to user data corruption! */
1917 BUG_ON(((unsigned long) ptr) & (size-1));
1918
1919 return ptr;
1920}
1921
1922void jbd2_free(void *ptr, size_t size)
1923{
1924 if (size == PAGE_SIZE) {
1925 free_pages((unsigned long)ptr, 0);
1926 return;
1927 }
1928 if (size > PAGE_SIZE) {
1929 int order = get_order(size);
1930
1931 if (order < 3)
1932 free_pages((unsigned long)ptr, order);
1933 else
1934 vfree(ptr);
1935 return;
1936 }
1937 kmem_cache_free(get_slab(size), ptr);
1938};
1939
1940/*
1810 * Journal_head storage management 1941 * Journal_head storage management
1811 */ 1942 */
1812static struct kmem_cache *jbd2_journal_head_cache; 1943static struct kmem_cache *jbd2_journal_head_cache;
@@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
2204 jbd2_journal_destroy_revoke_caches(); 2335 jbd2_journal_destroy_revoke_caches();
2205 jbd2_journal_destroy_jbd2_journal_head_cache(); 2336 jbd2_journal_destroy_jbd2_journal_head_cache();
2206 jbd2_journal_destroy_handle_cache(); 2337 jbd2_journal_destroy_handle_cache();
2338 jbd2_journal_destroy_slabs();
2207} 2339}
2208 2340
2209static int __init journal_init(void) 2341static int __init journal_init(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..bfc70f57900f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1727 if (!jh) 1727 if (!jh)
1728 goto zap_buffer_no_jh; 1728 goto zap_buffer_no_jh;
1729 1729
1730 /*
1731 * We cannot remove the buffer from checkpoint lists until the
1732 * transaction adding inode to orphan list (let's call it T)
1733 * is committed. Otherwise if the transaction changing the
1734 * buffer would be cleaned from the journal before T is
1735 * committed, a crash will cause that the correct contents of
1736 * the buffer will be lost. On the other hand we have to
1737 * clear the buffer dirty bit at latest at the moment when the
1738 * transaction marking the buffer as freed in the filesystem
1739 * structures is committed because from that moment on the
1740 * buffer can be reallocated and used by a different page.
1741 * Since the block hasn't been freed yet but the inode has
1742 * already been added to orphan list, it is safe for us to add
1743 * the buffer to BJ_Forget list of the newest transaction.
1744 */
1730 transaction = jh->b_transaction; 1745 transaction = jh->b_transaction;
1731 if (transaction == NULL) { 1746 if (transaction == NULL) {
1732 /* First case: not on any transaction. If it 1747 /* First case: not on any transaction. If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1783 } else if (transaction == journal->j_committing_transaction) { 1798 } else if (transaction == journal->j_committing_transaction) {
1784 JBUFFER_TRACE(jh, "on committing transaction"); 1799 JBUFFER_TRACE(jh, "on committing transaction");
1785 /* 1800 /*
1786 * If it is committing, we simply cannot touch it. We 1801 * The buffer is committing, we simply cannot touch
1787 * can remove it's next_transaction pointer from the 1802 * it. So we just set j_next_transaction to the
1788 * running transaction if that is set, but nothing 1803 * running transaction (if there is one) and mark
1789 * else. */ 1804 * buffer as freed so that commit code knows it should
1805 * clear dirty bits when it is done with the buffer.
1806 */
1790 set_buffer_freed(bh); 1807 set_buffer_freed(bh);
1791 if (jh->b_next_transaction) { 1808 if (journal->j_running_transaction && buffer_jbddirty(bh))
1792 J_ASSERT(jh->b_next_transaction == 1809 jh->b_next_transaction = journal->j_running_transaction;
1793 journal->j_running_transaction);
1794 jh->b_next_transaction = NULL;
1795 }
1796 jbd2_journal_put_journal_head(jh); 1810 jbd2_journal_put_journal_head(jh);
1797 spin_unlock(&journal->j_list_lock); 1811 spin_unlock(&journal->j_list_lock);
1798 jbd_unlock_bh_state(bh); 1812 jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
1969 */ 1983 */
1970void __jbd2_journal_refile_buffer(struct journal_head *jh) 1984void __jbd2_journal_refile_buffer(struct journal_head *jh)
1971{ 1985{
1972 int was_dirty; 1986 int was_dirty, jlist;
1973 struct buffer_head *bh = jh2bh(jh); 1987 struct buffer_head *bh = jh2bh(jh);
1974 1988
1975 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1989 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
1991 __jbd2_journal_temp_unlink_buffer(jh); 2005 __jbd2_journal_temp_unlink_buffer(jh);
1992 jh->b_transaction = jh->b_next_transaction; 2006 jh->b_transaction = jh->b_next_transaction;
1993 jh->b_next_transaction = NULL; 2007 jh->b_next_transaction = NULL;
1994 __jbd2_journal_file_buffer(jh, jh->b_transaction, 2008 if (buffer_freed(bh))
1995 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2009 jlist = BJ_Forget;
2010 else if (jh->b_modified)
2011 jlist = BJ_Metadata;
2012 else
2013 jlist = BJ_Reserved;
2014 __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
1996 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2015 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
1997 2016
1998 if (was_dirty) 2017 if (was_dirty)
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
567 else BUG(); 567 else BUG();
568 } 568 }
569 } 569 }
570 list->rb_node = NULL; 570 *list = RB_ROOT;
571} 571}
572 572
573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) 573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d66477c34306..213169780b6c 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -20,7 +20,6 @@
20 20
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
25#include "jfs_incore.h" 24#include "jfs_incore.h"
26#include "jfs_txnmgr.h" 25#include "jfs_txnmgr.h"
@@ -174,7 +173,7 @@ cleanup:
174 return rc; 173 return rc;
175} 174}
176 175
177static int jfs_acl_chmod(struct inode *inode) 176int jfs_acl_chmod(struct inode *inode)
178{ 177{
179 struct posix_acl *acl, *clone; 178 struct posix_acl *acl, *clone;
180 int rc; 179 int rc;
@@ -205,26 +204,3 @@ static int jfs_acl_chmod(struct inode *inode)
205 posix_acl_release(clone); 204 posix_acl_release(clone);
206 return rc; 205 return rc;
207} 206}
208
209int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
210{
211 struct inode *inode = dentry->d_inode;
212 int rc;
213
214 rc = inode_change_ok(inode, iattr);
215 if (rc)
216 return rc;
217
218 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
219 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
220 if (vfs_dq_transfer(inode, iattr))
221 return -EDQUOT;
222 }
223
224 rc = inode_setattr(inode, iattr);
225
226 if (!rc && (iattr->ia_valid & ATTR_MODE))
227 rc = jfs_acl_chmod(inode);
228
229 return rc;
230}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2b70fa78e4a7..14ba982b3f24 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/quotaops.h>
21#include "jfs_incore.h" 22#include "jfs_incore.h"
22#include "jfs_inode.h" 23#include "jfs_inode.h"
23#include "jfs_dmap.h" 24#include "jfs_dmap.h"
@@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file)
47{ 48{
48 int rc; 49 int rc;
49 50
50 if ((rc = generic_file_open(inode, file))) 51 if ((rc = dquot_file_open(inode, file)))
51 return rc; 52 return rc;
52 53
53 /* 54 /*
@@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file)
88 return 0; 89 return 0;
89} 90}
90 91
92int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
93{
94 struct inode *inode = dentry->d_inode;
95 int rc;
96
97 rc = inode_change_ok(inode, iattr);
98 if (rc)
99 return rc;
100
101 if (iattr->ia_valid & ATTR_SIZE)
102 dquot_initialize(inode);
103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
105 rc = dquot_transfer(inode, iattr);
106 if (rc)
107 return rc;
108 }
109
110 rc = inode_setattr(inode, iattr);
111
112 if (!rc && (iattr->ia_valid & ATTR_MODE))
113 rc = jfs_acl_chmod(inode);
114
115 return rc;
116}
117
91const struct inode_operations jfs_file_inode_operations = { 118const struct inode_operations jfs_file_inode_operations = {
92 .truncate = jfs_truncate, 119 .truncate = jfs_truncate,
93 .setxattr = jfs_setxattr, 120 .setxattr = jfs_setxattr,
94 .getxattr = jfs_getxattr, 121 .getxattr = jfs_getxattr,
95 .listxattr = jfs_listxattr, 122 .listxattr = jfs_listxattr,
96 .removexattr = jfs_removexattr, 123 .removexattr = jfs_removexattr,
97#ifdef CONFIG_JFS_POSIX_ACL
98 .setattr = jfs_setattr, 124 .setattr = jfs_setattr,
125#ifdef CONFIG_JFS_POSIX_ACL
99 .check_acl = jfs_check_acl, 126 .check_acl = jfs_check_acl,
100#endif 127#endif
101}; 128};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..9dd126276c9f 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/quotaops.h> 24#include <linux/quotaops.h>
25#include <linux/writeback.h>
25#include "jfs_incore.h" 26#include "jfs_incore.h"
26#include "jfs_inode.h" 27#include "jfs_inode.h"
27#include "jfs_filsys.h" 28#include "jfs_filsys.h"
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
120 return rc; 121 return rc;
121} 122}
122 123
123int jfs_write_inode(struct inode *inode, int wait) 124int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
124{ 125{
126 int wait = wbc->sync_mode == WB_SYNC_ALL;
127
125 if (test_cflag(COMMIT_Nolink, inode)) 128 if (test_cflag(COMMIT_Nolink, inode))
126 return 0; 129 return 0;
127 /* 130 /*
@@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode)
146{ 149{
147 jfs_info("In jfs_delete_inode, inode = 0x%p", inode); 150 jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
148 151
152 if (!is_bad_inode(inode))
153 dquot_initialize(inode);
154
149 if (!is_bad_inode(inode) && 155 if (!is_bad_inode(inode) &&
150 (JFS_IP(inode)->fileset == FILESYSTEM_I)) { 156 (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
151 truncate_inode_pages(&inode->i_data, 0); 157 truncate_inode_pages(&inode->i_data, 0);
@@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode)
158 /* 164 /*
159 * Free the inode from the quota allocation. 165 * Free the inode from the quota allocation.
160 */ 166 */
161 vfs_dq_init(inode); 167 dquot_initialize(inode);
162 vfs_dq_free_inode(inode); 168 dquot_free_inode(inode);
163 vfs_dq_drop(inode); 169 dquot_drop(inode);
164 } 170 }
165 171
166 clear_inode(inode); 172 clear_inode(inode);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index b07bd417ef85..54e07559878d 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -22,7 +22,7 @@
22 22
23int jfs_check_acl(struct inode *, int); 23int jfs_check_acl(struct inode *, int);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_setattr(struct dentry *, struct iattr *); 25int jfs_acl_chmod(struct inode *inode);
26 26
27#else 27#else
28 28
@@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
32 return 0; 32 return 0;
33} 33}
34 34
35static inline int jfs_acl_chmod(struct inode *inode)
36{
37 return 0;
38}
39
35#endif 40#endif
36#endif /* _H_JFS_ACL */ 41#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887b..0e4623be70ce 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
381 * It's time to move the inline table to an external 381 * It's time to move the inline table to an external
382 * page and begin to build the xtree 382 * page and begin to build the xtree
383 */ 383 */
384 if (vfs_dq_alloc_block(ip, sbi->nbperpage)) 384 if (dquot_alloc_block(ip, sbi->nbperpage))
385 goto clean_up; 385 goto clean_up;
386 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { 386 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
387 vfs_dq_free_block(ip, sbi->nbperpage); 387 dquot_free_block(ip, sbi->nbperpage);
388 goto clean_up; 388 goto clean_up;
389 } 389 }
390 390
@@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
408 memcpy(&jfs_ip->i_dirtable, temp_table, 408 memcpy(&jfs_ip->i_dirtable, temp_table,
409 sizeof (temp_table)); 409 sizeof (temp_table));
410 dbFree(ip, xaddr, sbi->nbperpage); 410 dbFree(ip, xaddr, sbi->nbperpage);
411 vfs_dq_free_block(ip, sbi->nbperpage); 411 dquot_free_block(ip, sbi->nbperpage);
412 goto clean_up; 412 goto clean_up;
413 } 413 }
414 ip->i_size = PSIZE; 414 ip->i_size = PSIZE;
@@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid,
1027 n = xlen; 1027 n = xlen;
1028 1028
1029 /* Allocate blocks to quota. */ 1029 /* Allocate blocks to quota. */
1030 if (vfs_dq_alloc_block(ip, n)) { 1030 rc = dquot_alloc_block(ip, n);
1031 rc = -EDQUOT; 1031 if (rc)
1032 goto extendOut; 1032 goto extendOut;
1033 }
1034 quota_allocation += n; 1033 quota_allocation += n;
1035 1034
1036 if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, 1035 if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid,
1308 1307
1309 /* Rollback quota allocation */ 1308 /* Rollback quota allocation */
1310 if (rc && quota_allocation) 1309 if (rc && quota_allocation)
1311 vfs_dq_free_block(ip, quota_allocation); 1310 dquot_free_block(ip, quota_allocation);
1312 1311
1313 dtSplitUp_Exit: 1312 dtSplitUp_Exit:
1314 1313
@@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1369 return -EIO; 1368 return -EIO;
1370 1369
1371 /* Allocate blocks to quota. */ 1370 /* Allocate blocks to quota. */
1372 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1371 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1372 if (rc) {
1373 release_metapage(rmp); 1373 release_metapage(rmp);
1374 return -EDQUOT; 1374 return rc;
1375 } 1375 }
1376 1376
1377 jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); 1377 jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid,
1892 struct dt_lock *dtlck; 1892 struct dt_lock *dtlck;
1893 struct tlock *tlck; 1893 struct tlock *tlck;
1894 struct lv *lv; 1894 struct lv *lv;
1895 int rc;
1895 1896
1896 /* get split root page */ 1897 /* get split root page */
1897 smp = split->mp; 1898 smp = split->mp;
@@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid,
1916 rp = rmp->data; 1917 rp = rmp->data;
1917 1918
1918 /* Allocate blocks to quota. */ 1919 /* Allocate blocks to quota. */
1919 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1920 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1921 if (rc) {
1920 release_metapage(rmp); 1922 release_metapage(rmp);
1921 return -EDQUOT; 1923 return rc;
1922 } 1924 }
1923 1925
1924 BT_MARK_DIRTY(rmp, ip); 1926 BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2287 xlen = lengthPXD(&fp->header.self); 2289 xlen = lengthPXD(&fp->header.self);
2288 2290
2289 /* Free quota allocation. */ 2291 /* Free quota allocation. */
2290 vfs_dq_free_block(ip, xlen); 2292 dquot_free_block(ip, xlen);
2291 2293
2292 /* free/invalidate its buffer page */ 2294 /* free/invalidate its buffer page */
2293 discard_metapage(fmp); 2295 discard_metapage(fmp);
@@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2363 xlen = lengthPXD(&p->header.self); 2365 xlen = lengthPXD(&p->header.self);
2364 2366
2365 /* Free quota allocation */ 2367 /* Free quota allocation */
2366 vfs_dq_free_block(ip, xlen); 2368 dquot_free_block(ip, xlen);
2367 2369
2368 /* free/invalidate its buffer page */ 2370 /* free/invalidate its buffer page */
2369 discard_metapage(mp); 2371 discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb0..5d3bbd10f8db 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
141 } 141 }
142 142
143 /* Allocate blocks to quota. */ 143 /* Allocate blocks to quota. */
144 if (vfs_dq_alloc_block(ip, nxlen)) { 144 rc = dquot_alloc_block(ip, nxlen);
145 if (rc) {
145 dbFree(ip, nxaddr, (s64) nxlen); 146 dbFree(ip, nxaddr, (s64) nxlen);
146 mutex_unlock(&JFS_IP(ip)->commit_mutex); 147 mutex_unlock(&JFS_IP(ip)->commit_mutex);
147 return -EDQUOT; 148 return rc;
148 } 149 }
149 150
150 /* determine the value of the extent flag */ 151 /* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
164 */ 165 */
165 if (rc) { 166 if (rc) {
166 dbFree(ip, nxaddr, nxlen); 167 dbFree(ip, nxaddr, nxlen);
167 vfs_dq_free_block(ip, nxlen); 168 dquot_free_block(ip, nxlen);
168 mutex_unlock(&JFS_IP(ip)->commit_mutex); 169 mutex_unlock(&JFS_IP(ip)->commit_mutex);
169 return (rc); 170 return (rc);
170 } 171 }
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
256 goto exit; 257 goto exit;
257 258
258 /* Allocat blocks to quota. */ 259 /* Allocat blocks to quota. */
259 if (vfs_dq_alloc_block(ip, nxlen)) { 260 rc = dquot_alloc_block(ip, nxlen);
261 if (rc) {
260 dbFree(ip, nxaddr, (s64) nxlen); 262 dbFree(ip, nxaddr, (s64) nxlen);
261 mutex_unlock(&JFS_IP(ip)->commit_mutex); 263 mutex_unlock(&JFS_IP(ip)->commit_mutex);
262 return -EDQUOT; 264 return rc;
263 } 265 }
264 266
265 delta = nxlen - xlen; 267 delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
297 /* extend the extent */ 299 /* extend the extent */
298 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { 300 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
299 dbFree(ip, xaddr + xlen, delta); 301 dbFree(ip, xaddr + xlen, delta);
300 vfs_dq_free_block(ip, nxlen); 302 dquot_free_block(ip, nxlen);
301 goto exit; 303 goto exit;
302 } 304 }
303 } else { 305 } else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
308 */ 310 */
309 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { 311 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
310 dbFree(ip, nxaddr, nxlen); 312 dbFree(ip, nxaddr, nxlen);
311 vfs_dq_free_block(ip, nxlen); 313 dquot_free_block(ip, nxlen);
312 goto exit; 314 goto exit;
313 } 315 }
314 } 316 }
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac9..829921b67765 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
116 /* 116 /*
117 * Allocate inode to quota. 117 * Allocate inode to quota.
118 */ 118 */
119 if (vfs_dq_alloc_inode(inode)) { 119 dquot_initialize(inode);
120 rc = -EDQUOT; 120 rc = dquot_alloc_inode(inode);
121 if (rc)
121 goto fail_drop; 122 goto fail_drop;
122 }
123 123
124 inode->i_mode = mode; 124 inode->i_mode = mode;
125 /* inherit flags from parent */ 125 /* inherit flags from parent */
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
162 return inode; 162 return inode;
163 163
164fail_drop: 164fail_drop:
165 vfs_dq_drop(inode); 165 dquot_drop(inode);
166 inode->i_flags |= S_NOQUOTA; 166 inode->i_flags |= S_NOQUOTA;
167fail_unlock: 167fail_unlock:
168 inode->i_nlink = 0; 168 inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..79e2c79661df 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode*, int); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_delete_inode(struct inode *); 30extern void jfs_delete_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
@@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
40 int fh_len, int fh_type); 40 int fh_len, int fh_type);
41extern void jfs_set_inode_flags(struct inode *); 41extern void jfs_set_inode_flags(struct inode *);
42extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 42extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
43extern int jfs_setattr(struct dentry *, struct iattr *);
43 44
44extern const struct address_space_operations jfs_aops; 45extern const struct address_space_operations jfs_aops;
45extern const struct inode_operations jfs_dir_inode_operations; 46extern const struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a6458648..6c50871e6220 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */
585 hint = addressXAD(xad) + lengthXAD(xad) - 1; 585 hint = addressXAD(xad) + lengthXAD(xad) - 1;
586 } else 586 } else
587 hint = 0; 587 hint = 0;
588 if ((rc = vfs_dq_alloc_block(ip, xlen))) 588 if ((rc = dquot_alloc_block(ip, xlen)))
589 goto out; 589 goto out;
590 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { 590 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
591 vfs_dq_free_block(ip, xlen); 591 dquot_free_block(ip, xlen);
592 goto out; 592 goto out;
593 } 593 }
594 } 594 }
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */
617 /* undo data extent allocation */ 617 /* undo data extent allocation */
618 if (*xaddrp == 0) { 618 if (*xaddrp == 0) {
619 dbFree(ip, xaddr, (s64) xlen); 619 dbFree(ip, xaddr, (s64) xlen);
620 vfs_dq_free_block(ip, xlen); 620 dquot_free_block(ip, xlen);
621 } 621 }
622 return rc; 622 return rc;
623 } 623 }
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
985 rbn = addressPXD(pxd); 985 rbn = addressPXD(pxd);
986 986
987 /* Allocate blocks to quota. */ 987 /* Allocate blocks to quota. */
988 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 988 rc = dquot_alloc_block(ip, lengthPXD(pxd));
989 rc = -EDQUOT; 989 if (rc)
990 goto clean_up; 990 goto clean_up;
991 }
992 991
993 quota_allocation += lengthPXD(pxd); 992 quota_allocation += lengthPXD(pxd);
994 993
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1195 1194
1196 /* Rollback quota allocation. */ 1195 /* Rollback quota allocation. */
1197 if (quota_allocation) 1196 if (quota_allocation)
1198 vfs_dq_free_block(ip, quota_allocation); 1197 dquot_free_block(ip, quota_allocation);
1199 1198
1200 return (rc); 1199 return (rc);
1201} 1200}
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
1235 struct pxdlist *pxdlist; 1234 struct pxdlist *pxdlist;
1236 struct tlock *tlck; 1235 struct tlock *tlck;
1237 struct xtlock *xtlck; 1236 struct xtlock *xtlck;
1237 int rc;
1238 1238
1239 sp = &JFS_IP(ip)->i_xtroot; 1239 sp = &JFS_IP(ip)->i_xtroot;
1240 1240
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
1252 return -EIO; 1252 return -EIO;
1253 1253
1254 /* Allocate blocks to quota. */ 1254 /* Allocate blocks to quota. */
1255 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1255 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1256 if (rc) {
1256 release_metapage(rmp); 1257 release_metapage(rmp);
1257 return -EDQUOT; 1258 return rc;
1258 } 1259 }
1259 1260
1260 jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); 1261 jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3680 ip->i_size = newsize; 3681 ip->i_size = newsize;
3681 3682
3682 /* update quota allocation to reflect freed blocks */ 3683 /* update quota allocation to reflect freed blocks */
3683 vfs_dq_free_block(ip, nfreed); 3684 dquot_free_block(ip, nfreed);
3684 3685
3685 /* 3686 /*
3686 * free tlock of invalidated pages 3687 * free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f083..4a3e9f39c21d 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
85 85
86 jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); 86 jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
87 87
88 dquot_initialize(dip);
89
88 /* 90 /*
89 * search parent directory for entry/freespace 91 * search parent directory for entry/freespace
90 * (dtSearch() returns parent directory page pinned) 92 * (dtSearch() returns parent directory page pinned)
@@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
215 217
216 jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); 218 jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
217 219
220 dquot_initialize(dip);
221
218 /* link count overflow on parent directory ? */ 222 /* link count overflow on parent directory ? */
219 if (dip->i_nlink == JFS_LINK_MAX) { 223 if (dip->i_nlink == JFS_LINK_MAX) {
220 rc = -EMLINK; 224 rc = -EMLINK;
@@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
356 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 360 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
357 361
358 /* Init inode for quota operations. */ 362 /* Init inode for quota operations. */
359 vfs_dq_init(ip); 363 dquot_initialize(dip);
364 dquot_initialize(ip);
360 365
361 /* directory must be empty to be removed */ 366 /* directory must be empty to be removed */
362 if (!dtEmpty(ip)) { 367 if (!dtEmpty(ip)) {
@@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
483 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); 488 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
484 489
485 /* Init inode for quota operations. */ 490 /* Init inode for quota operations. */
486 vfs_dq_init(ip); 491 dquot_initialize(dip);
492 dquot_initialize(ip);
487 493
488 if ((rc = get_UCSname(&dname, dentry))) 494 if ((rc = get_UCSname(&dname, dentry)))
489 goto out; 495 goto out;
@@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry,
805 if (ip->i_nlink == 0) 811 if (ip->i_nlink == 0)
806 return -ENOENT; 812 return -ENOENT;
807 813
814 dquot_initialize(dir);
815
808 tid = txBegin(ip->i_sb, 0); 816 tid = txBegin(ip->i_sb, 0);
809 817
810 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); 818 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
@@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
896 904
897 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); 905 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
898 906
907 dquot_initialize(dip);
908
899 ssize = strlen(name) + 1; 909 ssize = strlen(name) + 1;
900 910
901 /* 911 /*
@@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1087 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, 1097 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1088 new_dentry->d_name.name); 1098 new_dentry->d_name.name);
1089 1099
1100 dquot_initialize(old_dir);
1101 dquot_initialize(new_dir);
1102
1090 old_ip = old_dentry->d_inode; 1103 old_ip = old_dentry->d_inode;
1091 new_ip = new_dentry->d_inode; 1104 new_ip = new_dentry->d_inode;
1092 1105
@@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1136 } else if (new_ip) { 1149 } else if (new_ip) {
1137 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); 1150 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
1138 /* Init inode for quota operations. */ 1151 /* Init inode for quota operations. */
1139 vfs_dq_init(new_ip); 1152 dquot_initialize(new_ip);
1140 } 1153 }
1141 1154
1142 /* 1155 /*
@@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1360 1373
1361 jfs_info("jfs_mknod: %s", dentry->d_name.name); 1374 jfs_info("jfs_mknod: %s", dentry->d_name.name);
1362 1375
1376 dquot_initialize(dir);
1377
1363 if ((rc = get_UCSname(&dname, dentry))) 1378 if ((rc = get_UCSname(&dname, dentry)))
1364 goto out; 1379 goto out;
1365 1380
@@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = {
1541 .getxattr = jfs_getxattr, 1556 .getxattr = jfs_getxattr,
1542 .listxattr = jfs_listxattr, 1557 .listxattr = jfs_listxattr,
1543 .removexattr = jfs_removexattr, 1558 .removexattr = jfs_removexattr,
1544#ifdef CONFIG_JFS_POSIX_ACL
1545 .setattr = jfs_setattr, 1559 .setattr = jfs_setattr,
1560#ifdef CONFIG_JFS_POSIX_ACL
1546 .check_acl = jfs_check_acl, 1561 .check_acl = jfs_check_acl,
1547#endif 1562#endif
1548}; 1563};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d929a822a74e..266699deb1c6 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -131,6 +131,11 @@ static void jfs_destroy_inode(struct inode *inode)
131 kmem_cache_free(jfs_inode_cachep, ji); 131 kmem_cache_free(jfs_inode_cachep, ji);
132} 132}
133 133
134static void jfs_clear_inode(struct inode *inode)
135{
136 dquot_drop(inode);
137}
138
134static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) 139static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
135{ 140{
136 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); 141 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -745,6 +750,7 @@ static const struct super_operations jfs_super_operations = {
745 .dirty_inode = jfs_dirty_inode, 750 .dirty_inode = jfs_dirty_inode,
746 .write_inode = jfs_write_inode, 751 .write_inode = jfs_write_inode,
747 .delete_inode = jfs_delete_inode, 752 .delete_inode = jfs_delete_inode,
753 .clear_inode = jfs_clear_inode,
748 .put_super = jfs_put_super, 754 .put_super = jfs_put_super,
749 .sync_fs = jfs_sync_fs, 755 .sync_fs = jfs_sync_fs,
750 .freeze_fs = jfs_freeze, 756 .freeze_fs = jfs_freeze,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc9..1f594ab21895 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
260 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; 260 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
261 261
262 /* Allocate new blocks to quota. */ 262 /* Allocate new blocks to quota. */
263 if (vfs_dq_alloc_block(ip, nblocks)) { 263 rc = dquot_alloc_block(ip, nblocks);
264 return -EDQUOT; 264 if (rc)
265 } 265 return rc;
266 266
267 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); 267 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
268 if (rc) { 268 if (rc) {
269 /*Rollback quota allocation. */ 269 /*Rollback quota allocation. */
270 vfs_dq_free_block(ip, nblocks); 270 dquot_free_block(ip, nblocks);
271 return rc; 271 return rc;
272 } 272 }
273 273
@@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
332 332
333 failed: 333 failed:
334 /* Rollback quota allocation. */ 334 /* Rollback quota allocation. */
335 vfs_dq_free_block(ip, nblocks); 335 dquot_free_block(ip, nblocks);
336 336
337 dbFree(ip, blkno, nblocks); 337 dbFree(ip, blkno, nblocks);
338 return rc; 338 return rc;
@@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
538 538
539 if (blocks_needed > current_blocks) { 539 if (blocks_needed > current_blocks) {
540 /* Allocate new blocks to quota. */ 540 /* Allocate new blocks to quota. */
541 if (vfs_dq_alloc_block(inode, blocks_needed)) 541 rc = dquot_alloc_block(inode, blocks_needed);
542 if (rc)
542 return -EDQUOT; 543 return -EDQUOT;
543 544
544 quota_allocation = blocks_needed; 545 quota_allocation = blocks_needed;
@@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
602 clean_up: 603 clean_up:
603 /* Rollback quota allocation */ 604 /* Rollback quota allocation */
604 if (quota_allocation) 605 if (quota_allocation)
605 vfs_dq_free_block(inode, quota_allocation); 606 dquot_free_block(inode, quota_allocation);
606 607
607 return (rc); 608 return (rc);
608} 609}
@@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
677 678
678 /* If old blocks exist, they must be removed from quota allocation. */ 679 /* If old blocks exist, they must be removed from quota allocation. */
679 if (old_blocks) 680 if (old_blocks)
680 vfs_dq_free_block(inode, old_blocks); 681 dquot_free_block(inode, old_blocks);
681 682
682 inode->i_ctime = CURRENT_TIME; 683 inode->i_ctime = CURRENT_TIME;
683 684
diff --git a/fs/libfs.c b/fs/libfs.c
index 6e8d17e1dc4c..9e50bcf55857 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -338,28 +338,14 @@ int simple_readpage(struct file *file, struct page *page)
338 return 0; 338 return 0;
339} 339}
340 340
341int simple_prepare_write(struct file *file, struct page *page,
342 unsigned from, unsigned to)
343{
344 if (!PageUptodate(page)) {
345 if (to - from != PAGE_CACHE_SIZE)
346 zero_user_segments(page,
347 0, from,
348 to, PAGE_CACHE_SIZE);
349 }
350 return 0;
351}
352
353int simple_write_begin(struct file *file, struct address_space *mapping, 341int simple_write_begin(struct file *file, struct address_space *mapping,
354 loff_t pos, unsigned len, unsigned flags, 342 loff_t pos, unsigned len, unsigned flags,
355 struct page **pagep, void **fsdata) 343 struct page **pagep, void **fsdata)
356{ 344{
357 struct page *page; 345 struct page *page;
358 pgoff_t index; 346 pgoff_t index;
359 unsigned from;
360 347
361 index = pos >> PAGE_CACHE_SHIFT; 348 index = pos >> PAGE_CACHE_SHIFT;
362 from = pos & (PAGE_CACHE_SIZE - 1);
363 349
364 page = grab_cache_page_write_begin(mapping, index, flags); 350 page = grab_cache_page_write_begin(mapping, index, flags);
365 if (!page) 351 if (!page)
@@ -367,43 +353,59 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
367 353
368 *pagep = page; 354 *pagep = page;
369 355
370 return simple_prepare_write(file, page, from, from+len); 356 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
371} 357 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
372
373static int simple_commit_write(struct file *file, struct page *page,
374 unsigned from, unsigned to)
375{
376 struct inode *inode = page->mapping->host;
377 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
378 358
379 if (!PageUptodate(page)) 359 zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
380 SetPageUptodate(page); 360 }
381 /*
382 * No need to use i_size_read() here, the i_size
383 * cannot change under us because we hold the i_mutex.
384 */
385 if (pos > inode->i_size)
386 i_size_write(inode, pos);
387 set_page_dirty(page);
388 return 0; 361 return 0;
389} 362}
390 363
364/**
365 * simple_write_end - .write_end helper for non-block-device FSes
366 * @available: See .write_end of address_space_operations
367 * @file: "
368 * @mapping: "
369 * @pos: "
370 * @len: "
371 * @copied: "
372 * @page: "
373 * @fsdata: "
374 *
375 * simple_write_end does the minimum needed for updating a page after writing is
376 * done. It has the same API signature as the .write_end of
377 * address_space_operations vector. So it can just be set onto .write_end for
378 * FSes that don't need any other processing. i_mutex is assumed to be held.
379 * Block based filesystems should use generic_write_end().
380 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
381 * is not called, so a filesystem that actually does store data in .write_inode
382 * should extend on what's done here with a call to mark_inode_dirty() in the
383 * case that i_size has changed.
384 */
391int simple_write_end(struct file *file, struct address_space *mapping, 385int simple_write_end(struct file *file, struct address_space *mapping,
392 loff_t pos, unsigned len, unsigned copied, 386 loff_t pos, unsigned len, unsigned copied,
393 struct page *page, void *fsdata) 387 struct page *page, void *fsdata)
394{ 388{
395 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 389 struct inode *inode = page->mapping->host;
390 loff_t last_pos = pos + copied;
396 391
397 /* zero the stale part of the page if we did a short copy */ 392 /* zero the stale part of the page if we did a short copy */
398 if (copied < len) { 393 if (copied < len) {
399 void *kaddr = kmap_atomic(page, KM_USER0); 394 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
400 memset(kaddr + from + copied, 0, len - copied); 395
401 flush_dcache_page(page); 396 zero_user(page, from + copied, len - copied);
402 kunmap_atomic(kaddr, KM_USER0);
403 } 397 }
404 398
405 simple_commit_write(file, page, from, from+copied); 399 if (!PageUptodate(page))
400 SetPageUptodate(page);
401 /*
402 * No need to use i_size_read() here, the i_size
403 * cannot change under us because we hold the i_mutex.
404 */
405 if (last_pos > inode->i_size)
406 i_size_write(inode, last_pos);
406 407
408 set_page_dirty(page);
407 unlock_page(page); 409 unlock_page(page);
408 page_cache_release(page); 410 page_cache_release(page);
409 411
@@ -853,7 +855,6 @@ EXPORT_SYMBOL(simple_getattr);
853EXPORT_SYMBOL(simple_link); 855EXPORT_SYMBOL(simple_link);
854EXPORT_SYMBOL(simple_lookup); 856EXPORT_SYMBOL(simple_lookup);
855EXPORT_SYMBOL(simple_pin_fs); 857EXPORT_SYMBOL(simple_pin_fs);
856EXPORT_UNUSED_SYMBOL(simple_prepare_write);
857EXPORT_SYMBOL(simple_readpage); 858EXPORT_SYMBOL(simple_readpage);
858EXPORT_SYMBOL(simple_release_fs); 859EXPORT_SYMBOL(simple_release_fs);
859EXPORT_SYMBOL(simple_rename); 860EXPORT_SYMBOL(simple_rename);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 4600c2037b8b..bb464d12104c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -479,8 +479,8 @@ again: mutex_lock(&nlm_host_mutex);
479 } 479 }
480 } 480 }
481 } 481 }
482
483 mutex_unlock(&nlm_host_mutex); 482 mutex_unlock(&nlm_host_mutex);
483 nsm_release(nsm);
484} 484}
485 485
486/* 486/*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index f956651d0f65..fefa4df3f005 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -349,9 +349,9 @@ retry:
349 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle 349 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
350 * @info: pointer to NLMPROC_SM_NOTIFY arguments 350 * @info: pointer to NLMPROC_SM_NOTIFY arguments
351 * 351 *
352 * Returns a matching nsm_handle if found in the nsm cache; the returned 352 * Returns a matching nsm_handle if found in the nsm cache. The returned
353 * nsm_handle's reference count is bumped and sm_monitored is cleared. 353 * nsm_handle's reference count is bumped. Otherwise returns NULL if some
354 * Otherwise returns NULL if some error occurred. 354 * error occurred.
355 */ 355 */
356struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) 356struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
357{ 357{
@@ -370,12 +370,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
370 atomic_inc(&cached->sm_count); 370 atomic_inc(&cached->sm_count);
371 spin_unlock(&nsm_lock); 371 spin_unlock(&nsm_lock);
372 372
373 /*
374 * During subsequent lock activity, force a fresh
375 * notification to be set up for this host.
376 */
377 cached->sm_monitored = 0;
378
379 dprintk("lockd: host %s (%s) rebooted, cnt %d\n", 373 dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
380 cached->sm_name, cached->sm_addrbuf, 374 cached->sm_name, cached->sm_addrbuf,
381 atomic_read(&cached->sm_count)); 375 atomic_read(&cached->sm_count));
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e50cfa3d9654..7d150517ddf0 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -243,11 +243,9 @@ static int make_socks(struct svc_serv *serv)
243 if (err < 0) 243 if (err < 0)
244 goto out_err; 244 goto out_err;
245 245
246#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
247 err = create_lockd_family(serv, PF_INET6); 246 err = create_lockd_family(serv, PF_INET6);
248 if (err < 0 && err != -EAFNOSUPPORT) 247 if (err < 0 && err != -EAFNOSUPPORT)
249 goto out_err; 248 goto out_err;
250#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
251 249
252 warned = 0; 250 warned = 0;
253 return 0; 251 return 0;
diff --git a/fs/locks.c b/fs/locks.c
index a8794f233bc9..ab24d49fc048 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode)
1182 struct file_lock *fl; 1182 struct file_lock *fl;
1183 unsigned long break_time; 1183 unsigned long break_time;
1184 int i_have_this_lease = 0; 1184 int i_have_this_lease = 0;
1185 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1185 1186
1186 new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK); 1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1187 1188
1188 lock_kernel(); 1189 lock_kernel();
1189 1190
@@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1197 if (fl->fl_owner == current->files) 1198 if (fl->fl_owner == current->files)
1198 i_have_this_lease = 1; 1199 i_have_this_lease = 1;
1199 1200
1200 if (mode & FMODE_WRITE) { 1201 if (want_write) {
1201 /* If we want write access, we have to revoke any lease. */ 1202 /* If we want write access, we have to revoke any lease. */
1202 future = F_UNLCK | F_INPROGRESS; 1203 future = F_UNLCK | F_INPROGRESS;
1203 } else if (flock->fl_type & F_INPROGRESS) { 1204 } else if (flock->fl_type & F_INPROGRESS) {
@@ -1454,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
1454 * leases held by processes on this node. 1455 * leases held by processes on this node.
1455 * 1456 *
1456 * There is also no break_lease method; filesystems that 1457 * There is also no break_lease method; filesystems that
1457 * handle their own leases shoud break leases themselves from the 1458 * handle their own leases should break leases themselves from the
1458 * filesystem's open, create, and (on truncate) setattr methods. 1459 * filesystem's open, create, and (on truncate) setattr methods.
1459 * 1460 *
1460 * Warning: the only current setlease methods exist only to disable 1461 * Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
1config LOGFS
2 tristate "LogFS file system (EXPERIMENTAL)"
3 depends on (MTD || BLOCK) && EXPERIMENTAL
4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE
6 select CRC32
7 select BTREE
8 help
9 Flash filesystem aimed to scale efficiently to large devices.
10 In comparison to JFFS2 it offers significantly faster mount
11 times and potentially less RAM usage, although the latter has
12 not been measured yet.
13
14 In its current state it is still very experimental and should
15 not be used for other than testing purposes.
16
17 If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
1obj-$(CONFIG_LOGFS) += logfs.o
2
3logfs-y += compr.o
4logfs-y += dir.o
5logfs-y += file.o
6logfs-y += gc.o
7logfs-y += inode.o
8logfs-y += journal.o
9logfs-y += readwrite.o
10logfs-y += segment.o
11logfs-y += super.o
12logfs-$(CONFIG_BLOCK) += dev_bdev.o
13logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
1/*
2 * fs/logfs/compr.c - compression routines
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/vmalloc.h>
10#include <linux/zlib.h>
11
12#define COMPR_LEVEL 3
13
14static DEFINE_MUTEX(compr_mutex);
15static struct z_stream_s stream;
16
17int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
18{
19 int err, ret;
20
21 ret = -EIO;
22 mutex_lock(&compr_mutex);
23 err = zlib_deflateInit(&stream, COMPR_LEVEL);
24 if (err != Z_OK)
25 goto error;
26
27 stream.next_in = in;
28 stream.avail_in = inlen;
29 stream.total_in = 0;
30 stream.next_out = out;
31 stream.avail_out = outlen;
32 stream.total_out = 0;
33
34 err = zlib_deflate(&stream, Z_FINISH);
35 if (err != Z_STREAM_END)
36 goto error;
37
38 err = zlib_deflateEnd(&stream);
39 if (err != Z_OK)
40 goto error;
41
42 if (stream.total_out >= stream.total_in)
43 goto error;
44
45 ret = stream.total_out;
46error:
47 mutex_unlock(&compr_mutex);
48 return ret;
49}
50
51int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
52{
53 int err, ret;
54
55 ret = -EIO;
56 mutex_lock(&compr_mutex);
57 err = zlib_inflateInit(&stream);
58 if (err != Z_OK)
59 goto error;
60
61 stream.next_in = in;
62 stream.avail_in = inlen;
63 stream.total_in = 0;
64 stream.next_out = out;
65 stream.avail_out = outlen;
66 stream.total_out = 0;
67
68 err = zlib_inflate(&stream, Z_FINISH);
69 if (err != Z_STREAM_END)
70 goto error;
71
72 err = zlib_inflateEnd(&stream);
73 if (err != Z_OK)
74 goto error;
75
76 ret = 0;
77error:
78 mutex_unlock(&compr_mutex);
79 return ret;
80}
81
82int __init logfs_compr_init(void)
83{
84 size_t size = max(zlib_deflate_workspacesize(),
85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size);
87 if (!stream.workspace)
88 return -ENOMEM;
89 return 0;
90}
91
92void logfs_compr_exit(void)
93{
94 vfree(stream.workspace);
95}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..a5d0c56d3ebc
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,332 @@
1/*
2 * fs/logfs/dev_bdev.c - Device access methods for block devices
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/bio.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static void request_complete(struct bio *bio, int err)
16{
17 complete((struct completion *)bio->bi_private);
18}
19
20static int sync_request(struct page *page, struct block_device *bdev, int rw)
21{
22 struct bio bio;
23 struct bio_vec bio_vec;
24 struct completion complete;
25
26 bio_init(&bio);
27 bio.bi_io_vec = &bio_vec;
28 bio_vec.bv_page = page;
29 bio_vec.bv_len = PAGE_SIZE;
30 bio_vec.bv_offset = 0;
31 bio.bi_vcnt = 1;
32 bio.bi_idx = 0;
33 bio.bi_size = PAGE_SIZE;
34 bio.bi_bdev = bdev;
35 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
36 init_completion(&complete);
37 bio.bi_private = &complete;
38 bio.bi_end_io = request_complete;
39
40 submit_bio(rw, &bio);
41 generic_unplug_device(bdev_get_queue(bdev));
42 wait_for_completion(&complete);
43 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
44}
45
46static int bdev_readpage(void *_sb, struct page *page)
47{
48 struct super_block *sb = _sb;
49 struct block_device *bdev = logfs_super(sb)->s_bdev;
50 int err;
51
52 err = sync_request(page, bdev, READ);
53 if (err) {
54 ClearPageUptodate(page);
55 SetPageError(page);
56 } else {
57 SetPageUptodate(page);
58 ClearPageError(page);
59 }
60 unlock_page(page);
61 return err;
62}
63
64static DECLARE_WAIT_QUEUE_HEAD(wq);
65
66static void writeseg_end_io(struct bio *bio, int err)
67{
68 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
69 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
70 struct super_block *sb = bio->bi_private;
71 struct logfs_super *super = logfs_super(sb);
72 struct page *page;
73
74 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
75 BUG_ON(err);
76 BUG_ON(bio->bi_vcnt == 0);
77 do {
78 page = bvec->bv_page;
79 if (--bvec >= bio->bi_io_vec)
80 prefetchw(&bvec->bv_page->flags);
81
82 end_page_writeback(page);
83 page_cache_release(page);
84 } while (bvec >= bio->bi_io_vec);
85 bio_put(bio);
86 if (atomic_dec_and_test(&super->s_pending_writes))
87 wake_up(&wq);
88}
89
90static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
91 size_t nr_pages)
92{
93 struct logfs_super *super = logfs_super(sb);
94 struct address_space *mapping = super->s_mapping_inode->i_mapping;
95 struct bio *bio;
96 struct page *page;
97 struct request_queue *q = bdev_get_queue(sb->s_bdev);
98 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
99 int i;
100
101 if (max_pages > BIO_MAX_PAGES)
102 max_pages = BIO_MAX_PAGES;
103 bio = bio_alloc(GFP_NOFS, max_pages);
104 BUG_ON(!bio);
105
106 for (i = 0; i < nr_pages; i++) {
107 if (i >= max_pages) {
108 /* Block layer cannot split bios :( */
109 bio->bi_vcnt = i;
110 bio->bi_idx = 0;
111 bio->bi_size = i * PAGE_SIZE;
112 bio->bi_bdev = super->s_bdev;
113 bio->bi_sector = ofs >> 9;
114 bio->bi_private = sb;
115 bio->bi_end_io = writeseg_end_io;
116 atomic_inc(&super->s_pending_writes);
117 submit_bio(WRITE, bio);
118
119 ofs += i * PAGE_SIZE;
120 index += i;
121 nr_pages -= i;
122 i = 0;
123
124 bio = bio_alloc(GFP_NOFS, max_pages);
125 BUG_ON(!bio);
126 }
127 page = find_lock_page(mapping, index + i);
128 BUG_ON(!page);
129 bio->bi_io_vec[i].bv_page = page;
130 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
131 bio->bi_io_vec[i].bv_offset = 0;
132
133 BUG_ON(PageWriteback(page));
134 set_page_writeback(page);
135 unlock_page(page);
136 }
137 bio->bi_vcnt = nr_pages;
138 bio->bi_idx = 0;
139 bio->bi_size = nr_pages * PAGE_SIZE;
140 bio->bi_bdev = super->s_bdev;
141 bio->bi_sector = ofs >> 9;
142 bio->bi_private = sb;
143 bio->bi_end_io = writeseg_end_io;
144 atomic_inc(&super->s_pending_writes);
145 submit_bio(WRITE, bio);
146 return 0;
147}
148
149static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
150{
151 struct logfs_super *super = logfs_super(sb);
152 int head;
153
154 BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
155
156 if (len == 0) {
157 /* This can happen when the object fit perfectly into a
158 * segment, the segment gets written per sync and subsequently
159 * closed.
160 */
161 return;
162 }
163 head = ofs & (PAGE_SIZE - 1);
164 if (head) {
165 ofs -= head;
166 len += head;
167 }
168 len = PAGE_ALIGN(len);
169 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
170 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
171}
172
173
174static void erase_end_io(struct bio *bio, int err)
175{
176 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
177 struct super_block *sb = bio->bi_private;
178 struct logfs_super *super = logfs_super(sb);
179
180 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
181 BUG_ON(err);
182 BUG_ON(bio->bi_vcnt == 0);
183 bio_put(bio);
184 if (atomic_dec_and_test(&super->s_pending_writes))
185 wake_up(&wq);
186}
187
188static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
189 size_t nr_pages)
190{
191 struct logfs_super *super = logfs_super(sb);
192 struct bio *bio;
193 struct request_queue *q = bdev_get_queue(sb->s_bdev);
194 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
195 int i;
196
197 if (max_pages > BIO_MAX_PAGES)
198 max_pages = BIO_MAX_PAGES;
199 bio = bio_alloc(GFP_NOFS, max_pages);
200 BUG_ON(!bio);
201
202 for (i = 0; i < nr_pages; i++) {
203 if (i >= max_pages) {
204 /* Block layer cannot split bios :( */
205 bio->bi_vcnt = i;
206 bio->bi_idx = 0;
207 bio->bi_size = i * PAGE_SIZE;
208 bio->bi_bdev = super->s_bdev;
209 bio->bi_sector = ofs >> 9;
210 bio->bi_private = sb;
211 bio->bi_end_io = erase_end_io;
212 atomic_inc(&super->s_pending_writes);
213 submit_bio(WRITE, bio);
214
215 ofs += i * PAGE_SIZE;
216 index += i;
217 nr_pages -= i;
218 i = 0;
219
220 bio = bio_alloc(GFP_NOFS, max_pages);
221 BUG_ON(!bio);
222 }
223 bio->bi_io_vec[i].bv_page = super->s_erase_page;
224 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
225 bio->bi_io_vec[i].bv_offset = 0;
226 }
227 bio->bi_vcnt = nr_pages;
228 bio->bi_idx = 0;
229 bio->bi_size = nr_pages * PAGE_SIZE;
230 bio->bi_bdev = super->s_bdev;
231 bio->bi_sector = ofs >> 9;
232 bio->bi_private = sb;
233 bio->bi_end_io = erase_end_io;
234 atomic_inc(&super->s_pending_writes);
235 submit_bio(WRITE, bio);
236 return 0;
237}
238
239static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
240 int ensure_write)
241{
242 struct logfs_super *super = logfs_super(sb);
243
244 BUG_ON(to & (PAGE_SIZE - 1));
245 BUG_ON(len & (PAGE_SIZE - 1));
246
247 if (super->s_flags & LOGFS_SB_FLAG_RO)
248 return -EROFS;
249
250 if (ensure_write) {
251 /*
252 * Object store doesn't care whether erases happen or not.
253 * But for the journal they are required. Otherwise a scan
254 * can find an old commit entry and assume it is the current
255 * one, travelling back in time.
256 */
257 do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
258 }
259
260 return 0;
261}
262
263static void bdev_sync(struct super_block *sb)
264{
265 struct logfs_super *super = logfs_super(sb);
266
267 wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
268}
269
270static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
271{
272 struct logfs_super *super = logfs_super(sb);
273 struct address_space *mapping = super->s_mapping_inode->i_mapping;
274 filler_t *filler = bdev_readpage;
275
276 *ofs = 0;
277 return read_cache_page(mapping, 0, filler, sb);
278}
279
280static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
281{
282 struct logfs_super *super = logfs_super(sb);
283 struct address_space *mapping = super->s_mapping_inode->i_mapping;
284 filler_t *filler = bdev_readpage;
285 u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
286 pgoff_t index = pos >> PAGE_SHIFT;
287
288 *ofs = pos;
289 return read_cache_page(mapping, index, filler, sb);
290}
291
292static int bdev_write_sb(struct super_block *sb, struct page *page)
293{
294 struct block_device *bdev = logfs_super(sb)->s_bdev;
295
296 /* Nothing special to do for block devices. */
297 return sync_request(page, bdev, WRITE);
298}
299
300static void bdev_put_device(struct super_block *sb)
301{
302 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
303}
304
305static const struct logfs_device_ops bd_devops = {
306 .find_first_sb = bdev_find_first_sb,
307 .find_last_sb = bdev_find_last_sb,
308 .write_sb = bdev_write_sb,
309 .readpage = bdev_readpage,
310 .writeseg = bdev_writeseg,
311 .erase = bdev_erase,
312 .sync = bdev_sync,
313 .put_device = bdev_put_device,
314};
315
316int logfs_get_sb_bdev(struct file_system_type *type, int flags,
317 const char *devname, struct vfsmount *mnt)
318{
319 struct block_device *bdev;
320
321 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
322 if (IS_ERR(bdev))
323 return PTR_ERR(bdev);
324
325 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
326 int mtdnr = MINOR(bdev->bd_dev);
327 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
328 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
329 }
330
331 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
332}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..cafb6ef2e05b
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,254 @@
1/*
2 * fs/logfs/dev_mtd.c - Device access methods for MTD
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/completion.h>
10#include <linux/mount.h>
11#include <linux/sched.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
16{
17 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
18 size_t retlen;
19 int ret;
20
21 ret = mtd->read(mtd, ofs, len, &retlen, buf);
22 BUG_ON(ret == -EINVAL);
23 if (ret)
24 return ret;
25
26 /* Not sure if we should loop instead. */
27 if (retlen != len)
28 return -EIO;
29
30 return 0;
31}
32
33static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
34{
35 struct logfs_super *super = logfs_super(sb);
36 struct mtd_info *mtd = super->s_mtd;
37 size_t retlen;
38 loff_t page_start, page_end;
39 int ret;
40
41 if (super->s_flags & LOGFS_SB_FLAG_RO)
42 return -EROFS;
43
44 BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
45 BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
46 BUG_ON(len > PAGE_CACHE_SIZE);
47 page_start = ofs & PAGE_CACHE_MASK;
48 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
49 ret = mtd->write(mtd, ofs, len, &retlen, buf);
50 if (ret || (retlen != len))
51 return -EIO;
52
53 return 0;
54}
55
56/*
57 * For as long as I can remember (since about 2001) mtd->erase has been an
58 * asynchronous interface lacking the first driver to actually use the
59 * asynchronous properties. So just to prevent the first implementor of such
60 * a thing from breaking logfs in 2350, we do the usual pointless dance to
61 * declare a completion variable and wait for completion before returning
62 * from mtd_erase(). What an excercise in futility!
63 */
64static void logfs_erase_callback(struct erase_info *ei)
65{
66 complete((struct completion *)ei->priv);
67}
68
69static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
70{
71 struct logfs_super *super = logfs_super(sb);
72 struct address_space *mapping = super->s_mapping_inode->i_mapping;
73 struct page *page;
74 pgoff_t index = ofs >> PAGE_SHIFT;
75
76 for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
77 page = find_get_page(mapping, index);
78 if (!page)
79 continue;
80 memset(page_address(page), 0xFF, PAGE_SIZE);
81 page_cache_release(page);
82 }
83 return 0;
84}
85
86static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
87 int ensure_write)
88{
89 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
90 struct erase_info ei;
91 DECLARE_COMPLETION_ONSTACK(complete);
92 int ret;
93
94 BUG_ON(len % mtd->erasesize);
95 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
96 return -EROFS;
97
98 memset(&ei, 0, sizeof(ei));
99 ei.mtd = mtd;
100 ei.addr = ofs;
101 ei.len = len;
102 ei.callback = logfs_erase_callback;
103 ei.priv = (long)&complete;
104 ret = mtd->erase(mtd, &ei);
105 if (ret)
106 return -EIO;
107
108 wait_for_completion(&complete);
109 if (ei.state != MTD_ERASE_DONE)
110 return -EIO;
111 return mtd_erase_mapping(sb, ofs, len);
112}
113
114static void mtd_sync(struct super_block *sb)
115{
116 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
117
118 if (mtd->sync)
119 mtd->sync(mtd);
120}
121
122static int mtd_readpage(void *_sb, struct page *page)
123{
124 struct super_block *sb = _sb;
125 int err;
126
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page));
129 if (err == -EUCLEAN) {
130 err = 0;
131 /* FIXME: force GC this segment */
132 }
133 if (err) {
134 ClearPageUptodate(page);
135 SetPageError(page);
136 } else {
137 SetPageUptodate(page);
138 ClearPageError(page);
139 }
140 unlock_page(page);
141 return err;
142}
143
144static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
145{
146 struct logfs_super *super = logfs_super(sb);
147 struct address_space *mapping = super->s_mapping_inode->i_mapping;
148 filler_t *filler = mtd_readpage;
149 struct mtd_info *mtd = super->s_mtd;
150
151 if (!mtd->block_isbad)
152 return NULL;
153
154 *ofs = 0;
155 while (mtd->block_isbad(mtd, *ofs)) {
156 *ofs += mtd->erasesize;
157 if (*ofs >= mtd->size)
158 return NULL;
159 }
160 BUG_ON(*ofs & ~PAGE_MASK);
161 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
162}
163
164static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
165{
166 struct logfs_super *super = logfs_super(sb);
167 struct address_space *mapping = super->s_mapping_inode->i_mapping;
168 filler_t *filler = mtd_readpage;
169 struct mtd_info *mtd = super->s_mtd;
170
171 if (!mtd->block_isbad)
172 return NULL;
173
174 *ofs = mtd->size - mtd->erasesize;
175 while (mtd->block_isbad(mtd, *ofs)) {
176 *ofs -= mtd->erasesize;
177 if (*ofs <= 0)
178 return NULL;
179 }
180 *ofs = *ofs + mtd->erasesize - 0x1000;
181 BUG_ON(*ofs & ~PAGE_MASK);
182 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
183}
184
185static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
186 size_t nr_pages)
187{
188 struct logfs_super *super = logfs_super(sb);
189 struct address_space *mapping = super->s_mapping_inode->i_mapping;
190 struct page *page;
191 int i, err;
192
193 for (i = 0; i < nr_pages; i++) {
194 page = find_lock_page(mapping, index + i);
195 BUG_ON(!page);
196
197 err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
198 page_address(page));
199 unlock_page(page);
200 page_cache_release(page);
201 if (err)
202 return err;
203 }
204 return 0;
205}
206
207static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
208{
209 struct logfs_super *super = logfs_super(sb);
210 int head;
211
212 if (super->s_flags & LOGFS_SB_FLAG_RO)
213 return;
214
215 if (len == 0) {
216 /* This can happen when the object fit perfectly into a
217 * segment, the segment gets written per sync and subsequently
218 * closed.
219 */
220 return;
221 }
222 head = ofs & (PAGE_SIZE - 1);
223 if (head) {
224 ofs -= head;
225 len += head;
226 }
227 len = PAGE_ALIGN(len);
228 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
229}
230
231static void mtd_put_device(struct super_block *sb)
232{
233 put_mtd_device(logfs_super(sb)->s_mtd);
234}
235
236static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg,
241 .erase = mtd_erase,
242 .sync = mtd_sync,
243 .put_device = mtd_put_device,
244};
245
246int logfs_get_sb_mtd(struct file_system_type *type, int flags,
247 int mtdnr, struct vfsmount *mnt)
248{
249 struct mtd_info *mtd;
250 const struct logfs_device_ops *devops = &mtd_devops;
251
252 mtd = get_mtd_device(NULL, mtdnr);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..c76b4b5c7ff6
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
1/*
2 * fs/logfs/dir.c - directory-related code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9
10
11/*
12 * Atomic dir operations
13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do
16 * a small amount of journaling.
17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do
19 * the work: __logfs_create. This function works in two atomic steps:
20 * 1. allocate inode (remember in journal)
21 * 2. allocate dentry (clear journal)
22 *
23 * As we can only get interrupted between the two, when the inode we just
24 * created is simply stored in the anchor. On next mount, if we were
25 * interrupted, we delete the inode. From a users point of view the
26 * operation never happened.
27 *
28 * Unlink and rmdir also share the same function: unlink. Again, this
29 * function works in two atomic steps
30 * 1. remove dentry (remember inode in journal)
31 * 2. unlink inode (clear journal)
32 *
33 * And again, on the next mount, if we were interrupted, we delete the inode.
34 * From a users point of view the operation succeeded.
35 *
36 * Rename is the real pain to deal with, harder than all the other methods
37 * combined. Depending on the circumstances we can run into three cases.
38 * A "target rename" where the target dentry already existed, a "local
39 * rename" where both parent directories are identical or a "cross-directory
40 * rename" in the remaining case.
41 *
42 * Local rename is atomic, as the old dentry is simply rewritten with a new
43 * name.
44 *
45 * Cross-directory rename works in two steps, similar to __logfs_create and
46 * logfs_unlink:
47 * 1. Write new dentry (remember old dentry in journal)
48 * 2. Remove old dentry (clear journal)
49 *
50 * Here we remember a dentry instead of an inode. On next mount, if we were
51 * interrupted, we delete the dentry. From a users point of view, the
52 * operation succeeded.
53 *
54 * Target rename works in three atomic steps:
55 * 1. Attach old inode to new dentry (remember old dentry and new inode)
56 * 2. Remove old dentry (still remember the new inode)
57 * 3. Remove victim inode
58 *
59 * Here we remember both an inode an a dentry. If we get interrupted
60 * between steps 1 and 2, we delete both the dentry and the inode. If
61 * we get interrupted between steps 2 and 3, we delete just the inode.
62 * In either case, the remaining objects are deleted on next mount. From
63 * a users point of view, the operation succeeded.
64 */
65
66static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
67 loff_t pos)
68{
69 return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
70}
71
72static int write_inode(struct inode *inode)
73{
74 return __logfs_write_inode(inode, WF_LOCK);
75}
76
77static s64 dir_seek_data(struct inode *inode, s64 pos)
78{
79 s64 new_pos = logfs_seek_data(inode, pos);
80
81 return max(pos, new_pos - 1);
82}
83
84static int beyond_eof(struct inode *inode, loff_t bix)
85{
86 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
87 return pos >= i_size_read(inode);
88}
89
90/*
91 * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferrably a bit-sparse
96 * one.
97 */
98static u32 hash_32(const char *s, int len, u32 seed)
99{
100 u32 hash = seed;
101 int i;
102
103 for (i = 0; i < len; i++)
104 hash = hash * 293 + s[i];
105 return hash;
106}
107
108/*
109 * We have to satisfy several conflicting requirements here. Small
110 * directories should stay fairly compact and not require too many
111 * indirect blocks. The number of possible locations for a given hash
112 * should be small to make lookup() fast. And we should try hard not
113 * to overflow the 32bit name space or nfs and 32bit host systems will
114 * be unhappy.
115 *
116 * So we use the following scheme. First we reduce the hash to 0..15
117 * and try a direct block. If that is occupied we reduce the hash to
118 * 16..255 and try an indirect block. Same for 2x and 3x indirect
119 * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
120 * but use buckets containing eight entries instead of a single one.
121 *
122 * Using 16 entries should allow for a reasonable amount of hash
123 * collisions, so the 32bit name space can be packed fairly tight
124 * before overflowing. Oh and currently we don't overflow but return
125 * and error.
126 *
127 * How likely are collisions? Doing the appropriate math is beyond me
128 * and the Bronstein textbook. But running a test program to brute
129 * force collisions for a couple of days showed that on average the
130 * first collision occurs after 598M entries, with 290M being the
131 * smallest result. Obviously 21 entries could already cause a
132 * collision if all entries are carefully chosen.
133 */
134static pgoff_t hash_index(u32 hash, int round)
135{
136 u32 i0_blocks = I0_BLOCKS;
137 u32 i1_blocks = I1_BLOCKS;
138 u32 i2_blocks = I2_BLOCKS;
139 u32 i3_blocks = I3_BLOCKS;
140
141 switch (round) {
142 case 0:
143 return hash % i0_blocks;
144 case 1:
145 return i0_blocks + hash % (i1_blocks - i0_blocks);
146 case 2:
147 return i1_blocks + hash % (i2_blocks - i1_blocks);
148 case 3:
149 return i2_blocks + hash % (i3_blocks - i2_blocks);
150 case 4 ... 19:
151 return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
152 + round - 4;
153 }
154 BUG();
155}
156
157static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
158{
159 struct qstr *name = &dentry->d_name;
160 struct page *page;
161 struct logfs_disk_dentry *dd;
162 u32 hash = hash_32(name->name, name->len, 0);
163 pgoff_t index;
164 int round;
165
166 if (name->len > LOGFS_MAX_NAMELEN)
167 return ERR_PTR(-ENAMETOOLONG);
168
169 for (round = 0; round < 20; round++) {
170 index = hash_index(hash, round);
171
172 if (beyond_eof(dir, index))
173 return NULL;
174 if (!logfs_exist_block(dir, index))
175 continue;
176 page = read_cache_page(dir->i_mapping, index,
177 (filler_t *)logfs_readpage, NULL);
178 if (IS_ERR(page))
179 return page;
180 dd = kmap_atomic(page, KM_USER0);
181 BUG_ON(dd->namelen == 0);
182
183 if (name->len != be16_to_cpu(dd->namelen) ||
184 memcmp(name->name, dd->name, name->len)) {
185 kunmap_atomic(dd, KM_USER0);
186 page_cache_release(page);
187 continue;
188 }
189
190 kunmap_atomic(dd, KM_USER0);
191 return page;
192 }
193 return NULL;
194}
195
196static int logfs_remove_inode(struct inode *inode)
197{
198 int ret;
199
200 inode->i_nlink--;
201 ret = write_inode(inode);
202 LOGFS_BUG_ON(ret, inode->i_sb);
203 return ret;
204}
205
206static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
207{
208 if (logfs_inode(inode)->li_block)
209 logfs_inode(inode)->li_block->ta = NULL;
210 kfree(ta);
211}
212
213static int logfs_unlink(struct inode *dir, struct dentry *dentry)
214{
215 struct logfs_super *super = logfs_super(dir->i_sb);
216 struct inode *inode = dentry->d_inode;
217 struct logfs_transaction *ta;
218 struct page *page;
219 pgoff_t index;
220 int ret;
221
222 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
223 if (!ta)
224 return -ENOMEM;
225
226 ta->state = UNLINK_1;
227 ta->ino = inode->i_ino;
228
229 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
230
231 page = logfs_get_dd_page(dir, dentry);
232 if (!page) {
233 kfree(ta);
234 return -ENOENT;
235 }
236 if (IS_ERR(page)) {
237 kfree(ta);
238 return PTR_ERR(page);
239 }
240 index = page->index;
241 page_cache_release(page);
242
243 mutex_lock(&super->s_dirop_mutex);
244 logfs_add_transaction(dir, ta);
245
246 ret = logfs_delete(dir, index, NULL);
247 if (!ret)
248 ret = write_inode(dir);
249
250 if (ret) {
251 abort_transaction(dir, ta);
252 printk(KERN_ERR"LOGFS: unable to delete inode\n");
253 goto out;
254 }
255
256 ta->state = UNLINK_2;
257 logfs_add_transaction(inode, ta);
258 ret = logfs_remove_inode(inode);
259out:
260 mutex_unlock(&super->s_dirop_mutex);
261 return ret;
262}
263
264static inline int logfs_empty_dir(struct inode *dir)
265{
266 u64 data;
267
268 data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
269 return data >= i_size_read(dir);
270}
271
272static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{
274 struct inode *inode = dentry->d_inode;
275
276 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY;
278
279 return logfs_unlink(dir, dentry);
280}
281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */
284#define IMPLICIT_NODES 2
285static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
286{
287 struct inode *dir = file->f_dentry->d_inode;
288 loff_t pos = file->f_pos - IMPLICIT_NODES;
289 struct page *page;
290 struct logfs_disk_dentry *dd;
291 int full;
292
293 BUG_ON(pos < 0);
294 for (;; pos++) {
295 if (beyond_eof(dir, pos))
296 break;
297 if (!logfs_exist_block(dir, pos)) {
298 /* deleted dentry */
299 pos = dir_seek_data(dir, pos);
300 continue;
301 }
302 page = read_cache_page(dir->i_mapping, pos,
303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page))
305 return PTR_ERR(page);
306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0);
308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap(page);
312 page_cache_release(page);
313 if (full)
314 break;
315 }
316
317 file->f_pos = pos + IMPLICIT_NODES;
318 return 0;
319}
320
321static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
322{
323 struct inode *inode = file->f_dentry->d_inode;
324 ino_t pino = parent_ino(file->f_dentry);
325 int err;
326
327 if (file->f_pos < 0)
328 return -EINVAL;
329
330 if (file->f_pos == 0) {
331 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
332 return 0;
333 file->f_pos++;
334 }
335 if (file->f_pos == 1) {
336 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
337 return 0;
338 file->f_pos++;
339 }
340
341 err = __logfs_readdir(file, buf, filldir);
342 return err;
343}
344
345static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
346{
347 dd->namelen = cpu_to_be16(name->len);
348 memcpy(dd->name, name->name, name->len);
349}
350
351static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
352 struct nameidata *nd)
353{
354 struct page *page;
355 struct logfs_disk_dentry *dd;
356 pgoff_t index;
357 u64 ino = 0;
358 struct inode *inode;
359
360 page = logfs_get_dd_page(dir, dentry);
361 if (IS_ERR(page))
362 return ERR_CAST(page);
363 if (!page) {
364 d_add(dentry, NULL);
365 return NULL;
366 }
367 index = page->index;
368 dd = kmap_atomic(page, KM_USER0);
369 ino = be64_to_cpu(dd->ino);
370 kunmap_atomic(dd, KM_USER0);
371 page_cache_release(page);
372
373 inode = logfs_iget(dir->i_sb, ino);
374 if (IS_ERR(inode)) {
375 printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
376 ino, dir->i_ino, index);
377 return ERR_CAST(inode);
378 }
379 return d_splice_alias(inode, dentry);
380}
381
382static void grow_dir(struct inode *dir, loff_t index)
383{
384 index = (index + 1) << dir->i_sb->s_blocksize_bits;
385 if (i_size_read(dir) < index)
386 i_size_write(dir, index);
387}
388
389static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
390 struct inode *inode)
391{
392 struct page *page;
393 struct logfs_disk_dentry *dd;
394 u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
395 pgoff_t index;
396 int round, err;
397
398 for (round = 0; round < 20; round++) {
399 index = hash_index(hash, round);
400
401 if (logfs_exist_block(dir, index))
402 continue;
403 page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
404 if (!page)
405 return -ENOMEM;
406
407 dd = kmap_atomic(page, KM_USER0);
408 memset(dd, 0, sizeof(*dd));
409 dd->ino = cpu_to_be64(inode->i_ino);
410 dd->type = logfs_type(inode);
411 logfs_set_name(dd, &dentry->d_name);
412 kunmap_atomic(dd, KM_USER0);
413
414 err = logfs_write_buf(dir, page, WF_LOCK);
415 unlock_page(page);
416 page_cache_release(page);
417 if (!err)
418 grow_dir(dir, index);
419 return err;
420 }
421 /* FIXME: Is there a better return value? In most cases neither
422 * the filesystem nor the directory are full. But we have had
423 * too many collisions for this particular hash and no fallback.
424 */
425 return -ENOSPC;
426}
427
428static int __logfs_create(struct inode *dir, struct dentry *dentry,
429 struct inode *inode, const char *dest, long destlen)
430{
431 struct logfs_super *super = logfs_super(dir->i_sb);
432 struct logfs_inode *li = logfs_inode(inode);
433 struct logfs_transaction *ta;
434 int ret;
435
436 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
437 if (!ta)
438 return -ENOMEM;
439
440 ta->state = CREATE_1;
441 ta->ino = inode->i_ino;
442 mutex_lock(&super->s_dirop_mutex);
443 logfs_add_transaction(inode, ta);
444
445 if (dest) {
446 /* symlink */
447 ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
448 if (!ret)
449 ret = write_inode(inode);
450 } else {
451 /* creat/mkdir/mknod */
452 ret = write_inode(inode);
453 }
454 if (ret) {
455 abort_transaction(inode, ta);
456 li->li_flags |= LOGFS_IF_STILLBORN;
457 /* FIXME: truncate symlink */
458 inode->i_nlink--;
459 iput(inode);
460 goto out;
461 }
462
463 ta->state = CREATE_2;
464 logfs_add_transaction(dir, ta);
465 ret = logfs_write_dir(dir, dentry, inode);
466 /* sync directory */
467 if (!ret)
468 ret = write_inode(dir);
469
470 if (ret) {
471 logfs_del_transaction(dir, ta);
472 ta->state = CREATE_2;
473 logfs_add_transaction(inode, ta);
474 logfs_remove_inode(inode);
475 iput(inode);
476 goto out;
477 }
478 d_instantiate(dentry, inode);
479out:
480 mutex_unlock(&super->s_dirop_mutex);
481 return ret;
482}
483
484static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
485{
486 struct inode *inode;
487
488 /*
489 * FIXME: why do we have to fill in S_IFDIR, while the mode is
490 * correct for mknod, creat, etc.? Smells like the vfs *should*
491 * do it for us but for some reason fails to do so.
492 */
493 inode = logfs_new_inode(dir, S_IFDIR | mode);
494 if (IS_ERR(inode))
495 return PTR_ERR(inode);
496
497 inode->i_op = &logfs_dir_iops;
498 inode->i_fop = &logfs_dir_fops;
499
500 return __logfs_create(dir, dentry, inode, NULL, 0);
501}
502
503static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
504 struct nameidata *nd)
505{
506 struct inode *inode;
507
508 inode = logfs_new_inode(dir, mode);
509 if (IS_ERR(inode))
510 return PTR_ERR(inode);
511
512 inode->i_op = &logfs_reg_iops;
513 inode->i_fop = &logfs_reg_fops;
514 inode->i_mapping->a_ops = &logfs_reg_aops;
515
516 return __logfs_create(dir, dentry, inode, NULL, 0);
517}
518
519static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
520 dev_t rdev)
521{
522 struct inode *inode;
523
524 if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
525 return -ENAMETOOLONG;
526
527 inode = logfs_new_inode(dir, mode);
528 if (IS_ERR(inode))
529 return PTR_ERR(inode);
530
531 init_special_inode(inode, mode, rdev);
532
533 return __logfs_create(dir, dentry, inode, NULL, 0);
534}
535
536static int logfs_symlink(struct inode *dir, struct dentry *dentry,
537 const char *target)
538{
539 struct inode *inode;
540 size_t destlen = strlen(target) + 1;
541
542 if (destlen > dir->i_sb->s_blocksize)
543 return -ENAMETOOLONG;
544
545 inode = logfs_new_inode(dir, S_IFLNK | 0777);
546 if (IS_ERR(inode))
547 return PTR_ERR(inode);
548
549 inode->i_op = &logfs_symlink_iops;
550 inode->i_mapping->a_ops = &logfs_reg_aops;
551
552 return __logfs_create(dir, dentry, inode, target, destlen);
553}
554
555static int logfs_permission(struct inode *inode, int mask)
556{
557 return generic_permission(inode, mask, NULL);
558}
559
560static int logfs_link(struct dentry *old_dentry, struct inode *dir,
561 struct dentry *dentry)
562{
563 struct inode *inode = old_dentry->d_inode;
564
565 if (inode->i_nlink >= LOGFS_LINK_MAX)
566 return -EMLINK;
567
568 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
569 atomic_inc(&inode->i_count);
570 inode->i_nlink++;
571 mark_inode_dirty_sync(inode);
572
573 return __logfs_create(dir, dentry, inode, NULL, 0);
574}
575
576static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
577 struct logfs_disk_dentry *dd, loff_t *pos)
578{
579 struct page *page;
580 void *map;
581
582 page = logfs_get_dd_page(dir, dentry);
583 if (IS_ERR(page))
584 return PTR_ERR(page);
585 *pos = page->index;
586 map = kmap_atomic(page, KM_USER0);
587 memcpy(dd, map, sizeof(*dd));
588 kunmap_atomic(map, KM_USER0);
589 page_cache_release(page);
590 return 0;
591}
592
593static int logfs_delete_dd(struct inode *dir, loff_t pos)
594{
595 /*
596 * Getting called with pos somewhere beyond eof is either a goofup
597 * within this file or means someone maliciously edited the
598 * (crc-protected) journal.
599 */
600 BUG_ON(beyond_eof(dir, pos));
601 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
602 log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
603 return logfs_delete(dir, pos, NULL);
604}
605
606/*
607 * Cross-directory rename, target does not exist. Just a little nasty.
608 * Create a new dentry in the target dir, then remove the old dentry,
609 * all the while taking care to remember our operation in the journal.
610 */
611static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
612 struct inode *new_dir, struct dentry *new_dentry)
613{
614 struct logfs_super *super = logfs_super(old_dir->i_sb);
615 struct logfs_disk_dentry dd;
616 struct logfs_transaction *ta;
617 loff_t pos;
618 int err;
619
620 /* 1. locate source dd */
621 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
622 if (err)
623 return err;
624
625 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
626 if (!ta)
627 return -ENOMEM;
628
629 ta->state = CROSS_RENAME_1;
630 ta->dir = old_dir->i_ino;
631 ta->pos = pos;
632
633 /* 2. write target dd */
634 mutex_lock(&super->s_dirop_mutex);
635 logfs_add_transaction(new_dir, ta);
636 err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
637 if (!err)
638 err = write_inode(new_dir);
639
640 if (err) {
641 super->s_rename_dir = 0;
642 super->s_rename_pos = 0;
643 abort_transaction(new_dir, ta);
644 goto out;
645 }
646
647 /* 3. remove source dd */
648 ta->state = CROSS_RENAME_2;
649 logfs_add_transaction(old_dir, ta);
650 err = logfs_delete_dd(old_dir, pos);
651 if (!err)
652 err = write_inode(old_dir);
653 LOGFS_BUG_ON(err, old_dir->i_sb);
654out:
655 mutex_unlock(&super->s_dirop_mutex);
656 return err;
657}
658
659static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
660 struct logfs_disk_dentry *dd, struct inode *inode)
661{
662 loff_t pos;
663 int err;
664
665 err = logfs_get_dd(dir, dentry, dd, &pos);
666 if (err)
667 return err;
668 dd->ino = cpu_to_be64(inode->i_ino);
669 dd->type = logfs_type(inode);
670
671 err = write_dir(dir, dd, pos);
672 if (err)
673 return err;
674 log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
675 dd->name, be64_to_cpu(dd->ino));
676 return write_inode(dir);
677}
678
679/* Target dentry exists - the worst case. We need to attach the source
680 * inode to the target dentry, then remove the orphaned target inode and
681 * source dentry.
682 */
683static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
684 struct inode *new_dir, struct dentry *new_dentry)
685{
686 struct logfs_super *super = logfs_super(old_dir->i_sb);
687 struct inode *old_inode = old_dentry->d_inode;
688 struct inode *new_inode = new_dentry->d_inode;
689 int isdir = S_ISDIR(old_inode->i_mode);
690 struct logfs_disk_dentry dd;
691 struct logfs_transaction *ta;
692 loff_t pos;
693 int err;
694
695 BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
696 if (isdir) {
697 if (!logfs_empty_dir(new_inode))
698 return -ENOTEMPTY;
699 }
700
701 /* 1. locate source dd */
702 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
703 if (err)
704 return err;
705
706 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
707 if (!ta)
708 return -ENOMEM;
709
710 ta->state = TARGET_RENAME_1;
711 ta->dir = old_dir->i_ino;
712 ta->pos = pos;
713 ta->ino = new_inode->i_ino;
714
715 /* 2. attach source inode to target dd */
716 mutex_lock(&super->s_dirop_mutex);
717 logfs_add_transaction(new_dir, ta);
718 err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
719 if (err) {
720 super->s_rename_dir = 0;
721 super->s_rename_pos = 0;
722 super->s_victim_ino = 0;
723 abort_transaction(new_dir, ta);
724 goto out;
725 }
726
727 /* 3. remove source dd */
728 ta->state = TARGET_RENAME_2;
729 logfs_add_transaction(old_dir, ta);
730 err = logfs_delete_dd(old_dir, pos);
731 if (!err)
732 err = write_inode(old_dir);
733 LOGFS_BUG_ON(err, old_dir->i_sb);
734
735 /* 4. remove target inode */
736 ta->state = TARGET_RENAME_3;
737 logfs_add_transaction(new_inode, ta);
738 err = logfs_remove_inode(new_inode);
739
740out:
741 mutex_unlock(&super->s_dirop_mutex);
742 return err;
743}
744
745static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
746 struct inode *new_dir, struct dentry *new_dentry)
747{
748 if (new_dentry->d_inode)
749 return logfs_rename_target(old_dir, old_dentry,
750 new_dir, new_dentry);
751 return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
752}
753
754/* No locking done here, as this is called before .get_sb() returns. */
755int logfs_replay_journal(struct super_block *sb)
756{
757 struct logfs_super *super = logfs_super(sb);
758 struct inode *inode;
759 u64 ino, pos;
760 int err;
761
762 if (super->s_victim_ino) {
763 /* delete victim inode */
764 ino = super->s_victim_ino;
765 printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
766 inode = logfs_iget(sb, ino);
767 if (IS_ERR(inode))
768 goto fail;
769
770 LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
771 super->s_victim_ino = 0;
772 err = logfs_remove_inode(inode);
773 iput(inode);
774 if (err) {
775 super->s_victim_ino = ino;
776 goto fail;
777 }
778 }
779 if (super->s_rename_dir) {
780 /* delete old dd from rename */
781 ino = super->s_rename_dir;
782 pos = super->s_rename_pos;
783 printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
784 ino, pos);
785 inode = logfs_iget(sb, ino);
786 if (IS_ERR(inode))
787 goto fail;
788
789 super->s_rename_dir = 0;
790 super->s_rename_pos = 0;
791 err = logfs_delete_dd(inode, pos);
792 iput(inode);
793 if (err) {
794 super->s_rename_dir = ino;
795 super->s_rename_pos = pos;
796 goto fail;
797 }
798 }
799 return 0;
800fail:
801 LOGFS_BUG(sb);
802 return -EIO;
803}
804
805const struct inode_operations logfs_symlink_iops = {
806 .readlink = generic_readlink,
807 .follow_link = page_follow_link_light,
808};
809
810const struct inode_operations logfs_dir_iops = {
811 .create = logfs_create,
812 .link = logfs_link,
813 .lookup = logfs_lookup,
814 .mkdir = logfs_mkdir,
815 .mknod = logfs_mknod,
816 .rename = logfs_rename,
817 .rmdir = logfs_rmdir,
818 .permission = logfs_permission,
819 .symlink = logfs_symlink,
820 .unlink = logfs_unlink,
821};
822const struct file_operations logfs_dir_fops = {
823 .fsync = logfs_fsync,
824 .ioctl = logfs_ioctl,
825 .readdir = logfs_readdir,
826 .read = generic_read_dir,
827};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..370f367a933e
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
1/*
2 * fs/logfs/file.c - prepare_write, commit_write and friends
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/writeback.h>
11
12static int logfs_write_begin(struct file *file, struct address_space *mapping,
13 loff_t pos, unsigned len, unsigned flags,
14 struct page **pagep, void **fsdata)
15{
16 struct inode *inode = mapping->host;
17 struct page *page;
18 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
19
20 page = grab_cache_page_write_begin(mapping, index, flags);
21 if (!page)
22 return -ENOMEM;
23 *pagep = page;
24
25 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
26 return 0;
27 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
28 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
29 unsigned end = start + len;
30
31 /* Reading beyond i_size is simple: memset to zero */
32 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
33 return 0;
34 }
35 return logfs_readpage_nolock(page);
36}
37
38static int logfs_write_end(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned copied, struct page *page,
40 void *fsdata)
41{
42 struct inode *inode = mapping->host;
43 pgoff_t index = page->index;
44 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
45 unsigned end = start + copied;
46 int ret = 0;
47
48 BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
49 BUG_ON(page->index > I3_BLOCKS);
50
51 if (copied < len) {
52 /*
53 * Short write of a non-initialized paged. Just tell userspace
54 * to retry the entire page.
55 */
56 if (!PageUptodate(page)) {
57 copied = 0;
58 goto out;
59 }
60 }
61 if (copied == 0)
62 goto out; /* FIXME: do we need to update inode? */
63
64 if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
65 i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
66 mark_inode_dirty_sync(inode);
67 }
68
69 SetPageUptodate(page);
70 if (!PageDirty(page)) {
71 if (!get_page_reserve(inode, page))
72 __set_page_dirty_nobuffers(page);
73 else
74 ret = logfs_write_buf(inode, page, WF_LOCK);
75 }
76out:
77 unlock_page(page);
78 page_cache_release(page);
79 return ret ? ret : copied;
80}
81
82int logfs_readpage(struct file *file, struct page *page)
83{
84 int ret;
85
86 ret = logfs_readpage_nolock(page);
87 unlock_page(page);
88 return ret;
89}
90
91/* Clear the page's dirty flag in the radix tree. */
92/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
93 * the dirty bit from the radix tree for filesystems that don't have to wait
94 * for page writeback to finish (i.e. any compressing filesystem).
95 */
96static void clear_radix_tree_dirty(struct page *page)
97{
98 BUG_ON(PagePrivate(page) || page->private);
99 set_page_writeback(page);
100 end_page_writeback(page);
101}
102
103static int __logfs_writepage(struct page *page)
104{
105 struct inode *inode = page->mapping->host;
106 int err;
107
108 err = logfs_write_buf(inode, page, WF_LOCK);
109 if (err)
110 set_page_dirty(page);
111 else
112 clear_radix_tree_dirty(page);
113 unlock_page(page);
114 return err;
115}
116
117static int logfs_writepage(struct page *page, struct writeback_control *wbc)
118{
119 struct inode *inode = page->mapping->host;
120 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset;
123 u64 bix;
124 level_t level;
125
126 log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
127 page);
128
129 logfs_unpack_index(page->index, &bix, &level);
130
131 /* Indirect blocks are never truncated */
132 if (level != 0)
133 return __logfs_writepage(page);
134
135 /*
136 * TODO: everything below is a near-verbatim copy of nobh_writepage().
137 * The relevant bits should be factored out after logfs is merged.
138 */
139
140 /* Is the page fully inside i_size? */
141 if (bix < end_index)
142 return __logfs_writepage(page);
143
144 /* Is the page fully outside i_size? (truncate in progress) */
145 offset = i_size & (PAGE_CACHE_SIZE-1);
146 if (bix > end_index || offset == 0) {
147 unlock_page(page);
148 return 0; /* don't care */
149 }
150
151 /*
152 * The page straddles i_size. It must be zeroed out on each and every
153 * writepage invokation because it may be mmapped. "A file is mapped
154 * in multiples of the page size. For a file that is not a multiple of
155 * the page size, the remaining memory is zeroed when mapped, and
156 * writes to that region are not written out to the file."
157 */
158 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
159 return __logfs_writepage(page);
160}
161
162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{
164 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private);
166}
167
168static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
169{
170 return 0; /* None of these are easy to release */
171}
172
173
174int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
175 unsigned long arg)
176{
177 struct logfs_inode *li = logfs_inode(inode);
178 unsigned int oldflags, flags;
179 int err;
180
181 switch (cmd) {
182 case FS_IOC_GETFLAGS:
183 flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
184 return put_user(flags, (int __user *)arg);
185 case FS_IOC_SETFLAGS:
186 if (IS_RDONLY(inode))
187 return -EROFS;
188
189 if (!is_owner_or_cap(inode))
190 return -EACCES;
191
192 err = get_user(flags, (int __user *)arg);
193 if (err)
194 return err;
195
196 mutex_lock(&inode->i_mutex);
197 oldflags = li->li_flags;
198 flags &= LOGFS_FL_USER_MODIFIABLE;
199 flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
200 li->li_flags = flags;
201 mutex_unlock(&inode->i_mutex);
202
203 inode->i_ctime = CURRENT_TIME;
204 mark_inode_dirty_sync(inode);
205 return 0;
206
207 default:
208 return -ENOTTY;
209 }
210}
211
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
213{
214 struct super_block *sb = dentry->d_inode->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216
217 /* FIXME: write anchor */
218 super->s_devops->sync(sb);
219 return 0;
220}
221
222static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
223{
224 struct inode *inode = dentry->d_inode;
225 int err = 0;
226
227 if (attr->ia_valid & ATTR_SIZE)
228 err = logfs_truncate(inode, attr->ia_size);
229 attr->ia_valid &= ~ATTR_SIZE;
230
231 if (!err)
232 err = inode_change_ok(inode, attr);
233 if (!err)
234 err = inode_setattr(inode, attr);
235 return err;
236}
237
238const struct inode_operations logfs_reg_iops = {
239 .setattr = logfs_setattr,
240};
241
242const struct file_operations logfs_reg_fops = {
243 .aio_read = generic_file_aio_read,
244 .aio_write = generic_file_aio_write,
245 .fsync = logfs_fsync,
246 .ioctl = logfs_ioctl,
247 .llseek = generic_file_llseek,
248 .mmap = generic_file_readonly_mmap,
249 .open = generic_file_open,
250 .read = do_sync_read,
251 .write = do_sync_write,
252};
253
254const struct address_space_operations logfs_reg_aops = {
255 .invalidatepage = logfs_invalidatepage,
256 .readpage = logfs_readpage,
257 .releasepage = logfs_releasepage,
258 .set_page_dirty = __set_page_dirty_nobuffers,
259 .writepage = logfs_writepage,
260 .writepages = generic_writepages,
261 .write_begin = logfs_write_begin,
262 .write_end = logfs_write_end,
263};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..92949f95a901
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,730 @@
1/*
2 * fs/logfs/gc.c - garbage collection code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10
11/*
12 * Wear leveling needs to kick in when the difference between low erase
13 * counts and high erase counts gets too big. A good value for "too big"
14 * may be somewhat below 10% of maximum erase count for the device.
15 * Why not 397, to pick a nice round number with no specific meaning? :)
16 *
17 * WL_RATELIMIT is the minimum time between two wear level events. A huge
18 * number of segments may fulfil the requirements for wear leveling at the
19 * same time. If that happens we don't want to cause a latency from hell,
20 * but just gently pick one segment every so often and minimize overhead.
21 */
22#define WL_DELTA 397
23#define WL_RATELIMIT 100
24#define MAX_OBJ_ALIASES 2600
25#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
26#define LIST_SIZE 64 /* base size of candidate lists */
27#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
28#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
29
30static int no_free_segments(struct super_block *sb)
31{
32 struct logfs_super *super = logfs_super(sb);
33
34 return super->s_free_list.count;
35}
36
37/* journal has distance -1, top-most ifile layer distance 0 */
38static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
39{
40 struct logfs_super *super = logfs_super(sb);
41 u8 gc_level = (__force u8)__gc_level;
42
43 switch (gc_level) {
44 case 0: /* fall through */
45 case 1: /* fall through */
46 case 2: /* fall through */
47 case 3:
48 /* file data or indirect blocks */
49 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
50 case 6: /* fall through */
51 case 7: /* fall through */
52 case 8: /* fall through */
53 case 9:
54 /* inode file data or indirect blocks */
55 return super->s_ifile_levels - (gc_level - 6);
56 default:
57 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
58 gc_level);
59 WARN_ON(1);
60 return super->s_ifile_levels + super->s_iblock_levels;
61 }
62}
63
64static int segment_is_reserved(struct super_block *sb, u32 segno)
65{
66 struct logfs_super *super = logfs_super(sb);
67 struct logfs_area *area;
68 void *reserved;
69 int i;
70
71 /* Some segments are reserved. Just pretend they were all valid */
72 reserved = btree_lookup32(&super->s_reserved_segments, segno);
73 if (reserved)
74 return 1;
75
76 /* Currently open segments */
77 for_each_area(i) {
78 area = super->s_area[i];
79 if (area->a_is_open && area->a_segno == segno)
80 return 1;
81 }
82
83 return 0;
84}
85
86static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
87{
88 BUG();
89}
90
91/*
92 * Returns the bytes consumed by valid objects in this segment. Object headers
93 * are counted, the segment header is not.
94 */
95static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
96 gc_level_t *gc_level)
97{
98 struct logfs_segment_entry se;
99 u32 ec_level;
100
101 logfs_get_segment_entry(sb, segno, &se);
102 if (se.ec_level == cpu_to_be32(BADSEG) ||
103 se.valid == cpu_to_be32(RESERVED))
104 return RESERVED;
105
106 ec_level = be32_to_cpu(se.ec_level);
107 *ec = ec_level >> 4;
108 *gc_level = GC_LEVEL(ec_level & 0xf);
109 return be32_to_cpu(se.valid);
110}
111
112static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
113 u64 bix, gc_level_t gc_level)
114{
115 struct inode *inode;
116 int err, cookie;
117
118 inode = logfs_safe_iget(sb, ino, &cookie);
119 err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
120 BUG_ON(err);
121 logfs_safe_iput(inode, cookie);
122}
123
124static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
125{
126 struct logfs_super *super = logfs_super(sb);
127 struct logfs_segment_header sh;
128 struct logfs_object_header oh;
129 u64 ofs, ino, bix;
130 u32 seg_ofs, logical_segno, cleaned = 0;
131 int err, len, valid;
132 gc_level_t gc_level;
133
134 LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
135
136 btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
137 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
138 BUG_ON(err);
139 gc_level = GC_LEVEL(sh.level);
140 logical_segno = be32_to_cpu(sh.segno);
141 if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
142 logfs_mark_segment_bad(sb, segno);
143 cleaned = -1;
144 goto out;
145 }
146
147 for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
148 seg_ofs + sizeof(oh) < super->s_segsize; ) {
149 ofs = dev_ofs(sb, logical_segno, seg_ofs);
150 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
151 &oh);
152 BUG_ON(err);
153
154 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
155 break;
156
157 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
158 logfs_mark_segment_bad(sb, segno);
159 cleaned = super->s_segsize - 1;
160 goto out;
161 }
162
163 ino = be64_to_cpu(oh.ino);
164 bix = be64_to_cpu(oh.bix);
165 len = sizeof(oh) + be16_to_cpu(oh.len);
166 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
167 if (valid == 1) {
168 logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
169 cleaned += len;
170 } else if (valid == 2) {
171 /* Will be invalid upon journal commit */
172 cleaned += len;
173 }
174 seg_ofs += len;
175 }
176out:
177 btree_remove32(&super->s_reserved_segments, segno);
178 return cleaned;
179}
180
181static struct gc_candidate *add_list(struct gc_candidate *cand,
182 struct candidate_list *list)
183{
184 struct rb_node **p = &list->rb_tree.rb_node;
185 struct rb_node *parent = NULL;
186 struct gc_candidate *cur;
187 int comp;
188
189 cand->list = list;
190 while (*p) {
191 parent = *p;
192 cur = rb_entry(parent, struct gc_candidate, rb_node);
193
194 if (list->sort_by_ec)
195 comp = cand->erase_count < cur->erase_count;
196 else
197 comp = cand->valid < cur->valid;
198
199 if (comp)
200 p = &parent->rb_left;
201 else
202 p = &parent->rb_right;
203 }
204 rb_link_node(&cand->rb_node, parent, p);
205 rb_insert_color(&cand->rb_node, &list->rb_tree);
206
207 if (list->count <= list->maxcount) {
208 list->count++;
209 return NULL;
210 }
211 cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
212 rb_erase(&cand->rb_node, &list->rb_tree);
213 cand->list = NULL;
214 return cand;
215}
216
217static void remove_from_list(struct gc_candidate *cand)
218{
219 struct candidate_list *list = cand->list;
220
221 rb_erase(&cand->rb_node, &list->rb_tree);
222 list->count--;
223}
224
225static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
226{
227 struct logfs_super *super = logfs_super(sb);
228
229 btree_remove32(&super->s_cand_tree, cand->segno);
230 kfree(cand);
231}
232
233u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
234{
235 struct gc_candidate *cand;
236 u32 segno;
237
238 BUG_ON(list->count == 0);
239
240 cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
241 remove_from_list(cand);
242 segno = cand->segno;
243 if (ec)
244 *ec = cand->erase_count;
245 free_candidate(sb, cand);
246 return segno;
247}
248
249/*
250 * We have several lists to manage segments with. The reserve_list is used to
251 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
252 * list.
253 * The free_list contains free segments for normal usage. It usually gets the
254 * second pick after the reserve_list. But when the free_list is running short
255 * it is more important to keep the free_list full than to keep a reserve.
256 *
257 * Segments that are not free are put onto a per-level low_list. If we have
258 * to run garbage collection, we pick a candidate from there. All segments on
259 * those lists should have at least some free space so GC will make progress.
260 *
261 * And last we have the ec_list, which is used to pick segments for wear
262 * leveling.
263 *
264 * If all appropriate lists are full, we simply free the candidate and forget
265 * about that segment for a while. We have better candidates for each purpose.
266 */
267static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
268{
269 struct logfs_super *super = logfs_super(sb);
270 u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
271
272 if (cand->valid == 0) {
273 /* 100% free segments */
274 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
275 cand->segno, cand->erase_count,
276 dev_ofs(sb, cand->segno, 0));
277 cand = add_list(cand, &super->s_reserve_list);
278 if (cand) {
279 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
280 cand->segno, cand->erase_count,
281 dev_ofs(sb, cand->segno, 0));
282 cand = add_list(cand, &super->s_free_list);
283 }
284 } else {
285 /* good candidates for Garbage Collection */
286 if (cand->valid < full)
287 cand = add_list(cand, &super->s_low_list[cand->dist]);
288 /* good candidates for wear leveling,
289 * segments that were recently written get ignored */
290 if (cand)
291 cand = add_list(cand, &super->s_ec_list);
292 }
293 if (cand)
294 free_candidate(sb, cand);
295}
296
297static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
298 u8 dist)
299{
300 struct logfs_super *super = logfs_super(sb);
301 struct gc_candidate *cand;
302
303 cand = kmalloc(sizeof(*cand), GFP_NOFS);
304 if (!cand)
305 return -ENOMEM;
306
307 cand->segno = segno;
308 cand->valid = valid;
309 cand->erase_count = ec;
310 cand->dist = dist;
311
312 btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
313 __add_candidate(sb, cand);
314 return 0;
315}
316
317static void remove_segment_from_lists(struct super_block *sb, u32 segno)
318{
319 struct logfs_super *super = logfs_super(sb);
320 struct gc_candidate *cand;
321
322 cand = btree_lookup32(&super->s_cand_tree, segno);
323 if (cand) {
324 remove_from_list(cand);
325 free_candidate(sb, cand);
326 }
327}
328
329static void scan_segment(struct super_block *sb, u32 segno)
330{
331 u32 valid, ec = 0;
332 gc_level_t gc_level = 0;
333 u8 dist;
334
335 if (segment_is_reserved(sb, segno))
336 return;
337
338 remove_segment_from_lists(sb, segno);
339 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
340 if (valid == RESERVED)
341 return;
342
343 dist = root_distance(sb, gc_level);
344 add_candidate(sb, segno, valid, ec, dist);
345}
346
347static struct gc_candidate *first_in_list(struct candidate_list *list)
348{
349 if (list->count == 0)
350 return NULL;
351 return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
352}
353
354/*
355 * Find the best segment for garbage collection. Main criterion is
356 * the segment requiring the least effort to clean. Secondary
357 * criterion is to GC on the lowest level available.
358 *
359 * So we search the least effort segment on the lowest level first,
360 * then move up and pick another segment iff is requires significantly
361 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
362 */
363static struct gc_candidate *get_candidate(struct super_block *sb)
364{
365 struct logfs_super *super = logfs_super(sb);
366 int i, max_dist;
367 struct gc_candidate *cand = NULL, *this;
368
369 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
370
371 for (i = max_dist; i >= 0; i--) {
372 this = first_in_list(&super->s_low_list[i]);
373 if (!this)
374 continue;
375 if (!cand)
376 cand = this;
377 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
378 cand = this;
379 }
380 return cand;
381}
382
383static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
384{
385 struct logfs_super *super = logfs_super(sb);
386 gc_level_t gc_level;
387 u32 cleaned, valid, segno, ec;
388 u8 dist;
389
390 if (!cand) {
391 log_gc("GC attempted, but no candidate found\n");
392 return 0;
393 }
394
395 segno = cand->segno;
396 dist = cand->dist;
397 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
398 free_candidate(sb, cand);
399 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
400 segno, (u64)segno << super->s_segshift,
401 dist, no_free_segments(sb), valid,
402 super->s_free_bytes);
403 cleaned = logfs_gc_segment(sb, segno, dist);
404 log_gc("GC segment #%02x complete - now %x valid\n", segno,
405 valid - cleaned);
406 BUG_ON(cleaned != valid);
407 return 1;
408}
409
410static int logfs_gc_once(struct super_block *sb)
411{
412 struct gc_candidate *cand;
413
414 cand = get_candidate(sb);
415 if (cand)
416 remove_from_list(cand);
417 return __logfs_gc_once(sb, cand);
418}
419
420/* returns 1 if a wrap occurs, 0 otherwise */
421static int logfs_scan_some(struct super_block *sb)
422{
423 struct logfs_super *super = logfs_super(sb);
424 u32 segno;
425 int i, ret = 0;
426
427 segno = super->s_sweeper;
428 for (i = SCAN_RATIO; i > 0; i--) {
429 segno++;
430 if (segno >= super->s_no_segs) {
431 segno = 0;
432 ret = 1;
433 /* Break out of the loop. We want to read a single
434 * block from the segment size on next invocation if
435 * SCAN_RATIO is set to match block size
436 */
437 break;
438 }
439
440 scan_segment(sb, segno);
441 }
442 super->s_sweeper = segno;
443 return ret;
444}
445
446/*
447 * In principle, this function should loop forever, looking for GC candidates
448 * and moving data. LogFS is designed in such a way that this loop is
449 * guaranteed to terminate.
450 *
451 * Limiting the loop to some iterations serves purely to catch cases when
452 * these guarantees have failed. An actual endless loop is an obvious bug
453 * and should be reported as such.
454 */
455static void __logfs_gc_pass(struct super_block *sb, int target)
456{
457 struct logfs_super *super = logfs_super(sb);
458 struct logfs_block *block;
459 int round, progress, last_progress = 0;
460
461 if (no_free_segments(sb) >= target &&
462 super->s_no_object_aliases < MAX_OBJ_ALIASES)
463 return;
464
465 log_gc("__logfs_gc_pass(%x)\n", target);
466 for (round = 0; round < SCAN_ROUNDS; ) {
467 if (no_free_segments(sb) >= target)
468 goto write_alias;
469
470 /* Sync in-memory state with on-medium state in case they
471 * diverged */
472 logfs_write_anchor(sb);
473 round += logfs_scan_some(sb);
474 if (no_free_segments(sb) >= target)
475 goto write_alias;
476 progress = logfs_gc_once(sb);
477 if (progress)
478 last_progress = round;
479 else if (round - last_progress > 2)
480 break;
481 continue;
482
483 /*
484 * The goto logic is nasty, I just don't know a better way to
485 * code it. GC is supposed to ensure two things:
486 * 1. Enough free segments are available.
487 * 2. The number of aliases is bounded.
488 * When 1. is achieved, we take a look at 2. and write back
489 * some alias-containing blocks, if necessary. However, after
490 * each such write we need to go back to 1., as writes can
491 * consume free segments.
492 */
493write_alias:
494 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
495 return;
496 if (list_empty(&super->s_object_alias)) {
497 /* All aliases are still in btree */
498 return;
499 }
500 log_gc("Write back one alias\n");
501 block = list_entry(super->s_object_alias.next,
502 struct logfs_block, alias_list);
503 block->ops->write_block(block);
504 /*
505 * To round off the nasty goto logic, we reset round here. It
506 * is a safety-net for GC not making any progress and limited
507 * to something reasonably small. If incremented it for every
508 * single alias, the loop could terminate rather quickly.
509 */
510 round = 0;
511 }
512 LOGFS_BUG(sb);
513}
514
515static int wl_ratelimit(struct super_block *sb, u64 *next_event)
516{
517 struct logfs_super *super = logfs_super(sb);
518
519 if (*next_event < super->s_gec) {
520 *next_event = super->s_gec + WL_RATELIMIT;
521 return 0;
522 }
523 return 1;
524}
525
526static void logfs_wl_pass(struct super_block *sb)
527{
528 struct logfs_super *super = logfs_super(sb);
529 struct gc_candidate *wl_cand, *free_cand;
530
531 if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
532 return;
533
534 wl_cand = first_in_list(&super->s_ec_list);
535 if (!wl_cand)
536 return;
537 free_cand = first_in_list(&super->s_free_list);
538 if (!free_cand)
539 return;
540
541 if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
542 remove_from_list(wl_cand);
543 __logfs_gc_once(sb, wl_cand);
544 }
545}
546
547/*
548 * The journal needs wear leveling as well. But moving the journal is an
549 * expensive operation so we try to avoid it as much as possible. And if we
550 * have to do it, we move the whole journal, not individual segments.
551 *
552 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
553 * calculations. First we check whether moving the journal would be a
554 * significant improvement. That means that a) the current journal segments
555 * have more wear than the future journal segments and b) the current journal
556 * segments have more wear than normal ostore segments.
557 * Rationale for b) is that we don't have to move the journal if it is aging
558 * less than the ostore, even if the reserve segments age even less (they are
559 * excluded from wear leveling, after all).
560 * Next we check that the superblocks have less wear than the journal. Since
561 * moving the journal requires writing the superblocks, we have to protect the
562 * superblocks even more than the journal.
563 *
564 * Also we double the acceptable wear difference, compared to ostore wear
565 * leveling. Journal data is read and rewritten rapidly, comparatively. So
566 * soft errors have much less time to accumulate and we allow the journal to
567 * be a bit worse than the ostore.
568 */
569static void logfs_journal_wl_pass(struct super_block *sb)
570{
571 struct logfs_super *super = logfs_super(sb);
572 struct gc_candidate *cand;
573 u32 min_journal_ec = -1, max_reserve_ec = 0;
574 int i;
575
576 if (wl_ratelimit(sb, &super->s_wl_gec_journal))
577 return;
578
579 if (super->s_reserve_list.count < super->s_no_journal_segs) {
580 /* Reserve is not full enough to move complete journal */
581 return;
582 }
583
584 journal_for_each(i)
585 if (super->s_journal_seg[i])
586 min_journal_ec = min(min_journal_ec,
587 super->s_journal_ec[i]);
588 cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
589 struct gc_candidate, rb_node);
590 max_reserve_ec = cand->erase_count;
591 for (i = 0; i < 2; i++) {
592 struct logfs_segment_entry se;
593 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
594 u32 ec;
595
596 logfs_get_segment_entry(sb, segno, &se);
597 ec = be32_to_cpu(se.ec_level) >> 4;
598 max_reserve_ec = max(max_reserve_ec, ec);
599 }
600
601 if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
602 do_logfs_journal_wl_pass(sb);
603 }
604}
605
606void logfs_gc_pass(struct super_block *sb)
607{
608 struct logfs_super *super = logfs_super(sb);
609
610 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
611 /* Write journal before free space is getting saturated with dirty
612 * objects.
613 */
614 if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
615 + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
616 logfs_write_anchor(sb);
617 __logfs_gc_pass(sb, super->s_total_levels);
618 logfs_wl_pass(sb);
619 logfs_journal_wl_pass(sb);
620}
621
622static int check_area(struct super_block *sb, int i)
623{
624 struct logfs_super *super = logfs_super(sb);
625 struct logfs_area *area = super->s_area[i];
626 struct logfs_object_header oh;
627 u32 segno = area->a_segno;
628 u32 ofs = area->a_used_bytes;
629 __be32 crc;
630 int err;
631
632 if (!area->a_is_open)
633 return 0;
634
635 for (ofs = area->a_used_bytes;
636 ofs <= super->s_segsize - sizeof(oh);
637 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
638 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
639 if (err)
640 return err;
641
642 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
643 break;
644
645 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
646 if (crc != oh.crc) {
647 printk(KERN_INFO "interrupted header at %llx\n",
648 dev_ofs(sb, segno, ofs));
649 return 0;
650 }
651 }
652 if (ofs != area->a_used_bytes) {
653 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
654 ofs - area->a_used_bytes,
655 dev_ofs(sb, segno, area->a_used_bytes));
656 area->a_used_bytes = ofs;
657 }
658 return 0;
659}
660
661int logfs_check_areas(struct super_block *sb)
662{
663 int i, err;
664
665 for_each_area(i) {
666 err = check_area(sb, i);
667 if (err)
668 return err;
669 }
670 return 0;
671}
672
673static void logfs_init_candlist(struct candidate_list *list, int maxcount,
674 int sort_by_ec)
675{
676 list->count = 0;
677 list->maxcount = maxcount;
678 list->sort_by_ec = sort_by_ec;
679 list->rb_tree = RB_ROOT;
680}
681
682int logfs_init_gc(struct super_block *sb)
683{
684 struct logfs_super *super = logfs_super(sb);
685 int i;
686
687 btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
688 logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
689 logfs_init_candlist(&super->s_reserve_list,
690 super->s_bad_seg_reserve, 1);
691 for_each_area(i)
692 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
693 logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
694 return 0;
695}
696
697static void logfs_cleanup_list(struct super_block *sb,
698 struct candidate_list *list)
699{
700 struct gc_candidate *cand;
701
702 while (list->count) {
703 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
704 rb_node);
705 remove_from_list(cand);
706 free_candidate(sb, cand);
707 }
708 BUG_ON(list->rb_tree.rb_node);
709}
710
711void logfs_cleanup_gc(struct super_block *sb)
712{
713 struct logfs_super *super = logfs_super(sb);
714 int i;
715
716 if (!super->s_free_list.count)
717 return;
718
719 /*
720 * FIXME: The btree may still contain a single empty node. So we
721 * call the grim visitor to clean up that mess. Btree code should
722 * do it for us, really.
723 */
724 btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
725 logfs_cleanup_list(sb, &super->s_free_list);
726 logfs_cleanup_list(sb, &super->s_reserve_list);
727 for_each_area(i)
728 logfs_cleanup_list(sb, &super->s_low_list[i]);
729 logfs_cleanup_list(sb, &super->s_ec_list);
730}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..33ec1aeaeec4
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,417 @@
1/*
2 * fs/logfs/inode.c - inode handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/writeback.h>
10#include <linux/backing-dev.h>
11
12/*
13 * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
14 * on the medium. It therefore also lacks a method to store the previous
15 * generation number for deleted inodes. Instead a single generation number
16 * is stored which will be used for new inodes. Being just a 32bit counter,
17 * this can obvious wrap relatively quickly. So we only reuse inodes if we
18 * know that a fair number of inodes can be created before we have to increment
19 * the generation again - effectively adding some bits to the counter.
20 * But being too aggressive here means we keep a very large and very sparse
21 * inode file, wasting space on indirect blocks.
22 * So what is a good value? Beats me. 64k seems moderately bad on both
23 * fronts, so let's use that for now...
24 *
25 * NFS sucks, as everyone already knows.
26 */
27#define INOS_PER_WRAP (0x10000)
28
29/*
30 * Logfs' requirement to read inodes for garbage collection makes life a bit
31 * harder. GC may have to read inodes that are in I_FREEING state, when they
32 * are being written out - and waiting for GC to make progress, naturally.
33 *
34 * So we cannot just call iget() or some variant of it, but first have to check
35 * wether the inode in question might be in I_FREEING state. Therefore we
36 * maintain our own per-sb list of "almost deleted" inodes and check against
37 * that list first. Normally this should be at most 1-2 entries long.
38 *
39 * Also, inodes have logfs-specific reference counting on top of what the vfs
40 * does. When .destroy_inode is called, normally the reference count will drop
41 * to zero and the inode gets deleted. But if GC accessed the inode, its
42 * refcount will remain nonzero and final deletion will have to wait.
43 *
44 * As a result we have two sets of functions to get/put inodes:
45 * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
46 * logfs_iget/iput - normal version
47 */
48static struct kmem_cache *logfs_inode_cache;
49
50static DEFINE_SPINLOCK(logfs_inode_lock);
51
52static void logfs_inode_setops(struct inode *inode)
53{
54 switch (inode->i_mode & S_IFMT) {
55 case S_IFDIR:
56 inode->i_op = &logfs_dir_iops;
57 inode->i_fop = &logfs_dir_fops;
58 inode->i_mapping->a_ops = &logfs_reg_aops;
59 break;
60 case S_IFREG:
61 inode->i_op = &logfs_reg_iops;
62 inode->i_fop = &logfs_reg_fops;
63 inode->i_mapping->a_ops = &logfs_reg_aops;
64 break;
65 case S_IFLNK:
66 inode->i_op = &logfs_symlink_iops;
67 inode->i_mapping->a_ops = &logfs_reg_aops;
68 break;
69 case S_IFSOCK: /* fall through */
70 case S_IFBLK: /* fall through */
71 case S_IFCHR: /* fall through */
72 case S_IFIFO:
73 init_special_inode(inode, inode->i_mode, inode->i_rdev);
74 break;
75 default:
76 BUG();
77 }
78}
79
80static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
81{
82 struct inode *inode = iget_locked(sb, ino);
83 int err;
84
85 if (!inode)
86 return ERR_PTR(-ENOMEM);
87 if (!(inode->i_state & I_NEW))
88 return inode;
89
90 err = logfs_read_inode(inode);
91 if (err || inode->i_nlink == 0) {
92 /* inode->i_nlink == 0 can be true when called from
93 * block validator */
94 /* set i_nlink to 0 to prevent caching */
95 inode->i_nlink = 0;
96 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
97 iget_failed(inode);
98 if (!err)
99 err = -ENOENT;
100 return ERR_PTR(err);
101 }
102
103 logfs_inode_setops(inode);
104 unlock_new_inode(inode);
105 return inode;
106}
107
108struct inode *logfs_iget(struct super_block *sb, ino_t ino)
109{
110 BUG_ON(ino == LOGFS_INO_MASTER);
111 BUG_ON(ino == LOGFS_INO_SEGFILE);
112 return __logfs_iget(sb, ino);
113}
114
115/*
116 * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
117 * this allows logfs_iput to do the right thing later
118 */
119struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
120{
121 struct logfs_super *super = logfs_super(sb);
122 struct logfs_inode *li;
123
124 if (ino == LOGFS_INO_MASTER)
125 return super->s_master_inode;
126 if (ino == LOGFS_INO_SEGFILE)
127 return super->s_segfile_inode;
128
129 spin_lock(&logfs_inode_lock);
130 list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
131 if (li->vfs_inode.i_ino == ino) {
132 li->li_refcount++;
133 spin_unlock(&logfs_inode_lock);
134 *is_cached = 1;
135 return &li->vfs_inode;
136 }
137 spin_unlock(&logfs_inode_lock);
138
139 *is_cached = 0;
140 return __logfs_iget(sb, ino);
141}
142
143static void __logfs_destroy_inode(struct inode *inode)
144{
145 struct logfs_inode *li = logfs_inode(inode);
146
147 BUG_ON(li->li_block);
148 list_del(&li->li_freeing_list);
149 kmem_cache_free(logfs_inode_cache, li);
150}
151
152static void logfs_destroy_inode(struct inode *inode)
153{
154 struct logfs_inode *li = logfs_inode(inode);
155
156 BUG_ON(list_empty(&li->li_freeing_list));
157 spin_lock(&logfs_inode_lock);
158 li->li_refcount--;
159 if (li->li_refcount == 0)
160 __logfs_destroy_inode(inode);
161 spin_unlock(&logfs_inode_lock);
162}
163
164void logfs_safe_iput(struct inode *inode, int is_cached)
165{
166 if (inode->i_ino == LOGFS_INO_MASTER)
167 return;
168 if (inode->i_ino == LOGFS_INO_SEGFILE)
169 return;
170
171 if (is_cached) {
172 logfs_destroy_inode(inode);
173 return;
174 }
175
176 iput(inode);
177}
178
179static void logfs_init_inode(struct super_block *sb, struct inode *inode)
180{
181 struct logfs_inode *li = logfs_inode(inode);
182 int i;
183
184 li->li_flags = 0;
185 li->li_height = 0;
186 li->li_used_bytes = 0;
187 li->li_block = NULL;
188 inode->i_uid = 0;
189 inode->i_gid = 0;
190 inode->i_size = 0;
191 inode->i_blocks = 0;
192 inode->i_ctime = CURRENT_TIME;
193 inode->i_mtime = CURRENT_TIME;
194 inode->i_nlink = 1;
195 INIT_LIST_HEAD(&li->li_freeing_list);
196
197 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
198 li->li_data[i] = 0;
199
200 return;
201}
202
203static struct inode *logfs_alloc_inode(struct super_block *sb)
204{
205 struct logfs_inode *li;
206
207 li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
208 if (!li)
209 return NULL;
210 logfs_init_inode(sb, &li->vfs_inode);
211 return &li->vfs_inode;
212}
213
214/*
215 * In logfs inodes are written to an inode file. The inode file, like any
216 * other file, is managed with a inode. The inode file's inode, aka master
217 * inode, requires special handling in several respects. First, it cannot be
218 * written to the inode file, so it is stored in the journal instead.
219 *
220 * Secondly, this inode cannot be written back and destroyed before all other
221 * inodes have been written. The ordering is important. Linux' VFS is happily
222 * unaware of the ordering constraint and would ordinarily destroy the master
223 * inode at umount time while other inodes are still in use and dirty. Not
224 * good.
225 *
226 * So logfs makes sure the master inode is not written until all other inodes
227 * have been destroyed. Sadly, this method has another side-effect. The VFS
228 * will notice one remaining inode and print a frightening warning message.
229 * Worse, it is impossible to judge whether such a warning was caused by the
230 * master inode or any other inodes have leaked as well.
231 *
232 * Our attempt of solving this is with logfs_new_meta_inode() below. Its
233 * purpose is to create a new inode that will not trigger the warning if such
234 * an inode is still in use. An ugly hack, no doubt. Suggections for
235 * improvement are welcome.
236 */
237struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
238{
239 struct inode *inode;
240
241 inode = logfs_alloc_inode(sb);
242 if (!inode)
243 return ERR_PTR(-ENOMEM);
244
245 inode->i_mode = S_IFREG;
246 inode->i_ino = ino;
247 inode->i_sb = sb;
248
249 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
250 * to be nonstatic, alas. */
251 {
252 struct address_space * const mapping = &inode->i_data;
253
254 mapping->a_ops = &logfs_reg_aops;
255 mapping->host = inode;
256 mapping->flags = 0;
257 mapping_set_gfp_mask(mapping, GFP_NOFS);
258 mapping->assoc_mapping = NULL;
259 mapping->backing_dev_info = &default_backing_dev_info;
260 inode->i_mapping = mapping;
261 inode->i_nlink = 1;
262 }
263
264 return inode;
265}
266
267struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
268{
269 struct inode *inode;
270 int err;
271
272 inode = logfs_new_meta_inode(sb, ino);
273 if (IS_ERR(inode))
274 return inode;
275
276 err = logfs_read_inode(inode);
277 if (err) {
278 destroy_meta_inode(inode);
279 return ERR_PTR(err);
280 }
281 logfs_inode_setops(inode);
282 return inode;
283}
284
285static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
286{
287 int ret;
288 long flags = WF_LOCK;
289
290 /* Can only happen if creat() failed. Safe to skip. */
291 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
292 return 0;
293
294 ret = __logfs_write_inode(inode, flags);
295 LOGFS_BUG_ON(ret, inode->i_sb);
296 return ret;
297}
298
299void destroy_meta_inode(struct inode *inode)
300{
301 if (inode) {
302 if (inode->i_data.nrpages)
303 truncate_inode_pages(&inode->i_data, 0);
304 logfs_clear_inode(inode);
305 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
306 }
307}
308
309/* called with inode_lock held */
310static void logfs_drop_inode(struct inode *inode)
311{
312 struct logfs_super *super = logfs_super(inode->i_sb);
313 struct logfs_inode *li = logfs_inode(inode);
314
315 spin_lock(&logfs_inode_lock);
316 list_move(&li->li_freeing_list, &super->s_freeing_list);
317 spin_unlock(&logfs_inode_lock);
318 generic_drop_inode(inode);
319}
320
321static void logfs_set_ino_generation(struct super_block *sb,
322 struct inode *inode)
323{
324 struct logfs_super *super = logfs_super(sb);
325 u64 ino;
326
327 mutex_lock(&super->s_journal_mutex);
328 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
329 super->s_last_ino = ino;
330 super->s_inos_till_wrap--;
331 if (super->s_inos_till_wrap < 0) {
332 super->s_last_ino = LOGFS_RESERVED_INOS;
333 super->s_generation++;
334 super->s_inos_till_wrap = INOS_PER_WRAP;
335 }
336 inode->i_ino = ino;
337 inode->i_generation = super->s_generation;
338 mutex_unlock(&super->s_journal_mutex);
339}
340
341struct inode *logfs_new_inode(struct inode *dir, int mode)
342{
343 struct super_block *sb = dir->i_sb;
344 struct inode *inode;
345
346 inode = new_inode(sb);
347 if (!inode)
348 return ERR_PTR(-ENOMEM);
349
350 logfs_init_inode(sb, inode);
351
352 /* inherit parent flags */
353 logfs_inode(inode)->li_flags |=
354 logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
355
356 inode->i_mode = mode;
357 logfs_set_ino_generation(sb, inode);
358
359 inode->i_uid = current_fsuid();
360 inode->i_gid = current_fsgid();
361 if (dir->i_mode & S_ISGID) {
362 inode->i_gid = dir->i_gid;
363 if (S_ISDIR(mode))
364 inode->i_mode |= S_ISGID;
365 }
366
367 logfs_inode_setops(inode);
368 insert_inode_hash(inode);
369
370 return inode;
371}
372
373static void logfs_init_once(void *_li)
374{
375 struct logfs_inode *li = _li;
376 int i;
377
378 li->li_flags = 0;
379 li->li_used_bytes = 0;
380 li->li_refcount = 1;
381 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
382 li->li_data[i] = 0;
383 inode_init_once(&li->vfs_inode);
384}
385
386static int logfs_sync_fs(struct super_block *sb, int wait)
387{
388 /* FIXME: write anchor */
389 logfs_super(sb)->s_devops->sync(sb);
390 return 0;
391}
392
393const struct super_operations logfs_super_operations = {
394 .alloc_inode = logfs_alloc_inode,
395 .clear_inode = logfs_clear_inode,
396 .delete_inode = logfs_delete_inode,
397 .destroy_inode = logfs_destroy_inode,
398 .drop_inode = logfs_drop_inode,
399 .write_inode = logfs_write_inode,
400 .statfs = logfs_statfs,
401 .sync_fs = logfs_sync_fs,
402};
403
404int logfs_init_inode_cache(void)
405{
406 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
407 sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
408 logfs_init_once);
409 if (!logfs_inode_cache)
410 return -ENOMEM;
411 return 0;
412}
413
414void logfs_destroy_inode_cache(void)
415{
416 kmem_cache_destroy(logfs_inode_cache);
417}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..d57c7b07b60b
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,890 @@
1/*
2 * fs/logfs/journal.c - journal handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9
10static void logfs_calc_free(struct super_block *sb)
11{
12 struct logfs_super *super = logfs_super(sb);
13 u64 reserve, no_segs = super->s_no_segs;
14 s64 free;
15 int i;
16
17 /* superblock segments */
18 no_segs -= 2;
19 super->s_no_journal_segs = 0;
20 /* journal */
21 journal_for_each(i)
22 if (super->s_journal_seg[i]) {
23 no_segs--;
24 super->s_no_journal_segs++;
25 }
26
27 /* open segments plus one extra per level for GC */
28 no_segs -= 2 * super->s_total_levels;
29
30 free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
31 free -= super->s_used_bytes;
32 /* just a bit extra */
33 free -= super->s_total_levels * 4096;
34
35 /* Bad blocks are 'paid' for with speed reserve - the filesystem
36 * simply gets slower as bad blocks accumulate. Until the bad blocks
37 * exceed the speed reserve - then the filesystem gets smaller.
38 */
39 reserve = super->s_bad_segments + super->s_bad_seg_reserve;
40 reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
41 reserve = max(reserve, super->s_speed_reserve);
42 free -= reserve;
43 if (free < 0)
44 free = 0;
45
46 super->s_free_bytes = free;
47}
48
49static void reserve_sb_and_journal(struct super_block *sb)
50{
51 struct logfs_super *super = logfs_super(sb);
52 struct btree_head32 *head = &super->s_reserved_segments;
53 int i, err;
54
55 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
56 GFP_KERNEL);
57 BUG_ON(err);
58
59 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
60 GFP_KERNEL);
61 BUG_ON(err);
62
63 journal_for_each(i) {
64 if (!super->s_journal_seg[i])
65 continue;
66 err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
67 GFP_KERNEL);
68 BUG_ON(err);
69 }
70}
71
72static void read_dynsb(struct super_block *sb,
73 struct logfs_je_dynsb *dynsb)
74{
75 struct logfs_super *super = logfs_super(sb);
76
77 super->s_gec = be64_to_cpu(dynsb->ds_gec);
78 super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
79 super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
80 super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
81 super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
82 super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
83 super->s_generation = be32_to_cpu(dynsb->ds_generation);
84}
85
86static void read_anchor(struct super_block *sb,
87 struct logfs_je_anchor *da)
88{
89 struct logfs_super *super = logfs_super(sb);
90 struct inode *inode = super->s_master_inode;
91 struct logfs_inode *li = logfs_inode(inode);
92 int i;
93
94 super->s_last_ino = be64_to_cpu(da->da_last_ino);
95 li->li_flags = 0;
96 li->li_height = da->da_height;
97 i_size_write(inode, be64_to_cpu(da->da_size));
98 li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
99
100 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
101 li->li_data[i] = be64_to_cpu(da->da_data[i]);
102}
103
104static void read_erasecount(struct super_block *sb,
105 struct logfs_je_journal_ec *ec)
106{
107 struct logfs_super *super = logfs_super(sb);
108 int i;
109
110 journal_for_each(i)
111 super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
112}
113
114static int read_area(struct super_block *sb, struct logfs_je_area *a)
115{
116 struct logfs_super *super = logfs_super(sb);
117 struct logfs_area *area = super->s_area[a->gc_level];
118 u64 ofs;
119 u32 writemask = ~(super->s_writesize - 1);
120
121 if (a->gc_level >= LOGFS_NO_AREAS)
122 return -EIO;
123 if (a->vim != VIM_DEFAULT)
124 return -EIO; /* TODO: close area and continue */
125
126 area->a_used_bytes = be32_to_cpu(a->used_bytes);
127 area->a_written_bytes = area->a_used_bytes & writemask;
128 area->a_segno = be32_to_cpu(a->segno);
129 if (area->a_segno)
130 area->a_is_open = 1;
131
132 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
133 if (super->s_writesize > 1)
134 logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
135 else
136 logfs_buf_recover(area, ofs, NULL, 0);
137 return 0;
138}
139
140static void *unpack(void *from, void *to)
141{
142 struct logfs_journal_header *jh = from;
143 void *data = from + sizeof(struct logfs_journal_header);
144 int err;
145 size_t inlen, outlen;
146
147 inlen = be16_to_cpu(jh->h_len);
148 outlen = be16_to_cpu(jh->h_datalen);
149
150 if (jh->h_compr == COMPR_NONE)
151 memcpy(to, data, inlen);
152 else {
153 err = logfs_uncompress(data, to, inlen, outlen);
154 BUG_ON(err);
155 }
156 return to;
157}
158
159static int __read_je_header(struct super_block *sb, u64 ofs,
160 struct logfs_journal_header *jh)
161{
162 struct logfs_super *super = logfs_super(sb);
163 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
164 + MAX_JOURNAL_HEADER;
165 u16 type, len, datalen;
166 int err;
167
168 /* read header only */
169 err = wbuf_read(sb, ofs, sizeof(*jh), jh);
170 if (err)
171 return err;
172 type = be16_to_cpu(jh->h_type);
173 len = be16_to_cpu(jh->h_len);
174 datalen = be16_to_cpu(jh->h_datalen);
175 if (len > sb->s_blocksize)
176 return -EIO;
177 if ((type < JE_FIRST) || (type > JE_LAST))
178 return -EIO;
179 if (datalen > bufsize)
180 return -EIO;
181 return 0;
182}
183
184static int __read_je_payload(struct super_block *sb, u64 ofs,
185 struct logfs_journal_header *jh)
186{
187 u16 len;
188 int err;
189
190 len = be16_to_cpu(jh->h_len);
191 err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
192 if (err)
193 return err;
194 if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
195 /* Old code was confused. It forgot about the header length
196 * and stopped calculating the crc 16 bytes before the end
197 * of data - ick!
198 * FIXME: Remove this hack once the old code is fixed.
199 */
200 if (jh->h_crc == logfs_crc32(jh, len, 4))
201 WARN_ON_ONCE(1);
202 else
203 return -EIO;
204 }
205 return 0;
206}
207
208/*
209 * jh needs to be large enough to hold the complete entry, not just the header
210 */
211static int __read_je(struct super_block *sb, u64 ofs,
212 struct logfs_journal_header *jh)
213{
214 int err;
215
216 err = __read_je_header(sb, ofs, jh);
217 if (err)
218 return err;
219 return __read_je_payload(sb, ofs, jh);
220}
221
222static int read_je(struct super_block *sb, u64 ofs)
223{
224 struct logfs_super *super = logfs_super(sb);
225 struct logfs_journal_header *jh = super->s_compressed_je;
226 void *scratch = super->s_je;
227 u16 type, datalen;
228 int err;
229
230 err = __read_je(sb, ofs, jh);
231 if (err)
232 return err;
233 type = be16_to_cpu(jh->h_type);
234 datalen = be16_to_cpu(jh->h_datalen);
235
236 switch (type) {
237 case JE_DYNSB:
238 read_dynsb(sb, unpack(jh, scratch));
239 break;
240 case JE_ANCHOR:
241 read_anchor(sb, unpack(jh, scratch));
242 break;
243 case JE_ERASECOUNT:
244 read_erasecount(sb, unpack(jh, scratch));
245 break;
246 case JE_AREA:
247 read_area(sb, unpack(jh, scratch));
248 break;
249 case JE_OBJ_ALIAS:
250 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
251 datalen);
252 break;
253 default:
254 WARN_ON_ONCE(1);
255 return -EIO;
256 }
257 return err;
258}
259
260static int logfs_read_segment(struct super_block *sb, u32 segno)
261{
262 struct logfs_super *super = logfs_super(sb);
263 struct logfs_journal_header *jh = super->s_compressed_je;
264 u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
265 u32 h_ofs, last_ofs = 0;
266 u16 len, datalen, last_len = 0;
267 int i, err;
268
269 /* search for most recent commit */
270 for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
271 ofs = seg_ofs + h_ofs;
272 err = __read_je_header(sb, ofs, jh);
273 if (err)
274 continue;
275 if (jh->h_type != cpu_to_be16(JE_COMMIT))
276 continue;
277 err = __read_je_payload(sb, ofs, jh);
278 if (err)
279 continue;
280 len = be16_to_cpu(jh->h_len);
281 datalen = be16_to_cpu(jh->h_datalen);
282 if ((datalen > sizeof(super->s_je_array)) ||
283 (datalen % sizeof(__be64)))
284 continue;
285 last_ofs = h_ofs;
286 last_len = datalen;
287 h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
288 }
289 /* read commit */
290 if (last_ofs == 0)
291 return -ENOENT;
292 ofs = seg_ofs + last_ofs;
293 log_journal("Read commit from %llx\n", ofs);
294 err = __read_je(sb, ofs, jh);
295 BUG_ON(err); /* We should have caught it in the scan loop already */
296 if (err)
297 return err;
298 /* uncompress */
299 unpack(jh, super->s_je_array);
300 super->s_no_je = last_len / sizeof(__be64);
301 /* iterate over array */
302 for (i = 0; i < super->s_no_je; i++) {
303 err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
304 if (err)
305 return err;
306 }
307 super->s_journal_area->a_segno = segno;
308 return 0;
309}
310
311static u64 read_gec(struct super_block *sb, u32 segno)
312{
313 struct logfs_segment_header sh;
314 __be32 crc;
315 int err;
316
317 if (!segno)
318 return 0;
319 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
320 if (err)
321 return 0;
322 crc = logfs_crc32(&sh, sizeof(sh), 4);
323 if (crc != sh.crc) {
324 WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
325 /* Most likely it was just erased */
326 return 0;
327 }
328 return be64_to_cpu(sh.gec);
329}
330
331static int logfs_read_journal(struct super_block *sb)
332{
333 struct logfs_super *super = logfs_super(sb);
334 u64 gec[LOGFS_JOURNAL_SEGS], max;
335 u32 segno;
336 int i, max_i;
337
338 max = 0;
339 max_i = -1;
340 journal_for_each(i) {
341 segno = super->s_journal_seg[i];
342 gec[i] = read_gec(sb, super->s_journal_seg[i]);
343 if (gec[i] > max) {
344 max = gec[i];
345 max_i = i;
346 }
347 }
348 if (max_i == -1)
349 return -EIO;
350 /* FIXME: Try older segments in case of error */
351 return logfs_read_segment(sb, super->s_journal_seg[max_i]);
352}
353
354/*
355 * First search the current segment (outer loop), then pick the next segment
356 * in the array, skipping any zero entries (inner loop).
357 */
358static void journal_get_free_segment(struct logfs_area *area)
359{
360 struct logfs_super *super = logfs_super(area->a_sb);
361 int i;
362
363 journal_for_each(i) {
364 if (area->a_segno != super->s_journal_seg[i])
365 continue;
366
367 do {
368 i++;
369 if (i == LOGFS_JOURNAL_SEGS)
370 i = 0;
371 } while (!super->s_journal_seg[i]);
372
373 area->a_segno = super->s_journal_seg[i];
374 area->a_erase_count = ++(super->s_journal_ec[i]);
375 log_journal("Journal now at %x (ec %x)\n", area->a_segno,
376 area->a_erase_count);
377 return;
378 }
379 BUG();
380}
381
382static void journal_get_erase_count(struct logfs_area *area)
383{
384 /* erase count is stored globally and incremented in
385 * journal_get_free_segment() - nothing to do here */
386}
387
388static int journal_erase_segment(struct logfs_area *area)
389{
390 struct super_block *sb = area->a_sb;
391 struct logfs_segment_header sh;
392 u64 ofs;
393 int err;
394
395 err = logfs_erase_segment(sb, area->a_segno, 1);
396 if (err)
397 return err;
398
399 sh.pad = 0;
400 sh.type = SEG_JOURNAL;
401 sh.level = 0;
402 sh.segno = cpu_to_be32(area->a_segno);
403 sh.ec = cpu_to_be32(area->a_erase_count);
404 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
405 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
406
407 /* This causes a bug in segment.c. Not yet. */
408 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
409
410 ofs = dev_ofs(sb, area->a_segno, 0);
411 area->a_used_bytes = ALIGN(sizeof(sh), 16);
412 logfs_buf_write(area, ofs, &sh, sizeof(sh));
413 return 0;
414}
415
416static size_t __logfs_write_header(struct logfs_super *super,
417 struct logfs_journal_header *jh, size_t len, size_t datalen,
418 u16 type, u8 compr)
419{
420 jh->h_len = cpu_to_be16(len);
421 jh->h_type = cpu_to_be16(type);
422 jh->h_datalen = cpu_to_be16(datalen);
423 jh->h_compr = compr;
424 jh->h_pad[0] = 'H';
425 jh->h_pad[1] = 'E';
426 jh->h_pad[2] = 'A';
427 jh->h_pad[3] = 'D';
428 jh->h_pad[4] = 'R';
429 jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
430 return ALIGN(len, 16) + sizeof(*jh);
431}
432
433static size_t logfs_write_header(struct logfs_super *super,
434 struct logfs_journal_header *jh, size_t datalen, u16 type)
435{
436 size_t len = datalen;
437
438 return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
439}
440
441static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
442{
443 return LOGFS_JOURNAL_SEGS * sizeof(__be32);
444}
445
446static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
447 u16 *type, size_t *len)
448{
449 struct logfs_super *super = logfs_super(sb);
450 struct logfs_je_journal_ec *ec = _ec;
451 int i;
452
453 journal_for_each(i)
454 ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
455 *type = JE_ERASECOUNT;
456 *len = logfs_journal_erasecount_size(super);
457 return ec;
458}
459
460static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
461 size_t ignore2)
462{
463 struct logfs_shadow *shadow = _shadow;
464 struct super_block *sb = (void *)_sb;
465 struct logfs_super *super = logfs_super(sb);
466
467 /* consume new space */
468 super->s_free_bytes -= shadow->new_len;
469 super->s_used_bytes += shadow->new_len;
470 super->s_dirty_used_bytes -= shadow->new_len;
471
472 /* free up old space */
473 super->s_free_bytes += shadow->old_len;
474 super->s_used_bytes -= shadow->old_len;
475 super->s_dirty_free_bytes -= shadow->old_len;
476
477 logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
478 logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
479
480 log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
481 shadow->ino, shadow->bix, shadow->gc_level,
482 shadow->old_ofs, shadow->new_ofs,
483 shadow->old_len, shadow->new_len);
484 mempool_free(shadow, super->s_shadow_pool);
485}
486
487static void account_shadows(struct super_block *sb)
488{
489 struct logfs_super *super = logfs_super(sb);
490 struct inode *inode = super->s_master_inode;
491 struct logfs_inode *li = logfs_inode(inode);
492 struct shadow_tree *tree = &super->s_shadow_tree;
493
494 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
495 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
496
497 if (li->li_block) {
498 /*
499 * We never actually use the structure, when attached to the
500 * master inode. But it is easier to always free it here than
501 * to have checks in several places elsewhere when allocating
502 * it.
503 */
504 li->li_block->ops->free_block(sb, li->li_block);
505 }
506 BUG_ON((s64)li->li_used_bytes < 0);
507}
508
509static void *__logfs_write_anchor(struct super_block *sb, void *_da,
510 u16 *type, size_t *len)
511{
512 struct logfs_super *super = logfs_super(sb);
513 struct logfs_je_anchor *da = _da;
514 struct inode *inode = super->s_master_inode;
515 struct logfs_inode *li = logfs_inode(inode);
516 int i;
517
518 da->da_height = li->li_height;
519 da->da_last_ino = cpu_to_be64(super->s_last_ino);
520 da->da_size = cpu_to_be64(i_size_read(inode));
521 da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
522 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
523 da->da_data[i] = cpu_to_be64(li->li_data[i]);
524 *type = JE_ANCHOR;
525 *len = sizeof(*da);
526 return da;
527}
528
529static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
530 u16 *type, size_t *len)
531{
532 struct logfs_super *super = logfs_super(sb);
533 struct logfs_je_dynsb *dynsb = _dynsb;
534
535 dynsb->ds_gec = cpu_to_be64(super->s_gec);
536 dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
537 dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
538 dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
539 dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
540 dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
541 dynsb->ds_generation = cpu_to_be32(super->s_generation);
542 *type = JE_DYNSB;
543 *len = sizeof(*dynsb);
544 return dynsb;
545}
546
547static void write_wbuf(struct super_block *sb, struct logfs_area *area,
548 void *wbuf)
549{
550 struct logfs_super *super = logfs_super(sb);
551 struct address_space *mapping = super->s_mapping_inode->i_mapping;
552 u64 ofs;
553 pgoff_t index;
554 int page_ofs;
555 struct page *page;
556
557 ofs = dev_ofs(sb, area->a_segno,
558 area->a_used_bytes & ~(super->s_writesize - 1));
559 index = ofs >> PAGE_SHIFT;
560 page_ofs = ofs & (PAGE_SIZE - 1);
561
562 page = find_lock_page(mapping, index);
563 BUG_ON(!page);
564 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
565 unlock_page(page);
566}
567
568static void *logfs_write_area(struct super_block *sb, void *_a,
569 u16 *type, size_t *len)
570{
571 struct logfs_super *super = logfs_super(sb);
572 struct logfs_area *area = super->s_area[super->s_sum_index];
573 struct logfs_je_area *a = _a;
574
575 a->vim = VIM_DEFAULT;
576 a->gc_level = super->s_sum_index;
577 a->used_bytes = cpu_to_be32(area->a_used_bytes);
578 a->segno = cpu_to_be32(area->a_segno);
579 if (super->s_writesize > 1)
580 write_wbuf(sb, area, a + 1);
581
582 *type = JE_AREA;
583 *len = sizeof(*a) + super->s_writesize;
584 return a;
585}
586
587static void *logfs_write_commit(struct super_block *sb, void *h,
588 u16 *type, size_t *len)
589{
590 struct logfs_super *super = logfs_super(sb);
591
592 *type = JE_COMMIT;
593 *len = super->s_no_je * sizeof(__be64);
594 return super->s_je_array;
595}
596
597static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
598 size_t len)
599{
600 struct logfs_super *super = logfs_super(sb);
601 void *header = super->s_compressed_je;
602 void *data = header + sizeof(struct logfs_journal_header);
603 ssize_t compr_len, pad_len;
604 u8 compr = COMPR_ZLIB;
605
606 if (len == 0)
607 return logfs_write_header(super, header, 0, type);
608
609 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
610 if (compr_len < 0 || type == JE_ANCHOR) {
611 BUG_ON(len > sb->s_blocksize);
612 memcpy(data, buf, len);
613 compr_len = len;
614 compr = COMPR_NONE;
615 }
616
617 pad_len = ALIGN(compr_len, 16);
618 memset(data + compr_len, 0, pad_len - compr_len);
619
620 return __logfs_write_header(super, header, compr_len, len, type, compr);
621}
622
623static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
624 int must_pad)
625{
626 u32 writesize = logfs_super(area->a_sb)->s_writesize;
627 s32 ofs;
628 int ret;
629
630 ret = logfs_open_area(area, *bytes);
631 if (ret)
632 return -EAGAIN;
633
634 ofs = area->a_used_bytes;
635 area->a_used_bytes += *bytes;
636
637 if (must_pad) {
638 area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
639 *bytes = area->a_used_bytes - ofs;
640 }
641
642 return dev_ofs(area->a_sb, area->a_segno, ofs);
643}
644
645static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
646 size_t buf_len)
647{
648 struct logfs_super *super = logfs_super(sb);
649 struct logfs_area *area = super->s_journal_area;
650 struct logfs_journal_header *jh = super->s_compressed_je;
651 size_t len;
652 int must_pad = 0;
653 s64 ofs;
654
655 len = __logfs_write_je(sb, buf, type, buf_len);
656 if (jh->h_type == cpu_to_be16(JE_COMMIT))
657 must_pad = 1;
658
659 ofs = logfs_get_free_bytes(area, &len, must_pad);
660 if (ofs < 0)
661 return ofs;
662 logfs_buf_write(area, ofs, super->s_compressed_je, len);
663 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
664 return 0;
665}
666
667static int logfs_write_je(struct super_block *sb,
668 void* (*write)(struct super_block *sb, void *scratch,
669 u16 *type, size_t *len))
670{
671 void *buf;
672 size_t len;
673 u16 type;
674
675 buf = write(sb, logfs_super(sb)->s_je, &type, &len);
676 return logfs_write_je_buf(sb, buf, type, len);
677}
678
679int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
680 level_t level, int child_no, __be64 val)
681{
682 struct logfs_super *super = logfs_super(sb);
683 struct logfs_obj_alias *oa = super->s_je;
684 int err = 0, fill = super->s_je_fill;
685
686 log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
687 fill, ino, bix, level, child_no, be64_to_cpu(val));
688 oa[fill].ino = cpu_to_be64(ino);
689 oa[fill].bix = cpu_to_be64(bix);
690 oa[fill].val = val;
691 oa[fill].level = (__force u8)level;
692 oa[fill].child_no = cpu_to_be16(child_no);
693 fill++;
694 if (fill >= sb->s_blocksize / sizeof(*oa)) {
695 err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
696 fill = 0;
697 }
698
699 super->s_je_fill = fill;
700 return err;
701}
702
703static int logfs_write_obj_aliases(struct super_block *sb)
704{
705 struct logfs_super *super = logfs_super(sb);
706 int err;
707
708 log_journal("logfs_write_obj_aliases: %d aliases to write\n",
709 super->s_no_object_aliases);
710 super->s_je_fill = 0;
711 err = logfs_write_obj_aliases_pagecache(sb);
712 if (err)
713 return err;
714
715 if (super->s_je_fill)
716 err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
717 super->s_je_fill
718 * sizeof(struct logfs_obj_alias));
719 return err;
720}
721
722/*
723 * Write all journal entries. The goto logic ensures that all journal entries
724 * are written whenever a new segment is used. It is ugly and potentially a
725 * bit wasteful, but robustness is more important. With this we can *always*
726 * erase all journal segments except the one containing the most recent commit.
727 */
728void logfs_write_anchor(struct super_block *sb)
729{
730 struct logfs_super *super = logfs_super(sb);
731 struct logfs_area *area = super->s_journal_area;
732 int i, err;
733
734 if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
735 return;
736 super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
737
738 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
739 mutex_lock(&super->s_journal_mutex);
740
741 /* Do this first or suffer corruption */
742 logfs_sync_segments(sb);
743 account_shadows(sb);
744
745again:
746 super->s_no_je = 0;
747 for_each_area(i) {
748 if (!super->s_area[i]->a_is_open)
749 continue;
750 super->s_sum_index = i;
751 err = logfs_write_je(sb, logfs_write_area);
752 if (err)
753 goto again;
754 }
755 err = logfs_write_obj_aliases(sb);
756 if (err)
757 goto again;
758 err = logfs_write_je(sb, logfs_write_erasecount);
759 if (err)
760 goto again;
761 err = logfs_write_je(sb, __logfs_write_anchor);
762 if (err)
763 goto again;
764 err = logfs_write_je(sb, logfs_write_dynsb);
765 if (err)
766 goto again;
767 /*
768 * Order is imperative. First we sync all writes, including the
769 * non-committed journal writes. Then we write the final commit and
770 * sync the current journal segment.
771 * There is a theoretical bug here. Syncing the journal segment will
772 * write a number of journal entries and the final commit. All these
773 * are written in a single operation. If the device layer writes the
774 * data back-to-front, the commit will precede the other journal
775 * entries, leaving a race window.
776 * Two fixes are possible. Preferred is to fix the device layer to
777 * ensure writes happen front-to-back. Alternatively we can insert
778 * another logfs_sync_area() super->s_devops->sync() combo before
779 * writing the commit.
780 */
781 /*
782 * On another subject, super->s_devops->sync is usually not necessary.
783 * Unless called from sys_sync or friends, a barrier would suffice.
784 */
785 super->s_devops->sync(sb);
786 err = logfs_write_je(sb, logfs_write_commit);
787 if (err)
788 goto again;
789 log_journal("Write commit to %llx\n",
790 be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
791 logfs_sync_area(area);
792 BUG_ON(area->a_used_bytes != area->a_written_bytes);
793 super->s_devops->sync(sb);
794
795 mutex_unlock(&super->s_journal_mutex);
796 return;
797}
798
799void do_logfs_journal_wl_pass(struct super_block *sb)
800{
801 struct logfs_super *super = logfs_super(sb);
802 struct logfs_area *area = super->s_journal_area;
803 struct btree_head32 *head = &super->s_reserved_segments;
804 u32 segno, ec;
805 int i, err;
806
807 log_journal("Journal requires wear-leveling.\n");
808 /* Drop old segments */
809 journal_for_each(i)
810 if (super->s_journal_seg[i]) {
811 btree_remove32(head, super->s_journal_seg[i]);
812 logfs_set_segment_unreserved(sb,
813 super->s_journal_seg[i],
814 super->s_journal_ec[i]);
815 super->s_journal_seg[i] = 0;
816 super->s_journal_ec[i] = 0;
817 }
818 /* Get new segments */
819 for (i = 0; i < super->s_no_journal_segs; i++) {
820 segno = get_best_cand(sb, &super->s_reserve_list, &ec);
821 super->s_journal_seg[i] = segno;
822 super->s_journal_ec[i] = ec;
823 logfs_set_segment_reserved(sb, segno);
824 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
825 BUG_ON(err); /* mempool should prevent this */
826 err = logfs_erase_segment(sb, segno, 1);
827 BUG_ON(err); /* FIXME: remount-ro would be nicer */
828 }
829 /* Manually move journal_area */
830 freeseg(sb, area->a_segno);
831 area->a_segno = super->s_journal_seg[0];
832 area->a_is_open = 0;
833 area->a_used_bytes = 0;
834 /* Write journal */
835 logfs_write_anchor(sb);
836 /* Write superblocks */
837 err = logfs_write_sb(sb);
838 BUG_ON(err);
839}
840
841static const struct logfs_area_ops journal_area_ops = {
842 .get_free_segment = journal_get_free_segment,
843 .get_erase_count = journal_get_erase_count,
844 .erase_segment = journal_erase_segment,
845};
846
847int logfs_init_journal(struct super_block *sb)
848{
849 struct logfs_super *super = logfs_super(sb);
850 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
851 + MAX_JOURNAL_HEADER;
852 int ret = -ENOMEM;
853
854 mutex_init(&super->s_journal_mutex);
855 btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
856
857 super->s_je = kzalloc(bufsize, GFP_KERNEL);
858 if (!super->s_je)
859 return ret;
860
861 super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
862 if (!super->s_compressed_je)
863 return ret;
864
865 super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
866 if (IS_ERR(super->s_master_inode))
867 return PTR_ERR(super->s_master_inode);
868
869 ret = logfs_read_journal(sb);
870 if (ret)
871 return -EIO;
872
873 reserve_sb_and_journal(sb);
874 logfs_calc_free(sb);
875
876 super->s_journal_area->a_ops = &journal_area_ops;
877 return 0;
878}
879
880void logfs_cleanup_journal(struct super_block *sb)
881{
882 struct logfs_super *super = logfs_super(sb);
883
884 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
885 destroy_meta_inode(super->s_master_inode);
886 super->s_master_inode = NULL;
887
888 kfree(super->s_compressed_je);
889 kfree(super->s_je);
890}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..b84b0eec6024
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,725 @@
1/*
2 * fs/logfs/logfs.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Private header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_H
11#define FS_LOGFS_LOGFS_H
12
13#undef __CHECK_ENDIAN__
14#define __CHECK_ENDIAN__
15
16#include <linux/btree.h>
17#include <linux/crc32.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mempool.h>
21#include <linux/pagemap.h>
22#include <linux/mtd/mtd.h>
23#include "logfs_abi.h"
24
25#define LOGFS_DEBUG_SUPER (0x0001)
26#define LOGFS_DEBUG_SEGMENT (0x0002)
27#define LOGFS_DEBUG_JOURNAL (0x0004)
28#define LOGFS_DEBUG_DIR (0x0008)
29#define LOGFS_DEBUG_FILE (0x0010)
30#define LOGFS_DEBUG_INODE (0x0020)
31#define LOGFS_DEBUG_READWRITE (0x0040)
32#define LOGFS_DEBUG_GC (0x0080)
33#define LOGFS_DEBUG_GC_NOISY (0x0100)
34#define LOGFS_DEBUG_ALIASES (0x0200)
35#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
36#define LOGFS_DEBUG_ALL (0xffffffff)
37
38#define LOGFS_DEBUG (0x01)
39/*
40 * To enable specific log messages, simply define LOGFS_DEBUG to match any
41 * or all of the above.
42 */
43#ifndef LOGFS_DEBUG
44#define LOGFS_DEBUG (0)
45#endif
46
47#define log_cond(cond, fmt, arg...) do { \
48 if (cond) \
49 printk(KERN_DEBUG fmt, ##arg); \
50} while (0)
51
52#define log_super(fmt, arg...) \
53 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
54#define log_segment(fmt, arg...) \
55 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
56#define log_journal(fmt, arg...) \
57 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
58#define log_dir(fmt, arg...) \
59 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
60#define log_file(fmt, arg...) \
61 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
62#define log_inode(fmt, arg...) \
63 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
64#define log_readwrite(fmt, arg...) \
65 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
66#define log_gc(fmt, arg...) \
67 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
68#define log_gc_noisy(fmt, arg...) \
69 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
70#define log_aliases(fmt, arg...) \
71 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
72#define log_blockmove(fmt, arg...) \
73 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
74
75#define PG_pre_locked PG_owner_priv_1
76#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
77#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
78#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
79
80/* FIXME: This should really be somewhere in the 64bit area. */
81#define LOGFS_LINK_MAX (1<<30)
82
83/* Read-only filesystem */
84#define LOGFS_SB_FLAG_RO 0x0001
85#define LOGFS_SB_FLAG_DIRTY 0x0002
86#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
87#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
88
89/* Write Control Flags */
90#define WF_LOCK 0x01 /* take write lock */
91#define WF_WRITE 0x02 /* write block */
92#define WF_DELETE 0x04 /* delete old block */
93
94typedef u8 __bitwise level_t;
95typedef u8 __bitwise gc_level_t;
96
97#define LEVEL(level) ((__force level_t)(level))
98#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
99
100#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
101 (__force level_t)((__force u8)(level) - 1) )
102
103/**
104 * struct logfs_area - area management information
105 *
106 * @a_sb: the superblock this area belongs to
107 * @a_is_open: 1 if the area is currently open, else 0
108 * @a_segno: segment number of area
109 * @a_written_bytes: number of bytes already written back
110 * @a_used_bytes: number of used bytes
111 * @a_ops: area operations (either journal or ostore)
112 * @a_erase_count: erase count
113 * @a_level: GC level
114 */
115struct logfs_area { /* a segment open for writing */
116 struct super_block *a_sb;
117 int a_is_open;
118 u32 a_segno;
119 u32 a_written_bytes;
120 u32 a_used_bytes;
121 const struct logfs_area_ops *a_ops;
122 u32 a_erase_count;
123 gc_level_t a_level;
124};
125
126/**
127 * struct logfs_area_ops - area operations
128 *
129 * @get_free_segment: fill area->ofs with the offset of a free segment
130 * @get_erase_count: fill area->erase_count (needs area->ofs)
131 * @erase_segment: erase and setup segment
132 */
133struct logfs_area_ops {
134 void (*get_free_segment)(struct logfs_area *area);
135 void (*get_erase_count)(struct logfs_area *area);
136 int (*erase_segment)(struct logfs_area *area);
137};
138
139/**
140 * struct logfs_device_ops - device access operations
141 *
142 * @readpage: read one page (mm page)
143 * @writeseg: write one segment. may be a partial segment
144 * @erase: erase one segment
145 * @read: read from the device
146 * @erase: erase part of the device
147 */
148struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
150 struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
151 int (*write_sb)(struct super_block *sb, struct page *page);
152 int (*readpage)(void *_sb, struct page *page);
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write);
156 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb);
158};
159
160/**
161 * struct candidate_list - list of similar candidates
162 */
163struct candidate_list {
164 struct rb_root rb_tree;
165 int count;
166 int maxcount;
167 int sort_by_ec;
168};
169
170/**
171 * struct gc_candidate - "candidate" segment to be garbage collected next
172 *
173 * @list: list (either free of low)
174 * @segno: segment number
175 * @valid: number of valid bytes
176 * @erase_count: erase count of segment
177 * @dist: distance from tree root
178 *
179 * Candidates can be on two lists. The free list contains electees rather
180 * than candidates - segments that no longer contain any valid data. The
181 * low list contains candidates to be picked for GC. It should be kept
182 * short. It is not required to always pick a perfect candidate. In the
183 * worst case GC will have to move more data than absolutely necessary.
184 */
185struct gc_candidate {
186 struct rb_node rb_node;
187 struct candidate_list *list;
188 u32 segno;
189 u32 valid;
190 u32 erase_count;
191 u8 dist;
192};
193
194/**
195 * struct logfs_journal_entry - temporary structure used during journal scan
196 *
197 * @used:
198 * @version: normalized version
199 * @len: length
200 * @offset: offset
201 */
202struct logfs_journal_entry {
203 int used;
204 s16 version;
205 u16 len;
206 u16 datalen;
207 u64 offset;
208};
209
210enum transaction_state {
211 CREATE_1 = 1,
212 CREATE_2,
213 UNLINK_1,
214 UNLINK_2,
215 CROSS_RENAME_1,
216 CROSS_RENAME_2,
217 TARGET_RENAME_1,
218 TARGET_RENAME_2,
219 TARGET_RENAME_3
220};
221
222/**
223 * struct logfs_transaction - essential fields to support atomic dirops
224 *
225 * @ino: target inode
226 * @dir: inode of directory containing dentry
227 * @pos: pos of dentry in directory
228 */
229struct logfs_transaction {
230 enum transaction_state state;
231 u64 ino;
232 u64 dir;
233 u64 pos;
234};
235
236/**
237 * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
238 * @old_ofs: offset of old block on medium
239 * @new_ofs: offset of new block on medium
240 * @ino: inode number
241 * @bix: block index
242 * @old_len: size of old block, including header
243 * @new_len: size of new block, including header
244 * @level: block level
245 */
246struct logfs_shadow {
247 u64 old_ofs;
248 u64 new_ofs;
249 u64 ino;
250 u64 bix;
251 int old_len;
252 int new_len;
253 gc_level_t gc_level;
254};
255
256/**
257 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs
260 */
261struct shadow_tree {
262 struct btree_head64 new;
263 struct btree_head64 old;
264};
265
266struct object_alias_item {
267 struct list_head list;
268 __be64 val;
269 int child_no;
270};
271
272/**
273 * struct logfs_block - contains any block state
274 * @type: indirect block or inode
275 * @full: number of fully populated children
276 * @partial: number of partially populated children
277 *
278 * Most blocks are directly represented by page cache pages. But when a block
279 * becomes dirty, is part of a transaction, contains aliases or is otherwise
280 * special, a struct logfs_block is allocated to track the additional state.
281 * Inodes are very similar to indirect blocks, so they can also get one of
282 * these structures added when appropriate.
283 */
284#define BLOCK_INDIRECT 1 /* Indirect block */
285#define BLOCK_INODE 2 /* Inode */
286struct logfs_block_ops;
287struct logfs_block {
288 struct list_head alias_list;
289 struct list_head item_list;
290 struct super_block *sb;
291 u64 ino;
292 u64 bix;
293 level_t level;
294 struct page *page;
295 struct inode *inode;
296 struct logfs_transaction *ta;
297 unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
298 struct logfs_block_ops *ops;
299 int full;
300 int partial;
301 int reserved_bytes;
302};
303
304typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
305 level_t level, int child_no, __be64 val);
306struct logfs_block_ops {
307 void (*write_block)(struct logfs_block *block);
308 gc_level_t (*block_level)(struct logfs_block *block);
309 void (*free_block)(struct super_block *sb, struct logfs_block*block);
310 int (*write_alias)(struct super_block *sb,
311 struct logfs_block *block,
312 write_alias_t *write_one_alias);
313};
314
315struct logfs_super {
316 struct mtd_info *s_mtd; /* underlying device */
317 struct block_device *s_bdev; /* underlying device */
318 const struct logfs_device_ops *s_devops;/* device access */
319 struct inode *s_master_inode; /* inode file */
320 struct inode *s_segfile_inode; /* segment file */
321 struct inode *s_mapping_inode; /* device mapping */
322 atomic_t s_pending_writes; /* outstanting bios */
323 long s_flags;
324 mempool_t *s_btree_pool; /* for btree nodes */
325 mempool_t *s_alias_pool; /* aliases in segment.c */
326 u64 s_feature_incompat;
327 u64 s_feature_ro_compat;
328 u64 s_feature_compat;
329 u64 s_feature_flags;
330 u64 s_sb_ofs[2];
331 struct page *s_erase_page; /* for dev_bdev.c */
332 /* alias.c fields */
333 struct btree_head32 s_segment_alias; /* remapped segments */
334 int s_no_object_aliases;
335 struct list_head s_object_alias; /* remapped objects */
336 struct btree_head128 s_object_alias_tree; /* remapped objects */
337 struct mutex s_object_alias_mutex;
338 /* dir.c fields */
339 struct mutex s_dirop_mutex; /* for creat/unlink/rename */
340 u64 s_victim_ino; /* used for atomic dir-ops */
341 u64 s_rename_dir; /* source directory ino */
342 u64 s_rename_pos; /* position of source dd */
343 /* gc.c fields */
344 long s_segsize; /* size of a segment */
345 int s_segshift; /* log2 of segment size */
346 long s_segmask; /* 1 << s_segshift - 1 */
347 long s_no_segs; /* segments on device */
348 long s_no_journal_segs; /* segments used for journal */
349 long s_no_blocks; /* blocks per segment */
350 long s_writesize; /* minimum write size */
351 int s_writeshift; /* log2 of write size */
352 u64 s_size; /* filesystem size */
353 struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
354 u64 s_gec; /* global erase count */
355 u64 s_wl_gec_ostore; /* time of last wl event */
356 u64 s_wl_gec_journal; /* time of last wl event */
357 u64 s_sweeper; /* current sweeper pos */
358 u8 s_ifile_levels; /* max level of ifile */
359 u8 s_iblock_levels; /* max level of regular files */
360 u8 s_data_levels; /* # of segments to leaf block*/
361 u8 s_total_levels; /* sum of above three */
362 struct btree_head32 s_cand_tree; /* all candidates */
363 struct candidate_list s_free_list; /* 100% free segments */
364 struct candidate_list s_reserve_list; /* Bad segment reserve */
365 struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
366 struct candidate_list s_ec_list; /* wear level candidates */
367 struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
368 /* inode.c fields */
369 u64 s_last_ino; /* highest ino used */
370 long s_inos_till_wrap;
371 u32 s_generation; /* i_generation for new files */
372 struct list_head s_freeing_list; /* inodes being freed */
373 /* journal.c fields */
374 struct mutex s_journal_mutex;
375 void *s_je; /* journal entry to compress */
376 void *s_compressed_je; /* block to write to journal */
377 u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
378 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
379 u64 s_last_version;
380 struct logfs_area *s_journal_area; /* open journal segment */
381 __be64 s_je_array[64];
382 int s_no_je;
383
384 int s_sum_index; /* for the 12 summaries */
385 struct shadow_tree s_shadow_tree;
386 int s_je_fill; /* index of current je */
387 /* readwrite.c fields */
388 struct mutex s_write_mutex;
389 int s_lock_count;
390 mempool_t *s_block_pool; /* struct logfs_block pool */
391 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
392 /*
393 * Space accounting:
394 * - s_used_bytes specifies space used to store valid data objects.
395 * - s_dirty_used_bytes is space used to store non-committed data
396 * objects. Those objects have already been written themselves,
397 * but they don't become valid until all indirect blocks up to the
398 * journal have been written as well.
399 * - s_dirty_free_bytes is space used to store the old copy of a
400 * replaced object, as long as the replacement is non-committed.
401 * In other words, it is the amount of space freed when all dirty
402 * blocks are written back.
403 * - s_free_bytes is the amount of free space available for any
404 * purpose.
405 * - s_root_reserve is the amount of free space available only to
406 * the root user. Non-privileged users can no longer write once
407 * this watermark has been reached.
408 * - s_speed_reserve is space which remains unused to speed up
409 * garbage collection performance.
410 * - s_dirty_pages is the space reserved for currently dirty pages.
411 * It is a pessimistic estimate, so some/most will get freed on
412 * page writeback.
413 *
414 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
415 */
416 u64 s_free_bytes;
417 u64 s_used_bytes;
418 u64 s_dirty_free_bytes;
419 u64 s_dirty_used_bytes;
420 u64 s_root_reserve;
421 u64 s_speed_reserve;
422 u64 s_dirty_pages;
423 /* Bad block handling:
424 * - s_bad_seg_reserve is a number of segments usually kept
425 * free. When encountering bad blocks, the affected segment's data
426 * is _temporarily_ moved to a reserved segment.
427 * - s_bad_segments is the number of known bad segments.
428 */
429 u32 s_bad_seg_reserve;
430 u32 s_bad_segments;
431};
432
433/**
434 * struct logfs_inode - in-memory inode
435 *
436 * @vfs_inode: struct inode
437 * @li_data: data pointers
438 * @li_used_bytes: number of used bytes
439 * @li_freeing_list: used to track inodes currently being freed
440 * @li_flags: inode flags
441 * @li_refcount: number of internal (GC-induced) references
442 */
443struct logfs_inode {
444 struct inode vfs_inode;
445 u64 li_data[LOGFS_EMBEDDED_FIELDS];
446 u64 li_used_bytes;
447 struct list_head li_freeing_list;
448 struct logfs_block *li_block;
449 u32 li_flags;
450 u8 li_height;
451 int li_refcount;
452};
453
454#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
455#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
456#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
457
458/* compr.c */
459int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
460int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
461int __init logfs_compr_init(void);
462void logfs_compr_exit(void);
463
464/* dev_bdev.c */
465#ifdef CONFIG_BLOCK
466int logfs_get_sb_bdev(struct file_system_type *type, int flags,
467 const char *devname, struct vfsmount *mnt);
468#else
469static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
470 const char *devname, struct vfsmount *mnt)
471{
472 return -ENODEV;
473}
474#endif
475
476/* dev_mtd.c */
477#ifdef CONFIG_MTD
478int logfs_get_sb_mtd(struct file_system_type *type, int flags,
479 int mtdnr, struct vfsmount *mnt);
480#else
481static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
482 int mtdnr, struct vfsmount *mnt)
483{
484 return -ENODEV;
485}
486#endif
487
488/* dir.c */
489extern const struct inode_operations logfs_symlink_iops;
490extern const struct inode_operations logfs_dir_iops;
491extern const struct file_operations logfs_dir_fops;
492int logfs_replay_journal(struct super_block *sb);
493
494/* file.c */
495extern const struct inode_operations logfs_reg_iops;
496extern const struct file_operations logfs_reg_fops;
497extern const struct address_space_operations logfs_reg_aops;
498int logfs_readpage(struct file *file, struct page *page);
499int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
500 unsigned long arg);
501int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
502
503/* gc.c */
504u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
505void logfs_gc_pass(struct super_block *sb);
506int logfs_check_areas(struct super_block *sb);
507int logfs_init_gc(struct super_block *sb);
508void logfs_cleanup_gc(struct super_block *sb);
509
510/* inode.c */
511extern const struct super_operations logfs_super_operations;
512struct inode *logfs_iget(struct super_block *sb, ino_t ino);
513struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
514void logfs_safe_iput(struct inode *inode, int cookie);
515struct inode *logfs_new_inode(struct inode *dir, int mode);
516struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
517struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
518int logfs_init_inode_cache(void);
519void logfs_destroy_inode_cache(void);
520void destroy_meta_inode(struct inode *inode);
521void logfs_set_blocks(struct inode *inode, u64 no);
522/* these logically belong into inode.c but actually reside in readwrite.c */
523int logfs_read_inode(struct inode *inode);
524int __logfs_write_inode(struct inode *inode, long flags);
525void logfs_delete_inode(struct inode *inode);
526void logfs_clear_inode(struct inode *inode);
527
528/* journal.c */
529void logfs_write_anchor(struct super_block *sb);
530int logfs_init_journal(struct super_block *sb);
531void logfs_cleanup_journal(struct super_block *sb);
532int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
533 level_t level, int child_no, __be64 val);
534void do_logfs_journal_wl_pass(struct super_block *sb);
535
536/* readwrite.c */
537pgoff_t logfs_pack_index(u64 bix, level_t level);
538void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
539int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
540 loff_t bix, long flags, struct shadow_tree *shadow_tree);
541int logfs_readpage_nolock(struct page *page);
542int logfs_write_buf(struct inode *inode, struct page *page, long flags);
543int logfs_delete(struct inode *inode, pgoff_t index,
544 struct shadow_tree *shadow_tree);
545int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
546 gc_level_t gc_level, long flags);
547int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
548 gc_level_t gc_level);
549int logfs_truncate(struct inode *inode, u64 size);
550u64 logfs_seek_hole(struct inode *inode, u64 bix);
551u64 logfs_seek_data(struct inode *inode, u64 bix);
552int logfs_open_segfile(struct super_block *sb);
553int logfs_init_rw(struct super_block *sb);
554void logfs_cleanup_rw(struct super_block *sb);
555void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
556void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
557void logfs_write_block(struct logfs_block *block, long flags);
558int logfs_write_obj_aliases_pagecache(struct super_block *sb);
559void logfs_get_segment_entry(struct super_block *sb, u32 segno,
560 struct logfs_segment_entry *se);
561void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
562void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
563 gc_level_t gc_level);
564void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
565void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
566struct logfs_block *__alloc_block(struct super_block *sb,
567 u64 ino, u64 bix, level_t level);
568void __free_block(struct super_block *sb, struct logfs_block *block);
569void btree_write_block(struct logfs_block *block);
570void initialize_block_counters(struct page *page, struct logfs_block *block,
571 __be64 *array, int page_is_empty);
572int logfs_exist_block(struct inode *inode, u64 bix);
573int get_page_reserve(struct inode *inode, struct page *page);
574extern struct logfs_block_ops indirect_block_ops;
575
576/* segment.c */
577int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
578int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
579int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
580 level_t level);
581int logfs_segment_write(struct inode *inode, struct page *page,
582 struct logfs_shadow *shadow);
583int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
584int logfs_load_object_aliases(struct super_block *sb,
585 struct logfs_obj_alias *oa, int count);
586void move_page_to_btree(struct page *page);
587int logfs_init_mapping(struct super_block *sb);
588void logfs_sync_area(struct logfs_area *area);
589void logfs_sync_segments(struct super_block *sb);
590void freeseg(struct super_block *sb, u32 segno);
591
592/* area handling */
593int logfs_init_areas(struct super_block *sb);
594void logfs_cleanup_areas(struct super_block *sb);
595int logfs_open_area(struct logfs_area *area, size_t bytes);
596void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
597 int use_filler);
598
599static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
600 void *buf, size_t len)
601{
602 __logfs_buf_write(area, ofs, buf, len, 0);
603}
604
605static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
606 void *buf, size_t len)
607{
608 __logfs_buf_write(area, ofs, buf, len, 1);
609}
610
611/* super.c */
612struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
613void emergency_read_end(struct page *page);
614void logfs_crash_dump(struct super_block *sb);
615void *memchr_inv(const void *s, int c, size_t n);
616int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
617int logfs_get_sb_device(struct file_system_type *type, int flags,
618 struct mtd_info *mtd, struct block_device *bdev,
619 const struct logfs_device_ops *devops, struct vfsmount *mnt);
620int logfs_check_ds(struct logfs_disk_super *ds);
621int logfs_write_sb(struct super_block *sb);
622
623static inline struct logfs_super *logfs_super(struct super_block *sb)
624{
625 return sb->s_fs_info;
626}
627
628static inline struct logfs_inode *logfs_inode(struct inode *inode)
629{
630 return container_of(inode, struct logfs_inode, vfs_inode);
631}
632
633static inline void logfs_set_ro(struct super_block *sb)
634{
635 logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
636}
637
638#define LOGFS_BUG(sb) do { \
639 struct super_block *__sb = sb; \
640 logfs_crash_dump(__sb); \
641 logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
642 BUG(); \
643} while (0)
644
645#define LOGFS_BUG_ON(condition, sb) \
646 do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
647
648static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
649{
650 return cpu_to_be32(crc32(~0, data+skip, len-skip));
651}
652
653static inline u8 logfs_type(struct inode *inode)
654{
655 return (inode->i_mode >> 12) & 15;
656}
657
658static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
659{
660 return pos >> sb->s_blocksize_bits;
661}
662
663static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
664{
665 return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
666}
667
668static inline u32 seg_no(struct super_block *sb, u64 ofs)
669{
670 return ofs >> logfs_super(sb)->s_segshift;
671}
672
673static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
674{
675 return ofs & logfs_super(sb)->s_segmask;
676}
677
678static inline u64 seg_align(struct super_block *sb, u64 ofs)
679{
680 return ofs & ~logfs_super(sb)->s_segmask;
681}
682
683static inline struct logfs_block *logfs_block(struct page *page)
684{
685 return (void *)page->private;
686}
687
688static inline level_t shrink_level(gc_level_t __level)
689{
690 u8 level = (__force u8)__level;
691
692 if (level >= LOGFS_MAX_LEVELS)
693 level -= LOGFS_MAX_LEVELS;
694 return (__force level_t)level;
695}
696
697static inline gc_level_t expand_level(u64 ino, level_t __level)
698{
699 u8 level = (__force u8)__level;
700
701 if (ino == LOGFS_INO_MASTER) {
702 /* ifile has seperate areas */
703 level += LOGFS_MAX_LEVELS;
704 }
705 return (__force gc_level_t)level;
706}
707
708static inline int logfs_block_shift(struct super_block *sb, level_t level)
709{
710 level = shrink_level((__force gc_level_t)level);
711 return (__force int)level * (sb->s_blocksize_bits - 3);
712}
713
714static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
715{
716 return ~0ull << logfs_block_shift(sb, level);
717}
718
719static inline struct logfs_area *get_area(struct super_block *sb,
720 gc_level_t gc_level)
721{
722 return logfs_super(sb)->s_area[(__force u8)gc_level];
723}
724
725#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..f674725663fe
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
1/*
2 * fs/logfs/logfs_abi.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Public header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_ABI_H
11#define FS_LOGFS_LOGFS_ABI_H
12
13/* For out-of-kernel compiles */
14#ifndef BUILD_BUG_ON
15#define BUILD_BUG_ON(condition) /**/
16#endif
17
18#define SIZE_CHECK(type, size) \
19static inline void check_##type(void) \
20{ \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
22}
23
24/*
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
29 *
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
32 */
33
34/*
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
59 * short-lived.
60 */
61
62
63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
65#define LOGFS_MAGIC_U32 0xc97e8168u
66
67/*
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
71 * will remain 64bit.
72 *
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
76 */
77#define LOGFS_BLOCKSIZE (4096ull)
78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79#define LOGFS_BLOCK_BITS (9)
80
81/*
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
84 */
85#define I0_BLOCKS (16)
86#define I1_BLOCKS LOGFS_BLOCK_FACTOR
87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91
92#define INDIRECT_INDEX I0_BLOCKS
93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
94
95/*
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
99 *
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
102 */
103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
110
111/*
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
115 */
116#define LOGFS_FULLY_POPULATED (1ULL << 63)
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118
119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
125 * progress.
126 *
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
132 */
133#define LOGFS_MAX_INDIRECT (5)
134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
136
137/* Maximum size of filenames */
138#define LOGFS_MAX_NAMELEN (255)
139
140/* Number of segments in the primary journal. */
141#define LOGFS_JOURNAL_SEGS (16)
142
143/* Maximum number of free/erased/etc. segments in journal entries */
144#define MAX_CACHED_SEGS (64)
145
146
147/*
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150 * its header,
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
153 * fit.
154 */
155#define LOGFS_OBJECT_HEADERSIZE (0x1c)
156#define LOGFS_SEGMENT_HEADERSIZE (0x18)
157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158#define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160
161/*
162 * Segment types:
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
166 */
167enum {
168 SEG_SUPER = 0x01,
169 SEG_JOURNAL = 0x02,
170 SEG_OSTORE = 0x03,
171};
172
173/**
174 * struct logfs_segment_header - per-segment header in the ostore
175 *
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
183 */
184struct logfs_segment_header {
185 __be32 crc;
186 __be16 pad;
187 __u8 type;
188 __u8 level;
189 __be32 segno;
190 __be32 ec;
191 __be64 gec;
192};
193
194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195
196#define LOGFS_FEATURES_INCOMPAT (0ull)
197#define LOGFS_FEATURES_RO_COMPAT (0ull)
198#define LOGFS_FEATURES_COMPAT (0ull)
199
200/**
201 * struct logfs_disk_super - on-medium superblock
202 *
203 * @ds_magic: magic number, must equal LOGFS_MAGIC
204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data
208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features
211 * @ds_feature_compat: compatible filesystem features
212 * @ds_flags: flags
213 * @ds_segment_shift: log2 of segment size
214 * @ds_block_shift: log2 of block size
215 * @ds_write_shift: log2 of write size
216 * @pad1: reserved, must be 0
217 * @ds_journal_seg: segments used by primary journal
218 * @ds_root_reserve: bytes reserved for the superuser
219 * @ds_speed_reserve: bytes reserved to speed up GC
220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
221 * @pad2: reserved, must be 0
222 * @pad3: reserved, must be 0
223 *
224 * Contains only read-only fields. Read-write fields like the amount of used
225 * space is tracked in the dynamic superblock, which is stored in the journal.
226 */
227struct logfs_disk_super {
228 struct logfs_segment_header ds_sh;
229 __be64 ds_magic;
230
231 __be32 ds_crc;
232 __u8 ds_ifile_levels;
233 __u8 ds_iblock_levels;
234 __u8 ds_data_levels;
235 __u8 ds_segment_shift;
236 __u8 ds_block_shift;
237 __u8 ds_write_shift;
238 __u8 pad0[6];
239
240 __be64 ds_filesystem_size;
241 __be32 ds_segment_size;
242 __be32 ds_bad_seg_reserve;
243
244 __be64 ds_feature_incompat;
245 __be64 ds_feature_ro_compat;
246
247 __be64 ds_feature_compat;
248 __be64 ds_feature_flags;
249
250 __be64 ds_root_reserve;
251 __be64 ds_speed_reserve;
252
253 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
254
255 __be64 ds_super_ofs[2];
256 __be64 pad3[8];
257};
258
259SIZE_CHECK(logfs_disk_super, 256);
260
261/*
262 * Object types:
263 * OBJ_BLOCK - Data or indirect block
264 * OBJ_INODE - Inode
265 * OBJ_DENTRY - Dentry
266 */
267enum {
268 OBJ_BLOCK = 0x04,
269 OBJ_INODE = 0x05,
270 OBJ_DENTRY = 0x06,
271};
272
273/**
274 * struct logfs_object_header - per-object header in the ostore
275 *
276 * @crc: crc32 of header, excluding data_crc
277 * @len: length of data
278 * @type: object type, see above
279 * @compr: compression type
280 * @ino: inode number
281 * @bix: block index
282 * @data_crc: crc32 of payload
283 */
284struct logfs_object_header {
285 __be32 crc;
286 __be16 len;
287 __u8 type;
288 __u8 compr;
289 __be64 ino;
290 __be64 bix;
291 __be32 data_crc;
292} __attribute__((packed));
293
294SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
295
296/*
297 * Reserved inode numbers:
298 * LOGFS_INO_MASTER - master inode (for inode file)
299 * LOGFS_INO_ROOT - root directory
300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
301 */
302enum {
303 LOGFS_INO_MAPPING = 0x00,
304 LOGFS_INO_MASTER = 0x01,
305 LOGFS_INO_ROOT = 0x02,
306 LOGFS_INO_SEGFILE = 0x03,
307 LOGFS_RESERVED_INOS = 0x10,
308};
309
310/*
311 * Inode flags. High bits should never be written to the medium. They are
312 * reserved for in-memory usage.
313 * Low bits should either remain in sync with the corresponding FS_*_FL or
314 * reuse slots that obviously don't make sense for logfs.
315 *
316 * LOGFS_IF_DIRTY Inode must be written back
317 * LOGFS_IF_ZOMBIE Inode has been deleted
318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
319 */
320#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
321#define LOGFS_IF_DIRTY 0x20000000
322#define LOGFS_IF_ZOMBIE 0x40000000
323#define LOGFS_IF_STILLBORN 0x80000000
324
325/* Flags available to chattr */
326#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
327#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328/* Flags inherited from parent directory on file/directory creation */
329#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
330
331/**
332 * struct logfs_disk_inode - on-medium inode
333 *
334 * @di_mode: file mode
335 * @di_pad: reserved, must be 0
336 * @di_flags: inode flags, see above
337 * @di_uid: user id
338 * @di_gid: group id
339 * @di_ctime: change time
340 * @di_mtime: modify time
341 * @di_refcount: reference count (aka nlink or link count)
342 * @di_generation: inode generation, for nfs
343 * @di_used_bytes: number of bytes used
344 * @di_size: file size
345 * @di_data: data pointers
346 */
347struct logfs_disk_inode {
348 __be16 di_mode;
349 __u8 di_height;
350 __u8 di_pad;
351 __be32 di_flags;
352 __be32 di_uid;
353 __be32 di_gid;
354
355 __be64 di_ctime;
356 __be64 di_mtime;
357
358 __be64 di_atime;
359 __be32 di_refcount;
360 __be32 di_generation;
361
362 __be64 di_used_bytes;
363 __be64 di_size;
364
365 __be64 di_data[LOGFS_EMBEDDED_FIELDS];
366};
367
368SIZE_CHECK(logfs_disk_inode, 200);
369
370#define INODE_POINTER_OFS \
371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372#define INODE_USED_OFS \
373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374#define INODE_SIZE_OFS \
375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376#define INODE_HEIGHT_OFS (0)
377
378/**
379 * struct logfs_disk_dentry - on-medium dentry structure
380 *
381 * @ino: inode number
382 * @namelen: length of file name
383 * @type: file type, identical to bits 12..15 of mode
384 * @name: file name
385 */
386/* FIXME: add 6 bytes of padding to remove the __packed */
387struct logfs_disk_dentry {
388 __be64 ino;
389 __be16 namelen;
390 __u8 type;
391 __u8 name[LOGFS_MAX_NAMELEN];
392} __attribute__((packed));
393
394SIZE_CHECK(logfs_disk_dentry, 266);
395
396#define RESERVED 0xffffffff
397#define BADSEG 0xffffffff
398/**
399 * struct logfs_segment_entry - segment file entry
400 *
401 * @ec_level: erase count and level
402 * @valid: number of valid bytes
403 *
404 * Segment file contains one entry for every segment. ec_level contains the
405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408 * superblock or the journal, or when the segment is bad.
409 */
410struct logfs_segment_entry {
411 __be32 ec_level;
412 __be32 valid;
413};
414
415SIZE_CHECK(logfs_segment_entry, 8);
416
417/**
418 * struct logfs_journal_header - header for journal entries (JEs)
419 *
420 * @h_crc: crc32 of journal entry
421 * @h_len: length of compressed journal entry,
422 * not including header
423 * @h_datalen: length of uncompressed data
424 * @h_type: JE type
425 * @h_compr: compression type
426 * @h_pad: reserved
427 */
428struct logfs_journal_header {
429 __be32 h_crc;
430 __be16 h_len;
431 __be16 h_datalen;
432 __be16 h_type;
433 __u8 h_compr;
434 __u8 h_pad[5];
435};
436
437SIZE_CHECK(logfs_journal_header, 16);
438
439/*
440 * Life expectency of data.
441 * VIM_DEFAULT - default vim
442 * VIM_SEGFILE - for segment file only - very short-living
443 * VIM_GC - GC'd data - likely long-living
444 */
445enum logfs_vim {
446 VIM_DEFAULT = 0,
447 VIM_SEGFILE = 1,
448};
449
450/**
451 * struct logfs_je_area - wbuf header
452 *
453 * @segno: segment number of area
454 * @used_bytes: number of bytes already used
455 * @gc_level: GC level
456 * @vim: life expectancy of data
457 *
458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed.
462 * The write buffer immediately follow this header.
463 */
464struct logfs_je_area {
465 __be32 segno;
466 __be32 used_bytes;
467 __u8 gc_level;
468 __u8 vim;
469} __attribute__((packed));
470
471SIZE_CHECK(logfs_je_area, 10);
472
473#define MAX_JOURNAL_HEADER \
474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
475
476/**
477 * struct logfs_je_dynsb - dynamic superblock
478 *
479 * @ds_gec: global erase count
480 * @ds_sweeper: current position of GC "sweeper"
481 * @ds_rename_dir: source directory ino (see dir.c documentation)
482 * @ds_rename_pos: position of source dd (see dir.c documentation)
483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
484 * @ds_victim_ino: parent inode of victim (see dir.c)
485 * @ds_used_bytes: number of used bytes
486 */
487struct logfs_je_dynsb {
488 __be64 ds_gec;
489 __be64 ds_sweeper;
490
491 __be64 ds_rename_dir;
492 __be64 ds_rename_pos;
493
494 __be64 ds_victim_ino;
495 __be64 ds_victim_parent; /* XXX */
496
497 __be64 ds_used_bytes;
498 __be32 ds_generation;
499 __be32 pad;
500};
501
502SIZE_CHECK(logfs_je_dynsb, 64);
503
504/**
505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
506 *
507 * @da_size: size of inode file
508 * @da_last_ino: last created inode
509 * @da_used_bytes: number of bytes used
510 * @da_data: data pointers
511 */
512struct logfs_je_anchor {
513 __be64 da_size;
514 __be64 da_last_ino;
515
516 __be64 da_used_bytes;
517 u8 da_height;
518 u8 pad[7];
519
520 __be64 da_data[LOGFS_EMBEDDED_FIELDS];
521};
522
523SIZE_CHECK(logfs_je_anchor, 168);
524
525/**
526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
527 *
528 * @so_segment: segments used for 2nd journal
529 *
530 * Length of the array is given by h_len field in the header.
531 */
532struct logfs_je_spillout {
533 __be64 so_segment[0];
534};
535
536SIZE_CHECK(logfs_je_spillout, 0);
537
538/**
539 * struct logfs_je_journal_ec - erase counts for all journal segments
540 *
541 * @ec: erase count
542 *
543 * Length of the array is given by h_len field in the header.
544 */
545struct logfs_je_journal_ec {
546 __be32 ec[0];
547};
548
549SIZE_CHECK(logfs_je_journal_ec, 0);
550
551/**
552 * struct logfs_je_free_segments - list of free segmetns with erase count
553 */
554struct logfs_je_free_segments {
555 __be32 segno;
556 __be32 ec;
557};
558
559SIZE_CHECK(logfs_je_free_segments, 8);
560
561/**
562 * struct logfs_seg_alias - list of segment aliases
563 */
564struct logfs_seg_alias {
565 __be32 old_segno;
566 __be32 new_segno;
567};
568
569SIZE_CHECK(logfs_seg_alias, 8);
570
571/**
572 * struct logfs_obj_alias - list of object aliases
573 */
574struct logfs_obj_alias {
575 __be64 ino;
576 __be64 bix;
577 __be64 val;
578 u8 level;
579 u8 pad[5];
580 __be16 child_no;
581};
582
583SIZE_CHECK(logfs_obj_alias, 32);
584
585/**
586 * Compression types.
587 *
588 * COMPR_NONE - uncompressed
589 * COMPR_ZLIB - compressed with zlib
590 */
591enum {
592 COMPR_NONE = 0,
593 COMPR_ZLIB = 1,
594};
595
596/*
597 * Journal entries come in groups of 16. First group contains unique
598 * entries, next groups contain one entry per level
599 *
600 * JE_FIRST - smallest possible journal entry number
601 *
602 * JEG_BASE - base group, containing unique entries
603 * JE_COMMIT - commit entry, validates all previous entries
604 * JE_DYNSB - dynamic superblock, anything that ought to be in the
605 * superblock but cannot because it is read-write data
606 * JE_ANCHOR - anchor aka master inode aka inode file's inode
607 * JE_ERASECOUNT erasecounts for all journal segments
608 * JE_SPILLOUT - unused
609 * JE_SEG_ALIAS - aliases segments
610 * JE_AREA - area description
611 *
612 * JE_LAST - largest possible journal entry number
613 */
614enum {
615 JE_FIRST = 0x01,
616
617 JEG_BASE = 0x00,
618 JE_COMMIT = 0x02,
619 JE_DYNSB = 0x03,
620 JE_ANCHOR = 0x04,
621 JE_ERASECOUNT = 0x05,
622 JE_SPILLOUT = 0x06,
623 JE_OBJ_ALIAS = 0x0d,
624 JE_AREA = 0x0e,
625
626 JE_LAST = 0x0e,
627};
628
629#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..c3a3a6814b84
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2257 @@
1/*
2 * fs/logfs/readwrite.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 *
9 * Actually contains five sets of very similar functions:
10 * read read blocks from a file
11 * seek_hole find next hole
12 * seek_data find next data block
13 * valid check whether a block still belongs to a file
14 * write write blocks to a file
15 * delete delete a block (for directories and ifile)
16 * rewrite move existing blocks of a file to a new location (gc helper)
17 * truncate truncate a file
18 */
19#include "logfs.h"
20#include <linux/sched.h>
21
22static u64 adjust_bix(u64 bix, level_t level)
23{
24 switch (level) {
25 case 0:
26 return bix;
27 case LEVEL(1):
28 return max_t(u64, bix, I0_BLOCKS);
29 case LEVEL(2):
30 return max_t(u64, bix, I1_BLOCKS);
31 case LEVEL(3):
32 return max_t(u64, bix, I2_BLOCKS);
33 case LEVEL(4):
34 return max_t(u64, bix, I3_BLOCKS);
35 case LEVEL(5):
36 return max_t(u64, bix, I4_BLOCKS);
37 default:
38 WARN_ON(1);
39 return bix;
40 }
41}
42
43static inline u64 maxbix(u8 height)
44{
45 return 1ULL << (LOGFS_BLOCK_BITS * height);
46}
47
48/**
49 * The inode address space is cut in two halves. Lower half belongs to data
50 * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
51 * set, the actual block index (bix) and level can be derived from the page
52 * index.
53 *
54 * The lowest three bits of the block index are set to 0 after packing and
55 * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
56 * anyway this is harmless.
57 */
58#define ARCH_SHIFT (BITS_PER_LONG - 32)
59#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
60#define LEVEL_SHIFT (28 + ARCH_SHIFT)
61static inline pgoff_t first_indirect_block(void)
62{
63 return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
64}
65
66pgoff_t logfs_pack_index(u64 bix, level_t level)
67{
68 pgoff_t index;
69
70 BUG_ON(bix >= INDIRECT_BIT);
71 if (level == 0)
72 return bix;
73
74 index = INDIRECT_BIT;
75 index |= (__force long)level << LEVEL_SHIFT;
76 index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
77 return index;
78}
79
80void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
81{
82 u8 __level;
83
84 if (!(index & INDIRECT_BIT)) {
85 *bix = index;
86 *level = 0;
87 return;
88 }
89
90 __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
91 *level = LEVEL(__level);
92 *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
93 *bix = adjust_bix(*bix, *level);
94 return;
95}
96#undef ARCH_SHIFT
97#undef INDIRECT_BIT
98#undef LEVEL_SHIFT
99
100/*
101 * Time is stored as nanoseconds since the epoch.
102 */
103static struct timespec be64_to_timespec(__be64 betime)
104{
105 return ns_to_timespec(be64_to_cpu(betime));
106}
107
108static __be64 timespec_to_be64(struct timespec tsp)
109{
110 return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
111}
112
113static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
114{
115 struct logfs_inode *li = logfs_inode(inode);
116 int i;
117
118 inode->i_mode = be16_to_cpu(di->di_mode);
119 li->li_height = di->di_height;
120 li->li_flags = be32_to_cpu(di->di_flags);
121 inode->i_uid = be32_to_cpu(di->di_uid);
122 inode->i_gid = be32_to_cpu(di->di_gid);
123 inode->i_size = be64_to_cpu(di->di_size);
124 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
125 inode->i_atime = be64_to_timespec(di->di_atime);
126 inode->i_ctime = be64_to_timespec(di->di_ctime);
127 inode->i_mtime = be64_to_timespec(di->di_mtime);
128 inode->i_nlink = be32_to_cpu(di->di_refcount);
129 inode->i_generation = be32_to_cpu(di->di_generation);
130
131 switch (inode->i_mode & S_IFMT) {
132 case S_IFSOCK: /* fall through */
133 case S_IFBLK: /* fall through */
134 case S_IFCHR: /* fall through */
135 case S_IFIFO:
136 inode->i_rdev = be64_to_cpu(di->di_data[0]);
137 break;
138 case S_IFDIR: /* fall through */
139 case S_IFREG: /* fall through */
140 case S_IFLNK:
141 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
142 li->li_data[i] = be64_to_cpu(di->di_data[i]);
143 break;
144 default:
145 BUG();
146 }
147}
148
149static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
150{
151 struct logfs_inode *li = logfs_inode(inode);
152 int i;
153
154 di->di_mode = cpu_to_be16(inode->i_mode);
155 di->di_height = li->li_height;
156 di->di_pad = 0;
157 di->di_flags = cpu_to_be32(li->li_flags);
158 di->di_uid = cpu_to_be32(inode->i_uid);
159 di->di_gid = cpu_to_be32(inode->i_gid);
160 di->di_size = cpu_to_be64(i_size_read(inode));
161 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
162 di->di_atime = timespec_to_be64(inode->i_atime);
163 di->di_ctime = timespec_to_be64(inode->i_ctime);
164 di->di_mtime = timespec_to_be64(inode->i_mtime);
165 di->di_refcount = cpu_to_be32(inode->i_nlink);
166 di->di_generation = cpu_to_be32(inode->i_generation);
167
168 switch (inode->i_mode & S_IFMT) {
169 case S_IFSOCK: /* fall through */
170 case S_IFBLK: /* fall through */
171 case S_IFCHR: /* fall through */
172 case S_IFIFO:
173 di->di_data[0] = cpu_to_be64(inode->i_rdev);
174 break;
175 case S_IFDIR: /* fall through */
176 case S_IFREG: /* fall through */
177 case S_IFLNK:
178 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
179 di->di_data[i] = cpu_to_be64(li->li_data[i]);
180 break;
181 default:
182 BUG();
183 }
184}
185
186static void __logfs_set_blocks(struct inode *inode)
187{
188 struct super_block *sb = inode->i_sb;
189 struct logfs_inode *li = logfs_inode(inode);
190
191 inode->i_blocks = ULONG_MAX;
192 if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
193 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
194}
195
196void logfs_set_blocks(struct inode *inode, u64 bytes)
197{
198 struct logfs_inode *li = logfs_inode(inode);
199
200 li->li_used_bytes = bytes;
201 __logfs_set_blocks(inode);
202}
203
204static void prelock_page(struct super_block *sb, struct page *page, int lock)
205{
206 struct logfs_super *super = logfs_super(sb);
207
208 BUG_ON(!PageLocked(page));
209 if (lock) {
210 BUG_ON(PagePreLocked(page));
211 SetPagePreLocked(page);
212 } else {
213 /* We are in GC path. */
214 if (PagePreLocked(page))
215 super->s_lock_count++;
216 else
217 SetPagePreLocked(page);
218 }
219}
220
221static void preunlock_page(struct super_block *sb, struct page *page, int lock)
222{
223 struct logfs_super *super = logfs_super(sb);
224
225 BUG_ON(!PageLocked(page));
226 if (lock)
227 ClearPagePreLocked(page);
228 else {
229 /* We are in GC path. */
230 BUG_ON(!PagePreLocked(page));
231 if (super->s_lock_count)
232 super->s_lock_count--;
233 else
234 ClearPagePreLocked(page);
235 }
236}
237
238/*
239 * Logfs is prone to an AB-BA deadlock where one task tries to acquire
240 * s_write_mutex with a locked page and GC tries to get that page while holding
241 * s_write_mutex.
242 * To solve this issue logfs will ignore the page lock iff the page in question
243 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
244 * in addition to PG_locked.
245 */
246static void logfs_get_wblocks(struct super_block *sb, struct page *page,
247 int lock)
248{
249 struct logfs_super *super = logfs_super(sb);
250
251 if (page)
252 prelock_page(sb, page, lock);
253
254 if (lock) {
255 mutex_lock(&super->s_write_mutex);
256 logfs_gc_pass(sb);
257 /* FIXME: We also have to check for shadowed space
258 * and mempool fill grade */
259 }
260}
261
262static void logfs_put_wblocks(struct super_block *sb, struct page *page,
263 int lock)
264{
265 struct logfs_super *super = logfs_super(sb);
266
267 if (page)
268 preunlock_page(sb, page, lock);
269 /* Order matters - we must clear PG_pre_locked before releasing
270 * s_write_mutex or we could race against another task. */
271 if (lock)
272 mutex_unlock(&super->s_write_mutex);
273}
274
275static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
276 level_t level)
277{
278 return find_or_create_page(inode->i_mapping,
279 logfs_pack_index(bix, level), GFP_NOFS);
280}
281
282static void logfs_put_read_page(struct page *page)
283{
284 unlock_page(page);
285 page_cache_release(page);
286}
287
288static void logfs_lock_write_page(struct page *page)
289{
290 int loop = 0;
291
292 while (unlikely(!trylock_page(page))) {
293 if (loop++ > 0x1000) {
294 /* Has been observed once so far... */
295 printk(KERN_ERR "stack at %p\n", &loop);
296 BUG();
297 }
298 if (PagePreLocked(page)) {
299 /* Holder of page lock is waiting for us, it
300 * is safe to use this page. */
301 break;
302 }
303 /* Some other process has this page locked and has
304 * nothing to do with us. Wait for it to finish.
305 */
306 schedule();
307 }
308 BUG_ON(!PageLocked(page));
309}
310
311static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
312 level_t level)
313{
314 struct address_space *mapping = inode->i_mapping;
315 pgoff_t index = logfs_pack_index(bix, level);
316 struct page *page;
317 int err;
318
319repeat:
320 page = find_get_page(mapping, index);
321 if (!page) {
322 page = __page_cache_alloc(GFP_NOFS);
323 if (!page)
324 return NULL;
325 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
326 if (unlikely(err)) {
327 page_cache_release(page);
328 if (err == -EEXIST)
329 goto repeat;
330 return NULL;
331 }
332 } else logfs_lock_write_page(page);
333 BUG_ON(!PageLocked(page));
334 return page;
335}
336
337static void logfs_unlock_write_page(struct page *page)
338{
339 if (!PagePreLocked(page))
340 unlock_page(page);
341}
342
343static void logfs_put_write_page(struct page *page)
344{
345 logfs_unlock_write_page(page);
346 page_cache_release(page);
347}
348
349static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
350 int rw)
351{
352 if (rw == READ)
353 return logfs_get_read_page(inode, bix, level);
354 else
355 return logfs_get_write_page(inode, bix, level);
356}
357
358static void logfs_put_page(struct page *page, int rw)
359{
360 if (rw == READ)
361 logfs_put_read_page(page);
362 else
363 logfs_put_write_page(page);
364}
365
366static unsigned long __get_bits(u64 val, int skip, int no)
367{
368 u64 ret = val;
369
370 ret >>= skip * no;
371 ret <<= 64 - no;
372 ret >>= 64 - no;
373 return ret;
374}
375
376static unsigned long get_bits(u64 val, level_t skip)
377{
378 return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
379}
380
381static inline void init_shadow_tree(struct super_block *sb,
382 struct shadow_tree *tree)
383{
384 struct logfs_super *super = logfs_super(sb);
385
386 btree_init_mempool64(&tree->new, super->s_btree_pool);
387 btree_init_mempool64(&tree->old, super->s_btree_pool);
388}
389
390static void indirect_write_block(struct logfs_block *block)
391{
392 struct page *page;
393 struct inode *inode;
394 int ret;
395
396 page = block->page;
397 inode = page->mapping->host;
398 logfs_lock_write_page(page);
399 ret = logfs_write_buf(inode, page, 0);
400 logfs_unlock_write_page(page);
401 /*
402 * This needs some rework. Unless you want your filesystem to run
403 * completely synchronously (you don't), the filesystem will always
404 * report writes as 'successful' before the actual work has been
405 * done. The actual work gets done here and this is where any errors
406 * will show up. And there isn't much we can do about it, really.
407 *
408 * Some attempts to fix the errors (move from bad blocks, retry io,...)
409 * have already been done, so anything left should be either a broken
410 * device or a bug somewhere in logfs itself. Being relatively new,
411 * the odds currently favor a bug, so for now the line below isn't
412 * entirely tasteles.
413 */
414 BUG_ON(ret);
415}
416
417static void inode_write_block(struct logfs_block *block)
418{
419 struct inode *inode;
420 int ret;
421
422 inode = block->inode;
423 if (inode->i_ino == LOGFS_INO_MASTER)
424 logfs_write_anchor(inode->i_sb);
425 else {
426 ret = __logfs_write_inode(inode, 0);
427 /* see indirect_write_block comment */
428 BUG_ON(ret);
429 }
430}
431
432static gc_level_t inode_block_level(struct logfs_block *block)
433{
434 BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
435 return GC_LEVEL(LOGFS_MAX_LEVELS);
436}
437
438static gc_level_t indirect_block_level(struct logfs_block *block)
439{
440 struct page *page;
441 struct inode *inode;
442 u64 bix;
443 level_t level;
444
445 page = block->page;
446 inode = page->mapping->host;
447 logfs_unpack_index(page->index, &bix, &level);
448 return expand_level(inode->i_ino, level);
449}
450
451/*
452 * This silences a false, yet annoying gcc warning. I hate it when my editor
453 * jumps into bitops.h each time I recompile this file.
454 * TODO: Complain to gcc folks about this and upgrade compiler.
455 */
456static unsigned long fnb(const unsigned long *addr,
457 unsigned long size, unsigned long offset)
458{
459 return find_next_bit(addr, size, offset);
460}
461
462static __be64 inode_val0(struct inode *inode)
463{
464 struct logfs_inode *li = logfs_inode(inode);
465 u64 val;
466
467 /*
468 * Explicit shifting generates good code, but must match the format
469 * of the structure. Add some paranoia just in case.
470 */
471 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
472 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
473 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
474
475 val = (u64)inode->i_mode << 48 |
476 (u64)li->li_height << 40 |
477 (u64)li->li_flags;
478 return cpu_to_be64(val);
479}
480
481static int inode_write_alias(struct super_block *sb,
482 struct logfs_block *block, write_alias_t *write_one_alias)
483{
484 struct inode *inode = block->inode;
485 struct logfs_inode *li = logfs_inode(inode);
486 unsigned long pos;
487 u64 ino , bix;
488 __be64 val;
489 level_t level;
490 int err;
491
492 for (pos = 0; ; pos++) {
493 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
494 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
495 return 0;
496
497 switch (pos) {
498 case INODE_HEIGHT_OFS:
499 val = inode_val0(inode);
500 break;
501 case INODE_USED_OFS:
502 val = cpu_to_be64(li->li_used_bytes);;
503 break;
504 case INODE_SIZE_OFS:
505 val = cpu_to_be64(i_size_read(inode));
506 break;
507 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
508 val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
509 break;
510 default:
511 BUG();
512 }
513
514 ino = LOGFS_INO_MASTER;
515 bix = inode->i_ino;
516 level = LEVEL(0);
517 err = write_one_alias(sb, ino, bix, level, pos, val);
518 if (err)
519 return err;
520 }
521}
522
523static int indirect_write_alias(struct super_block *sb,
524 struct logfs_block *block, write_alias_t *write_one_alias)
525{
526 unsigned long pos;
527 struct page *page = block->page;
528 u64 ino , bix;
529 __be64 *child, val;
530 level_t level;
531 int err;
532
533 for (pos = 0; ; pos++) {
534 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
535 if (pos >= LOGFS_BLOCK_FACTOR)
536 return 0;
537
538 ino = page->mapping->host->i_ino;
539 logfs_unpack_index(page->index, &bix, &level);
540 child = kmap_atomic(page, KM_USER0);
541 val = child[pos];
542 kunmap_atomic(child, KM_USER0);
543 err = write_one_alias(sb, ino, bix, level, pos, val);
544 if (err)
545 return err;
546 }
547}
548
549int logfs_write_obj_aliases_pagecache(struct super_block *sb)
550{
551 struct logfs_super *super = logfs_super(sb);
552 struct logfs_block *block;
553 int err;
554
555 list_for_each_entry(block, &super->s_object_alias, alias_list) {
556 err = block->ops->write_alias(sb, block, write_alias_journal);
557 if (err)
558 return err;
559 }
560 return 0;
561}
562
563void __free_block(struct super_block *sb, struct logfs_block *block)
564{
565 BUG_ON(!list_empty(&block->item_list));
566 list_del(&block->alias_list);
567 mempool_free(block, logfs_super(sb)->s_block_pool);
568}
569
570static void inode_free_block(struct super_block *sb, struct logfs_block *block)
571{
572 struct inode *inode = block->inode;
573
574 logfs_inode(inode)->li_block = NULL;
575 __free_block(sb, block);
576}
577
578static void indirect_free_block(struct super_block *sb,
579 struct logfs_block *block)
580{
581 ClearPagePrivate(block->page);
582 block->page->private = 0;
583 __free_block(sb, block);
584}
585
586
587static struct logfs_block_ops inode_block_ops = {
588 .write_block = inode_write_block,
589 .block_level = inode_block_level,
590 .free_block = inode_free_block,
591 .write_alias = inode_write_alias,
592};
593
594struct logfs_block_ops indirect_block_ops = {
595 .write_block = indirect_write_block,
596 .block_level = indirect_block_level,
597 .free_block = indirect_free_block,
598 .write_alias = indirect_write_alias,
599};
600
601struct logfs_block *__alloc_block(struct super_block *sb,
602 u64 ino, u64 bix, level_t level)
603{
604 struct logfs_super *super = logfs_super(sb);
605 struct logfs_block *block;
606
607 block = mempool_alloc(super->s_block_pool, GFP_NOFS);
608 memset(block, 0, sizeof(*block));
609 INIT_LIST_HEAD(&block->alias_list);
610 INIT_LIST_HEAD(&block->item_list);
611 block->sb = sb;
612 block->ino = ino;
613 block->bix = bix;
614 block->level = level;
615 return block;
616}
617
618static void alloc_inode_block(struct inode *inode)
619{
620 struct logfs_inode *li = logfs_inode(inode);
621 struct logfs_block *block;
622
623 if (li->li_block)
624 return;
625
626 block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
627 block->inode = inode;
628 li->li_block = block;
629 block->ops = &inode_block_ops;
630}
631
632void initialize_block_counters(struct page *page, struct logfs_block *block,
633 __be64 *array, int page_is_empty)
634{
635 u64 ptr;
636 int i, start;
637
638 block->partial = 0;
639 block->full = 0;
640 start = 0;
641 if (page->index < first_indirect_block()) {
642 /* Counters are pointless on level 0 */
643 return;
644 }
645 if (page->index == first_indirect_block()) {
646 /* Skip unused pointers */
647 start = I0_BLOCKS;
648 block->full = I0_BLOCKS;
649 }
650 if (!page_is_empty) {
651 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
652 ptr = be64_to_cpu(array[i]);
653 if (ptr)
654 block->partial++;
655 if (ptr & LOGFS_FULLY_POPULATED)
656 block->full++;
657 }
658 }
659}
660
661static void alloc_data_block(struct inode *inode, struct page *page)
662{
663 struct logfs_block *block;
664 u64 bix;
665 level_t level;
666
667 if (PagePrivate(page))
668 return;
669
670 logfs_unpack_index(page->index, &bix, &level);
671 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
672 block->page = page;
673 SetPagePrivate(page);
674 page->private = (unsigned long)block;
675 block->ops = &indirect_block_ops;
676}
677
678static void alloc_indirect_block(struct inode *inode, struct page *page,
679 int page_is_empty)
680{
681 struct logfs_block *block;
682 __be64 *array;
683
684 if (PagePrivate(page))
685 return;
686
687 alloc_data_block(inode, page);
688
689 block = logfs_block(page);
690 array = kmap_atomic(page, KM_USER0);
691 initialize_block_counters(page, block, array, page_is_empty);
692 kunmap_atomic(array, KM_USER0);
693}
694
695static void block_set_pointer(struct page *page, int index, u64 ptr)
696{
697 struct logfs_block *block = logfs_block(page);
698 __be64 *array;
699 u64 oldptr;
700
701 BUG_ON(!block);
702 array = kmap_atomic(page, KM_USER0);
703 oldptr = be64_to_cpu(array[index]);
704 array[index] = cpu_to_be64(ptr);
705 kunmap_atomic(array, KM_USER0);
706 SetPageUptodate(page);
707
708 block->full += !!(ptr & LOGFS_FULLY_POPULATED)
709 - !!(oldptr & LOGFS_FULLY_POPULATED);
710 block->partial += !!ptr - !!oldptr;
711}
712
713static u64 block_get_pointer(struct page *page, int index)
714{
715 __be64 *block;
716 u64 ptr;
717
718 block = kmap_atomic(page, KM_USER0);
719 ptr = be64_to_cpu(block[index]);
720 kunmap_atomic(block, KM_USER0);
721 return ptr;
722}
723
724static int logfs_read_empty(struct page *page)
725{
726 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
727 return 0;
728}
729
730static int logfs_read_direct(struct inode *inode, struct page *page)
731{
732 struct logfs_inode *li = logfs_inode(inode);
733 pgoff_t index = page->index;
734 u64 block;
735
736 block = li->li_data[index];
737 if (!block)
738 return logfs_read_empty(page);
739
740 return logfs_segment_read(inode, page, block, index, 0);
741}
742
743static int logfs_read_loop(struct inode *inode, struct page *page,
744 int rw_context)
745{
746 struct logfs_inode *li = logfs_inode(inode);
747 u64 bix, bofs = li->li_data[INDIRECT_INDEX];
748 level_t level, target_level;
749 int ret;
750 struct page *ipage;
751
752 logfs_unpack_index(page->index, &bix, &target_level);
753 if (!bofs)
754 return logfs_read_empty(page);
755
756 if (bix >= maxbix(li->li_height))
757 return logfs_read_empty(page);
758
759 for (level = LEVEL(li->li_height);
760 (__force u8)level > (__force u8)target_level;
761 level = SUBLEVEL(level)){
762 ipage = logfs_get_page(inode, bix, level, rw_context);
763 if (!ipage)
764 return -ENOMEM;
765
766 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
767 if (ret) {
768 logfs_put_read_page(ipage);
769 return ret;
770 }
771
772 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
773 logfs_put_page(ipage, rw_context);
774 if (!bofs)
775 return logfs_read_empty(page);
776 }
777
778 return logfs_segment_read(inode, page, bofs, bix, 0);
779}
780
781static int logfs_read_block(struct inode *inode, struct page *page,
782 int rw_context)
783{
784 pgoff_t index = page->index;
785
786 if (index < I0_BLOCKS)
787 return logfs_read_direct(inode, page);
788 return logfs_read_loop(inode, page, rw_context);
789}
790
791static int logfs_exist_loop(struct inode *inode, u64 bix)
792{
793 struct logfs_inode *li = logfs_inode(inode);
794 u64 bofs = li->li_data[INDIRECT_INDEX];
795 level_t level;
796 int ret;
797 struct page *ipage;
798
799 if (!bofs)
800 return 0;
801 if (bix >= maxbix(li->li_height))
802 return 0;
803
804 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
805 ipage = logfs_get_read_page(inode, bix, level);
806 if (!ipage)
807 return -ENOMEM;
808
809 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
810 if (ret) {
811 logfs_put_read_page(ipage);
812 return ret;
813 }
814
815 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
816 logfs_put_read_page(ipage);
817 if (!bofs)
818 return 0;
819 }
820
821 return 1;
822}
823
824int logfs_exist_block(struct inode *inode, u64 bix)
825{
826 struct logfs_inode *li = logfs_inode(inode);
827
828 if (bix < I0_BLOCKS)
829 return !!li->li_data[bix];
830 return logfs_exist_loop(inode, bix);
831}
832
833static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
834{
835 struct logfs_inode *li = logfs_inode(inode);
836
837 for (; bix < I0_BLOCKS; bix++)
838 if (data ^ (li->li_data[bix] == 0))
839 return bix;
840 return I0_BLOCKS;
841}
842
843static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
844{
845 struct logfs_inode *li = logfs_inode(inode);
846 __be64 *rblock;
847 u64 increment, bofs = li->li_data[INDIRECT_INDEX];
848 level_t level;
849 int ret, slot;
850 struct page *page;
851
852 BUG_ON(!bofs);
853
854 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
855 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
856 page = logfs_get_read_page(inode, bix, level);
857 if (!page)
858 return bix;
859
860 ret = logfs_segment_read(inode, page, bofs, bix, level);
861 if (ret) {
862 logfs_put_read_page(page);
863 return bix;
864 }
865
866 slot = get_bits(bix, SUBLEVEL(level));
867 rblock = kmap_atomic(page, KM_USER0);
868 while (slot < LOGFS_BLOCK_FACTOR) {
869 if (data && (rblock[slot] != 0))
870 break;
871 if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
872 break;
873 slot++;
874 bix += increment;
875 bix &= ~(increment - 1);
876 }
877 if (slot >= LOGFS_BLOCK_FACTOR) {
878 kunmap_atomic(rblock, KM_USER0);
879 logfs_put_read_page(page);
880 return bix;
881 }
882 bofs = be64_to_cpu(rblock[slot]);
883 kunmap_atomic(rblock, KM_USER0);
884 logfs_put_read_page(page);
885 if (!bofs) {
886 BUG_ON(data);
887 return bix;
888 }
889 }
890 return bix;
891}
892
893/**
894 * logfs_seek_hole - find next hole starting at a given block index
895 * @inode: inode to search in
896 * @bix: block index to start searching
897 *
898 * Returns next hole. If the file doesn't contain any further holes, the
899 * block address next to eof is returned instead.
900 */
901u64 logfs_seek_hole(struct inode *inode, u64 bix)
902{
903 struct logfs_inode *li = logfs_inode(inode);
904
905 if (bix < I0_BLOCKS) {
906 bix = seek_holedata_direct(inode, bix, 0);
907 if (bix < I0_BLOCKS)
908 return bix;
909 }
910
911 if (!li->li_data[INDIRECT_INDEX])
912 return bix;
913 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
914 bix = maxbix(li->li_height);
915 else {
916 bix = seek_holedata_loop(inode, bix, 0);
917 if (bix < maxbix(li->li_height))
918 return bix;
919 /* Should not happen anymore. But if some port writes semi-
920 * corrupt images (as this one used to) we might run into it.
921 */
922 WARN_ON_ONCE(bix == maxbix(li->li_height));
923 }
924
925 return bix;
926}
927
928static u64 __logfs_seek_data(struct inode *inode, u64 bix)
929{
930 struct logfs_inode *li = logfs_inode(inode);
931
932 if (bix < I0_BLOCKS) {
933 bix = seek_holedata_direct(inode, bix, 1);
934 if (bix < I0_BLOCKS)
935 return bix;
936 }
937
938 if (bix < maxbix(li->li_height)) {
939 if (!li->li_data[INDIRECT_INDEX])
940 bix = maxbix(li->li_height);
941 else
942 return seek_holedata_loop(inode, bix, 1);
943 }
944
945 return bix;
946}
947
948/**
949 * logfs_seek_data - find next data block after a given block index
950 * @inode: inode to search in
951 * @bix: block index to start searching
952 *
953 * Returns next data block. If the file doesn't contain any further data
954 * blocks, the last block in the file is returned instead.
955 */
956u64 logfs_seek_data(struct inode *inode, u64 bix)
957{
958 struct super_block *sb = inode->i_sb;
959 u64 ret, end;
960
961 ret = __logfs_seek_data(inode, bix);
962 end = i_size_read(inode) >> sb->s_blocksize_bits;
963 if (ret >= end)
964 ret = max(bix, end);
965 return ret;
966}
967
968static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
969{
970 return pure_ofs(li->li_data[bix]) == ofs;
971}
972
973static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
974 u64 ofs, u64 bofs)
975{
976 struct logfs_inode *li = logfs_inode(inode);
977 level_t level;
978 int ret;
979 struct page *page;
980
981 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
982 page = logfs_get_write_page(inode, bix, level);
983 BUG_ON(!page);
984
985 ret = logfs_segment_read(inode, page, bofs, bix, level);
986 if (ret) {
987 logfs_put_write_page(page);
988 return 0;
989 }
990
991 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
992 logfs_put_write_page(page);
993 if (!bofs)
994 return 0;
995
996 if (pure_ofs(bofs) == ofs)
997 return 1;
998 }
999 return 0;
1000}
1001
1002static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
1003{
1004 struct logfs_inode *li = logfs_inode(inode);
1005 u64 bofs = li->li_data[INDIRECT_INDEX];
1006
1007 if (!bofs)
1008 return 0;
1009
1010 if (bix >= maxbix(li->li_height))
1011 return 0;
1012
1013 if (pure_ofs(bofs) == ofs)
1014 return 1;
1015
1016 return __logfs_is_valid_loop(inode, bix, ofs, bofs);
1017}
1018
1019static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1020{
1021 struct logfs_inode *li = logfs_inode(inode);
1022
1023 if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1024 return 0;
1025
1026 if (bix < I0_BLOCKS)
1027 return logfs_is_valid_direct(li, bix, ofs);
1028 return logfs_is_valid_loop(inode, bix, ofs);
1029}
1030
1031/**
1032 * logfs_is_valid_block - check whether this block is still valid
1033 *
1034 * @sb - superblock
1035 * @ofs - block physical offset
1036 * @ino - block inode number
1037 * @bix - block index
1038 * @level - block level
1039 *
1040 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1041 * become invalid once the journal is written.
1042 */
1043int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1044 gc_level_t gc_level)
1045{
1046 struct logfs_super *super = logfs_super(sb);
1047 struct inode *inode;
1048 int ret, cookie;
1049
1050 /* Umount closes a segment with free blocks remaining. Those
1051 * blocks are by definition invalid. */
1052 if (ino == -1)
1053 return 0;
1054
1055 LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1056
1057 inode = logfs_safe_iget(sb, ino, &cookie);
1058 if (IS_ERR(inode))
1059 goto invalid;
1060
1061 ret = __logfs_is_valid_block(inode, bix, ofs);
1062 logfs_safe_iput(inode, cookie);
1063 if (ret)
1064 return ret;
1065
1066invalid:
1067 /* Block is nominally invalid, but may still sit in the shadow tree,
1068 * waiting for a journal commit.
1069 */
1070 if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1071 return 2;
1072 return 0;
1073}
1074
1075int logfs_readpage_nolock(struct page *page)
1076{
1077 struct inode *inode = page->mapping->host;
1078 int ret = -EIO;
1079
1080 ret = logfs_read_block(inode, page, READ);
1081
1082 if (ret) {
1083 ClearPageUptodate(page);
1084 SetPageError(page);
1085 } else {
1086 SetPageUptodate(page);
1087 ClearPageError(page);
1088 }
1089 flush_dcache_page(page);
1090
1091 return ret;
1092}
1093
1094static int logfs_reserve_bytes(struct inode *inode, int bytes)
1095{
1096 struct logfs_super *super = logfs_super(inode->i_sb);
1097 u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1098 - super->s_dirty_used_bytes - super->s_dirty_pages;
1099
1100 if (!bytes)
1101 return 0;
1102
1103 if (available < bytes)
1104 return -ENOSPC;
1105
1106 if (available < bytes + super->s_root_reserve &&
1107 !capable(CAP_SYS_RESOURCE))
1108 return -ENOSPC;
1109
1110 return 0;
1111}
1112
1113int get_page_reserve(struct inode *inode, struct page *page)
1114{
1115 struct logfs_super *super = logfs_super(inode->i_sb);
1116 int ret;
1117
1118 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1119 return 0;
1120
1121 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1122 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
1123 if (!ret) {
1124 alloc_data_block(inode, page);
1125 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1126 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1127 }
1128 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1129 return ret;
1130}
1131
1132/*
1133 * We are protected by write lock. Push victims up to superblock level
1134 * and release transaction when appropriate.
1135 */
1136/* FIXME: This is currently called from the wrong spots. */
1137static void logfs_handle_transaction(struct inode *inode,
1138 struct logfs_transaction *ta)
1139{
1140 struct logfs_super *super = logfs_super(inode->i_sb);
1141
1142 if (!ta)
1143 return;
1144 logfs_inode(inode)->li_block->ta = NULL;
1145
1146 if (inode->i_ino != LOGFS_INO_MASTER) {
1147 BUG(); /* FIXME: Yes, this needs more thought */
1148 /* just remember the transaction until inode is written */
1149 //BUG_ON(logfs_inode(inode)->li_transaction);
1150 //logfs_inode(inode)->li_transaction = ta;
1151 return;
1152 }
1153
1154 switch (ta->state) {
1155 case CREATE_1: /* fall through */
1156 case UNLINK_1:
1157 BUG_ON(super->s_victim_ino);
1158 super->s_victim_ino = ta->ino;
1159 break;
1160 case CREATE_2: /* fall through */
1161 case UNLINK_2:
1162 BUG_ON(super->s_victim_ino != ta->ino);
1163 super->s_victim_ino = 0;
1164 /* transaction ends here - free it */
1165 kfree(ta);
1166 break;
1167 case CROSS_RENAME_1:
1168 BUG_ON(super->s_rename_dir);
1169 BUG_ON(super->s_rename_pos);
1170 super->s_rename_dir = ta->dir;
1171 super->s_rename_pos = ta->pos;
1172 break;
1173 case CROSS_RENAME_2:
1174 BUG_ON(super->s_rename_dir != ta->dir);
1175 BUG_ON(super->s_rename_pos != ta->pos);
1176 super->s_rename_dir = 0;
1177 super->s_rename_pos = 0;
1178 kfree(ta);
1179 break;
1180 case TARGET_RENAME_1:
1181 BUG_ON(super->s_rename_dir);
1182 BUG_ON(super->s_rename_pos);
1183 BUG_ON(super->s_victim_ino);
1184 super->s_rename_dir = ta->dir;
1185 super->s_rename_pos = ta->pos;
1186 super->s_victim_ino = ta->ino;
1187 break;
1188 case TARGET_RENAME_2:
1189 BUG_ON(super->s_rename_dir != ta->dir);
1190 BUG_ON(super->s_rename_pos != ta->pos);
1191 BUG_ON(super->s_victim_ino != ta->ino);
1192 super->s_rename_dir = 0;
1193 super->s_rename_pos = 0;
1194 break;
1195 case TARGET_RENAME_3:
1196 BUG_ON(super->s_rename_dir);
1197 BUG_ON(super->s_rename_pos);
1198 BUG_ON(super->s_victim_ino != ta->ino);
1199 super->s_victim_ino = 0;
1200 kfree(ta);
1201 break;
1202 default:
1203 BUG();
1204 }
1205}
1206
1207/*
1208 * Not strictly a reservation, but rather a check that we still have enough
1209 * space to satisfy the write.
1210 */
1211static int logfs_reserve_blocks(struct inode *inode, int blocks)
1212{
1213 return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1214}
1215
1216struct write_control {
1217 u64 ofs;
1218 long flags;
1219};
1220
1221static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1222 level_t level, u64 old_ofs)
1223{
1224 struct logfs_super *super = logfs_super(inode->i_sb);
1225 struct logfs_shadow *shadow;
1226
1227 shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1228 memset(shadow, 0, sizeof(*shadow));
1229 shadow->ino = inode->i_ino;
1230 shadow->bix = bix;
1231 shadow->gc_level = expand_level(inode->i_ino, level);
1232 shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1233 return shadow;
1234}
1235
1236static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1237{
1238 struct logfs_super *super = logfs_super(inode->i_sb);
1239
1240 mempool_free(shadow, super->s_shadow_pool);
1241}
1242
1243/**
1244 * fill_shadow_tree - Propagate shadow tree changes due to a write
1245 * @inode: Inode owning the page
1246 * @page: Struct page that was written
1247 * @shadow: Shadow for the current write
1248 *
1249 * Writes in logfs can result in two semi-valid objects. The old object
1250 * is still valid as long as it can be reached by following pointers on
1251 * the medium. Only when writes propagate all the way up to the journal
1252 * has the new object safely replaced the old one.
1253 *
1254 * To handle this problem, a struct logfs_shadow is used to represent
1255 * every single write. It is attached to the indirect block, which is
1256 * marked dirty. When the indirect block is written, its shadows are
1257 * handed up to the next indirect block (or inode). Untimately they
1258 * will reach the master inode and be freed upon journal commit.
1259 *
1260 * This function handles a single step in the propagation. It adds the
1261 * shadow for the current write to the tree, along with any shadows in
1262 * the page's tree, in case it was an indirect block. If a page is
1263 * written, the inode parameter is left NULL, if an inode is written,
1264 * the page parameter is left NULL.
1265 */
1266static void fill_shadow_tree(struct inode *inode, struct page *page,
1267 struct logfs_shadow *shadow)
1268{
1269 struct logfs_super *super = logfs_super(inode->i_sb);
1270 struct logfs_block *block = logfs_block(page);
1271 struct shadow_tree *tree = &super->s_shadow_tree;
1272
1273 if (PagePrivate(page)) {
1274 if (block->alias_map)
1275 super->s_no_object_aliases -= bitmap_weight(
1276 block->alias_map, LOGFS_BLOCK_FACTOR);
1277 logfs_handle_transaction(inode, block->ta);
1278 block->ops->free_block(inode->i_sb, block);
1279 }
1280 if (shadow) {
1281 if (shadow->old_ofs)
1282 btree_insert64(&tree->old, shadow->old_ofs, shadow,
1283 GFP_NOFS);
1284 else
1285 btree_insert64(&tree->new, shadow->new_ofs, shadow,
1286 GFP_NOFS);
1287
1288 super->s_dirty_used_bytes += shadow->new_len;
1289 super->s_dirty_free_bytes += shadow->old_len;
1290 }
1291}
1292
1293static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1294 long child_no)
1295{
1296 struct logfs_super *super = logfs_super(sb);
1297
1298 if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1299 /* Aliases in the master inode are pointless. */
1300 return;
1301 }
1302
1303 if (!test_bit(child_no, block->alias_map)) {
1304 set_bit(child_no, block->alias_map);
1305 super->s_no_object_aliases++;
1306 }
1307 list_move_tail(&block->alias_list, &super->s_object_alias);
1308}
1309
1310/*
1311 * Object aliases can and often do change the size and occupied space of a
1312 * file. So not only do we have to change the pointers, we also have to
1313 * change inode->i_size and li->li_used_bytes. Which is done by setting
1314 * another two object aliases for the inode itself.
1315 */
1316static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1317{
1318 struct logfs_inode *li = logfs_inode(inode);
1319
1320 if (shadow->new_len == shadow->old_len)
1321 return;
1322
1323 alloc_inode_block(inode);
1324 li->li_used_bytes += shadow->new_len - shadow->old_len;
1325 __logfs_set_blocks(inode);
1326 logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1327 logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1328}
1329
1330static int logfs_write_i0(struct inode *inode, struct page *page,
1331 struct write_control *wc)
1332{
1333 struct logfs_shadow *shadow;
1334 u64 bix;
1335 level_t level;
1336 int full, err = 0;
1337
1338 logfs_unpack_index(page->index, &bix, &level);
1339 if (wc->ofs == 0)
1340 if (logfs_reserve_blocks(inode, 1))
1341 return -ENOSPC;
1342
1343 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1344 if (wc->flags & WF_WRITE)
1345 err = logfs_segment_write(inode, page, shadow);
1346 if (wc->flags & WF_DELETE)
1347 logfs_segment_delete(inode, shadow);
1348 if (err) {
1349 free_shadow(inode, shadow);
1350 return err;
1351 }
1352
1353 set_iused(inode, shadow);
1354 full = 1;
1355 if (level != 0) {
1356 alloc_indirect_block(inode, page, 0);
1357 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1358 }
1359 fill_shadow_tree(inode, page, shadow);
1360 wc->ofs = shadow->new_ofs;
1361 if (wc->ofs && full)
1362 wc->ofs |= LOGFS_FULLY_POPULATED;
1363 return 0;
1364}
1365
1366static int logfs_write_direct(struct inode *inode, struct page *page,
1367 long flags)
1368{
1369 struct logfs_inode *li = logfs_inode(inode);
1370 struct write_control wc = {
1371 .ofs = li->li_data[page->index],
1372 .flags = flags,
1373 };
1374 int err;
1375
1376 alloc_inode_block(inode);
1377
1378 err = logfs_write_i0(inode, page, &wc);
1379 if (err)
1380 return err;
1381
1382 li->li_data[page->index] = wc.ofs;
1383 logfs_set_alias(inode->i_sb, li->li_block,
1384 page->index + INODE_POINTER_OFS);
1385 return 0;
1386}
1387
1388static int ptr_change(u64 ofs, struct page *page)
1389{
1390 struct logfs_block *block = logfs_block(page);
1391 int empty0, empty1, full0, full1;
1392
1393 empty0 = ofs == 0;
1394 empty1 = block->partial == 0;
1395 if (empty0 != empty1)
1396 return 1;
1397
1398 /* The !! is necessary to shrink result to int */
1399 full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1400 full1 = block->full == LOGFS_BLOCK_FACTOR;
1401 if (full0 != full1)
1402 return 1;
1403 return 0;
1404}
1405
1406static int __logfs_write_rec(struct inode *inode, struct page *page,
1407 struct write_control *this_wc,
1408 pgoff_t bix, level_t target_level, level_t level)
1409{
1410 int ret, page_empty = 0;
1411 int child_no = get_bits(bix, SUBLEVEL(level));
1412 struct page *ipage;
1413 struct write_control child_wc = {
1414 .flags = this_wc->flags,
1415 };
1416
1417 ipage = logfs_get_write_page(inode, bix, level);
1418 if (!ipage)
1419 return -ENOMEM;
1420
1421 if (this_wc->ofs) {
1422 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1423 if (ret)
1424 goto out;
1425 } else if (!PageUptodate(ipage)) {
1426 page_empty = 1;
1427 logfs_read_empty(ipage);
1428 }
1429
1430 child_wc.ofs = block_get_pointer(ipage, child_no);
1431
1432 if ((__force u8)level-1 > (__force u8)target_level)
1433 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1434 target_level, SUBLEVEL(level));
1435 else
1436 ret = logfs_write_i0(inode, page, &child_wc);
1437
1438 if (ret)
1439 goto out;
1440
1441 alloc_indirect_block(inode, ipage, page_empty);
1442 block_set_pointer(ipage, child_no, child_wc.ofs);
1443 /* FIXME: first condition seems superfluous */
1444 if (child_wc.ofs || logfs_block(ipage)->partial)
1445 this_wc->flags |= WF_WRITE;
1446 /* the condition on this_wc->ofs ensures that we won't consume extra
1447 * space for indirect blocks in the future, which we cannot reserve */
1448 if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1449 ret = logfs_write_i0(inode, ipage, this_wc);
1450 else
1451 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1452out:
1453 logfs_put_write_page(ipage);
1454 return ret;
1455}
1456
1457static int logfs_write_rec(struct inode *inode, struct page *page,
1458 pgoff_t bix, level_t target_level, long flags)
1459{
1460 struct logfs_inode *li = logfs_inode(inode);
1461 struct write_control wc = {
1462 .ofs = li->li_data[INDIRECT_INDEX],
1463 .flags = flags,
1464 };
1465 int ret;
1466
1467 alloc_inode_block(inode);
1468
1469 if (li->li_height > (__force u8)target_level)
1470 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1471 LEVEL(li->li_height));
1472 else
1473 ret = logfs_write_i0(inode, page, &wc);
1474 if (ret)
1475 return ret;
1476
1477 if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1478 li->li_data[INDIRECT_INDEX] = wc.ofs;
1479 logfs_set_alias(inode->i_sb, li->li_block,
1480 INDIRECT_INDEX + INODE_POINTER_OFS);
1481 }
1482 return ret;
1483}
1484
1485void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1486{
1487 alloc_inode_block(inode);
1488 logfs_inode(inode)->li_block->ta = ta;
1489}
1490
1491void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1492{
1493 struct logfs_block *block = logfs_inode(inode)->li_block;
1494
1495 if (block && block->ta)
1496 block->ta = NULL;
1497}
1498
1499static int grow_inode(struct inode *inode, u64 bix, level_t level)
1500{
1501 struct logfs_inode *li = logfs_inode(inode);
1502 u8 height = (__force u8)level;
1503 struct page *page;
1504 struct write_control wc = {
1505 .flags = WF_WRITE,
1506 };
1507 int err;
1508
1509 BUG_ON(height > 5 || li->li_height > 5);
1510 while (height > li->li_height || bix >= maxbix(li->li_height)) {
1511 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1512 LEVEL(li->li_height + 1));
1513 if (!page)
1514 return -ENOMEM;
1515 logfs_read_empty(page);
1516 alloc_indirect_block(inode, page, 1);
1517 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1518 err = logfs_write_i0(inode, page, &wc);
1519 logfs_put_write_page(page);
1520 if (err)
1521 return err;
1522 li->li_data[INDIRECT_INDEX] = wc.ofs;
1523 wc.ofs = 0;
1524 li->li_height++;
1525 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1526 }
1527 return 0;
1528}
1529
1530static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1531{
1532 struct logfs_super *super = logfs_super(inode->i_sb);
1533 pgoff_t index = page->index;
1534 u64 bix;
1535 level_t level;
1536 int err;
1537
1538 flags |= WF_WRITE | WF_DELETE;
1539 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1540
1541 logfs_unpack_index(index, &bix, &level);
1542 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1543 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1544
1545 if (index < I0_BLOCKS)
1546 return logfs_write_direct(inode, page, flags);
1547
1548 bix = adjust_bix(bix, level);
1549 err = grow_inode(inode, bix, level);
1550 if (err)
1551 return err;
1552 return logfs_write_rec(inode, page, bix, level, flags);
1553}
1554
1555int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1556{
1557 struct super_block *sb = inode->i_sb;
1558 int ret;
1559
1560 logfs_get_wblocks(sb, page, flags & WF_LOCK);
1561 ret = __logfs_write_buf(inode, page, flags);
1562 logfs_put_wblocks(sb, page, flags & WF_LOCK);
1563 return ret;
1564}
1565
1566static int __logfs_delete(struct inode *inode, struct page *page)
1567{
1568 long flags = WF_DELETE;
1569
1570 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1571
1572 if (page->index < I0_BLOCKS)
1573 return logfs_write_direct(inode, page, flags);
1574 return logfs_write_rec(inode, page, page->index, 0, flags);
1575}
1576
1577int logfs_delete(struct inode *inode, pgoff_t index,
1578 struct shadow_tree *shadow_tree)
1579{
1580 struct super_block *sb = inode->i_sb;
1581 struct page *page;
1582 int ret;
1583
1584 page = logfs_get_read_page(inode, index, 0);
1585 if (!page)
1586 return -ENOMEM;
1587
1588 logfs_get_wblocks(sb, page, 1);
1589 ret = __logfs_delete(inode, page);
1590 logfs_put_wblocks(sb, page, 1);
1591
1592 logfs_put_read_page(page);
1593
1594 return ret;
1595}
1596
1597int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1598 gc_level_t gc_level, long flags)
1599{
1600 level_t level = shrink_level(gc_level);
1601 struct page *page;
1602 int err;
1603
1604 page = logfs_get_write_page(inode, bix, level);
1605 if (!page)
1606 return -ENOMEM;
1607
1608 err = logfs_segment_read(inode, page, ofs, bix, level);
1609 if (!err) {
1610 if (level != 0)
1611 alloc_indirect_block(inode, page, 0);
1612 err = logfs_write_buf(inode, page, flags);
1613 if (!err && shrink_level(gc_level) == 0) {
1614 /* Rewrite cannot mark the inode dirty but has to
1615 * write it immediatly.
1616 * Q: Can't we just create an alias for the inode
1617 * instead? And if not, why not?
1618 */
1619 if (inode->i_ino == LOGFS_INO_MASTER)
1620 logfs_write_anchor(inode->i_sb);
1621 else {
1622 err = __logfs_write_inode(inode, flags);
1623 }
1624 }
1625 }
1626 logfs_put_write_page(page);
1627 return err;
1628}
1629
1630static int truncate_data_block(struct inode *inode, struct page *page,
1631 u64 ofs, struct logfs_shadow *shadow, u64 size)
1632{
1633 loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1634 u64 bix;
1635 level_t level;
1636 int err;
1637
1638 /* Does truncation happen within this page? */
1639 if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1640 return 0;
1641
1642 logfs_unpack_index(page->index, &bix, &level);
1643 BUG_ON(level != 0);
1644
1645 err = logfs_segment_read(inode, page, ofs, bix, level);
1646 if (err)
1647 return err;
1648
1649 zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
1650 return logfs_segment_write(inode, page, shadow);
1651}
1652
1653static int logfs_truncate_i0(struct inode *inode, struct page *page,
1654 struct write_control *wc, u64 size)
1655{
1656 struct logfs_shadow *shadow;
1657 u64 bix;
1658 level_t level;
1659 int err = 0;
1660
1661 logfs_unpack_index(page->index, &bix, &level);
1662 BUG_ON(level != 0);
1663 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1664
1665 err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1666 if (err) {
1667 free_shadow(inode, shadow);
1668 return err;
1669 }
1670
1671 logfs_segment_delete(inode, shadow);
1672 set_iused(inode, shadow);
1673 fill_shadow_tree(inode, page, shadow);
1674 wc->ofs = shadow->new_ofs;
1675 return 0;
1676}
1677
1678static int logfs_truncate_direct(struct inode *inode, u64 size)
1679{
1680 struct logfs_inode *li = logfs_inode(inode);
1681 struct write_control wc;
1682 struct page *page;
1683 int e;
1684 int err;
1685
1686 alloc_inode_block(inode);
1687
1688 for (e = I0_BLOCKS - 1; e >= 0; e--) {
1689 if (size > (e+1) * LOGFS_BLOCKSIZE)
1690 break;
1691
1692 wc.ofs = li->li_data[e];
1693 if (!wc.ofs)
1694 continue;
1695
1696 page = logfs_get_write_page(inode, e, 0);
1697 if (!page)
1698 return -ENOMEM;
1699 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1700 if (err) {
1701 logfs_put_write_page(page);
1702 return err;
1703 }
1704 err = logfs_truncate_i0(inode, page, &wc, size);
1705 logfs_put_write_page(page);
1706 if (err)
1707 return err;
1708
1709 li->li_data[e] = wc.ofs;
1710 }
1711 return 0;
1712}
1713
1714/* FIXME: these need to become per-sb once we support different blocksizes */
1715static u64 __logfs_step[] = {
1716 1,
1717 I1_BLOCKS,
1718 I2_BLOCKS,
1719 I3_BLOCKS,
1720};
1721
1722static u64 __logfs_start_index[] = {
1723 I0_BLOCKS,
1724 I1_BLOCKS,
1725 I2_BLOCKS,
1726 I3_BLOCKS
1727};
1728
1729static inline u64 logfs_step(level_t level)
1730{
1731 return __logfs_step[(__force u8)level];
1732}
1733
1734static inline u64 logfs_factor(u8 level)
1735{
1736 return __logfs_step[level] * LOGFS_BLOCKSIZE;
1737}
1738
1739static inline u64 logfs_start_index(level_t level)
1740{
1741 return __logfs_start_index[(__force u8)level];
1742}
1743
1744static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1745{
1746 logfs_unpack_index(index, bix, level);
1747 if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1748 *bix = 0;
1749}
1750
1751static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1752 struct write_control *this_wc, u64 size)
1753{
1754 int truncate_happened = 0;
1755 int e, err = 0;
1756 u64 bix, child_bix, next_bix;
1757 level_t level;
1758 struct page *page;
1759 struct write_control child_wc = { /* FIXME: flags */ };
1760
1761 logfs_unpack_raw_index(ipage->index, &bix, &level);
1762 err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1763 if (err)
1764 return err;
1765
1766 for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1767 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1768 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1769 if (size > next_bix * LOGFS_BLOCKSIZE)
1770 break;
1771
1772 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1773 if (!child_wc.ofs)
1774 continue;
1775
1776 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1777 if (!page)
1778 return -ENOMEM;
1779
1780 if ((__force u8)level > 1)
1781 err = __logfs_truncate_rec(inode, page, &child_wc, size);
1782 else
1783 err = logfs_truncate_i0(inode, page, &child_wc, size);
1784 logfs_put_write_page(page);
1785 if (err)
1786 return err;
1787
1788 truncate_happened = 1;
1789 alloc_indirect_block(inode, ipage, 0);
1790 block_set_pointer(ipage, e, child_wc.ofs);
1791 }
1792
1793 if (!truncate_happened) {
1794 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1795 return 0;
1796 }
1797
1798 this_wc->flags = WF_DELETE;
1799 if (logfs_block(ipage)->partial)
1800 this_wc->flags |= WF_WRITE;
1801
1802 return logfs_write_i0(inode, ipage, this_wc);
1803}
1804
1805static int logfs_truncate_rec(struct inode *inode, u64 size)
1806{
1807 struct logfs_inode *li = logfs_inode(inode);
1808 struct write_control wc = {
1809 .ofs = li->li_data[INDIRECT_INDEX],
1810 };
1811 struct page *page;
1812 int err;
1813
1814 alloc_inode_block(inode);
1815
1816 if (!wc.ofs)
1817 return 0;
1818
1819 page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1820 if (!page)
1821 return -ENOMEM;
1822
1823 err = __logfs_truncate_rec(inode, page, &wc, size);
1824 logfs_put_write_page(page);
1825 if (err)
1826 return err;
1827
1828 if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1829 li->li_data[INDIRECT_INDEX] = wc.ofs;
1830 return 0;
1831}
1832
1833static int __logfs_truncate(struct inode *inode, u64 size)
1834{
1835 int ret;
1836
1837 if (size >= logfs_factor(logfs_inode(inode)->li_height))
1838 return 0;
1839
1840 ret = logfs_truncate_rec(inode, size);
1841 if (ret)
1842 return ret;
1843
1844 return logfs_truncate_direct(inode, size);
1845}
1846
1847int logfs_truncate(struct inode *inode, u64 size)
1848{
1849 struct super_block *sb = inode->i_sb;
1850 int err;
1851
1852 logfs_get_wblocks(sb, NULL, 1);
1853 err = __logfs_truncate(inode, size);
1854 if (!err)
1855 err = __logfs_write_inode(inode, 0);
1856 logfs_put_wblocks(sb, NULL, 1);
1857
1858 if (!err)
1859 err = vmtruncate(inode, size);
1860
1861 /* I don't trust error recovery yet. */
1862 WARN_ON(err);
1863 return err;
1864}
1865
1866static void move_page_to_inode(struct inode *inode, struct page *page)
1867{
1868 struct logfs_inode *li = logfs_inode(inode);
1869 struct logfs_block *block = logfs_block(page);
1870
1871 if (!block)
1872 return;
1873
1874 log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1875 block->ino, block->bix, block->level);
1876 BUG_ON(li->li_block);
1877 block->ops = &inode_block_ops;
1878 block->inode = inode;
1879 li->li_block = block;
1880
1881 block->page = NULL;
1882 page->private = 0;
1883 ClearPagePrivate(page);
1884}
1885
1886static void move_inode_to_page(struct page *page, struct inode *inode)
1887{
1888 struct logfs_inode *li = logfs_inode(inode);
1889 struct logfs_block *block = li->li_block;
1890
1891 if (!block)
1892 return;
1893
1894 log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1895 block->ino, block->bix, block->level);
1896 BUG_ON(PagePrivate(page));
1897 block->ops = &indirect_block_ops;
1898 block->page = page;
1899 page->private = (unsigned long)block;
1900 SetPagePrivate(page);
1901
1902 block->inode = NULL;
1903 li->li_block = NULL;
1904}
1905
1906int logfs_read_inode(struct inode *inode)
1907{
1908 struct super_block *sb = inode->i_sb;
1909 struct logfs_super *super = logfs_super(sb);
1910 struct inode *master_inode = super->s_master_inode;
1911 struct page *page;
1912 struct logfs_disk_inode *di;
1913 u64 ino = inode->i_ino;
1914
1915 if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1916 return -ENODATA;
1917 if (!logfs_exist_block(master_inode, ino))
1918 return -ENODATA;
1919
1920 page = read_cache_page(master_inode->i_mapping, ino,
1921 (filler_t *)logfs_readpage, NULL);
1922 if (IS_ERR(page))
1923 return PTR_ERR(page);
1924
1925 di = kmap_atomic(page, KM_USER0);
1926 logfs_disk_to_inode(di, inode);
1927 kunmap_atomic(di, KM_USER0);
1928 move_page_to_inode(inode, page);
1929 page_cache_release(page);
1930 return 0;
1931}
1932
1933/* Caller must logfs_put_write_page(page); */
1934static struct page *inode_to_page(struct inode *inode)
1935{
1936 struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1937 struct logfs_disk_inode *di;
1938 struct page *page;
1939
1940 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1941
1942 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1943 if (!page)
1944 return NULL;
1945
1946 di = kmap_atomic(page, KM_USER0);
1947 logfs_inode_to_disk(inode, di);
1948 kunmap_atomic(di, KM_USER0);
1949 move_inode_to_page(page, inode);
1950 return page;
1951}
1952
1953/* Cheaper version of write_inode. All changes are concealed in
1954 * aliases, which are moved back. No write to the medium happens.
1955 */
1956void logfs_clear_inode(struct inode *inode)
1957{
1958 struct super_block *sb = inode->i_sb;
1959 struct logfs_inode *li = logfs_inode(inode);
1960 struct logfs_block *block = li->li_block;
1961 struct page *page;
1962
1963 /* Only deleted files may be dirty at this point */
1964 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1965 if (!block)
1966 return;
1967 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1968 block->ops->free_block(inode->i_sb, block);
1969 return;
1970 }
1971
1972 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1973 page = inode_to_page(inode);
1974 BUG_ON(!page); /* FIXME: Use emergency page */
1975 logfs_put_write_page(page);
1976}
1977
1978static int do_write_inode(struct inode *inode)
1979{
1980 struct super_block *sb = inode->i_sb;
1981 struct inode *master_inode = logfs_super(sb)->s_master_inode;
1982 loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
1983 struct page *page;
1984 int err;
1985
1986 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1987 /* FIXME: lock inode */
1988
1989 if (i_size_read(master_inode) < size)
1990 i_size_write(master_inode, size);
1991
1992 /* TODO: Tell vfs this inode is clean now */
1993
1994 page = inode_to_page(inode);
1995 if (!page)
1996 return -ENOMEM;
1997
1998 /* FIXME: transaction is part of logfs_block now. Is that enough? */
1999 err = logfs_write_buf(master_inode, page, 0);
2000 logfs_put_write_page(page);
2001 return err;
2002}
2003
2004static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
2005 int write,
2006 void (*change_se)(struct logfs_segment_entry *, long),
2007 long arg)
2008{
2009 struct logfs_super *super = logfs_super(sb);
2010 struct inode *inode;
2011 struct page *page;
2012 struct logfs_segment_entry *se;
2013 pgoff_t page_no;
2014 int child_no;
2015
2016 page_no = segno >> (sb->s_blocksize_bits - 3);
2017 child_no = segno & ((sb->s_blocksize >> 3) - 1);
2018
2019 inode = super->s_segfile_inode;
2020 page = logfs_get_write_page(inode, page_no, 0);
2021 BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2022 if (!PageUptodate(page))
2023 logfs_read_block(inode, page, WRITE);
2024
2025 if (write)
2026 alloc_indirect_block(inode, page, 0);
2027 se = kmap_atomic(page, KM_USER0);
2028 change_se(se + child_no, arg);
2029 if (write) {
2030 logfs_set_alias(sb, logfs_block(page), child_no);
2031 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2032 }
2033 kunmap_atomic(se, KM_USER0);
2034
2035 logfs_put_write_page(page);
2036}
2037
2038static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2039{
2040 struct logfs_segment_entry *target = (void *)_target;
2041
2042 *target = *se;
2043}
2044
2045void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2046 struct logfs_segment_entry *se)
2047{
2048 logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2049}
2050
2051static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2052{
2053 u32 valid;
2054
2055 valid = be32_to_cpu(se->valid);
2056 valid += increment;
2057 se->valid = cpu_to_be32(valid);
2058}
2059
2060void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2061{
2062 struct logfs_super *super = logfs_super(sb);
2063 u32 segno = ofs >> super->s_segshift;
2064
2065 if (!increment)
2066 return;
2067
2068 logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2069}
2070
2071static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2072{
2073 se->ec_level = cpu_to_be32(ec_level);
2074}
2075
2076void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2077 gc_level_t gc_level)
2078{
2079 u32 ec_level = ec << 4 | (__force u8)gc_level;
2080
2081 logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2082}
2083
2084static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2085{
2086 se->valid = cpu_to_be32(RESERVED);
2087}
2088
2089void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2090{
2091 logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2092}
2093
2094static void __set_segment_unreserved(struct logfs_segment_entry *se,
2095 long ec_level)
2096{
2097 se->valid = 0;
2098 se->ec_level = cpu_to_be32(ec_level);
2099}
2100
2101void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2102{
2103 u32 ec_level = ec << 4;
2104
2105 logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2106 ec_level);
2107}
2108
2109int __logfs_write_inode(struct inode *inode, long flags)
2110{
2111 struct super_block *sb = inode->i_sb;
2112 int ret;
2113
2114 logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
2115 ret = do_write_inode(inode);
2116 logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
2117 return ret;
2118}
2119
2120static int do_delete_inode(struct inode *inode)
2121{
2122 struct super_block *sb = inode->i_sb;
2123 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2124 struct page *page;
2125 int ret;
2126
2127 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2128 if (!page)
2129 return -ENOMEM;
2130
2131 move_inode_to_page(page, inode);
2132
2133 logfs_get_wblocks(sb, page, 1);
2134 ret = __logfs_delete(master_inode, page);
2135 logfs_put_wblocks(sb, page, 1);
2136
2137 logfs_put_write_page(page);
2138 return ret;
2139}
2140
2141/*
2142 * ZOMBIE inodes have already been deleted before and should remain dead,
2143 * if it weren't for valid checking. No need to kill them again here.
2144 */
2145void logfs_delete_inode(struct inode *inode)
2146{
2147 struct logfs_inode *li = logfs_inode(inode);
2148
2149 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2150 li->li_flags |= LOGFS_IF_ZOMBIE;
2151 if (i_size_read(inode) > 0)
2152 logfs_truncate(inode, 0);
2153 do_delete_inode(inode);
2154 }
2155 truncate_inode_pages(&inode->i_data, 0);
2156 clear_inode(inode);
2157}
2158
2159void btree_write_block(struct logfs_block *block)
2160{
2161 struct inode *inode;
2162 struct page *page;
2163 int err, cookie;
2164
2165 inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2166 page = logfs_get_write_page(inode, block->bix, block->level);
2167
2168 err = logfs_readpage_nolock(page);
2169 BUG_ON(err);
2170 BUG_ON(!PagePrivate(page));
2171 BUG_ON(logfs_block(page) != block);
2172 err = __logfs_write_buf(inode, page, 0);
2173 BUG_ON(err);
2174 BUG_ON(PagePrivate(page) || page->private);
2175
2176 logfs_put_write_page(page);
2177 logfs_safe_iput(inode, cookie);
2178}
2179
2180/**
2181 * logfs_inode_write - write inode or dentry objects
2182 *
2183 * @inode: parent inode (ifile or directory)
2184 * @buf: object to write (inode or dentry)
2185 * @n: object size
2186 * @_pos: object number (file position in blocks/objects)
2187 * @flags: write flags
2188 * @lock: 0 if write lock is already taken, 1 otherwise
2189 * @shadow_tree: shadow below this inode
2190 *
2191 * FIXME: All caller of this put a 200-300 byte variable on the stack,
2192 * only to call here and do a memcpy from that stack variable. A good
2193 * example of wasted performance and stack space.
2194 */
2195int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2196 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2197{
2198 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2199 int err;
2200 struct page *page;
2201 void *pagebuf;
2202
2203 BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2204 BUG_ON(count > LOGFS_BLOCKSIZE);
2205 page = logfs_get_write_page(inode, bix, 0);
2206 if (!page)
2207 return -ENOMEM;
2208
2209 pagebuf = kmap_atomic(page, KM_USER0);
2210 memcpy(pagebuf, buf, count);
2211 flush_dcache_page(page);
2212 kunmap_atomic(pagebuf, KM_USER0);
2213
2214 if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2215 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2216
2217 err = logfs_write_buf(inode, page, flags);
2218 logfs_put_write_page(page);
2219 return err;
2220}
2221
2222int logfs_open_segfile(struct super_block *sb)
2223{
2224 struct logfs_super *super = logfs_super(sb);
2225 struct inode *inode;
2226
2227 inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2228 if (IS_ERR(inode))
2229 return PTR_ERR(inode);
2230 super->s_segfile_inode = inode;
2231 return 0;
2232}
2233
2234int logfs_init_rw(struct super_block *sb)
2235{
2236 struct logfs_super *super = logfs_super(sb);
2237 int min_fill = 3 * super->s_no_blocks;
2238
2239 INIT_LIST_HEAD(&super->s_object_alias);
2240 mutex_init(&super->s_write_mutex);
2241 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2242 sizeof(struct logfs_block));
2243 super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2244 sizeof(struct logfs_shadow));
2245 return 0;
2246}
2247
2248void logfs_cleanup_rw(struct super_block *sb)
2249{
2250 struct logfs_super *super = logfs_super(sb);
2251
2252 destroy_meta_inode(super->s_segfile_inode);
2253 if (super->s_block_pool)
2254 mempool_destroy(super->s_block_pool);
2255 if (super->s_shadow_pool)
2256 mempool_destroy(super->s_shadow_pool);
2257}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..0ecd8f07c11e
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,935 @@
1/*
2 * fs/logfs/segment.c - Handling the Object Store
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Object store or ostore makes up the complete device with exception of
9 * the superblock and journal areas. Apart from its own metadata it stores
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */
12#include "logfs.h"
13
14static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
15{
16 struct logfs_super *super = logfs_super(sb);
17 struct btree_head32 *head = &super->s_reserved_segments;
18 int err;
19
20 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
21 if (err)
22 return err;
23 logfs_super(sb)->s_bad_segments++;
24 /* FIXME: write to journal */
25 return 0;
26}
27
28int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
29{
30 struct logfs_super *super = logfs_super(sb);
31
32 super->s_gec++;
33
34 return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
35 super->s_segsize, ensure_erase);
36}
37
38static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
39{
40 s32 ofs;
41
42 logfs_open_area(area, bytes);
43
44 ofs = area->a_used_bytes;
45 area->a_used_bytes += bytes;
46 BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
47
48 return dev_ofs(area->a_sb, area->a_segno, ofs);
49}
50
51static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
52 int use_filler)
53{
54 struct logfs_super *super = logfs_super(sb);
55 struct address_space *mapping = super->s_mapping_inode->i_mapping;
56 filler_t *filler = super->s_devops->readpage;
57 struct page *page;
58
59 BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
60 if (use_filler)
61 page = read_cache_page(mapping, index, filler, sb);
62 else {
63 page = find_or_create_page(mapping, index, GFP_NOFS);
64 unlock_page(page);
65 }
66 return page;
67}
68
69void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
70 int use_filler)
71{
72 pgoff_t index = ofs >> PAGE_SHIFT;
73 struct page *page;
74 long offset = ofs & (PAGE_SIZE-1);
75 long copylen;
76
77 /* Only logfs_wbuf_recover may use len==0 */
78 BUG_ON(!len && !use_filler);
79 do {
80 copylen = min((ulong)len, PAGE_SIZE - offset);
81
82 page = get_mapping_page(area->a_sb, index, use_filler);
83 SetPageUptodate(page);
84 BUG_ON(!page); /* FIXME: reserve a pool */
85 memcpy(page_address(page) + offset, buf, copylen);
86 SetPagePrivate(page);
87 page_cache_release(page);
88
89 buf += copylen;
90 len -= copylen;
91 offset = 0;
92 index++;
93 } while (len);
94}
95
96static void pad_partial_page(struct logfs_area *area)
97{
98 struct super_block *sb = area->a_sb;
99 struct page *page;
100 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
101 pgoff_t index = ofs >> PAGE_SHIFT;
102 long offset = ofs & (PAGE_SIZE-1);
103 u32 len = PAGE_SIZE - offset;
104
105 if (len % PAGE_SIZE) {
106 page = get_mapping_page(sb, index, 0);
107 BUG_ON(!page); /* FIXME: reserve a pool */
108 memset(page_address(page) + offset, 0xff, len);
109 SetPagePrivate(page);
110 page_cache_release(page);
111 }
112}
113
114static void pad_full_pages(struct logfs_area *area)
115{
116 struct super_block *sb = area->a_sb;
117 struct logfs_super *super = logfs_super(sb);
118 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
119 u32 len = super->s_segsize - area->a_used_bytes;
120 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
121 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
122 struct page *page;
123
124 while (no_indizes) {
125 page = get_mapping_page(sb, index, 0);
126 BUG_ON(!page); /* FIXME: reserve a pool */
127 SetPageUptodate(page);
128 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
129 SetPagePrivate(page);
130 page_cache_release(page);
131 index++;
132 no_indizes--;
133 }
134}
135
136/*
137 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
138 * Also make sure we allocate (and memset) all pages for final writeout.
139 */
140static void pad_wbuf(struct logfs_area *area, int final)
141{
142 pad_partial_page(area);
143 if (final)
144 pad_full_pages(area);
145}
146
147/*
148 * We have to be careful with the alias tree. Since lookup is done by bix,
149 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
150 * indirect blocks. So always use it through accessor functions.
151 */
152static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
153 level_t level)
154{
155 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
156 pgoff_t index = logfs_pack_index(bix, level);
157
158 return btree_lookup128(head, ino, index);
159}
160
161static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
162 level_t level, void *val)
163{
164 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
165 pgoff_t index = logfs_pack_index(bix, level);
166
167 return btree_insert128(head, ino, index, val, GFP_NOFS);
168}
169
170static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
171 write_alias_t *write_one_alias)
172{
173 struct object_alias_item *item;
174 int err;
175
176 list_for_each_entry(item, &block->item_list, list) {
177 err = write_alias_journal(sb, block->ino, block->bix,
178 block->level, item->child_no, item->val);
179 if (err)
180 return err;
181 }
182 return 0;
183}
184
185static gc_level_t btree_block_level(struct logfs_block *block)
186{
187 return expand_level(block->ino, block->level);
188}
189
190static struct logfs_block_ops btree_block_ops = {
191 .write_block = btree_write_block,
192 .block_level = btree_block_level,
193 .free_block = __free_block,
194 .write_alias = btree_write_alias,
195};
196
197int logfs_load_object_aliases(struct super_block *sb,
198 struct logfs_obj_alias *oa, int count)
199{
200 struct logfs_super *super = logfs_super(sb);
201 struct logfs_block *block;
202 struct object_alias_item *item;
203 u64 ino, bix;
204 level_t level;
205 int i, err;
206
207 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
208 count /= sizeof(*oa);
209 for (i = 0; i < count; i++) {
210 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
211 if (!item)
212 return -ENOMEM;
213 memset(item, 0, sizeof(*item));
214
215 super->s_no_object_aliases++;
216 item->val = oa[i].val;
217 item->child_no = be16_to_cpu(oa[i].child_no);
218
219 ino = be64_to_cpu(oa[i].ino);
220 bix = be64_to_cpu(oa[i].bix);
221 level = LEVEL(oa[i].level);
222
223 log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
224 ino, bix, level, item->child_no,
225 be64_to_cpu(item->val));
226 block = alias_tree_lookup(sb, ino, bix, level);
227 if (!block) {
228 block = __alloc_block(sb, ino, bix, level);
229 block->ops = &btree_block_ops;
230 err = alias_tree_insert(sb, ino, bix, level, block);
231 BUG_ON(err); /* mempool empty */
232 }
233 if (test_and_set_bit(item->child_no, block->alias_map)) {
234 printk(KERN_ERR"LogFS: Alias collision detected\n");
235 return -EIO;
236 }
237 list_move_tail(&block->alias_list, &super->s_object_alias);
238 list_add(&item->list, &block->item_list);
239 }
240 return 0;
241}
242
243static void kill_alias(void *_block, unsigned long ignore0,
244 u64 ignore1, u64 ignore2, size_t ignore3)
245{
246 struct logfs_block *block = _block;
247 struct super_block *sb = block->sb;
248 struct logfs_super *super = logfs_super(sb);
249 struct object_alias_item *item;
250
251 while (!list_empty(&block->item_list)) {
252 item = list_entry(block->item_list.next, typeof(*item), list);
253 list_del(&item->list);
254 mempool_free(item, super->s_alias_pool);
255 }
256 block->ops->free_block(sb, block);
257}
258
259static int obj_type(struct inode *inode, level_t level)
260{
261 if (level == 0) {
262 if (S_ISDIR(inode->i_mode))
263 return OBJ_DENTRY;
264 if (inode->i_ino == LOGFS_INO_MASTER)
265 return OBJ_INODE;
266 }
267 return OBJ_BLOCK;
268}
269
270static int obj_len(struct super_block *sb, int obj_type)
271{
272 switch (obj_type) {
273 case OBJ_DENTRY:
274 return sizeof(struct logfs_disk_dentry);
275 case OBJ_INODE:
276 return sizeof(struct logfs_disk_inode);
277 case OBJ_BLOCK:
278 return sb->s_blocksize;
279 default:
280 BUG();
281 }
282}
283
284static int __logfs_segment_write(struct inode *inode, void *buf,
285 struct logfs_shadow *shadow, int type, int len, int compr)
286{
287 struct logfs_area *area;
288 struct super_block *sb = inode->i_sb;
289 s64 ofs;
290 struct logfs_object_header h;
291 int acc_len;
292
293 if (shadow->gc_level == 0)
294 acc_len = len;
295 else
296 acc_len = obj_len(sb, type);
297
298 area = get_area(sb, shadow->gc_level);
299 ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
300 LOGFS_BUG_ON(ofs <= 0, sb);
301 /*
302 * Order is important. logfs_get_free_bytes(), by modifying the
303 * segment file, may modify the content of the very page we're about
304 * to write now. Which is fine, as long as the calculated crc and
305 * written data still match. So do the modifications _before_
306 * calculating the crc.
307 */
308
309 h.len = cpu_to_be16(len);
310 h.type = type;
311 h.compr = compr;
312 h.ino = cpu_to_be64(inode->i_ino);
313 h.bix = cpu_to_be64(shadow->bix);
314 h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
315 h.data_crc = logfs_crc32(buf, len, 0);
316
317 logfs_buf_write(area, ofs, &h, sizeof(h));
318 logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
319
320 shadow->new_ofs = ofs;
321 shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
322
323 return 0;
324}
325
326static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
327 struct logfs_shadow *shadow, int type, int len)
328{
329 struct super_block *sb = inode->i_sb;
330 void *compressor_buf = logfs_super(sb)->s_compressed_je;
331 ssize_t compr_len;
332 int ret;
333
334 mutex_lock(&logfs_super(sb)->s_journal_mutex);
335 compr_len = logfs_compress(buf, compressor_buf, len, len);
336
337 if (compr_len >= 0) {
338 ret = __logfs_segment_write(inode, compressor_buf, shadow,
339 type, compr_len, COMPR_ZLIB);
340 } else {
341 ret = __logfs_segment_write(inode, buf, shadow, type, len,
342 COMPR_NONE);
343 }
344 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
345 return ret;
346}
347
348/**
349 * logfs_segment_write - write data block to object store
350 * @inode: inode containing data
351 *
352 * Returns an errno or zero.
353 */
354int logfs_segment_write(struct inode *inode, struct page *page,
355 struct logfs_shadow *shadow)
356{
357 struct super_block *sb = inode->i_sb;
358 struct logfs_super *super = logfs_super(sb);
359 int do_compress, type, len;
360 int ret;
361 void *buf;
362
363 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
364 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
365 do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
366 if (shadow->gc_level != 0) {
367 /* temporarily disable compression for indirect blocks */
368 do_compress = 0;
369 }
370
371 type = obj_type(inode, shrink_level(shadow->gc_level));
372 len = obj_len(sb, type);
373 buf = kmap(page);
374 if (do_compress)
375 ret = logfs_segment_write_compress(inode, buf, shadow, type,
376 len);
377 else
378 ret = __logfs_segment_write(inode, buf, shadow, type, len,
379 COMPR_NONE);
380 kunmap(page);
381
382 log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
383 shadow->ino, shadow->bix, shadow->gc_level,
384 shadow->old_ofs, shadow->new_ofs,
385 shadow->old_len, shadow->new_len);
386 /* this BUG_ON did catch a locking bug. useful */
387 BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
388 return ret;
389}
390
391int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
392{
393 pgoff_t index = ofs >> PAGE_SHIFT;
394 struct page *page;
395 long offset = ofs & (PAGE_SIZE-1);
396 long copylen;
397
398 while (len) {
399 copylen = min((ulong)len, PAGE_SIZE - offset);
400
401 page = get_mapping_page(sb, index, 1);
402 if (IS_ERR(page))
403 return PTR_ERR(page);
404 memcpy(buf, page_address(page) + offset, copylen);
405 page_cache_release(page);
406
407 buf += copylen;
408 len -= copylen;
409 offset = 0;
410 index++;
411 }
412 return 0;
413}
414
415/*
416 * The "position" of indirect blocks is ambiguous. It can be the position
417 * of any data block somewhere behind this indirect block. So we need to
418 * normalize the positions through logfs_block_mask() before comparing.
419 */
420static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
421{
422 return (pos1 & logfs_block_mask(sb, level)) !=
423 (pos2 & logfs_block_mask(sb, level));
424}
425
426#if 0
427static int read_seg_header(struct super_block *sb, u64 ofs,
428 struct logfs_segment_header *sh)
429{
430 __be32 crc;
431 int err;
432
433 err = wbuf_read(sb, ofs, sizeof(*sh), sh);
434 if (err)
435 return err;
436 crc = logfs_crc32(sh, sizeof(*sh), 4);
437 if (crc != sh->crc) {
438 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
439 "got %x\n", ofs, be32_to_cpu(sh->crc),
440 be32_to_cpu(crc));
441 return -EIO;
442 }
443 return 0;
444}
445#endif
446
447static int read_obj_header(struct super_block *sb, u64 ofs,
448 struct logfs_object_header *oh)
449{
450 __be32 crc;
451 int err;
452
453 err = wbuf_read(sb, ofs, sizeof(*oh), oh);
454 if (err)
455 return err;
456 crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
457 if (crc != oh->crc) {
458 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
459 "got %x\n", ofs, be32_to_cpu(oh->crc),
460 be32_to_cpu(crc));
461 return -EIO;
462 }
463 return 0;
464}
465
466static void move_btree_to_page(struct inode *inode, struct page *page,
467 __be64 *data)
468{
469 struct super_block *sb = inode->i_sb;
470 struct logfs_super *super = logfs_super(sb);
471 struct btree_head128 *head = &super->s_object_alias_tree;
472 struct logfs_block *block;
473 struct object_alias_item *item, *next;
474
475 if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
476 return;
477
478 block = btree_remove128(head, inode->i_ino, page->index);
479 if (!block)
480 return;
481
482 log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
483 block->ino, block->bix, block->level);
484 list_for_each_entry_safe(item, next, &block->item_list, list) {
485 data[item->child_no] = item->val;
486 list_del(&item->list);
487 mempool_free(item, super->s_alias_pool);
488 }
489 block->page = page;
490 SetPagePrivate(page);
491 page->private = (unsigned long)block;
492 block->ops = &indirect_block_ops;
493 initialize_block_counters(page, block, data, 0);
494}
495
496/*
497 * This silences a false, yet annoying gcc warning. I hate it when my editor
498 * jumps into bitops.h each time I recompile this file.
499 * TODO: Complain to gcc folks about this and upgrade compiler.
500 */
501static unsigned long fnb(const unsigned long *addr,
502 unsigned long size, unsigned long offset)
503{
504 return find_next_bit(addr, size, offset);
505}
506
507void move_page_to_btree(struct page *page)
508{
509 struct logfs_block *block = logfs_block(page);
510 struct super_block *sb = block->sb;
511 struct logfs_super *super = logfs_super(sb);
512 struct object_alias_item *item;
513 unsigned long pos;
514 __be64 *child;
515 int err;
516
517 if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
518 block->ops->free_block(sb, block);
519 return;
520 }
521 log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
522 block->ino, block->bix, block->level);
523 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
524
525 for (pos = 0; ; pos++) {
526 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
527 if (pos >= LOGFS_BLOCK_FACTOR)
528 break;
529
530 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
531 BUG_ON(!item); /* mempool empty */
532 memset(item, 0, sizeof(*item));
533
534 child = kmap_atomic(page, KM_USER0);
535 item->val = child[pos];
536 kunmap_atomic(child, KM_USER0);
537 item->child_no = pos;
538 list_add(&item->list, &block->item_list);
539 }
540 block->page = NULL;
541 ClearPagePrivate(page);
542 page->private = 0;
543 block->ops = &btree_block_ops;
544 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
545 block);
546 BUG_ON(err); /* mempool empty */
547 ClearPageUptodate(page);
548}
549
550static int __logfs_segment_read(struct inode *inode, void *buf,
551 u64 ofs, u64 bix, level_t level)
552{
553 struct super_block *sb = inode->i_sb;
554 void *compressor_buf = logfs_super(sb)->s_compressed_je;
555 struct logfs_object_header oh;
556 __be32 crc;
557 u16 len;
558 int err, block_len;
559
560 block_len = obj_len(sb, obj_type(inode, level));
561 err = read_obj_header(sb, ofs, &oh);
562 if (err)
563 goto out_err;
564
565 err = -EIO;
566 if (be64_to_cpu(oh.ino) != inode->i_ino
567 || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
568 printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
569 "expected (%lx, %llx), got (%llx, %llx)\n",
570 ofs, inode->i_ino, bix,
571 be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
572 goto out_err;
573 }
574
575 len = be16_to_cpu(oh.len);
576
577 switch (oh.compr) {
578 case COMPR_NONE:
579 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
580 if (err)
581 goto out_err;
582 crc = logfs_crc32(buf, len, 0);
583 if (crc != oh.data_crc) {
584 printk(KERN_ERR"LOGFS: uncompressed data crc error at "
585 "%llx: expected %x, got %x\n", ofs,
586 be32_to_cpu(oh.data_crc),
587 be32_to_cpu(crc));
588 goto out_err;
589 }
590 break;
591 case COMPR_ZLIB:
592 mutex_lock(&logfs_super(sb)->s_journal_mutex);
593 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
594 compressor_buf);
595 if (err) {
596 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
597 goto out_err;
598 }
599 crc = logfs_crc32(compressor_buf, len, 0);
600 if (crc != oh.data_crc) {
601 printk(KERN_ERR"LOGFS: compressed data crc error at "
602 "%llx: expected %x, got %x\n", ofs,
603 be32_to_cpu(oh.data_crc),
604 be32_to_cpu(crc));
605 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
606 goto out_err;
607 }
608 err = logfs_uncompress(compressor_buf, buf, len, block_len);
609 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
610 if (err) {
611 printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
612 goto out_err;
613 }
614 break;
615 default:
616 LOGFS_BUG(sb);
617 err = -EIO;
618 goto out_err;
619 }
620 return 0;
621
622out_err:
623 logfs_set_ro(sb);
624 printk(KERN_ERR"LOGFS: device is read-only now\n");
625 LOGFS_BUG(sb);
626 return err;
627}
628
629/**
630 * logfs_segment_read - read data block from object store
631 * @inode: inode containing data
632 * @buf: data buffer
633 * @ofs: physical data offset
634 * @bix: block index
635 * @level: block level
636 *
637 * Returns 0 on success or a negative errno.
638 */
639int logfs_segment_read(struct inode *inode, struct page *page,
640 u64 ofs, u64 bix, level_t level)
641{
642 int err;
643 void *buf;
644
645 if (PageUptodate(page))
646 return 0;
647
648 ofs &= ~LOGFS_FULLY_POPULATED;
649
650 buf = kmap(page);
651 err = __logfs_segment_read(inode, buf, ofs, bix, level);
652 if (!err) {
653 move_btree_to_page(inode, page, buf);
654 SetPageUptodate(page);
655 }
656 kunmap(page);
657 log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
658 inode->i_ino, bix, level, ofs, err);
659 return err;
660}
661
662int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
663{
664 struct super_block *sb = inode->i_sb;
665 struct logfs_super *super = logfs_super(sb);
666 struct logfs_object_header h;
667 u16 len;
668 int err;
669
670 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
671 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
672 BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
673 if (!shadow->old_ofs)
674 return 0;
675
676 log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
677 shadow->ino, shadow->bix, shadow->gc_level,
678 shadow->old_ofs, shadow->new_ofs,
679 shadow->old_len, shadow->new_len);
680 err = read_obj_header(sb, shadow->old_ofs, &h);
681 LOGFS_BUG_ON(err, sb);
682 LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
683 LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
684 shrink_level(shadow->gc_level)), sb);
685
686 if (shadow->gc_level == 0)
687 len = be16_to_cpu(h.len);
688 else
689 len = obj_len(sb, h.type);
690 shadow->old_len = len + sizeof(h);
691 return 0;
692}
693
694void freeseg(struct super_block *sb, u32 segno)
695{
696 struct logfs_super *super = logfs_super(sb);
697 struct address_space *mapping = super->s_mapping_inode->i_mapping;
698 struct page *page;
699 u64 ofs, start, end;
700
701 start = dev_ofs(sb, segno, 0);
702 end = dev_ofs(sb, segno + 1, 0);
703 for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
704 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
705 if (!page)
706 continue;
707 ClearPagePrivate(page);
708 page_cache_release(page);
709 }
710}
711
712int logfs_open_area(struct logfs_area *area, size_t bytes)
713{
714 struct super_block *sb = area->a_sb;
715 struct logfs_super *super = logfs_super(sb);
716 int err, closed = 0;
717
718 if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
719 return 0;
720
721 if (area->a_is_open) {
722 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
723 u32 len = super->s_segsize - area->a_written_bytes;
724
725 log_gc("logfs_close_area(%x)\n", area->a_segno);
726 pad_wbuf(area, 1);
727 super->s_devops->writeseg(area->a_sb, ofs, len);
728 freeseg(sb, area->a_segno);
729 closed = 1;
730 }
731
732 area->a_used_bytes = 0;
733 area->a_written_bytes = 0;
734again:
735 area->a_ops->get_free_segment(area);
736 area->a_ops->get_erase_count(area);
737
738 log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
739 err = area->a_ops->erase_segment(area);
740 if (err) {
741 printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
742 area->a_segno);
743 logfs_mark_segment_bad(sb, area->a_segno);
744 goto again;
745 }
746 area->a_is_open = 1;
747 return closed;
748}
749
750void logfs_sync_area(struct logfs_area *area)
751{
752 struct super_block *sb = area->a_sb;
753 struct logfs_super *super = logfs_super(sb);
754 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
755 u32 len = (area->a_used_bytes - area->a_written_bytes);
756
757 if (super->s_writesize)
758 len &= ~(super->s_writesize - 1);
759 if (len == 0)
760 return;
761 pad_wbuf(area, 0);
762 super->s_devops->writeseg(sb, ofs, len);
763 area->a_written_bytes += len;
764}
765
766void logfs_sync_segments(struct super_block *sb)
767{
768 struct logfs_super *super = logfs_super(sb);
769 int i;
770
771 for_each_area(i)
772 logfs_sync_area(super->s_area[i]);
773}
774
775/*
776 * Pick a free segment to be used for this area. Effectively takes a
777 * candidate from the free list (not really a candidate anymore).
778 */
779static void ostore_get_free_segment(struct logfs_area *area)
780{
781 struct super_block *sb = area->a_sb;
782 struct logfs_super *super = logfs_super(sb);
783
784 if (super->s_free_list.count == 0) {
785 printk(KERN_ERR"LOGFS: ran out of free segments\n");
786 LOGFS_BUG(sb);
787 }
788
789 area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
790}
791
792static void ostore_get_erase_count(struct logfs_area *area)
793{
794 struct logfs_segment_entry se;
795 u32 ec_level;
796
797 logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
798 BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
799 se.valid == cpu_to_be32(RESERVED));
800
801 ec_level = be32_to_cpu(se.ec_level);
802 area->a_erase_count = (ec_level >> 4) + 1;
803}
804
805static int ostore_erase_segment(struct logfs_area *area)
806{
807 struct super_block *sb = area->a_sb;
808 struct logfs_segment_header sh;
809 u64 ofs;
810 int err;
811
812 err = logfs_erase_segment(sb, area->a_segno, 0);
813 if (err)
814 return err;
815
816 sh.pad = 0;
817 sh.type = SEG_OSTORE;
818 sh.level = (__force u8)area->a_level;
819 sh.segno = cpu_to_be32(area->a_segno);
820 sh.ec = cpu_to_be32(area->a_erase_count);
821 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
822 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
823
824 logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
825 area->a_level);
826
827 ofs = dev_ofs(sb, area->a_segno, 0);
828 area->a_used_bytes = sizeof(sh);
829 logfs_buf_write(area, ofs, &sh, sizeof(sh));
830 return 0;
831}
832
833static const struct logfs_area_ops ostore_area_ops = {
834 .get_free_segment = ostore_get_free_segment,
835 .get_erase_count = ostore_get_erase_count,
836 .erase_segment = ostore_erase_segment,
837};
838
839static void free_area(struct logfs_area *area)
840{
841 if (area)
842 freeseg(area->a_sb, area->a_segno);
843 kfree(area);
844}
845
846static struct logfs_area *alloc_area(struct super_block *sb)
847{
848 struct logfs_area *area;
849
850 area = kzalloc(sizeof(*area), GFP_KERNEL);
851 if (!area)
852 return NULL;
853
854 area->a_sb = sb;
855 return area;
856}
857
858static void map_invalidatepage(struct page *page, unsigned long l)
859{
860 BUG();
861}
862
863static int map_releasepage(struct page *page, gfp_t g)
864{
865 /* Don't release these pages */
866 return 0;
867}
868
869static const struct address_space_operations mapping_aops = {
870 .invalidatepage = map_invalidatepage,
871 .releasepage = map_releasepage,
872 .set_page_dirty = __set_page_dirty_nobuffers,
873};
874
875int logfs_init_mapping(struct super_block *sb)
876{
877 struct logfs_super *super = logfs_super(sb);
878 struct address_space *mapping;
879 struct inode *inode;
880
881 inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
882 if (IS_ERR(inode))
883 return PTR_ERR(inode);
884 super->s_mapping_inode = inode;
885 mapping = inode->i_mapping;
886 mapping->a_ops = &mapping_aops;
887 /* Would it be possible to use __GFP_HIGHMEM as well? */
888 mapping_set_gfp_mask(mapping, GFP_NOFS);
889 return 0;
890}
891
892int logfs_init_areas(struct super_block *sb)
893{
894 struct logfs_super *super = logfs_super(sb);
895 int i = -1;
896
897 super->s_alias_pool = mempool_create_kmalloc_pool(600,
898 sizeof(struct object_alias_item));
899 if (!super->s_alias_pool)
900 return -ENOMEM;
901
902 super->s_journal_area = alloc_area(sb);
903 if (!super->s_journal_area)
904 goto err;
905
906 for_each_area(i) {
907 super->s_area[i] = alloc_area(sb);
908 if (!super->s_area[i])
909 goto err;
910 super->s_area[i]->a_level = GC_LEVEL(i);
911 super->s_area[i]->a_ops = &ostore_area_ops;
912 }
913 btree_init_mempool128(&super->s_object_alias_tree,
914 super->s_btree_pool);
915 return 0;
916
917err:
918 for (i--; i >= 0; i--)
919 free_area(super->s_area[i]);
920 free_area(super->s_journal_area);
921 mempool_destroy(super->s_alias_pool);
922 return -ENOMEM;
923}
924
925void logfs_cleanup_areas(struct super_block *sb)
926{
927 struct logfs_super *super = logfs_super(sb);
928 int i;
929
930 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
931 for_each_area(i)
932 free_area(super->s_area[i]);
933 free_area(super->s_journal_area);
934 destroy_meta_inode(super->s_mapping_inode);
935}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..9d856c49afc5
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,649 @@
1/*
2 * fs/logfs/super.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Generally contains mount/umount code and also serves as a dump area for
9 * any functions that don't fit elsewhere and neither justify a file of their
10 * own.
11 */
12#include "logfs.h"
13#include <linux/bio.h>
14#include <linux/mtd/mtd.h>
15#include <linux/statfs.h>
16#include <linux/buffer_head.h>
17
18static DEFINE_MUTEX(emergency_mutex);
19static struct page *emergency_page;
20
21struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
22{
23 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
24 struct page *page;
25 int err;
26
27 page = read_cache_page(mapping, index, filler, NULL);
28 if (page)
29 return page;
30
31 /* No more pages available, switch to emergency page */
32 printk(KERN_INFO"Logfs: Using emergency page\n");
33 mutex_lock(&emergency_mutex);
34 err = filler(NULL, emergency_page);
35 if (err) {
36 mutex_unlock(&emergency_mutex);
37 printk(KERN_EMERG"Logfs: Error reading emergency page\n");
38 return ERR_PTR(err);
39 }
40 return emergency_page;
41}
42
43void emergency_read_end(struct page *page)
44{
45 if (page == emergency_page)
46 mutex_unlock(&emergency_mutex);
47 else
48 page_cache_release(page);
49}
50
51static void dump_segfile(struct super_block *sb)
52{
53 struct logfs_super *super = logfs_super(sb);
54 struct logfs_segment_entry se;
55 u32 segno;
56
57 for (segno = 0; segno < super->s_no_segs; segno++) {
58 logfs_get_segment_entry(sb, segno, &se);
59 printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
60 be32_to_cpu(se.valid));
61 if (++segno < super->s_no_segs) {
62 logfs_get_segment_entry(sb, segno, &se);
63 printk(" %6x %8x", be32_to_cpu(se.ec_level),
64 be32_to_cpu(se.valid));
65 }
66 if (++segno < super->s_no_segs) {
67 logfs_get_segment_entry(sb, segno, &se);
68 printk(" %6x %8x", be32_to_cpu(se.ec_level),
69 be32_to_cpu(se.valid));
70 }
71 if (++segno < super->s_no_segs) {
72 logfs_get_segment_entry(sb, segno, &se);
73 printk(" %6x %8x", be32_to_cpu(se.ec_level),
74 be32_to_cpu(se.valid));
75 }
76 printk("\n");
77 }
78}
79
80/*
81 * logfs_crash_dump - dump debug information to device
82 *
83 * The LogFS superblock only occupies part of a segment. This function will
84 * write as much debug information as it can gather into the spare space.
85 */
86void logfs_crash_dump(struct super_block *sb)
87{
88 dump_segfile(sb);
89}
90
91/*
92 * TODO: move to lib/string.c
93 */
94/**
95 * memchr_inv - Find a character in an area of memory.
96 * @s: The memory area
97 * @c: The byte to search for
98 * @n: The size of the area.
99 *
100 * returns the address of the first character other than @c, or %NULL
101 * if the whole buffer contains just @c.
102 */
103void *memchr_inv(const void *s, int c, size_t n)
104{
105 const unsigned char *p = s;
106 while (n-- != 0)
107 if ((unsigned char)c != *p++)
108 return (void *)(p - 1);
109
110 return NULL;
111}
112
113/*
114 * FIXME: There should be a reserve for root, similar to ext2.
115 */
116int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
117{
118 struct super_block *sb = dentry->d_sb;
119 struct logfs_super *super = logfs_super(sb);
120
121 stats->f_type = LOGFS_MAGIC_U32;
122 stats->f_bsize = sb->s_blocksize;
123 stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
124 stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
125 stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
126 stats->f_files = 0;
127 stats->f_ffree = 0;
128 stats->f_namelen = LOGFS_MAX_NAMELEN;
129 return 0;
130}
131
132static int logfs_sb_set(struct super_block *sb, void *_super)
133{
134 struct logfs_super *super = _super;
135
136 sb->s_fs_info = super;
137 sb->s_mtd = super->s_mtd;
138 sb->s_bdev = super->s_bdev;
139 return 0;
140}
141
142static int logfs_sb_test(struct super_block *sb, void *_super)
143{
144 struct logfs_super *super = _super;
145 struct mtd_info *mtd = super->s_mtd;
146
147 if (mtd && sb->s_mtd == mtd)
148 return 1;
149 if (super->s_bdev && sb->s_bdev == super->s_bdev)
150 return 1;
151 return 0;
152}
153
154static void set_segment_header(struct logfs_segment_header *sh, u8 type,
155 u8 level, u32 segno, u32 ec)
156{
157 sh->pad = 0;
158 sh->type = type;
159 sh->level = level;
160 sh->segno = cpu_to_be32(segno);
161 sh->ec = cpu_to_be32(ec);
162 sh->gec = cpu_to_be64(segno);
163 sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
164}
165
166static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
167 u32 segno, u32 ec)
168{
169 struct logfs_super *super = logfs_super(sb);
170 struct logfs_segment_header *sh = &ds->ds_sh;
171 int i;
172
173 memset(ds, 0, sizeof(*ds));
174 set_segment_header(sh, SEG_SUPER, 0, segno, ec);
175
176 ds->ds_ifile_levels = super->s_ifile_levels;
177 ds->ds_iblock_levels = super->s_iblock_levels;
178 ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
179 ds->ds_segment_shift = super->s_segshift;
180 ds->ds_block_shift = sb->s_blocksize_bits;
181 ds->ds_write_shift = super->s_writeshift;
182 ds->ds_filesystem_size = cpu_to_be64(super->s_size);
183 ds->ds_segment_size = cpu_to_be32(super->s_segsize);
184 ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
185 ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
186 ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
187 ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
188 ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
189 ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
190 ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
191 journal_for_each(i)
192 ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
193 ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
194 ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
195 LOGFS_SEGMENT_HEADERSIZE + 12);
196}
197
198static int write_one_sb(struct super_block *sb,
199 struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
200{
201 struct logfs_super *super = logfs_super(sb);
202 struct logfs_disk_super *ds;
203 struct logfs_segment_entry se;
204 struct page *page;
205 u64 ofs;
206 u32 ec, segno;
207 int err;
208
209 page = find_sb(sb, &ofs);
210 if (!page)
211 return -EIO;
212 ds = page_address(page);
213 segno = seg_no(sb, ofs);
214 logfs_get_segment_entry(sb, segno, &se);
215 ec = be32_to_cpu(se.ec_level) >> 4;
216 ec++;
217 logfs_set_segment_erased(sb, segno, ec, 0);
218 logfs_write_ds(sb, ds, segno, ec);
219 err = super->s_devops->write_sb(sb, page);
220 page_cache_release(page);
221 return err;
222}
223
224int logfs_write_sb(struct super_block *sb)
225{
226 struct logfs_super *super = logfs_super(sb);
227 int err;
228
229 /* First superblock */
230 err = write_one_sb(sb, super->s_devops->find_first_sb);
231 if (err)
232 return err;
233
234 /* Last superblock */
235 err = write_one_sb(sb, super->s_devops->find_last_sb);
236 if (err)
237 return err;
238 return 0;
239}
240
241static int ds_cmp(const void *ds0, const void *ds1)
242{
243 size_t len = sizeof(struct logfs_disk_super);
244
245 /* We know the segment headers differ, so ignore them */
246 len -= LOGFS_SEGMENT_HEADERSIZE;
247 ds0 += LOGFS_SEGMENT_HEADERSIZE;
248 ds1 += LOGFS_SEGMENT_HEADERSIZE;
249 return memcmp(ds0, ds1, len);
250}
251
252static int logfs_recover_sb(struct super_block *sb)
253{
254 struct logfs_super *super = logfs_super(sb);
255 struct logfs_disk_super _ds0, *ds0 = &_ds0;
256 struct logfs_disk_super _ds1, *ds1 = &_ds1;
257 int err, valid0, valid1;
258
259 /* read first superblock */
260 err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
261 if (err)
262 return err;
263 /* read last superblock */
264 err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
265 if (err)
266 return err;
267 valid0 = logfs_check_ds(ds0) == 0;
268 valid1 = logfs_check_ds(ds1) == 0;
269
270 if (!valid0 && valid1) {
271 printk(KERN_INFO"First superblock is invalid - fixing.\n");
272 return write_one_sb(sb, super->s_devops->find_first_sb);
273 }
274 if (valid0 && !valid1) {
275 printk(KERN_INFO"Last superblock is invalid - fixing.\n");
276 return write_one_sb(sb, super->s_devops->find_last_sb);
277 }
278 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
279 printk(KERN_INFO"Superblocks don't match - fixing.\n");
280 return logfs_write_sb(sb);
281 }
282 /* If neither is valid now, something's wrong. Didn't we properly
283 * check them before?!? */
284 BUG_ON(!valid0 && !valid1);
285 return 0;
286}
287
288static int logfs_make_writeable(struct super_block *sb)
289{
290 int err;
291
292 err = logfs_open_segfile(sb);
293 if (err)
294 return err;
295
296 /* Repair any broken superblock copies */
297 err = logfs_recover_sb(sb);
298 if (err)
299 return err;
300
301 /* Check areas for trailing unaccounted data */
302 err = logfs_check_areas(sb);
303 if (err)
304 return err;
305
306 /* Do one GC pass before any data gets dirtied */
307 logfs_gc_pass(sb);
308
309 /* after all initializations are done, replay the journal
310 * for rw-mounts, if necessary */
311 err = logfs_replay_journal(sb);
312 if (err)
313 return err;
314
315 return 0;
316}
317
318static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
319{
320 struct logfs_super *super = logfs_super(sb);
321 struct inode *rootdir;
322 int err;
323
324 /* root dir */
325 rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
326 if (IS_ERR(rootdir))
327 goto fail;
328
329 sb->s_root = d_alloc_root(rootdir);
330 if (!sb->s_root)
331 goto fail2;
332
333 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
334 if (!super->s_erase_page)
335 goto fail2;
336 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
337
338 /* FIXME: check for read-only mounts */
339 err = logfs_make_writeable(sb);
340 if (err)
341 goto fail3;
342
343 log_super("LogFS: Finished mounting\n");
344 simple_set_mnt(mnt, sb);
345 return 0;
346
347fail3:
348 __free_page(super->s_erase_page);
349fail2:
350 iput(rootdir);
351fail:
352 iput(logfs_super(sb)->s_master_inode);
353 return -EIO;
354}
355
356int logfs_check_ds(struct logfs_disk_super *ds)
357{
358 struct logfs_segment_header *sh = &ds->ds_sh;
359
360 if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
361 return -EINVAL;
362 if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
363 return -EINVAL;
364 if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
365 LOGFS_SEGMENT_HEADERSIZE + 12))
366 return -EINVAL;
367 return 0;
368}
369
370static struct page *find_super_block(struct super_block *sb)
371{
372 struct logfs_super *super = logfs_super(sb);
373 struct page *first, *last;
374
375 first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
376 if (!first || IS_ERR(first))
377 return NULL;
378 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
379 if (!last || IS_ERR(first)) {
380 page_cache_release(first);
381 return NULL;
382 }
383
384 if (!logfs_check_ds(page_address(first))) {
385 page_cache_release(last);
386 return first;
387 }
388
389 /* First one didn't work, try the second superblock */
390 if (!logfs_check_ds(page_address(last))) {
391 page_cache_release(first);
392 return last;
393 }
394
395 /* Neither worked, sorry folks */
396 page_cache_release(first);
397 page_cache_release(last);
398 return NULL;
399}
400
401static int __logfs_read_sb(struct super_block *sb)
402{
403 struct logfs_super *super = logfs_super(sb);
404 struct page *page;
405 struct logfs_disk_super *ds;
406 int i;
407
408 page = find_super_block(sb);
409 if (!page)
410 return -EIO;
411
412 ds = page_address(page);
413 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
414 super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
415 super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
416 super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
417 super->s_segsize = 1 << ds->ds_segment_shift;
418 super->s_segmask = (1 << ds->ds_segment_shift) - 1;
419 super->s_segshift = ds->ds_segment_shift;
420 sb->s_blocksize = 1 << ds->ds_block_shift;
421 sb->s_blocksize_bits = ds->ds_block_shift;
422 super->s_writesize = 1 << ds->ds_write_shift;
423 super->s_writeshift = ds->ds_write_shift;
424 super->s_no_segs = super->s_size >> super->s_segshift;
425 super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
426 super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
427 super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
428 super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
429 super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
430
431 journal_for_each(i)
432 super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
433
434 super->s_ifile_levels = ds->ds_ifile_levels;
435 super->s_iblock_levels = ds->ds_iblock_levels;
436 super->s_data_levels = ds->ds_data_levels;
437 super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
438 + super->s_data_levels;
439 page_cache_release(page);
440 return 0;
441}
442
443static int logfs_read_sb(struct super_block *sb, int read_only)
444{
445 struct logfs_super *super = logfs_super(sb);
446 int ret;
447
448 super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
449 if (!super->s_btree_pool)
450 return -ENOMEM;
451
452 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
453 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
454
455 ret = logfs_init_mapping(sb);
456 if (ret)
457 return ret;
458
459 ret = __logfs_read_sb(sb);
460 if (ret)
461 return ret;
462
463 if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
464 return -EIO;
465 if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
466 !read_only)
467 return -EIO;
468
469 mutex_init(&super->s_dirop_mutex);
470 mutex_init(&super->s_object_alias_mutex);
471 INIT_LIST_HEAD(&super->s_freeing_list);
472
473 ret = logfs_init_rw(sb);
474 if (ret)
475 return ret;
476
477 ret = logfs_init_areas(sb);
478 if (ret)
479 return ret;
480
481 ret = logfs_init_gc(sb);
482 if (ret)
483 return ret;
484
485 ret = logfs_init_journal(sb);
486 if (ret)
487 return ret;
488
489 return 0;
490}
491
492static void logfs_kill_sb(struct super_block *sb)
493{
494 struct logfs_super *super = logfs_super(sb);
495
496 log_super("LogFS: Start unmounting\n");
497 /* Alias entries slow down mount, so evict as many as possible */
498 sync_filesystem(sb);
499 logfs_write_anchor(sb);
500
501 /*
502 * From this point on alias entries are simply dropped - and any
503 * writes to the object store are considered bugs.
504 */
505 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
506 log_super("LogFS: Now in shutdown\n");
507 generic_shutdown_super(sb);
508
509 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
510
511 logfs_cleanup_gc(sb);
512 logfs_cleanup_journal(sb);
513 logfs_cleanup_areas(sb);
514 logfs_cleanup_rw(sb);
515 if (super->s_erase_page)
516 __free_page(super->s_erase_page);
517 super->s_devops->put_device(sb);
518 mempool_destroy(super->s_btree_pool);
519 mempool_destroy(super->s_alias_pool);
520 kfree(super);
521 log_super("LogFS: Finished unmounting\n");
522}
523
524int logfs_get_sb_device(struct file_system_type *type, int flags,
525 struct mtd_info *mtd, struct block_device *bdev,
526 const struct logfs_device_ops *devops, struct vfsmount *mnt)
527{
528 struct logfs_super *super;
529 struct super_block *sb;
530 int err = -ENOMEM;
531 static int mount_count;
532
533 log_super("LogFS: Start mount %x\n", mount_count++);
534 super = kzalloc(sizeof(*super), GFP_KERNEL);
535 if (!super)
536 goto err0;
537
538 super->s_mtd = mtd;
539 super->s_bdev = bdev;
540 err = -EINVAL;
541 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
542 if (IS_ERR(sb))
543 goto err0;
544
545 if (sb->s_root) {
546 /* Device is already in use */
547 err = 0;
548 simple_set_mnt(mnt, sb);
549 goto err0;
550 }
551
552 super->s_devops = devops;
553
554 /*
555 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
556 * only covers 16TB and the upper 8TB are used for indirect blocks.
557 * On 64bit system we could bump up the limit, but that would make
558 * the filesystem incompatible with 32bit systems.
559 */
560 sb->s_maxbytes = (1ull << 43) - 1;
561 sb->s_op = &logfs_super_operations;
562 sb->s_flags = flags | MS_NOATIME;
563
564 err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
565 if (err)
566 goto err1;
567
568 sb->s_flags |= MS_ACTIVE;
569 err = logfs_get_sb_final(sb, mnt);
570 if (err)
571 goto err1;
572 return 0;
573
574err1:
575 deactivate_locked_super(sb);
576 return err;
577err0:
578 kfree(super);
579 //devops->put_device(sb);
580 return err;
581}
582
583static int logfs_get_sb(struct file_system_type *type, int flags,
584 const char *devname, void *data, struct vfsmount *mnt)
585{
586 ulong mtdnr;
587
588 if (!devname)
589 return logfs_get_sb_bdev(type, flags, devname, mnt);
590 if (strncmp(devname, "mtd", 3))
591 return logfs_get_sb_bdev(type, flags, devname, mnt);
592
593 {
594 char *garbage;
595 mtdnr = simple_strtoul(devname+3, &garbage, 0);
596 if (*garbage)
597 return -EINVAL;
598 }
599
600 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
601}
602
603static struct file_system_type logfs_fs_type = {
604 .owner = THIS_MODULE,
605 .name = "logfs",
606 .get_sb = logfs_get_sb,
607 .kill_sb = logfs_kill_sb,
608 .fs_flags = FS_REQUIRES_DEV,
609
610};
611
612static int __init logfs_init(void)
613{
614 int ret;
615
616 emergency_page = alloc_pages(GFP_KERNEL, 0);
617 if (!emergency_page)
618 return -ENOMEM;
619
620 ret = logfs_compr_init();
621 if (ret)
622 goto out1;
623
624 ret = logfs_init_inode_cache();
625 if (ret)
626 goto out2;
627
628 return register_filesystem(&logfs_fs_type);
629out2:
630 logfs_compr_exit();
631out1:
632 __free_pages(emergency_page, 0);
633 return ret;
634}
635
636static void __exit logfs_exit(void)
637{
638 unregister_filesystem(&logfs_fs_type);
639 logfs_destroy_inode_cache();
640 logfs_compr_exit();
641 __free_pages(emergency_page, 0);
642}
643
644module_init(logfs_init);
645module_exit(logfs_exit);
646
647MODULE_LICENSE("GPL v2");
648MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
649MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d72164..756f8c93780c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/vfs.h> 19#include <linux/vfs.h>
20#include <linux/writeback.h>
20 21
21static int minix_write_inode(struct inode * inode, int wait); 22static int minix_write_inode(struct inode *inode,
23 struct writeback_control *wbc);
22static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); 24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
23static int minix_remount (struct super_block * sb, int * flags, char * data); 25static int minix_remount (struct super_block * sb, int * flags, char * data);
24 26
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
552 return bh; 554 return bh;
553} 555}
554 556
555static int minix_write_inode(struct inode *inode, int wait) 557static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
556{ 558{
557 int err = 0; 559 int err = 0;
558 struct buffer_head *bh; 560 struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
563 bh = V2_minix_update_inode(inode); 565 bh = V2_minix_update_inode(inode);
564 if (!bh) 566 if (!bh)
565 return -EIO; 567 return -EIO;
566 if (wait && buffer_dirty(bh)) { 568 if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
567 sync_dirty_buffer(bh); 569 sync_dirty_buffer(bh);
568 if (buffer_req(bh) && !buffer_uptodate(bh)) { 570 if (buffer_req(bh) && !buffer_uptodate(bh)) {
569 printk("IO error syncing minix inode [%s:%08lx]\n", 571 printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543b..598d54e200eb 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -561,7 +561,7 @@ page_is_mapped:
561 if (page->index >= end_index) { 561 if (page->index >= end_index) {
562 /* 562 /*
563 * The page straddles i_size. It must be zeroed out on each 563 * The page straddles i_size. It must be zeroed out on each
564 * and every writepage invokation because it may be mmapped. 564 * and every writepage invocation because it may be mmapped.
565 * "A file is mapped in multiples of the page size. For a file 565 * "A file is mapped in multiples of the page size. For a file
566 * that is not a multiple of the page size, the remaining memory 566 * that is not a multiple of the page size, the remaining memory
567 * is zeroed when mapped, and writes to that region are not 567 * is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index a4855af776a8..a7dce91a7e42 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/namei.h> 21#include <linux/namei.h>
22#include <linux/quotaops.h>
23#include <linux/pagemap.h> 22#include <linux/pagemap.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
@@ -498,8 +497,6 @@ static int link_path_walk(const char *, struct nameidata *);
498 497
499static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 498static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
500{ 499{
501 int res = 0;
502 char *name;
503 if (IS_ERR(link)) 500 if (IS_ERR(link))
504 goto fail; 501 goto fail;
505 502
@@ -510,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
510 path_get(&nd->root); 507 path_get(&nd->root);
511 } 508 }
512 509
513 res = link_path_walk(link, nd); 510 return link_path_walk(link, nd);
514 if (nd->depth || res || nd->last_type!=LAST_NORM)
515 return res;
516 /*
517 * If it is an iterative symlinks resolution in open_namei() we
518 * have to copy the last component. And all that crap because of
519 * bloody create() on broken symlinks. Furrfu...
520 */
521 name = __getname();
522 if (unlikely(!name)) {
523 path_put(&nd->path);
524 return -ENOMEM;
525 }
526 strcpy(name, nd->last.name);
527 nd->last.name = name;
528 return 0;
529fail: 511fail:
530 path_put(&nd->path); 512 path_put(&nd->path);
531 return PTR_ERR(link); 513 return PTR_ERR(link);
@@ -547,10 +529,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
547 nd->path.dentry = path->dentry; 529 nd->path.dentry = path->dentry;
548} 530}
549 531
550static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) 532static __always_inline int
533__do_follow_link(struct path *path, struct nameidata *nd, void **p)
551{ 534{
552 int error; 535 int error;
553 void *cookie;
554 struct dentry *dentry = path->dentry; 536 struct dentry *dentry = path->dentry;
555 537
556 touch_atime(path->mnt, dentry); 538 touch_atime(path->mnt, dentry);
@@ -562,9 +544,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
562 } 544 }
563 mntget(path->mnt); 545 mntget(path->mnt);
564 nd->last_type = LAST_BIND; 546 nd->last_type = LAST_BIND;
565 cookie = dentry->d_inode->i_op->follow_link(dentry, nd); 547 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
566 error = PTR_ERR(cookie); 548 error = PTR_ERR(*p);
567 if (!IS_ERR(cookie)) { 549 if (!IS_ERR(*p)) {
568 char *s = nd_get_link(nd); 550 char *s = nd_get_link(nd);
569 error = 0; 551 error = 0;
570 if (s) 552 if (s)
@@ -574,8 +556,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
574 if (error) 556 if (error)
575 path_put(&nd->path); 557 path_put(&nd->path);
576 } 558 }
577 if (dentry->d_inode->i_op->put_link)
578 dentry->d_inode->i_op->put_link(dentry, nd, cookie);
579 } 559 }
580 return error; 560 return error;
581} 561}
@@ -589,6 +569,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
589 */ 569 */
590static inline int do_follow_link(struct path *path, struct nameidata *nd) 570static inline int do_follow_link(struct path *path, struct nameidata *nd)
591{ 571{
572 void *cookie;
592 int err = -ELOOP; 573 int err = -ELOOP;
593 if (current->link_count >= MAX_NESTED_LINKS) 574 if (current->link_count >= MAX_NESTED_LINKS)
594 goto loop; 575 goto loop;
@@ -602,7 +583,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
602 current->link_count++; 583 current->link_count++;
603 current->total_link_count++; 584 current->total_link_count++;
604 nd->depth++; 585 nd->depth++;
605 err = __do_follow_link(path, nd); 586 err = __do_follow_link(path, nd, &cookie);
587 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
588 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
606 path_put(path); 589 path_put(path);
607 current->link_count--; 590 current->link_count--;
608 nd->depth--; 591 nd->depth--;
@@ -689,33 +672,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
689 set_root(nd); 672 set_root(nd);
690 673
691 while(1) { 674 while(1) {
692 struct vfsmount *parent;
693 struct dentry *old = nd->path.dentry; 675 struct dentry *old = nd->path.dentry;
694 676
695 if (nd->path.dentry == nd->root.dentry && 677 if (nd->path.dentry == nd->root.dentry &&
696 nd->path.mnt == nd->root.mnt) { 678 nd->path.mnt == nd->root.mnt) {
697 break; 679 break;
698 } 680 }
699 spin_lock(&dcache_lock);
700 if (nd->path.dentry != nd->path.mnt->mnt_root) { 681 if (nd->path.dentry != nd->path.mnt->mnt_root) {
701 nd->path.dentry = dget(nd->path.dentry->d_parent); 682 /* rare case of legitimate dget_parent()... */
702 spin_unlock(&dcache_lock); 683 nd->path.dentry = dget_parent(nd->path.dentry);
703 dput(old); 684 dput(old);
704 break; 685 break;
705 } 686 }
706 spin_unlock(&dcache_lock); 687 if (!follow_up(&nd->path))
707 spin_lock(&vfsmount_lock);
708 parent = nd->path.mnt->mnt_parent;
709 if (parent == nd->path.mnt) {
710 spin_unlock(&vfsmount_lock);
711 break; 688 break;
712 }
713 mntget(parent);
714 nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
715 spin_unlock(&vfsmount_lock);
716 dput(old);
717 mntput(nd->path.mnt);
718 nd->path.mnt = parent;
719 } 689 }
720 follow_mount(&nd->path); 690 follow_mount(&nd->path);
721} 691}
@@ -1347,7 +1317,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1347 return -ENOENT; 1317 return -ENOENT;
1348 1318
1349 BUG_ON(victim->d_parent->d_inode != dir); 1319 BUG_ON(victim->d_parent->d_inode != dir);
1350 audit_inode_child(victim->d_name.name, victim, dir); 1320 audit_inode_child(victim, dir);
1351 1321
1352 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 1322 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1353 if (error) 1323 if (error)
@@ -1388,22 +1358,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
1388 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 1358 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1389} 1359}
1390 1360
1391/*
1392 * O_DIRECTORY translates into forcing a directory lookup.
1393 */
1394static inline int lookup_flags(unsigned int f)
1395{
1396 unsigned long retval = LOOKUP_FOLLOW;
1397
1398 if (f & O_NOFOLLOW)
1399 retval &= ~LOOKUP_FOLLOW;
1400
1401 if (f & O_DIRECTORY)
1402 retval |= LOOKUP_DIRECTORY;
1403
1404 return retval;
1405}
1406
1407/* 1361/*
1408 * p1 and p2 should be directories on the same fs. 1362 * p1 and p2 should be directories on the same fs.
1409 */ 1363 */
@@ -1461,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1461 error = security_inode_create(dir, dentry, mode); 1415 error = security_inode_create(dir, dentry, mode);
1462 if (error) 1416 if (error)
1463 return error; 1417 return error;
1464 vfs_dq_init(dir);
1465 error = dir->i_op->create(dir, dentry, mode, nd); 1418 error = dir->i_op->create(dir, dentry, mode, nd);
1466 if (!error) 1419 if (!error)
1467 fsnotify_create(dir, dentry); 1420 fsnotify_create(dir, dentry);
@@ -1503,7 +1456,7 @@ int may_open(struct path *path, int acc_mode, int flag)
1503 * An append-only file must be opened in append mode for writing. 1456 * An append-only file must be opened in append mode for writing.
1504 */ 1457 */
1505 if (IS_APPEND(inode)) { 1458 if (IS_APPEND(inode)) {
1506 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1459 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
1507 return -EPERM; 1460 return -EPERM;
1508 if (flag & O_TRUNC) 1461 if (flag & O_TRUNC)
1509 return -EPERM; 1462 return -EPERM;
@@ -1547,7 +1500,7 @@ static int handle_truncate(struct path *path)
1547 * what get passed to sys_open(). 1500 * what get passed to sys_open().
1548 */ 1501 */
1549static int __open_namei_create(struct nameidata *nd, struct path *path, 1502static int __open_namei_create(struct nameidata *nd, struct path *path,
1550 int flag, int mode) 1503 int open_flag, int mode)
1551{ 1504{
1552 int error; 1505 int error;
1553 struct dentry *dir = nd->path.dentry; 1506 struct dentry *dir = nd->path.dentry;
@@ -1565,7 +1518,7 @@ out_unlock:
1565 if (error) 1518 if (error)
1566 return error; 1519 return error;
1567 /* Don't check for write permission, don't truncate */ 1520 /* Don't check for write permission, don't truncate */
1568 return may_open(&nd->path, 0, flag & ~O_TRUNC); 1521 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
1569} 1522}
1570 1523
1571/* 1524/*
@@ -1603,129 +1556,133 @@ static int open_will_truncate(int flag, struct inode *inode)
1603 return (flag & O_TRUNC); 1556 return (flag & O_TRUNC);
1604} 1557}
1605 1558
1606/* 1559static struct file *finish_open(struct nameidata *nd,
1607 * Note that the low bits of the passed in "open_flag" 1560 int open_flag, int acc_mode)
1608 * are not the same as in the local variable "flag". See
1609 * open_to_namei_flags() for more details.
1610 */
1611struct file *do_filp_open(int dfd, const char *pathname,
1612 int open_flag, int mode, int acc_mode)
1613{ 1561{
1614 struct file *filp; 1562 struct file *filp;
1615 struct nameidata nd;
1616 int error;
1617 struct path path;
1618 struct dentry *dir;
1619 int count = 0;
1620 int will_truncate; 1563 int will_truncate;
1621 int flag = open_to_namei_flags(open_flag); 1564 int error;
1622 int force_reval = 0;
1623 1565
1566 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
1567 if (will_truncate) {
1568 error = mnt_want_write(nd->path.mnt);
1569 if (error)
1570 goto exit;
1571 }
1572 error = may_open(&nd->path, acc_mode, open_flag);
1573 if (error) {
1574 if (will_truncate)
1575 mnt_drop_write(nd->path.mnt);
1576 goto exit;
1577 }
1578 filp = nameidata_to_filp(nd);
1579 if (!IS_ERR(filp)) {
1580 error = ima_file_check(filp, acc_mode);
1581 if (error) {
1582 fput(filp);
1583 filp = ERR_PTR(error);
1584 }
1585 }
1586 if (!IS_ERR(filp)) {
1587 if (will_truncate) {
1588 error = handle_truncate(&nd->path);
1589 if (error) {
1590 fput(filp);
1591 filp = ERR_PTR(error);
1592 }
1593 }
1594 }
1624 /* 1595 /*
1625 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 1596 * It is now safe to drop the mnt write
1626 * check for O_DSYNC if the need any syncing at all we enforce it's 1597 * because the filp has had a write taken
1627 * always set instead of having to deal with possibly weird behaviour 1598 * on its behalf.
1628 * for malicious applications setting only __O_SYNC.
1629 */ 1599 */
1630 if (open_flag & __O_SYNC) 1600 if (will_truncate)
1631 open_flag |= O_DSYNC; 1601 mnt_drop_write(nd->path.mnt);
1632 1602 return filp;
1633 if (!acc_mode)
1634 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1635 1603
1636 /* O_TRUNC implies we need access checks for write permissions */ 1604exit:
1637 if (flag & O_TRUNC) 1605 if (!IS_ERR(nd->intent.open.file))
1638 acc_mode |= MAY_WRITE; 1606 release_open_intent(nd);
1607 path_put(&nd->path);
1608 return ERR_PTR(error);
1609}
1639 1610
1640 /* Allow the LSM permission hook to distinguish append 1611static struct file *do_last(struct nameidata *nd, struct path *path,
1641 access from general write access. */ 1612 int open_flag, int acc_mode,
1642 if (flag & O_APPEND) 1613 int mode, const char *pathname)
1643 acc_mode |= MAY_APPEND; 1614{
1615 struct dentry *dir = nd->path.dentry;
1616 struct file *filp;
1617 int error = -EISDIR;
1644 1618
1645 /* 1619 switch (nd->last_type) {
1646 * The simplest case - just a plain lookup. 1620 case LAST_DOTDOT:
1647 */ 1621 follow_dotdot(nd);
1648 if (!(flag & O_CREAT)) { 1622 dir = nd->path.dentry;
1649 filp = get_empty_filp(); 1623 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1650 1624 if (!dir->d_op->d_revalidate(dir, nd)) {
1651 if (filp == NULL) 1625 error = -ESTALE;
1652 return ERR_PTR(-ENFILE); 1626 goto exit;
1653 nd.intent.open.file = filp;
1654 filp->f_flags = open_flag;
1655 nd.intent.open.flags = flag;
1656 nd.intent.open.create_mode = 0;
1657 error = do_path_lookup(dfd, pathname,
1658 lookup_flags(flag)|LOOKUP_OPEN, &nd);
1659 if (IS_ERR(nd.intent.open.file)) {
1660 if (error == 0) {
1661 error = PTR_ERR(nd.intent.open.file);
1662 path_put(&nd.path);
1663 } 1627 }
1664 } else if (error) 1628 }
1665 release_open_intent(&nd); 1629 /* fallthrough */
1666 if (error) 1630 case LAST_DOT:
1667 return ERR_PTR(error); 1631 case LAST_ROOT:
1632 if (open_flag & O_CREAT)
1633 goto exit;
1634 /* fallthrough */
1635 case LAST_BIND:
1636 audit_inode(pathname, dir);
1668 goto ok; 1637 goto ok;
1669 } 1638 }
1670 1639
1671 /* 1640 /* trailing slashes? */
1672 * Create - we need to know the parent. 1641 if (nd->last.name[nd->last.len]) {
1673 */ 1642 if (open_flag & O_CREAT)
1674reval: 1643 goto exit;
1675 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 1644 nd->flags |= LOOKUP_DIRECTORY;
1676 if (error)
1677 return ERR_PTR(error);
1678 if (force_reval)
1679 nd.flags |= LOOKUP_REVAL;
1680 error = path_walk(pathname, &nd);
1681 if (error) {
1682 if (nd.root.mnt)
1683 path_put(&nd.root);
1684 return ERR_PTR(error);
1685 } 1645 }
1686 if (unlikely(!audit_dummy_context()))
1687 audit_inode(pathname, nd.path.dentry);
1688 1646
1689 /* 1647 /* just plain open? */
1690 * We have the parent and last component. First of all, check 1648 if (!(open_flag & O_CREAT)) {
1691 * that we are not asked to creat(2) an obvious directory - that 1649 error = do_lookup(nd, &nd->last, path);
1692 * will not do. 1650 if (error)
1693 */ 1651 goto exit;
1694 error = -EISDIR; 1652 error = -ENOENT;
1695 if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) 1653 if (!path->dentry->d_inode)
1696 goto exit_parent; 1654 goto exit_dput;
1655 if (path->dentry->d_inode->i_op->follow_link)
1656 return NULL;
1657 error = -ENOTDIR;
1658 if (nd->flags & LOOKUP_DIRECTORY) {
1659 if (!path->dentry->d_inode->i_op->lookup)
1660 goto exit_dput;
1661 }
1662 path_to_nameidata(path, nd);
1663 audit_inode(pathname, nd->path.dentry);
1664 goto ok;
1665 }
1697 1666
1698 error = -ENFILE; 1667 /* OK, it's O_CREAT */
1699 filp = get_empty_filp();
1700 if (filp == NULL)
1701 goto exit_parent;
1702 nd.intent.open.file = filp;
1703 filp->f_flags = open_flag;
1704 nd.intent.open.flags = flag;
1705 nd.intent.open.create_mode = mode;
1706 dir = nd.path.dentry;
1707 nd.flags &= ~LOOKUP_PARENT;
1708 nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
1709 if (flag & O_EXCL)
1710 nd.flags |= LOOKUP_EXCL;
1711 mutex_lock(&dir->d_inode->i_mutex); 1668 mutex_lock(&dir->d_inode->i_mutex);
1712 path.dentry = lookup_hash(&nd);
1713 path.mnt = nd.path.mnt;
1714 1669
1715do_last: 1670 path->dentry = lookup_hash(nd);
1716 error = PTR_ERR(path.dentry); 1671 path->mnt = nd->path.mnt;
1717 if (IS_ERR(path.dentry)) { 1672
1673 error = PTR_ERR(path->dentry);
1674 if (IS_ERR(path->dentry)) {
1718 mutex_unlock(&dir->d_inode->i_mutex); 1675 mutex_unlock(&dir->d_inode->i_mutex);
1719 goto exit; 1676 goto exit;
1720 } 1677 }
1721 1678
1722 if (IS_ERR(nd.intent.open.file)) { 1679 if (IS_ERR(nd->intent.open.file)) {
1723 error = PTR_ERR(nd.intent.open.file); 1680 error = PTR_ERR(nd->intent.open.file);
1724 goto exit_mutex_unlock; 1681 goto exit_mutex_unlock;
1725 } 1682 }
1726 1683
1727 /* Negative dentry, just create the file */ 1684 /* Negative dentry, just create the file */
1728 if (!path.dentry->d_inode) { 1685 if (!path->dentry->d_inode) {
1729 /* 1686 /*
1730 * This write is needed to ensure that a 1687 * This write is needed to ensure that a
1731 * ro->rw transition does not occur between 1688 * ro->rw transition does not occur between
@@ -1733,18 +1690,16 @@ do_last:
1733 * a permanent write count is taken through 1690 * a permanent write count is taken through
1734 * the 'struct file' in nameidata_to_filp(). 1691 * the 'struct file' in nameidata_to_filp().
1735 */ 1692 */
1736 error = mnt_want_write(nd.path.mnt); 1693 error = mnt_want_write(nd->path.mnt);
1737 if (error) 1694 if (error)
1738 goto exit_mutex_unlock; 1695 goto exit_mutex_unlock;
1739 error = __open_namei_create(&nd, &path, flag, mode); 1696 error = __open_namei_create(nd, path, open_flag, mode);
1740 if (error) { 1697 if (error) {
1741 mnt_drop_write(nd.path.mnt); 1698 mnt_drop_write(nd->path.mnt);
1742 goto exit; 1699 goto exit;
1743 } 1700 }
1744 filp = nameidata_to_filp(&nd); 1701 filp = nameidata_to_filp(nd);
1745 mnt_drop_write(nd.path.mnt); 1702 mnt_drop_write(nd->path.mnt);
1746 if (nd.root.mnt)
1747 path_put(&nd.root);
1748 if (!IS_ERR(filp)) { 1703 if (!IS_ERR(filp)) {
1749 error = ima_file_check(filp, acc_mode); 1704 error = ima_file_check(filp, acc_mode);
1750 if (error) { 1705 if (error) {
@@ -1759,150 +1714,182 @@ do_last:
1759 * It already exists. 1714 * It already exists.
1760 */ 1715 */
1761 mutex_unlock(&dir->d_inode->i_mutex); 1716 mutex_unlock(&dir->d_inode->i_mutex);
1762 audit_inode(pathname, path.dentry); 1717 audit_inode(pathname, path->dentry);
1763 1718
1764 error = -EEXIST; 1719 error = -EEXIST;
1765 if (flag & O_EXCL) 1720 if (open_flag & O_EXCL)
1766 goto exit_dput; 1721 goto exit_dput;
1767 1722
1768 if (__follow_mount(&path)) { 1723 if (__follow_mount(path)) {
1769 error = -ELOOP; 1724 error = -ELOOP;
1770 if (flag & O_NOFOLLOW) 1725 if (open_flag & O_NOFOLLOW)
1771 goto exit_dput; 1726 goto exit_dput;
1772 } 1727 }
1773 1728
1774 error = -ENOENT; 1729 error = -ENOENT;
1775 if (!path.dentry->d_inode) 1730 if (!path->dentry->d_inode)
1776 goto exit_dput; 1731 goto exit_dput;
1777 if (path.dentry->d_inode->i_op->follow_link)
1778 goto do_link;
1779 1732
1780 path_to_nameidata(&path, &nd); 1733 if (path->dentry->d_inode->i_op->follow_link)
1734 return NULL;
1735
1736 path_to_nameidata(path, nd);
1781 error = -EISDIR; 1737 error = -EISDIR;
1782 if (S_ISDIR(path.dentry->d_inode->i_mode)) 1738 if (S_ISDIR(path->dentry->d_inode->i_mode))
1783 goto exit; 1739 goto exit;
1784ok: 1740ok:
1741 filp = finish_open(nd, open_flag, acc_mode);
1742 return filp;
1743
1744exit_mutex_unlock:
1745 mutex_unlock(&dir->d_inode->i_mutex);
1746exit_dput:
1747 path_put_conditional(path, nd);
1748exit:
1749 if (!IS_ERR(nd->intent.open.file))
1750 release_open_intent(nd);
1751 path_put(&nd->path);
1752 return ERR_PTR(error);
1753}
1754
1755/*
1756 * Note that the low bits of the passed in "open_flag"
1757 * are not the same as in the local variable "flag". See
1758 * open_to_namei_flags() for more details.
1759 */
1760struct file *do_filp_open(int dfd, const char *pathname,
1761 int open_flag, int mode, int acc_mode)
1762{
1763 struct file *filp;
1764 struct nameidata nd;
1765 int error;
1766 struct path path;
1767 int count = 0;
1768 int flag = open_to_namei_flags(open_flag);
1769 int force_reval = 0;
1770
1771 if (!(open_flag & O_CREAT))
1772 mode = 0;
1773
1785 /* 1774 /*
1786 * Consider: 1775 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1787 * 1. may_open() truncates a file 1776 * check for O_DSYNC if the need any syncing at all we enforce it's
1788 * 2. a rw->ro mount transition occurs 1777 * always set instead of having to deal with possibly weird behaviour
1789 * 3. nameidata_to_filp() fails due to 1778 * for malicious applications setting only __O_SYNC.
1790 * the ro mount.
1791 * That would be inconsistent, and should
1792 * be avoided. Taking this mnt write here
1793 * ensures that (2) can not occur.
1794 */ 1779 */
1795 will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode); 1780 if (open_flag & __O_SYNC)
1796 if (will_truncate) { 1781 open_flag |= O_DSYNC;
1797 error = mnt_want_write(nd.path.mnt); 1782
1798 if (error) 1783 if (!acc_mode)
1799 goto exit; 1784 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1800 } 1785
1801 error = may_open(&nd.path, acc_mode, flag); 1786 /* O_TRUNC implies we need access checks for write permissions */
1787 if (open_flag & O_TRUNC)
1788 acc_mode |= MAY_WRITE;
1789
1790 /* Allow the LSM permission hook to distinguish append
1791 access from general write access. */
1792 if (open_flag & O_APPEND)
1793 acc_mode |= MAY_APPEND;
1794
1795 /* find the parent */
1796reval:
1797 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1798 if (error)
1799 return ERR_PTR(error);
1800 if (force_reval)
1801 nd.flags |= LOOKUP_REVAL;
1802
1803 current->total_link_count = 0;
1804 error = link_path_walk(pathname, &nd);
1802 if (error) { 1805 if (error) {
1803 if (will_truncate) 1806 filp = ERR_PTR(error);
1804 mnt_drop_write(nd.path.mnt); 1807 goto out;
1805 goto exit;
1806 }
1807 filp = nameidata_to_filp(&nd);
1808 if (!IS_ERR(filp)) {
1809 error = ima_file_check(filp, acc_mode);
1810 if (error) {
1811 fput(filp);
1812 filp = ERR_PTR(error);
1813 }
1814 } 1808 }
1815 if (!IS_ERR(filp)) { 1809 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
1816 if (acc_mode & MAY_WRITE) 1810 audit_inode(pathname, nd.path.dentry);
1817 vfs_dq_init(nd.path.dentry->d_inode);
1818 1811
1819 if (will_truncate) {
1820 error = handle_truncate(&nd.path);
1821 if (error) {
1822 fput(filp);
1823 filp = ERR_PTR(error);
1824 }
1825 }
1826 }
1827 /* 1812 /*
1828 * It is now safe to drop the mnt write 1813 * We have the parent and last component.
1829 * because the filp has had a write taken
1830 * on its behalf.
1831 */ 1814 */
1832 if (will_truncate) 1815
1833 mnt_drop_write(nd.path.mnt); 1816 error = -ENFILE;
1817 filp = get_empty_filp();
1818 if (filp == NULL)
1819 goto exit_parent;
1820 nd.intent.open.file = filp;
1821 filp->f_flags = open_flag;
1822 nd.intent.open.flags = flag;
1823 nd.intent.open.create_mode = mode;
1824 nd.flags &= ~LOOKUP_PARENT;
1825 nd.flags |= LOOKUP_OPEN;
1826 if (open_flag & O_CREAT) {
1827 nd.flags |= LOOKUP_CREATE;
1828 if (open_flag & O_EXCL)
1829 nd.flags |= LOOKUP_EXCL;
1830 }
1831 if (open_flag & O_DIRECTORY)
1832 nd.flags |= LOOKUP_DIRECTORY;
1833 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1834 while (unlikely(!filp)) { /* trailing symlink */
1835 struct path holder;
1836 struct inode *inode = path.dentry->d_inode;
1837 void *cookie;
1838 error = -ELOOP;
1839 /* S_ISDIR part is a temporary automount kludge */
1840 if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
1841 goto exit_dput;
1842 if (count++ == 32)
1843 goto exit_dput;
1844 /*
1845 * This is subtle. Instead of calling do_follow_link() we do
1846 * the thing by hands. The reason is that this way we have zero
1847 * link_count and path_walk() (called from ->follow_link)
1848 * honoring LOOKUP_PARENT. After that we have the parent and
1849 * last component, i.e. we are in the same situation as after
1850 * the first path_walk(). Well, almost - if the last component
1851 * is normal we get its copy stored in nd->last.name and we will
1852 * have to putname() it when we are done. Procfs-like symlinks
1853 * just set LAST_BIND.
1854 */
1855 nd.flags |= LOOKUP_PARENT;
1856 error = security_inode_follow_link(path.dentry, &nd);
1857 if (error)
1858 goto exit_dput;
1859 error = __do_follow_link(&path, &nd, &cookie);
1860 if (unlikely(error)) {
1861 /* nd.path had been dropped */
1862 if (!IS_ERR(cookie) && inode->i_op->put_link)
1863 inode->i_op->put_link(path.dentry, &nd, cookie);
1864 path_put(&path);
1865 release_open_intent(&nd);
1866 filp = ERR_PTR(error);
1867 goto out;
1868 }
1869 holder = path;
1870 nd.flags &= ~LOOKUP_PARENT;
1871 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1872 if (inode->i_op->put_link)
1873 inode->i_op->put_link(holder.dentry, &nd, cookie);
1874 path_put(&holder);
1875 }
1876out:
1834 if (nd.root.mnt) 1877 if (nd.root.mnt)
1835 path_put(&nd.root); 1878 path_put(&nd.root);
1879 if (filp == ERR_PTR(-ESTALE) && !force_reval) {
1880 force_reval = 1;
1881 goto reval;
1882 }
1836 return filp; 1883 return filp;
1837 1884
1838exit_mutex_unlock:
1839 mutex_unlock(&dir->d_inode->i_mutex);
1840exit_dput: 1885exit_dput:
1841 path_put_conditional(&path, &nd); 1886 path_put_conditional(&path, &nd);
1842exit:
1843 if (!IS_ERR(nd.intent.open.file)) 1887 if (!IS_ERR(nd.intent.open.file))
1844 release_open_intent(&nd); 1888 release_open_intent(&nd);
1845exit_parent: 1889exit_parent:
1846 if (nd.root.mnt)
1847 path_put(&nd.root);
1848 path_put(&nd.path); 1890 path_put(&nd.path);
1849 return ERR_PTR(error); 1891 filp = ERR_PTR(error);
1850 1892 goto out;
1851do_link:
1852 error = -ELOOP;
1853 if (flag & O_NOFOLLOW)
1854 goto exit_dput;
1855 /*
1856 * This is subtle. Instead of calling do_follow_link() we do the
1857 * thing by hands. The reason is that this way we have zero link_count
1858 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1859 * After that we have the parent and last component, i.e.
1860 * we are in the same situation as after the first path_walk().
1861 * Well, almost - if the last component is normal we get its copy
1862 * stored in nd->last.name and we will have to putname() it when we
1863 * are done. Procfs-like symlinks just set LAST_BIND.
1864 */
1865 nd.flags |= LOOKUP_PARENT;
1866 error = security_inode_follow_link(path.dentry, &nd);
1867 if (error)
1868 goto exit_dput;
1869 error = __do_follow_link(&path, &nd);
1870 path_put(&path);
1871 if (error) {
1872 /* Does someone understand code flow here? Or it is only
1873 * me so stupid? Anathema to whoever designed this non-sense
1874 * with "intent.open".
1875 */
1876 release_open_intent(&nd);
1877 if (nd.root.mnt)
1878 path_put(&nd.root);
1879 if (error == -ESTALE && !force_reval) {
1880 force_reval = 1;
1881 goto reval;
1882 }
1883 return ERR_PTR(error);
1884 }
1885 nd.flags &= ~LOOKUP_PARENT;
1886 if (nd.last_type == LAST_BIND)
1887 goto ok;
1888 error = -EISDIR;
1889 if (nd.last_type != LAST_NORM)
1890 goto exit;
1891 if (nd.last.name[nd.last.len]) {
1892 __putname(nd.last.name);
1893 goto exit;
1894 }
1895 error = -ELOOP;
1896 if (count++==32) {
1897 __putname(nd.last.name);
1898 goto exit;
1899 }
1900 dir = nd.path.dentry;
1901 mutex_lock(&dir->d_inode->i_mutex);
1902 path.dentry = lookup_hash(&nd);
1903 path.mnt = nd.path.mnt;
1904 __putname(nd.last.name);
1905 goto do_last;
1906} 1893}
1907 1894
1908/** 1895/**
@@ -1996,7 +1983,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1996 if (error) 1983 if (error)
1997 return error; 1984 return error;
1998 1985
1999 vfs_dq_init(dir);
2000 error = dir->i_op->mknod(dir, dentry, mode, dev); 1986 error = dir->i_op->mknod(dir, dentry, mode, dev);
2001 if (!error) 1987 if (!error)
2002 fsnotify_create(dir, dentry); 1988 fsnotify_create(dir, dentry);
@@ -2095,7 +2081,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2095 if (error) 2081 if (error)
2096 return error; 2082 return error;
2097 2083
2098 vfs_dq_init(dir);
2099 error = dir->i_op->mkdir(dir, dentry, mode); 2084 error = dir->i_op->mkdir(dir, dentry, mode);
2100 if (!error) 2085 if (!error)
2101 fsnotify_mkdir(dir, dentry); 2086 fsnotify_mkdir(dir, dentry);
@@ -2181,8 +2166,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2181 if (!dir->i_op->rmdir) 2166 if (!dir->i_op->rmdir)
2182 return -EPERM; 2167 return -EPERM;
2183 2168
2184 vfs_dq_init(dir);
2185
2186 mutex_lock(&dentry->d_inode->i_mutex); 2169 mutex_lock(&dentry->d_inode->i_mutex);
2187 dentry_unhash(dentry); 2170 dentry_unhash(dentry);
2188 if (d_mountpoint(dentry)) 2171 if (d_mountpoint(dentry))
@@ -2268,15 +2251,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2268 if (!dir->i_op->unlink) 2251 if (!dir->i_op->unlink)
2269 return -EPERM; 2252 return -EPERM;
2270 2253
2271 vfs_dq_init(dir);
2272
2273 mutex_lock(&dentry->d_inode->i_mutex); 2254 mutex_lock(&dentry->d_inode->i_mutex);
2274 if (d_mountpoint(dentry)) 2255 if (d_mountpoint(dentry))
2275 error = -EBUSY; 2256 error = -EBUSY;
2276 else { 2257 else {
2277 error = security_inode_unlink(dir, dentry); 2258 error = security_inode_unlink(dir, dentry);
2278 if (!error) 2259 if (!error) {
2279 error = dir->i_op->unlink(dir, dentry); 2260 error = dir->i_op->unlink(dir, dentry);
2261 if (!error)
2262 dentry->d_inode->i_flags |= S_DEAD;
2263 }
2280 } 2264 }
2281 mutex_unlock(&dentry->d_inode->i_mutex); 2265 mutex_unlock(&dentry->d_inode->i_mutex);
2282 2266
@@ -2379,7 +2363,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2379 if (error) 2363 if (error)
2380 return error; 2364 return error;
2381 2365
2382 vfs_dq_init(dir);
2383 error = dir->i_op->symlink(dir, dentry, oldname); 2366 error = dir->i_op->symlink(dir, dentry, oldname);
2384 if (!error) 2367 if (!error)
2385 fsnotify_create(dir, dentry); 2368 fsnotify_create(dir, dentry);
@@ -2463,7 +2446,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2463 return error; 2446 return error;
2464 2447
2465 mutex_lock(&inode->i_mutex); 2448 mutex_lock(&inode->i_mutex);
2466 vfs_dq_init(dir);
2467 error = dir->i_op->link(old_dentry, dir, new_dentry); 2449 error = dir->i_op->link(old_dentry, dir, new_dentry);
2468 mutex_unlock(&inode->i_mutex); 2450 mutex_unlock(&inode->i_mutex);
2469 if (!error) 2451 if (!error)
@@ -2564,7 +2546,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
2564 * e) conversion from fhandle to dentry may come in the wrong moment - when 2546 * e) conversion from fhandle to dentry may come in the wrong moment - when
2565 * we are removing the target. Solution: we will have to grab ->i_mutex 2547 * we are removing the target. Solution: we will have to grab ->i_mutex
2566 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2548 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2567 * ->i_mutex on parents, which works but leads to some truely excessive 2549 * ->i_mutex on parents, which works but leads to some truly excessive
2568 * locking]. 2550 * locking].
2569 */ 2551 */
2570static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 2552static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2629,6 +2611,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2629 else 2611 else
2630 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2612 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2631 if (!error) { 2613 if (!error) {
2614 if (target)
2615 target->i_flags |= S_DEAD;
2632 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2616 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2633 d_move(old_dentry, new_dentry); 2617 d_move(old_dentry, new_dentry);
2634 } 2618 }
@@ -2662,20 +2646,15 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2662 if (!old_dir->i_op->rename) 2646 if (!old_dir->i_op->rename)
2663 return -EPERM; 2647 return -EPERM;
2664 2648
2665 vfs_dq_init(old_dir);
2666 vfs_dq_init(new_dir);
2667
2668 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 2649 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2669 2650
2670 if (is_dir) 2651 if (is_dir)
2671 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 2652 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2672 else 2653 else
2673 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 2654 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2674 if (!error) { 2655 if (!error)
2675 const char *new_name = old_dentry->d_name.name; 2656 fsnotify_move(old_dir, new_dir, old_name, is_dir,
2676 fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2677 new_dentry->d_inode, old_dentry); 2657 new_dentry->d_inode, old_dentry);
2678 }
2679 fsnotify_oldname_free(old_name); 2658 fsnotify_oldname_free(old_name);
2680 2659
2681 return error; 2660 return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index c768f733c8d6..8174c8ab5c70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
573 mnt->mnt_master = old; 573 mnt->mnt_master = old;
574 CLEAR_MNT_SHARED(mnt); 574 CLEAR_MNT_SHARED(mnt);
575 } else if (!(flag & CL_PRIVATE)) { 575 } else if (!(flag & CL_PRIVATE)) {
576 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 576 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
577 list_add(&mnt->mnt_share, &old->mnt_share); 577 list_add(&mnt->mnt_share, &old->mnt_share);
578 if (IS_MNT_SLAVE(old)) 578 if (IS_MNT_SLAVE(old))
579 list_add(&mnt->mnt_slave, &old->mnt_slave); 579 list_add(&mnt->mnt_slave, &old->mnt_slave);
@@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v)
737 up_read(&namespace_sem); 737 up_read(&namespace_sem);
738} 738}
739 739
740int mnt_had_events(struct proc_mounts *p)
741{
742 struct mnt_namespace *ns = p->ns;
743 int res = 0;
744
745 spin_lock(&vfsmount_lock);
746 if (p->event != ns->event) {
747 p->event = ns->event;
748 res = 1;
749 }
750 spin_unlock(&vfsmount_lock);
751
752 return res;
753}
754
740struct proc_fs_info { 755struct proc_fs_info {
741 int flag; 756 int flag;
742 const char *str; 757 const char *str;
@@ -1121,8 +1136,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1121{ 1136{
1122 struct path path; 1137 struct path path;
1123 int retval; 1138 int retval;
1139 int lookup_flags = 0;
1124 1140
1125 retval = user_path(name, &path); 1141 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1142 return -EINVAL;
1143
1144 if (!(flags & UMOUNT_NOFOLLOW))
1145 lookup_flags |= LOOKUP_FOLLOW;
1146
1147 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1126 if (retval) 1148 if (retval)
1127 goto out; 1149 goto out;
1128 retval = -EINVAL; 1150 retval = -EINVAL;
@@ -1246,6 +1268,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
1246 release_mounts(&umount_list); 1268 release_mounts(&umount_list);
1247} 1269}
1248 1270
1271int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1272 struct vfsmount *root)
1273{
1274 struct vfsmount *mnt;
1275 int res = f(root, arg);
1276 if (res)
1277 return res;
1278 list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
1279 res = f(mnt, arg);
1280 if (res)
1281 return res;
1282 }
1283 return 0;
1284}
1285
1249static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) 1286static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
1250{ 1287{
1251 struct vfsmount *p; 1288 struct vfsmount *p;
@@ -1538,7 +1575,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1538 err = do_remount_sb(sb, flags, data, 0); 1575 err = do_remount_sb(sb, flags, data, 0);
1539 if (!err) { 1576 if (!err) {
1540 spin_lock(&vfsmount_lock); 1577 spin_lock(&vfsmount_lock);
1541 mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK; 1578 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1542 path->mnt->mnt_flags = mnt_flags; 1579 path->mnt->mnt_flags = mnt_flags;
1543 spin_unlock(&vfsmount_lock); 1580 spin_unlock(&vfsmount_lock);
1544 } 1581 }
@@ -1671,7 +1708,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1671{ 1708{
1672 int err; 1709 int err;
1673 1710
1674 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD); 1711 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
1675 1712
1676 down_write(&namespace_sem); 1713 down_write(&namespace_sem);
1677 /* Something was mounted here while we slept */ 1714 /* Something was mounted here while we slept */
@@ -2314,17 +2351,13 @@ void __init mnt_init(void)
2314 2351
2315void put_mnt_ns(struct mnt_namespace *ns) 2352void put_mnt_ns(struct mnt_namespace *ns)
2316{ 2353{
2317 struct vfsmount *root;
2318 LIST_HEAD(umount_list); 2354 LIST_HEAD(umount_list);
2319 2355
2320 if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock)) 2356 if (!atomic_dec_and_test(&ns->count))
2321 return; 2357 return;
2322 root = ns->root;
2323 ns->root = NULL;
2324 spin_unlock(&vfsmount_lock);
2325 down_write(&namespace_sem); 2358 down_write(&namespace_sem);
2326 spin_lock(&vfsmount_lock); 2359 spin_lock(&vfsmount_lock);
2327 umount_tree(root, 0, &umount_list); 2360 umount_tree(ns->root, 0, &umount_list);
2328 spin_unlock(&vfsmount_lock); 2361 spin_unlock(&vfsmount_lock);
2329 up_write(&namespace_sem); 2362 up_write(&namespace_sem);
2330 release_mounts(&umount_list); 2363 release_mounts(&umount_list);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 73ab220354df..36dfdae95123 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv)
118 dprintk("NFS: Callback listener port = %u (af %u)\n", 118 dprintk("NFS: Callback listener port = %u (af %u)\n",
119 nfs_callback_tcpport, PF_INET); 119 nfs_callback_tcpport, PF_INET);
120 120
121#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
122 ret = svc_create_xprt(serv, "tcp", PF_INET6, 121 ret = svc_create_xprt(serv, "tcp", PF_INET6,
123 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 122 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
124 if (ret > 0) { 123 if (ret > 0) {
@@ -129,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv)
129 ret = 0; 128 ret = 0;
130 else 129 else
131 goto out_err; 130 goto out_err;
132#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
133 131
134 return svc_prepare_thread(serv, &serv->sv_pools[0]); 132 return svc_prepare_thread(serv, &serv->sv_pools[0]);
135 133
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d4036be0b589..85a7cfd1b8dd 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -119,6 +119,14 @@ struct cb_recallanyargs {
119}; 119};
120 120
121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); 121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
122
123struct cb_recallslotargs {
124 struct sockaddr *crsa_addr;
125 uint32_t crsa_target_max_slots;
126};
127extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
128 void *dummy);
129
122#endif /* CONFIG_NFS_V4_1 */ 130#endif /* CONFIG_NFS_V4_1 */
123 131
124extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 132extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index defa9b4c470e..84761b5bb8e2 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -143,44 +143,49 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
143 * Return success if the sequenceID is one more than what we last saw on 143 * Return success if the sequenceID is one more than what we last saw on
144 * this slot, accounting for wraparound. Increments the slot's sequence. 144 * this slot, accounting for wraparound. Increments the slot's sequence.
145 * 145 *
146 * We don't yet implement a duplicate request cache, so at this time 146 * We don't yet implement a duplicate request cache, instead we set the
147 * we will log replays, and process them as if we had not seen them before, 147 * back channel ca_maxresponsesize_cached to zero. This is OK for now
148 * but we don't bump the sequence in the slot. Not too worried about it,
149 * since we only currently implement idempotent callbacks anyway. 148 * since we only currently implement idempotent callbacks anyway.
150 * 149 *
151 * We have a single slot backchannel at this time, so we don't bother 150 * We have a single slot backchannel at this time, so we don't bother
152 * checking the used_slots bit array on the table. The lower layer guarantees 151 * checking the used_slots bit array on the table. The lower layer guarantees
153 * a single outstanding callback request at a time. 152 * a single outstanding callback request at a time.
154 */ 153 */
155static int 154static __be32
156validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid) 155validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
157{ 156{
158 struct nfs4_slot *slot; 157 struct nfs4_slot *slot;
159 158
160 dprintk("%s enter. slotid %d seqid %d\n", 159 dprintk("%s enter. slotid %d seqid %d\n",
161 __func__, slotid, seqid); 160 __func__, args->csa_slotid, args->csa_sequenceid);
162 161
163 if (slotid > NFS41_BC_MAX_CALLBACKS) 162 if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
164 return htonl(NFS4ERR_BADSLOT); 163 return htonl(NFS4ERR_BADSLOT);
165 164
166 slot = tbl->slots + slotid; 165 slot = tbl->slots + args->csa_slotid;
167 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); 166 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
168 167
169 /* Normal */ 168 /* Normal */
170 if (likely(seqid == slot->seq_nr + 1)) { 169 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
171 slot->seq_nr++; 170 slot->seq_nr++;
172 return htonl(NFS4_OK); 171 return htonl(NFS4_OK);
173 } 172 }
174 173
175 /* Replay */ 174 /* Replay */
176 if (seqid == slot->seq_nr) { 175 if (args->csa_sequenceid == slot->seq_nr) {
177 dprintk("%s seqid %d is a replay - no DRC available\n", 176 dprintk("%s seqid %d is a replay\n",
178 __func__, seqid); 177 __func__, args->csa_sequenceid);
179 return htonl(NFS4_OK); 178 /* Signal process_op to set this error on next op */
179 if (args->csa_cachethis == 0)
180 return htonl(NFS4ERR_RETRY_UNCACHED_REP);
181
182 /* The ca_maxresponsesize_cached is 0 with no DRC */
183 else if (args->csa_cachethis == 1)
184 return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
180 } 185 }
181 186
182 /* Wraparound */ 187 /* Wraparound */
183 if (seqid == 1 && (slot->seq_nr + 1) == 0) { 188 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
184 slot->seq_nr = 1; 189 slot->seq_nr = 1;
185 return htonl(NFS4_OK); 190 return htonl(NFS4_OK);
186 } 191 }
@@ -225,27 +230,87 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
225 return NULL; 230 return NULL;
226} 231}
227 232
228/* FIXME: referring calls should be processed */ 233/*
229unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, 234 * For each referring call triple, check the session's slot table for
235 * a match. If the slot is in use and the sequence numbers match, the
236 * client is still waiting for a response to the original request.
237 */
238static bool referring_call_exists(struct nfs_client *clp,
239 uint32_t nrclists,
240 struct referring_call_list *rclists)
241{
242 bool status = 0;
243 int i, j;
244 struct nfs4_session *session;
245 struct nfs4_slot_table *tbl;
246 struct referring_call_list *rclist;
247 struct referring_call *ref;
248
249 /*
250 * XXX When client trunking is implemented, this becomes
251 * a session lookup from within the loop
252 */
253 session = clp->cl_session;
254 tbl = &session->fc_slot_table;
255
256 for (i = 0; i < nrclists; i++) {
257 rclist = &rclists[i];
258 if (memcmp(session->sess_id.data,
259 rclist->rcl_sessionid.data,
260 NFS4_MAX_SESSIONID_LEN) != 0)
261 continue;
262
263 for (j = 0; j < rclist->rcl_nrefcalls; j++) {
264 ref = &rclist->rcl_refcalls[j];
265
266 dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
267 "slotid %u\n", __func__,
268 ((u32 *)&rclist->rcl_sessionid.data)[0],
269 ((u32 *)&rclist->rcl_sessionid.data)[1],
270 ((u32 *)&rclist->rcl_sessionid.data)[2],
271 ((u32 *)&rclist->rcl_sessionid.data)[3],
272 ref->rc_sequenceid, ref->rc_slotid);
273
274 spin_lock(&tbl->slot_tbl_lock);
275 status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
276 tbl->slots[ref->rc_slotid].seq_nr ==
277 ref->rc_sequenceid);
278 spin_unlock(&tbl->slot_tbl_lock);
279 if (status)
280 goto out;
281 }
282 }
283
284out:
285 return status;
286}
287
288__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
230 struct cb_sequenceres *res) 289 struct cb_sequenceres *res)
231{ 290{
232 struct nfs_client *clp; 291 struct nfs_client *clp;
233 int i, status; 292 int i;
234 293 __be32 status;
235 for (i = 0; i < args->csa_nrclists; i++)
236 kfree(args->csa_rclists[i].rcl_refcalls);
237 kfree(args->csa_rclists);
238 294
239 status = htonl(NFS4ERR_BADSESSION); 295 status = htonl(NFS4ERR_BADSESSION);
240 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); 296 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
241 if (clp == NULL) 297 if (clp == NULL)
242 goto out; 298 goto out;
243 299
244 status = validate_seqid(&clp->cl_session->bc_slot_table, 300 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
245 args->csa_slotid, args->csa_sequenceid);
246 if (status) 301 if (status)
247 goto out_putclient; 302 goto out_putclient;
248 303
304 /*
305 * Check for pending referring calls. If a match is found, a
306 * related callback was received before the response to the original
307 * call.
308 */
309 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
310 status = htonl(NFS4ERR_DELAY);
311 goto out_putclient;
312 }
313
249 memcpy(&res->csr_sessionid, &args->csa_sessionid, 314 memcpy(&res->csr_sessionid, &args->csa_sessionid,
250 sizeof(res->csr_sessionid)); 315 sizeof(res->csr_sessionid));
251 res->csr_sequenceid = args->csa_sequenceid; 316 res->csr_sequenceid = args->csa_sequenceid;
@@ -256,15 +321,23 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
256out_putclient: 321out_putclient:
257 nfs_put_client(clp); 322 nfs_put_client(clp);
258out: 323out:
259 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 324 for (i = 0; i < args->csa_nrclists; i++)
260 res->csr_status = status; 325 kfree(args->csa_rclists[i].rcl_refcalls);
261 return res->csr_status; 326 kfree(args->csa_rclists);
327
328 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
329 res->csr_status = 0;
330 else
331 res->csr_status = status;
332 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
333 ntohl(status), ntohl(res->csr_status));
334 return status;
262} 335}
263 336
264unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) 337__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
265{ 338{
266 struct nfs_client *clp; 339 struct nfs_client *clp;
267 int status; 340 __be32 status;
268 fmode_t flags = 0; 341 fmode_t flags = 0;
269 342
270 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 343 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
@@ -289,4 +362,40 @@ out:
289 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 362 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
290 return status; 363 return status;
291} 364}
365
366/* Reduce the fore channel's max_slots to the target value */
367__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
368{
369 struct nfs_client *clp;
370 struct nfs4_slot_table *fc_tbl;
371 __be32 status;
372
373 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
374 clp = nfs_find_client(args->crsa_addr, 4);
375 if (clp == NULL)
376 goto out;
377
378 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
379 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
380 args->crsa_target_max_slots);
381
382 fc_tbl = &clp->cl_session->fc_slot_table;
383
384 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
385 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
386 args->crsa_target_max_slots < 1)
387 goto out_putclient;
388
389 status = htonl(NFS4_OK);
390 if (args->crsa_target_max_slots == fc_tbl->max_slots)
391 goto out_putclient;
392
393 fc_tbl->target_max_slots = args->crsa_target_max_slots;
394 nfs41_handle_recall_slot(clp);
395out_putclient:
396 nfs_put_client(clp); /* balance nfs_find_client */
397out:
398 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
399 return status;
400}
292#endif /* CONFIG_NFS_V4_1 */ 401#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 8e1a2511c8be..a2b8b4df125d 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -24,10 +24,14 @@
24#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 24#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
25 4 + 1 + 3) 25 4 + 1 + 3)
26#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 26#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
27#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
27#endif /* CONFIG_NFS_V4_1 */ 28#endif /* CONFIG_NFS_V4_1 */
28 29
29#define NFSDBG_FACILITY NFSDBG_CALLBACK 30#define NFSDBG_FACILITY NFSDBG_CALLBACK
30 31
32/* Internal error code */
33#define NFS4ERR_RESOURCE_HDR 11050
34
31typedef __be32 (*callback_process_op_t)(void *, void *); 35typedef __be32 (*callback_process_op_t)(void *, void *);
32typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); 36typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
33typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); 37typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -173,7 +177,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
173 __be32 *p; 177 __be32 *p;
174 p = read_buf(xdr, 4); 178 p = read_buf(xdr, 4);
175 if (unlikely(p == NULL)) 179 if (unlikely(p == NULL))
176 return htonl(NFS4ERR_RESOURCE); 180 return htonl(NFS4ERR_RESOURCE_HDR);
177 *op = ntohl(*p); 181 *op = ntohl(*p);
178 return 0; 182 return 0;
179} 183}
@@ -215,10 +219,10 @@ out:
215 219
216#if defined(CONFIG_NFS_V4_1) 220#if defined(CONFIG_NFS_V4_1)
217 221
218static unsigned decode_sessionid(struct xdr_stream *xdr, 222static __be32 decode_sessionid(struct xdr_stream *xdr,
219 struct nfs4_sessionid *sid) 223 struct nfs4_sessionid *sid)
220{ 224{
221 uint32_t *p; 225 __be32 *p;
222 int len = NFS4_MAX_SESSIONID_LEN; 226 int len = NFS4_MAX_SESSIONID_LEN;
223 227
224 p = read_buf(xdr, len); 228 p = read_buf(xdr, len);
@@ -229,12 +233,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
229 return 0; 233 return 0;
230} 234}
231 235
232static unsigned decode_rc_list(struct xdr_stream *xdr, 236static __be32 decode_rc_list(struct xdr_stream *xdr,
233 struct referring_call_list *rc_list) 237 struct referring_call_list *rc_list)
234{ 238{
235 uint32_t *p; 239 __be32 *p;
236 int i; 240 int i;
237 unsigned status; 241 __be32 status;
238 242
239 status = decode_sessionid(xdr, &rc_list->rcl_sessionid); 243 status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
240 if (status) 244 if (status)
@@ -267,13 +271,13 @@ out:
267 return status; 271 return status;
268} 272}
269 273
270static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp, 274static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
271 struct xdr_stream *xdr, 275 struct xdr_stream *xdr,
272 struct cb_sequenceargs *args) 276 struct cb_sequenceargs *args)
273{ 277{
274 uint32_t *p; 278 __be32 *p;
275 int i; 279 int i;
276 unsigned status; 280 __be32 status;
277 281
278 status = decode_sessionid(xdr, &args->csa_sessionid); 282 status = decode_sessionid(xdr, &args->csa_sessionid);
279 if (status) 283 if (status)
@@ -327,11 +331,11 @@ out_free:
327 goto out; 331 goto out;
328} 332}
329 333
330static unsigned decode_recallany_args(struct svc_rqst *rqstp, 334static __be32 decode_recallany_args(struct svc_rqst *rqstp,
331 struct xdr_stream *xdr, 335 struct xdr_stream *xdr,
332 struct cb_recallanyargs *args) 336 struct cb_recallanyargs *args)
333{ 337{
334 uint32_t *p; 338 __be32 *p;
335 339
336 args->craa_addr = svc_addr(rqstp); 340 args->craa_addr = svc_addr(rqstp);
337 p = read_buf(xdr, 4); 341 p = read_buf(xdr, 4);
@@ -346,6 +350,20 @@ static unsigned decode_recallany_args(struct svc_rqst *rqstp,
346 return 0; 350 return 0;
347} 351}
348 352
353static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
354 struct xdr_stream *xdr,
355 struct cb_recallslotargs *args)
356{
357 __be32 *p;
358
359 args->crsa_addr = svc_addr(rqstp);
360 p = read_buf(xdr, 4);
361 if (unlikely(p == NULL))
362 return htonl(NFS4ERR_BADXDR);
363 args->crsa_target_max_slots = ntohl(*p++);
364 return 0;
365}
366
349#endif /* CONFIG_NFS_V4_1 */ 367#endif /* CONFIG_NFS_V4_1 */
350 368
351static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 369static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -465,7 +483,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
465 483
466 p = xdr_reserve_space(xdr, 8); 484 p = xdr_reserve_space(xdr, 8);
467 if (unlikely(p == NULL)) 485 if (unlikely(p == NULL))
468 return htonl(NFS4ERR_RESOURCE); 486 return htonl(NFS4ERR_RESOURCE_HDR);
469 *p++ = htonl(op); 487 *p++ = htonl(op);
470 *p = res; 488 *p = res;
471 return 0; 489 return 0;
@@ -499,10 +517,10 @@ out:
499 517
500#if defined(CONFIG_NFS_V4_1) 518#if defined(CONFIG_NFS_V4_1)
501 519
502static unsigned encode_sessionid(struct xdr_stream *xdr, 520static __be32 encode_sessionid(struct xdr_stream *xdr,
503 const struct nfs4_sessionid *sid) 521 const struct nfs4_sessionid *sid)
504{ 522{
505 uint32_t *p; 523 __be32 *p;
506 int len = NFS4_MAX_SESSIONID_LEN; 524 int len = NFS4_MAX_SESSIONID_LEN;
507 525
508 p = xdr_reserve_space(xdr, len); 526 p = xdr_reserve_space(xdr, len);
@@ -513,11 +531,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr,
513 return 0; 531 return 0;
514} 532}
515 533
516static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp, 534static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
517 struct xdr_stream *xdr, 535 struct xdr_stream *xdr,
518 const struct cb_sequenceres *res) 536 const struct cb_sequenceres *res)
519{ 537{
520 uint32_t *p; 538 __be32 *p;
521 unsigned status = res->csr_status; 539 unsigned status = res->csr_status;
522 540
523 if (unlikely(status != 0)) 541 if (unlikely(status != 0))
@@ -554,6 +572,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
554 case OP_CB_RECALL: 572 case OP_CB_RECALL:
555 case OP_CB_SEQUENCE: 573 case OP_CB_SEQUENCE:
556 case OP_CB_RECALL_ANY: 574 case OP_CB_RECALL_ANY:
575 case OP_CB_RECALL_SLOT:
557 *op = &callback_ops[op_nr]; 576 *op = &callback_ops[op_nr];
558 break; 577 break;
559 578
@@ -562,7 +581,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
562 case OP_CB_NOTIFY: 581 case OP_CB_NOTIFY:
563 case OP_CB_PUSH_DELEG: 582 case OP_CB_PUSH_DELEG:
564 case OP_CB_RECALLABLE_OBJ_AVAIL: 583 case OP_CB_RECALLABLE_OBJ_AVAIL:
565 case OP_CB_RECALL_SLOT:
566 case OP_CB_WANTS_CANCELLED: 584 case OP_CB_WANTS_CANCELLED:
567 case OP_CB_NOTIFY_LOCK: 585 case OP_CB_NOTIFY_LOCK:
568 return htonl(NFS4ERR_NOTSUPP); 586 return htonl(NFS4ERR_NOTSUPP);
@@ -602,20 +620,18 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
602static __be32 process_op(uint32_t minorversion, int nop, 620static __be32 process_op(uint32_t minorversion, int nop,
603 struct svc_rqst *rqstp, 621 struct svc_rqst *rqstp,
604 struct xdr_stream *xdr_in, void *argp, 622 struct xdr_stream *xdr_in, void *argp,
605 struct xdr_stream *xdr_out, void *resp) 623 struct xdr_stream *xdr_out, void *resp, int* drc_status)
606{ 624{
607 struct callback_op *op = &callback_ops[0]; 625 struct callback_op *op = &callback_ops[0];
608 unsigned int op_nr = OP_CB_ILLEGAL; 626 unsigned int op_nr;
609 __be32 status; 627 __be32 status;
610 long maxlen; 628 long maxlen;
611 __be32 res; 629 __be32 res;
612 630
613 dprintk("%s: start\n", __func__); 631 dprintk("%s: start\n", __func__);
614 status = decode_op_hdr(xdr_in, &op_nr); 632 status = decode_op_hdr(xdr_in, &op_nr);
615 if (unlikely(status)) { 633 if (unlikely(status))
616 status = htonl(NFS4ERR_OP_ILLEGAL); 634 return status;
617 goto out;
618 }
619 635
620 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", 636 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
621 __func__, minorversion, nop, op_nr); 637 __func__, minorversion, nop, op_nr);
@@ -624,19 +640,32 @@ static __be32 process_op(uint32_t minorversion, int nop,
624 preprocess_nfs4_op(op_nr, &op); 640 preprocess_nfs4_op(op_nr, &op);
625 if (status == htonl(NFS4ERR_OP_ILLEGAL)) 641 if (status == htonl(NFS4ERR_OP_ILLEGAL))
626 op_nr = OP_CB_ILLEGAL; 642 op_nr = OP_CB_ILLEGAL;
627out: 643 if (status)
644 goto encode_hdr;
645
646 if (*drc_status) {
647 status = *drc_status;
648 goto encode_hdr;
649 }
650
628 maxlen = xdr_out->end - xdr_out->p; 651 maxlen = xdr_out->end - xdr_out->p;
629 if (maxlen > 0 && maxlen < PAGE_SIZE) { 652 if (maxlen > 0 && maxlen < PAGE_SIZE) {
630 if (likely(status == 0 && op->decode_args != NULL)) 653 status = op->decode_args(rqstp, xdr_in, argp);
631 status = op->decode_args(rqstp, xdr_in, argp); 654 if (likely(status == 0))
632 if (likely(status == 0 && op->process_op != NULL))
633 status = op->process_op(argp, resp); 655 status = op->process_op(argp, resp);
634 } else 656 } else
635 status = htonl(NFS4ERR_RESOURCE); 657 status = htonl(NFS4ERR_RESOURCE);
636 658
659 /* Only set by OP_CB_SEQUENCE processing */
660 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
661 *drc_status = status;
662 status = 0;
663 }
664
665encode_hdr:
637 res = encode_op_hdr(xdr_out, op_nr, status); 666 res = encode_op_hdr(xdr_out, op_nr, status);
638 if (status == 0) 667 if (unlikely(res))
639 status = res; 668 return res;
640 if (op->encode_res != NULL && status == 0) 669 if (op->encode_res != NULL && status == 0)
641 status = op->encode_res(rqstp, xdr_out, resp); 670 status = op->encode_res(rqstp, xdr_out, resp);
642 dprintk("%s: done, status = %d\n", __func__, ntohl(status)); 671 dprintk("%s: done, status = %d\n", __func__, ntohl(status));
@@ -652,7 +681,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
652 struct cb_compound_hdr_res hdr_res = { NULL }; 681 struct cb_compound_hdr_res hdr_res = { NULL };
653 struct xdr_stream xdr_in, xdr_out; 682 struct xdr_stream xdr_in, xdr_out;
654 __be32 *p; 683 __be32 *p;
655 __be32 status; 684 __be32 status, drc_status = 0;
656 unsigned int nops = 0; 685 unsigned int nops = 0;
657 686
658 dprintk("%s: start\n", __func__); 687 dprintk("%s: start\n", __func__);
@@ -672,11 +701,18 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
672 return rpc_system_err; 701 return rpc_system_err;
673 702
674 while (status == 0 && nops != hdr_arg.nops) { 703 while (status == 0 && nops != hdr_arg.nops) {
675 status = process_op(hdr_arg.minorversion, nops, 704 status = process_op(hdr_arg.minorversion, nops, rqstp,
676 rqstp, &xdr_in, argp, &xdr_out, resp); 705 &xdr_in, argp, &xdr_out, resp, &drc_status);
677 nops++; 706 nops++;
678 } 707 }
679 708
709 /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
710 * resource error in cb_compound status without returning op */
711 if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
712 status = htonl(NFS4ERR_RESOURCE);
713 nops--;
714 }
715
680 *hdr_res.status = status; 716 *hdr_res.status = status;
681 *hdr_res.nops = htonl(nops); 717 *hdr_res.nops = htonl(nops);
682 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 718 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
@@ -713,6 +749,11 @@ static struct callback_op callback_ops[] = {
713 .decode_args = (callback_decode_arg_t)decode_recallany_args, 749 .decode_args = (callback_decode_arg_t)decode_recallany_args,
714 .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, 750 .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
715 }, 751 },
752 [OP_CB_RECALL_SLOT] = {
753 .process_op = (callback_process_op_t)nfs4_callback_recallslot,
754 .decode_args = (callback_decode_arg_t)decode_recallslot_args,
755 .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
756 },
716#endif /* CONFIG_NFS_V4_1 */ 757#endif /* CONFIG_NFS_V4_1 */
717}; 758};
718 759
@@ -741,6 +782,7 @@ struct svc_version nfs4_callback_version1 = {
741 .vs_proc = nfs4_callback_procedures1, 782 .vs_proc = nfs4_callback_procedures1,
742 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 783 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
743 .vs_dispatch = NULL, 784 .vs_dispatch = NULL,
785 .vs_hidden = 1,
744}; 786};
745 787
746struct svc_version nfs4_callback_version4 = { 788struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713ce68b..2274f1737336 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -164,30 +164,7 @@ error_0:
164 return ERR_PTR(err); 164 return ERR_PTR(err);
165} 165}
166 166
167static void nfs4_shutdown_client(struct nfs_client *clp)
168{
169#ifdef CONFIG_NFS_V4
170 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
171 nfs4_kill_renewd(clp);
172 BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
173 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
174 nfs_idmap_delete(clp);
175
176 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
177#endif
178}
179
180/*
181 * Destroy the NFS4 callback service
182 */
183static void nfs4_destroy_callback(struct nfs_client *clp)
184{
185#ifdef CONFIG_NFS_V4 167#ifdef CONFIG_NFS_V4
186 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
187 nfs_callback_down(clp->cl_minorversion);
188#endif /* CONFIG_NFS_V4 */
189}
190
191/* 168/*
192 * Clears/puts all minor version specific parts from an nfs_client struct 169 * Clears/puts all minor version specific parts from an nfs_client struct
193 * reverting it to minorversion 0. 170 * reverting it to minorversion 0.
@@ -202,9 +179,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
202 179
203 clp->cl_call_sync = _nfs4_call_sync; 180 clp->cl_call_sync = _nfs4_call_sync;
204#endif /* CONFIG_NFS_V4_1 */ 181#endif /* CONFIG_NFS_V4_1 */
182}
205 183
184/*
185 * Destroy the NFS4 callback service
186 */
187static void nfs4_destroy_callback(struct nfs_client *clp)
188{
189 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
190 nfs_callback_down(clp->cl_minorversion);
191}
192
193static void nfs4_shutdown_client(struct nfs_client *clp)
194{
195 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
196 nfs4_kill_renewd(clp);
197 nfs4_clear_client_minor_version(clp);
206 nfs4_destroy_callback(clp); 198 nfs4_destroy_callback(clp);
199 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
200 nfs_idmap_delete(clp);
201
202 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
207} 203}
204#else
205static void nfs4_shutdown_client(struct nfs_client *clp)
206{
207}
208#endif /* CONFIG_NFS_V4 */
208 209
209/* 210/*
210 * Destroy a shared client record 211 * Destroy a shared client record
@@ -213,7 +214,6 @@ static void nfs_free_client(struct nfs_client *clp)
213{ 214{
214 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); 215 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
215 216
216 nfs4_clear_client_minor_version(clp);
217 nfs4_shutdown_client(clp); 217 nfs4_shutdown_client(clp);
218 218
219 nfs_fscache_release_client_cookie(clp); 219 nfs_fscache_release_client_cookie(clp);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
71} 71}
72#endif 72#endif
73 73
74static inline int nfs_have_delegated_attributes(struct inode *inode)
75{
76 return nfs_have_delegation(inode, FMODE_READ) &&
77 !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
78}
79
74#endif 80#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3c7f03b669fb..c6f2750648f4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
560 desc->entry = &my_entry; 560 desc->entry = &my_entry;
561 561
562 nfs_block_sillyrename(dentry); 562 nfs_block_sillyrename(dentry);
563 res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); 563 res = nfs_revalidate_mapping(inode, filp->f_mapping);
564 if (res < 0) 564 if (res < 0)
565 goto out; 565 goto out;
566 566
@@ -1789,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
1789 cache = nfs_access_search_rbtree(inode, cred); 1789 cache = nfs_access_search_rbtree(inode, cred);
1790 if (cache == NULL) 1790 if (cache == NULL)
1791 goto out; 1791 goto out;
1792 if (!nfs_have_delegation(inode, FMODE_READ) && 1792 if (!nfs_have_delegated_attributes(inode) &&
1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1794 goto out_stale; 1794 goto out_stale;
1795 res->jiffies = cache->jiffies; 1795 res->jiffies = cache->jiffies;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 95e1ca765d47..3f0cd4dfddaf 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -36,6 +36,19 @@ struct nfs_dns_ent {
36}; 36};
37 37
38 38
39static void nfs_dns_ent_update(struct cache_head *cnew,
40 struct cache_head *ckey)
41{
42 struct nfs_dns_ent *new;
43 struct nfs_dns_ent *key;
44
45 new = container_of(cnew, struct nfs_dns_ent, h);
46 key = container_of(ckey, struct nfs_dns_ent, h);
47
48 memcpy(&new->addr, &key->addr, key->addrlen);
49 new->addrlen = key->addrlen;
50}
51
39static void nfs_dns_ent_init(struct cache_head *cnew, 52static void nfs_dns_ent_init(struct cache_head *cnew,
40 struct cache_head *ckey) 53 struct cache_head *ckey)
41{ 54{
@@ -49,8 +62,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
49 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); 62 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
50 if (new->hostname) { 63 if (new->hostname) {
51 new->namelen = key->namelen; 64 new->namelen = key->namelen;
52 memcpy(&new->addr, &key->addr, key->addrlen); 65 nfs_dns_ent_update(cnew, ckey);
53 new->addrlen = key->addrlen;
54 } else { 66 } else {
55 new->namelen = 0; 67 new->namelen = 0;
56 new->addrlen = 0; 68 new->addrlen = 0;
@@ -234,7 +246,7 @@ static struct cache_detail nfs_dns_resolve = {
234 .cache_show = nfs_dns_show, 246 .cache_show = nfs_dns_show,
235 .match = nfs_dns_match, 247 .match = nfs_dns_match,
236 .init = nfs_dns_ent_init, 248 .init = nfs_dns_ent_init,
237 .update = nfs_dns_ent_init, 249 .update = nfs_dns_ent_update,
238 .alloc = nfs_dns_ent_alloc, 250 .alloc = nfs_dns_ent_alloc,
239}; 251};
240 252
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 63f2071d6445..ae0d92736531 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -123,11 +123,11 @@ nfs_file_open(struct inode *inode, struct file *filp)
123 filp->f_path.dentry->d_parent->d_name.name, 123 filp->f_path.dentry->d_parent->d_name.name,
124 filp->f_path.dentry->d_name.name); 124 filp->f_path.dentry->d_name.name);
125 125
126 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
126 res = nfs_check_flags(filp->f_flags); 127 res = nfs_check_flags(filp->f_flags);
127 if (res) 128 if (res)
128 return res; 129 return res;
129 130
130 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
131 res = nfs_open(inode, filp); 131 res = nfs_open(inode, filp);
132 return res; 132 return res;
133} 133}
@@ -237,9 +237,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
237 dentry->d_parent->d_name.name, 237 dentry->d_parent->d_name.name,
238 dentry->d_name.name); 238 dentry->d_name.name);
239 239
240 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
240 if ((file->f_mode & FMODE_WRITE) == 0) 241 if ((file->f_mode & FMODE_WRITE) == 0)
241 return 0; 242 return 0;
242 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
243 243
244 /* Flush writes to the server and return any errors */ 244 /* Flush writes to the server and return any errors */
245 return nfs_do_fsync(ctx, inode); 245 return nfs_do_fsync(ctx, inode);
@@ -262,9 +262,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
262 (unsigned long) count, (unsigned long) pos); 262 (unsigned long) count, (unsigned long) pos);
263 263
264 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 264 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
265 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); 265 if (!result) {
266 if (!result)
267 result = generic_file_aio_read(iocb, iov, nr_segs, pos); 266 result = generic_file_aio_read(iocb, iov, nr_segs, pos);
267 if (result > 0)
268 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
269 }
268 return result; 270 return result;
269} 271}
270 272
@@ -282,8 +284,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
282 (unsigned long) count, (unsigned long long) *ppos); 284 (unsigned long) count, (unsigned long long) *ppos);
283 285
284 res = nfs_revalidate_mapping(inode, filp->f_mapping); 286 res = nfs_revalidate_mapping(inode, filp->f_mapping);
285 if (!res) 287 if (!res) {
286 res = generic_file_splice_read(filp, ppos, pipe, count, flags); 288 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
289 if (res > 0)
290 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
291 }
287 return res; 292 return res;
288} 293}
289 294
@@ -486,7 +491,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
486{ 491{
487 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
488 493
489 if (gfp & __GFP_WAIT) 494 /* Only do I/O if gfp is a superset of GFP_KERNEL */
495 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
490 nfs_wb_page(page->mapping->host, page); 496 nfs_wb_page(page->mapping->host, page);
491 /* If PagePrivate() is set, then the page is not freeable */ 497 /* If PagePrivate() is set, then the page is not freeable */
492 if (PagePrivate(page)) 498 if (PagePrivate(page))
@@ -596,6 +602,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
596{ 602{
597 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 603 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
598 struct inode * inode = dentry->d_inode; 604 struct inode * inode = dentry->d_inode;
605 unsigned long written = 0;
599 ssize_t result; 606 ssize_t result;
600 size_t count = iov_length(iov, nr_segs); 607 size_t count = iov_length(iov, nr_segs);
601 608
@@ -622,14 +629,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
622 if (!count) 629 if (!count)
623 goto out; 630 goto out;
624 631
625 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
626 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 632 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
633 if (result > 0)
634 written = result;
635
627 /* Return error values for O_DSYNC and IS_SYNC() */ 636 /* Return error values for O_DSYNC and IS_SYNC() */
628 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 637 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
629 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); 638 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
630 if (err < 0) 639 if (err < 0)
631 result = err; 640 result = err;
632 } 641 }
642 if (result > 0)
643 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
633out: 644out:
634 return result; 645 return result;
635 646
@@ -644,6 +655,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
644{ 655{
645 struct dentry *dentry = filp->f_path.dentry; 656 struct dentry *dentry = filp->f_path.dentry;
646 struct inode *inode = dentry->d_inode; 657 struct inode *inode = dentry->d_inode;
658 unsigned long written = 0;
647 ssize_t ret; 659 ssize_t ret;
648 660
649 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", 661 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
@@ -654,14 +666,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
654 * The combination of splice and an O_APPEND destination is disallowed. 666 * The combination of splice and an O_APPEND destination is disallowed.
655 */ 667 */
656 668
657 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
658
659 ret = generic_file_splice_write(pipe, filp, ppos, count, flags); 669 ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
670 if (ret > 0)
671 written = ret;
672
660 if (ret >= 0 && nfs_need_sync_write(filp, inode)) { 673 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
661 int err = nfs_do_fsync(nfs_file_open_context(filp), inode); 674 int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
662 if (err < 0) 675 if (err < 0)
663 ret = err; 676 ret = err;
664 } 677 }
678 if (ret > 0)
679 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
665 return ret; 680 return ret;
666} 681}
667 682
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f141bde7756a..e358df75a6ad 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
97 return ino; 97 return ino;
98} 98}
99 99
100int nfs_write_inode(struct inode *inode, int sync)
101{
102 int ret;
103
104 if (sync) {
105 ret = filemap_fdatawait(inode->i_mapping);
106 if (ret == 0)
107 ret = nfs_commit_inode(inode, FLUSH_SYNC);
108 } else
109 ret = nfs_commit_inode(inode, 0);
110 if (ret >= 0)
111 return 0;
112 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
113 return ret;
114}
115
116void nfs_clear_inode(struct inode *inode) 100void nfs_clear_inode(struct inode *inode)
117{ 101{
118 /* 102 /*
@@ -130,16 +114,12 @@ void nfs_clear_inode(struct inode *inode)
130 */ 114 */
131int nfs_sync_mapping(struct address_space *mapping) 115int nfs_sync_mapping(struct address_space *mapping)
132{ 116{
133 int ret; 117 int ret = 0;
134 118
135 if (mapping->nrpages == 0) 119 if (mapping->nrpages != 0) {
136 return 0; 120 unmap_mapping_range(mapping, 0, 0, 0);
137 unmap_mapping_range(mapping, 0, 0, 0); 121 ret = nfs_wb_all(mapping->host);
138 ret = filemap_write_and_wait(mapping); 122 }
139 if (ret != 0)
140 goto out;
141 ret = nfs_wb_all(mapping->host);
142out:
143 return ret; 123 return ret;
144} 124}
145 125
@@ -511,17 +491,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
511 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 491 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
512 int err; 492 int err;
513 493
514 /* 494 /* Flush out writes to the server in order to update c/mtime. */
515 * Flush out writes to the server in order to update c/mtime.
516 *
517 * Hold the i_mutex to suspend application writes temporarily;
518 * this prevents long-running writing applications from blocking
519 * nfs_wb_nocommit.
520 */
521 if (S_ISREG(inode->i_mode)) { 495 if (S_ISREG(inode->i_mode)) {
522 mutex_lock(&inode->i_mutex); 496 err = filemap_write_and_wait(inode->i_mapping);
523 nfs_wb_nocommit(inode); 497 if (err)
524 mutex_unlock(&inode->i_mutex); 498 goto out;
525 } 499 }
526 500
527 /* 501 /*
@@ -545,6 +519,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
545 generic_fillattr(inode, stat); 519 generic_fillattr(inode, stat);
546 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 520 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
547 } 521 }
522out:
548 return err; 523 return err;
549} 524}
550 525
@@ -574,14 +549,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
574 nfs_revalidate_inode(server, inode); 549 nfs_revalidate_inode(server, inode);
575} 550}
576 551
577static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 552static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
578{ 553{
579 struct nfs_open_context *ctx; 554 struct nfs_open_context *ctx;
580 555
581 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 556 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
582 if (ctx != NULL) { 557 if (ctx != NULL) {
583 ctx->path.dentry = dget(dentry); 558 ctx->path = *path;
584 ctx->path.mnt = mntget(mnt); 559 path_get(&ctx->path);
585 ctx->cred = get_rpccred(cred); 560 ctx->cred = get_rpccred(cred);
586 ctx->state = NULL; 561 ctx->state = NULL;
587 ctx->lockowner = current->files; 562 ctx->lockowner = current->files;
@@ -620,11 +595,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
620 __put_nfs_open_context(ctx, 0); 595 __put_nfs_open_context(ctx, 0);
621} 596}
622 597
623static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
624{
625 __put_nfs_open_context(ctx, 1);
626}
627
628/* 598/*
629 * Ensure that mmap has a recent RPC credential for use when writing out 599 * Ensure that mmap has a recent RPC credential for use when writing out
630 * shared pages 600 * shared pages
@@ -671,7 +641,7 @@ static void nfs_file_clear_open_context(struct file *filp)
671 spin_lock(&inode->i_lock); 641 spin_lock(&inode->i_lock);
672 list_move_tail(&ctx->list, &NFS_I(inode)->open_files); 642 list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
673 spin_unlock(&inode->i_lock); 643 spin_unlock(&inode->i_lock);
674 put_nfs_open_context_sync(ctx); 644 __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
675 } 645 }
676} 646}
677 647
@@ -686,7 +656,7 @@ int nfs_open(struct inode *inode, struct file *filp)
686 cred = rpc_lookup_cred(); 656 cred = rpc_lookup_cred();
687 if (IS_ERR(cred)) 657 if (IS_ERR(cred))
688 return PTR_ERR(cred); 658 return PTR_ERR(cred);
689 ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); 659 ctx = alloc_nfs_open_context(&filp->f_path, cred);
690 put_rpccred(cred); 660 put_rpccred(cred);
691 if (ctx == NULL) 661 if (ctx == NULL)
692 return -ENOMEM; 662 return -ENOMEM;
@@ -759,7 +729,7 @@ int nfs_attribute_timeout(struct inode *inode)
759{ 729{
760 struct nfs_inode *nfsi = NFS_I(inode); 730 struct nfs_inode *nfsi = NFS_I(inode);
761 731
762 if (nfs_have_delegation(inode, FMODE_READ)) 732 if (nfs_have_delegated_attributes(inode))
763 return 0; 733 return 0;
764 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 734 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
765} 735}
@@ -779,7 +749,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
779 return __nfs_revalidate_inode(server, inode); 749 return __nfs_revalidate_inode(server, inode);
780} 750}
781 751
782static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) 752static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
783{ 753{
784 struct nfs_inode *nfsi = NFS_I(inode); 754 struct nfs_inode *nfsi = NFS_I(inode);
785 755
@@ -800,49 +770,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
800 return 0; 770 return 0;
801} 771}
802 772
803static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
804{
805 int ret = 0;
806
807 mutex_lock(&inode->i_mutex);
808 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
809 ret = nfs_sync_mapping(mapping);
810 if (ret == 0)
811 ret = nfs_invalidate_mapping_nolock(inode, mapping);
812 }
813 mutex_unlock(&inode->i_mutex);
814 return ret;
815}
816
817/**
818 * nfs_revalidate_mapping_nolock - Revalidate the pagecache
819 * @inode - pointer to host inode
820 * @mapping - pointer to mapping
821 */
822int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
823{
824 struct nfs_inode *nfsi = NFS_I(inode);
825 int ret = 0;
826
827 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
828 || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
829 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
830 if (ret < 0)
831 goto out;
832 }
833 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
834 ret = nfs_invalidate_mapping_nolock(inode, mapping);
835out:
836 return ret;
837}
838
839/** 773/**
840 * nfs_revalidate_mapping - Revalidate the pagecache 774 * nfs_revalidate_mapping - Revalidate the pagecache
841 * @inode - pointer to host inode 775 * @inode - pointer to host inode
842 * @mapping - pointer to mapping 776 * @mapping - pointer to mapping
843 *
844 * This version of the function will take the inode->i_mutex and attempt to
845 * flush out all dirty data if it needs to invalidate the page cache.
846 */ 777 */
847int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 778int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
848{ 779{
@@ -1420,6 +1351,7 @@ static void init_once(void *foo)
1420 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1351 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1421 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1352 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1422 nfsi->npages = 0; 1353 nfsi->npages = 0;
1354 nfsi->ncommit = 0;
1423 atomic_set(&nfsi->silly_count, 1); 1355 atomic_set(&nfsi->silly_count, 1);
1424 INIT_HLIST_HEAD(&nfsi->silly_list); 1356 INIT_HLIST_HEAD(&nfsi->silly_list);
1425 init_waitqueue_head(&nfsi->waitqueue); 1357 init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d23b32..11f82f03c5de 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
211extern struct workqueue_struct *nfsiod_workqueue; 211extern struct workqueue_struct *nfsiod_workqueue;
212extern struct inode *nfs_alloc_inode(struct super_block *sb); 212extern struct inode *nfs_alloc_inode(struct super_block *sb);
213extern void nfs_destroy_inode(struct inode *); 213extern void nfs_destroy_inode(struct inode *);
214extern int nfs_write_inode(struct inode *,int); 214extern int nfs_write_inode(struct inode *, struct writeback_control *);
215extern void nfs_clear_inode(struct inode *); 215extern void nfs_clear_inode(struct inode *);
216#ifdef CONFIG_NFS_V4 216#ifdef CONFIG_NFS_V4
217extern void nfs4_clear_inode(struct inode *); 217extern void nfs4_clear_inode(struct inode *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 46d779abafd3..1d8d5c813b01 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -57,12 +57,12 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
57} 57}
58#endif 58#endif
59 59
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 60static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
61{ 61{
62 return alloc_percpu(struct nfs_iostats); 62 return alloc_percpu(struct nfs_iostats);
63} 63}
64 64
65static inline void nfs_free_iostats(struct nfs_iostats *stats) 65static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
66{ 66{
67 if (stats != NULL) 67 if (stats != NULL)
68 free_percpu(stats); 68 free_percpu(stats);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3f8881d1a050..24992f0a29f2 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,14 +22,14 @@
22 22
23#define NFSDBG_FACILITY NFSDBG_PROC 23#define NFSDBG_FACILITY NFSDBG_PROC
24 24
25/* A wrapper to handle the EJUKEBOX error message */ 25/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
26static int 26static int
27nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) 27nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
28{ 28{
29 int res; 29 int res;
30 do { 30 do {
31 res = rpc_call_sync(clnt, msg, flags); 31 res = rpc_call_sync(clnt, msg, flags);
32 if (res != -EJUKEBOX) 32 if (res != -EJUKEBOX && res != -EKEYEXPIRED)
33 break; 33 break;
34 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); 34 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
35 res = -ERESTARTSYS; 35 res = -ERESTARTSYS;
@@ -42,9 +42,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
42static int 42static int
43nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) 43nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
44{ 44{
45 if (task->tk_status != -EJUKEBOX) 45 if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
46 return 0; 46 return 0;
47 nfs_inc_stats(inode, NFSIOS_DELAY); 47 if (task->tk_status == -EJUKEBOX)
48 nfs_inc_stats(inode, NFSIOS_DELAY);
48 task->tk_status = 0; 49 task->tk_status = 0;
49 rpc_restart_call(task); 50 rpc_restart_call(task);
50 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); 51 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0c6fda33d66e..a187200a7aac 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -46,6 +46,7 @@ enum nfs4_client_state {
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_SESSION_RESET, 47 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_SESSION_DRAINING, 48 NFS4CLNT_SESSION_DRAINING,
49 NFS4CLNT_RECALL_SLOT,
49}; 50};
50 51
51/* 52/*
@@ -280,6 +281,7 @@ extern void nfs4_schedule_state_manager(struct nfs_client *);
280extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); 281extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
281extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); 282extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
282extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 283extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
284extern void nfs41_handle_recall_slot(struct nfs_client *clp);
283extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 285extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
284extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
285extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 375f0fae2c6a..f9254fb0c9d0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -281,6 +281,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
281 } 281 }
282 case -NFS4ERR_GRACE: 282 case -NFS4ERR_GRACE:
283 case -NFS4ERR_DELAY: 283 case -NFS4ERR_DELAY:
284 case -EKEYEXPIRED:
284 ret = nfs4_delay(server->client, &exception->timeout); 285 ret = nfs4_delay(server->client, &exception->timeout);
285 if (ret != 0) 286 if (ret != 0)
286 break; 287 break;
@@ -418,7 +419,8 @@ static void nfs41_sequence_done(struct nfs_client *clp,
418 clp->cl_last_renewal = timestamp; 419 clp->cl_last_renewal = timestamp;
419 spin_unlock(&clp->cl_lock); 420 spin_unlock(&clp->cl_lock);
420 /* Check sequence flags */ 421 /* Check sequence flags */
421 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); 422 if (atomic_read(&clp->cl_count) > 1)
423 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
422 } 424 }
423out: 425out:
424 /* The session may be reset by one of the error handlers. */ 426 /* The session may be reset by one of the error handlers. */
@@ -724,8 +726,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
724 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 726 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
725 if (p->o_arg.seqid == NULL) 727 if (p->o_arg.seqid == NULL)
726 goto err_free; 728 goto err_free;
727 p->path.mnt = mntget(path->mnt); 729 path_get(path);
728 p->path.dentry = dget(path->dentry); 730 p->path = *path;
729 p->dir = parent; 731 p->dir = parent;
730 p->owner = sp; 732 p->owner = sp;
731 atomic_inc(&sp->so_count); 733 atomic_inc(&sp->so_count);
@@ -1163,7 +1165,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1163 int err; 1165 int err;
1164 do { 1166 do {
1165 err = _nfs4_do_open_reclaim(ctx, state); 1167 err = _nfs4_do_open_reclaim(ctx, state);
1166 if (err != -NFS4ERR_DELAY) 1168 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
1167 break; 1169 break;
1168 nfs4_handle_exception(server, err, &exception); 1170 nfs4_handle_exception(server, err, &exception);
1169 } while (exception.retry); 1171 } while (exception.retry);
@@ -1582,6 +1584,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
1582 goto out; 1584 goto out;
1583 case -NFS4ERR_GRACE: 1585 case -NFS4ERR_GRACE:
1584 case -NFS4ERR_DELAY: 1586 case -NFS4ERR_DELAY:
1587 case -EKEYEXPIRED:
1585 nfs4_handle_exception(server, err, &exception); 1588 nfs4_handle_exception(server, err, &exception);
1586 err = 0; 1589 err = 0;
1587 } 1590 }
@@ -1944,8 +1947,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1944 calldata->res.seqid = calldata->arg.seqid; 1947 calldata->res.seqid = calldata->arg.seqid;
1945 calldata->res.server = server; 1948 calldata->res.server = server;
1946 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 1949 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1947 calldata->path.mnt = mntget(path->mnt); 1950 path_get(path);
1948 calldata->path.dentry = dget(path->dentry); 1951 calldata->path = *path;
1949 1952
1950 msg.rpc_argp = &calldata->arg, 1953 msg.rpc_argp = &calldata->arg,
1951 msg.rpc_resp = &calldata->res, 1954 msg.rpc_resp = &calldata->res,
@@ -3145,10 +3148,19 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
3145 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special 3148 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
3146 * standalone procedure for queueing an asynchronous RENEW. 3149 * standalone procedure for queueing an asynchronous RENEW.
3147 */ 3150 */
3151static void nfs4_renew_release(void *data)
3152{
3153 struct nfs_client *clp = data;
3154
3155 if (atomic_read(&clp->cl_count) > 1)
3156 nfs4_schedule_state_renewal(clp);
3157 nfs_put_client(clp);
3158}
3159
3148static void nfs4_renew_done(struct rpc_task *task, void *data) 3160static void nfs4_renew_done(struct rpc_task *task, void *data)
3149{ 3161{
3150 struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp; 3162 struct nfs_client *clp = data;
3151 unsigned long timestamp = (unsigned long)data; 3163 unsigned long timestamp = task->tk_start;
3152 3164
3153 if (task->tk_status < 0) { 3165 if (task->tk_status < 0) {
3154 /* Unless we're shutting down, schedule state recovery! */ 3166 /* Unless we're shutting down, schedule state recovery! */
@@ -3164,6 +3176,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
3164 3176
3165static const struct rpc_call_ops nfs4_renew_ops = { 3177static const struct rpc_call_ops nfs4_renew_ops = {
3166 .rpc_call_done = nfs4_renew_done, 3178 .rpc_call_done = nfs4_renew_done,
3179 .rpc_release = nfs4_renew_release,
3167}; 3180};
3168 3181
3169int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) 3182int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3174,8 +3187,10 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3174 .rpc_cred = cred, 3187 .rpc_cred = cred,
3175 }; 3188 };
3176 3189
3190 if (!atomic_inc_not_zero(&clp->cl_count))
3191 return -EIO;
3177 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 3192 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
3178 &nfs4_renew_ops, (void *)jiffies); 3193 &nfs4_renew_ops, clp);
3179} 3194}
3180 3195
3181int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3196int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3452,6 +3467,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3452 if (server) 3467 if (server)
3453 nfs_inc_server_stats(server, NFSIOS_DELAY); 3468 nfs_inc_server_stats(server, NFSIOS_DELAY);
3454 case -NFS4ERR_GRACE: 3469 case -NFS4ERR_GRACE:
3470 case -EKEYEXPIRED:
3455 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3471 rpc_delay(task, NFS4_POLL_RETRY_MAX);
3456 task->tk_status = 0; 3472 task->tk_status = 0;
3457 return -EAGAIN; 3473 return -EAGAIN;
@@ -3564,6 +3580,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
3564 case -NFS4ERR_RESOURCE: 3580 case -NFS4ERR_RESOURCE:
3565 /* The IBM lawyers misread another document! */ 3581 /* The IBM lawyers misread another document! */
3566 case -NFS4ERR_DELAY: 3582 case -NFS4ERR_DELAY:
3583 case -EKEYEXPIRED:
3567 err = nfs4_delay(clp->cl_rpcclient, &timeout); 3584 err = nfs4_delay(clp->cl_rpcclient, &timeout);
3568 } 3585 }
3569 } while (err == 0); 3586 } while (err == 0);
@@ -4179,7 +4196,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
4179 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4196 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4180 return 0; 4197 return 0;
4181 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); 4198 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
4182 if (err != -NFS4ERR_DELAY) 4199 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
4183 break; 4200 break;
4184 nfs4_handle_exception(server, err, &exception); 4201 nfs4_handle_exception(server, err, &exception);
4185 } while (exception.retry); 4202 } while (exception.retry);
@@ -4204,6 +4221,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
4204 goto out; 4221 goto out;
4205 case -NFS4ERR_GRACE: 4222 case -NFS4ERR_GRACE:
4206 case -NFS4ERR_DELAY: 4223 case -NFS4ERR_DELAY:
4224 case -EKEYEXPIRED:
4207 nfs4_handle_exception(server, err, &exception); 4225 nfs4_handle_exception(server, err, &exception);
4208 err = 0; 4226 err = 0;
4209 } 4227 }
@@ -4355,6 +4373,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4355 err = 0; 4373 err = 0;
4356 goto out; 4374 goto out;
4357 case -NFS4ERR_DELAY: 4375 case -NFS4ERR_DELAY:
4376 case -EKEYEXPIRED:
4358 break; 4377 break;
4359 } 4378 }
4360 err = nfs4_handle_exception(server, err, &exception); 4379 err = nfs4_handle_exception(server, err, &exception);
@@ -4500,7 +4519,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4500 4519
4501 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); 4520 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4502 4521
4503 if (status != NFS4ERR_CLID_INUSE) 4522 if (status != -NFS4ERR_CLID_INUSE)
4504 break; 4523 break;
4505 4524
4506 if (signalled()) 4525 if (signalled())
@@ -4554,6 +4573,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4554 switch (task->tk_status) { 4573 switch (task->tk_status) {
4555 case -NFS4ERR_DELAY: 4574 case -NFS4ERR_DELAY:
4556 case -NFS4ERR_GRACE: 4575 case -NFS4ERR_GRACE:
4576 case -EKEYEXPIRED:
4557 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); 4577 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4558 rpc_delay(task, NFS4_POLL_RETRY_MIN); 4578 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4559 task->tk_status = 0; 4579 task->tk_status = 0;
@@ -4611,26 +4631,32 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4611/* 4631/*
4612 * Reset a slot table 4632 * Reset a slot table
4613 */ 4633 */
4614static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots, 4634static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
4615 int old_max_slots, int ivalue) 4635 int ivalue)
4616{ 4636{
4637 struct nfs4_slot *new = NULL;
4617 int i; 4638 int i;
4618 int ret = 0; 4639 int ret = 0;
4619 4640
4620 dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl); 4641 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
4642 max_reqs, tbl->max_slots);
4621 4643
4622 /* 4644 /* Does the newly negotiated max_reqs match the existing slot table? */
4623 * Until we have dynamic slot table adjustment, insist 4645 if (max_reqs != tbl->max_slots) {
4624 * upon the same slot table size 4646 ret = -ENOMEM;
4625 */ 4647 new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
4626 if (max_slots != old_max_slots) { 4648 GFP_KERNEL);
4627 dprintk("%s reset slot table does't match old\n", 4649 if (!new)
4628 __func__); 4650 goto out;
4629 ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */ 4651 ret = 0;
4630 goto out; 4652 kfree(tbl->slots);
4631 } 4653 }
4632 spin_lock(&tbl->slot_tbl_lock); 4654 spin_lock(&tbl->slot_tbl_lock);
4633 for (i = 0; i < max_slots; ++i) 4655 if (new) {
4656 tbl->slots = new;
4657 tbl->max_slots = max_reqs;
4658 }
4659 for (i = 0; i < tbl->max_slots; ++i)
4634 tbl->slots[i].seq_nr = ivalue; 4660 tbl->slots[i].seq_nr = ivalue;
4635 spin_unlock(&tbl->slot_tbl_lock); 4661 spin_unlock(&tbl->slot_tbl_lock);
4636 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, 4662 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
@@ -4648,16 +4674,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
4648 int status; 4674 int status;
4649 4675
4650 status = nfs4_reset_slot_table(&session->fc_slot_table, 4676 status = nfs4_reset_slot_table(&session->fc_slot_table,
4651 session->fc_attrs.max_reqs, 4677 session->fc_attrs.max_reqs, 1);
4652 session->fc_slot_table.max_slots,
4653 1);
4654 if (status) 4678 if (status)
4655 return status; 4679 return status;
4656 4680
4657 status = nfs4_reset_slot_table(&session->bc_slot_table, 4681 status = nfs4_reset_slot_table(&session->bc_slot_table,
4658 session->bc_attrs.max_reqs, 4682 session->bc_attrs.max_reqs, 0);
4659 session->bc_slot_table.max_slots,
4660 0);
4661 return status; 4683 return status;
4662} 4684}
4663 4685
@@ -4798,16 +4820,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4798 args->fc_attrs.headerpadsz = 0; 4820 args->fc_attrs.headerpadsz = 0;
4799 args->fc_attrs.max_rqst_sz = mxrqst_sz; 4821 args->fc_attrs.max_rqst_sz = mxrqst_sz;
4800 args->fc_attrs.max_resp_sz = mxresp_sz; 4822 args->fc_attrs.max_resp_sz = mxresp_sz;
4801 args->fc_attrs.max_resp_sz_cached = mxresp_sz;
4802 args->fc_attrs.max_ops = NFS4_MAX_OPS; 4823 args->fc_attrs.max_ops = NFS4_MAX_OPS;
4803 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; 4824 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
4804 4825
4805 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " 4826 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
4806 "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", 4827 "max_ops=%u max_reqs=%u\n",
4807 __func__, 4828 __func__,
4808 args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, 4829 args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
4809 args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops, 4830 args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
4810 args->fc_attrs.max_reqs);
4811 4831
4812 /* Back channel attributes */ 4832 /* Back channel attributes */
4813 args->bc_attrs.headerpadsz = 0; 4833 args->bc_attrs.headerpadsz = 0;
@@ -5016,7 +5036,16 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5016 &res, args.sa_cache_this, 1); 5036 &res, args.sa_cache_this, 1);
5017} 5037}
5018 5038
5019void nfs41_sequence_call_done(struct rpc_task *task, void *data) 5039static void nfs41_sequence_release(void *data)
5040{
5041 struct nfs_client *clp = (struct nfs_client *)data;
5042
5043 if (atomic_read(&clp->cl_count) > 1)
5044 nfs4_schedule_state_renewal(clp);
5045 nfs_put_client(clp);
5046}
5047
5048static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
5020{ 5049{
5021 struct nfs_client *clp = (struct nfs_client *)data; 5050 struct nfs_client *clp = (struct nfs_client *)data;
5022 5051
@@ -5024,6 +5053,8 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
5024 5053
5025 if (task->tk_status < 0) { 5054 if (task->tk_status < 0) {
5026 dprintk("%s ERROR %d\n", __func__, task->tk_status); 5055 dprintk("%s ERROR %d\n", __func__, task->tk_status);
5056 if (atomic_read(&clp->cl_count) == 1)
5057 goto out;
5027 5058
5028 if (_nfs4_async_handle_error(task, NULL, clp, NULL) 5059 if (_nfs4_async_handle_error(task, NULL, clp, NULL)
5029 == -EAGAIN) { 5060 == -EAGAIN) {
@@ -5032,7 +5063,7 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
5032 } 5063 }
5033 } 5064 }
5034 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 5065 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
5035 5066out:
5036 kfree(task->tk_msg.rpc_argp); 5067 kfree(task->tk_msg.rpc_argp);
5037 kfree(task->tk_msg.rpc_resp); 5068 kfree(task->tk_msg.rpc_resp);
5038 5069
@@ -5057,6 +5088,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
5057static const struct rpc_call_ops nfs41_sequence_ops = { 5088static const struct rpc_call_ops nfs41_sequence_ops = {
5058 .rpc_call_done = nfs41_sequence_call_done, 5089 .rpc_call_done = nfs41_sequence_call_done,
5059 .rpc_call_prepare = nfs41_sequence_prepare, 5090 .rpc_call_prepare = nfs41_sequence_prepare,
5091 .rpc_release = nfs41_sequence_release,
5060}; 5092};
5061 5093
5062static int nfs41_proc_async_sequence(struct nfs_client *clp, 5094static int nfs41_proc_async_sequence(struct nfs_client *clp,
@@ -5069,12 +5101,14 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
5069 .rpc_cred = cred, 5101 .rpc_cred = cred,
5070 }; 5102 };
5071 5103
5104 if (!atomic_inc_not_zero(&clp->cl_count))
5105 return -EIO;
5072 args = kzalloc(sizeof(*args), GFP_KERNEL); 5106 args = kzalloc(sizeof(*args), GFP_KERNEL);
5073 if (!args)
5074 return -ENOMEM;
5075 res = kzalloc(sizeof(*res), GFP_KERNEL); 5107 res = kzalloc(sizeof(*res), GFP_KERNEL);
5076 if (!res) { 5108 if (!args || !res) {
5077 kfree(args); 5109 kfree(args);
5110 kfree(res);
5111 nfs_put_client(clp);
5078 return -ENOMEM; 5112 return -ENOMEM;
5079 } 5113 }
5080 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 5114 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0156c01c212c..d87f10327b72 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -36,11 +36,6 @@
36 * as an rpc_task, not a real kernel thread, so it always runs in rpciod's 36 * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
37 * context. There is one renewd per nfs_server. 37 * context. There is one renewd per nfs_server.
38 * 38 *
39 * TODO: If the send queue gets backlogged (e.g., if the server goes down),
40 * we will keep filling the queue with periodic RENEW requests. We need a
41 * mechanism for ensuring that if renewd successfully sends off a request,
42 * then it only wakes up when the request is finished. Maybe use the
43 * child task framework of the RPC layer?
44 */ 39 */
45 40
46#include <linux/mm.h> 41#include <linux/mm.h>
@@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work)
63 struct nfs_client *clp = 58 struct nfs_client *clp =
64 container_of(work, struct nfs_client, cl_renewd.work); 59 container_of(work, struct nfs_client, cl_renewd.work);
65 struct rpc_cred *cred; 60 struct rpc_cred *cred;
66 long lease, timeout; 61 long lease;
67 unsigned long last, now; 62 unsigned long last, now;
68 63
69 ops = nfs4_state_renewal_ops[clp->cl_minorversion]; 64 ops = nfs4_state_renewal_ops[clp->cl_minorversion];
@@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work)
75 lease = clp->cl_lease_time; 70 lease = clp->cl_lease_time;
76 last = clp->cl_last_renewal; 71 last = clp->cl_last_renewal;
77 now = jiffies; 72 now = jiffies;
78 timeout = (2 * lease) / 3 + (long)last - (long)now;
79 /* Are we close to a lease timeout? */ 73 /* Are we close to a lease timeout? */
80 if (time_after(now, last + lease/3)) { 74 if (time_after(now, last + lease/3)) {
81 cred = ops->get_state_renewal_cred_locked(clp); 75 cred = ops->get_state_renewal_cred_locked(clp);
@@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work)
90 /* Queue an asynchronous RENEW. */ 84 /* Queue an asynchronous RENEW. */
91 ops->sched_state_renewal(clp, cred); 85 ops->sched_state_renewal(clp, cred);
92 put_rpccred(cred); 86 put_rpccred(cred);
87 goto out_exp;
93 } 88 }
94 timeout = (2 * lease) / 3; 89 } else {
95 spin_lock(&clp->cl_lock);
96 } else
97 dprintk("%s: failed to call renewd. Reason: lease not expired \n", 90 dprintk("%s: failed to call renewd. Reason: lease not expired \n",
98 __func__); 91 __func__);
99 if (timeout < 5 * HZ) /* safeguard */ 92 spin_unlock(&clp->cl_lock);
100 timeout = 5 * HZ; 93 }
101 dprintk("%s: requeueing work. Lease period = %ld\n", 94 nfs4_schedule_state_renewal(clp);
102 __func__, (timeout + HZ - 1) / HZ); 95out_exp:
103 cancel_delayed_work(&clp->cl_renewd);
104 schedule_delayed_work(&clp->cl_renewd, timeout);
105 spin_unlock(&clp->cl_lock);
106 nfs_expire_unreferenced_delegations(clp); 96 nfs_expire_unreferenced_delegations(clp);
107out: 97out:
108 dprintk("%s: done\n", __func__); 98 dprintk("%s: done\n", __func__);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c1e2733f4fa4..6c5ed51f105e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1249,26 +1249,65 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
1249} 1249}
1250 1250
1251#ifdef CONFIG_NFS_V4_1 1251#ifdef CONFIG_NFS_V4_1
1252void nfs41_handle_recall_slot(struct nfs_client *clp)
1253{
1254 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1255 nfs4_schedule_state_recovery(clp);
1256}
1257
1258static void nfs4_reset_all_state(struct nfs_client *clp)
1259{
1260 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1261 clp->cl_boot_time = CURRENT_TIME;
1262 nfs4_state_start_reclaim_nograce(clp);
1263 nfs4_schedule_state_recovery(clp);
1264 }
1265}
1266
1267static void nfs41_handle_server_reboot(struct nfs_client *clp)
1268{
1269 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1270 nfs4_state_start_reclaim_reboot(clp);
1271 nfs4_schedule_state_recovery(clp);
1272 }
1273}
1274
1275static void nfs41_handle_state_revoked(struct nfs_client *clp)
1276{
1277 /* Temporary */
1278 nfs4_reset_all_state(clp);
1279}
1280
1281static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
1282{
1283 /* This will need to handle layouts too */
1284 nfs_expire_all_delegations(clp);
1285}
1286
1287static void nfs41_handle_cb_path_down(struct nfs_client *clp)
1288{
1289 nfs_expire_all_delegations(clp);
1290 if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
1291 nfs4_schedule_state_recovery(clp);
1292}
1293
1252void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) 1294void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
1253{ 1295{
1254 if (!flags) 1296 if (!flags)
1255 return; 1297 return;
1256 else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) { 1298 else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
1257 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1299 nfs41_handle_server_reboot(clp);
1258 nfs4_state_start_reclaim_reboot(clp); 1300 else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
1259 nfs4_schedule_state_recovery(clp);
1260 } else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
1261 SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | 1301 SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
1262 SEQ4_STATUS_ADMIN_STATE_REVOKED | 1302 SEQ4_STATUS_ADMIN_STATE_REVOKED |
1263 SEQ4_STATUS_RECALLABLE_STATE_REVOKED | 1303 SEQ4_STATUS_LEASE_MOVED))
1264 SEQ4_STATUS_LEASE_MOVED)) { 1304 nfs41_handle_state_revoked(clp);
1265 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1305 else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
1266 nfs4_state_start_reclaim_nograce(clp); 1306 nfs41_handle_recallable_state_revoked(clp);
1267 nfs4_schedule_state_recovery(clp); 1307 else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
1268 } else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
1269 SEQ4_STATUS_BACKCHANNEL_FAULT | 1308 SEQ4_STATUS_BACKCHANNEL_FAULT |
1270 SEQ4_STATUS_CB_PATH_DOWN_SESSION)) 1309 SEQ4_STATUS_CB_PATH_DOWN_SESSION))
1271 nfs_expire_all_delegations(clp); 1310 nfs41_handle_cb_path_down(clp);
1272} 1311}
1273 1312
1274static int nfs4_reset_session(struct nfs_client *clp) 1313static int nfs4_reset_session(struct nfs_client *clp)
@@ -1285,23 +1324,52 @@ static int nfs4_reset_session(struct nfs_client *clp)
1285 1324
1286 memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); 1325 memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
1287 status = nfs4_proc_create_session(clp); 1326 status = nfs4_proc_create_session(clp);
1288 if (status) 1327 if (status) {
1289 status = nfs4_recovery_handle_error(clp, status); 1328 status = nfs4_recovery_handle_error(clp, status);
1329 goto out;
1330 }
1331 /* create_session negotiated new slot table */
1332 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1290 1333
1291out: 1334 /* Let the state manager reestablish state */
1292 /* 1335 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
1293 * Let the state manager reestablish state
1294 */
1295 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1296 status == 0)
1297 nfs41_setup_state_renewal(clp); 1336 nfs41_setup_state_renewal(clp);
1298 1337out:
1299 return status; 1338 return status;
1300} 1339}
1301 1340
1341static int nfs4_recall_slot(struct nfs_client *clp)
1342{
1343 struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table;
1344 struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs;
1345 struct nfs4_slot *new, *old;
1346 int i;
1347
1348 nfs4_begin_drain_session(clp);
1349 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
1350 GFP_KERNEL);
1351 if (!new)
1352 return -ENOMEM;
1353
1354 spin_lock(&fc_tbl->slot_tbl_lock);
1355 for (i = 0; i < fc_tbl->target_max_slots; i++)
1356 new[i].seq_nr = fc_tbl->slots[i].seq_nr;
1357 old = fc_tbl->slots;
1358 fc_tbl->slots = new;
1359 fc_tbl->max_slots = fc_tbl->target_max_slots;
1360 fc_tbl->target_max_slots = 0;
1361 fc_attrs->max_reqs = fc_tbl->max_slots;
1362 spin_unlock(&fc_tbl->slot_tbl_lock);
1363
1364 kfree(old);
1365 nfs4_end_drain_session(clp);
1366 return 0;
1367}
1368
1302#else /* CONFIG_NFS_V4_1 */ 1369#else /* CONFIG_NFS_V4_1 */
1303static int nfs4_reset_session(struct nfs_client *clp) { return 0; } 1370static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
1304static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } 1371static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
1372static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
1305#endif /* CONFIG_NFS_V4_1 */ 1373#endif /* CONFIG_NFS_V4_1 */
1306 1374
1307/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors 1375/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1314,6 +1382,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1314 case -NFS4ERR_DELAY: 1382 case -NFS4ERR_DELAY:
1315 case -NFS4ERR_CLID_INUSE: 1383 case -NFS4ERR_CLID_INUSE:
1316 case -EAGAIN: 1384 case -EAGAIN:
1385 case -EKEYEXPIRED:
1317 break; 1386 break;
1318 1387
1319 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1388 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
@@ -1397,6 +1466,15 @@ static void nfs4_state_manager(struct nfs_client *clp)
1397 nfs_client_return_marked_delegations(clp); 1466 nfs_client_return_marked_delegations(clp);
1398 continue; 1467 continue;
1399 } 1468 }
1469 /* Recall session slots */
1470 if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
1471 && nfs4_has_session(clp)) {
1472 status = nfs4_recall_slot(clp);
1473 if (status < 0)
1474 goto out_error;
1475 continue;
1476 }
1477
1400 1478
1401 nfs4_clear_state_manager_bit(clp); 1479 nfs4_clear_state_manager_bit(clp);
1402 /* Did we race with an attempt to give us more work? */ 1480 /* Did we race with an attempt to give us more work? */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5cd5184b56db..dd17713413a5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1578,6 +1578,14 @@ static void encode_create_session(struct xdr_stream *xdr,
1578 char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; 1578 char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
1579 uint32_t len; 1579 uint32_t len;
1580 struct nfs_client *clp = args->client; 1580 struct nfs_client *clp = args->client;
1581 u32 max_resp_sz_cached;
1582
1583 /*
1584 * Assumes OPEN is the biggest non-idempotent compound.
1585 * 2 is the verifier.
1586 */
1587 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
1588 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
1581 1589
1582 len = scnprintf(machine_name, sizeof(machine_name), "%s", 1590 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1583 clp->cl_ipaddr); 1591 clp->cl_ipaddr);
@@ -1592,7 +1600,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1592 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ 1600 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1593 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ 1601 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
1594 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ 1602 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
1595 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1603 *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */
1596 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ 1604 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
1597 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ 1605 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
1598 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ 1606 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
@@ -5544,6 +5552,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5544 if (status != 0) 5552 if (status != 0)
5545 goto out; 5553 goto out;
5546 status = decode_delegreturn(&xdr); 5554 status = decode_delegreturn(&xdr);
5555 if (status != 0)
5556 goto out;
5547 decode_getfattr(&xdr, res->fattr, res->server, 5557 decode_getfattr(&xdr, res->fattr, res->server,
5548 !RPC_IS_ASYNC(rqstp->rq_task)); 5558 !RPC_IS_ASYNC(rqstp->rq_task));
5549out: 5559out:
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a12c45b65dd4..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
112 */ 112 */
113int nfs_set_page_tag_locked(struct nfs_page *req) 113int nfs_set_page_tag_locked(struct nfs_page *req)
114{ 114{
115 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
116
117 if (!nfs_lock_request_dontget(req)) 115 if (!nfs_lock_request_dontget(req))
118 return 0; 116 return 0;
119 if (req->wb_page != NULL) 117 if (req->wb_page != NULL)
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 118 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
121 return 1; 119 return 1;
122} 120}
123 121
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
126 */ 124 */
127void nfs_clear_page_tag_locked(struct nfs_page *req) 125void nfs_clear_page_tag_locked(struct nfs_page *req)
128{ 126{
129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode);
131
132 if (req->wb_page != NULL) { 127 if (req->wb_page != NULL) {
128 struct inode *inode = req->wb_context->path.dentry->d_inode;
129 struct nfs_inode *nfsi = NFS_I(inode);
130
133 spin_lock(&inode->i_lock); 131 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 132 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 nfs_unlock_request(req); 133 nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
142 * nfs_clear_request - Free up all resources allocated to the request 140 * nfs_clear_request - Free up all resources allocated to the request
143 * @req: 141 * @req:
144 * 142 *
145 * Release page resources associated with a write request after it 143 * Release page and open context resources associated with a read/write
146 * has completed. 144 * request after it has completed.
147 */ 145 */
148void nfs_clear_request(struct nfs_page *req) 146void nfs_clear_request(struct nfs_page *req)
149{ 147{
150 struct page *page = req->wb_page; 148 struct page *page = req->wb_page;
149 struct nfs_open_context *ctx = req->wb_context;
150
151 if (page != NULL) { 151 if (page != NULL) {
152 page_cache_release(page); 152 page_cache_release(page);
153 req->wb_page = NULL; 153 req->wb_page = NULL;
154 } 154 }
155 if (ctx != NULL) {
156 put_nfs_open_context(ctx);
157 req->wb_context = NULL;
158 }
155} 159}
156 160
157 161
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
165{ 169{
166 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 170 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
167 171
168 /* Release struct file or cached credential */ 172 /* Release struct file and open context */
169 nfs_clear_request(req); 173 nfs_clear_request(req);
170 put_nfs_open_context(req->wb_context);
171 nfs_page_free(req); 174 nfs_page_free(req);
172} 175}
173 176
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ef583854d8d0..c752d944fe9e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -47,6 +47,39 @@
47#define NFSDBG_FACILITY NFSDBG_PROC 47#define NFSDBG_FACILITY NFSDBG_PROC
48 48
49/* 49/*
50 * wrapper to handle the -EKEYEXPIRED error message. This should generally
51 * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
52 * support the NFSERR_JUKEBOX error code, but we handle this situation in the
53 * same way that we handle that error with NFSv3.
54 */
55static int
56nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
57{
58 int res;
59 do {
60 res = rpc_call_sync(clnt, msg, flags);
61 if (res != -EKEYEXPIRED)
62 break;
63 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
64 res = -ERESTARTSYS;
65 } while (!fatal_signal_pending(current));
66 return res;
67}
68
69#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
70
71static int
72nfs_async_handle_expired_key(struct rpc_task *task)
73{
74 if (task->tk_status != -EKEYEXPIRED)
75 return 0;
76 task->tk_status = 0;
77 rpc_restart_call(task);
78 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
79 return 1;
80}
81
82/*
50 * Bare-bones access to getattr: this is for nfs_read_super. 83 * Bare-bones access to getattr: this is for nfs_read_super.
51 */ 84 */
52static int 85static int
@@ -307,6 +340,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
307 340
308static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) 341static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
309{ 342{
343 if (nfs_async_handle_expired_key(task))
344 return 0;
310 nfs_mark_for_revalidate(dir); 345 nfs_mark_for_revalidate(dir);
311 return 1; 346 return 1;
312} 347}
@@ -560,6 +595,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
560 595
561static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) 596static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
562{ 597{
598 if (nfs_async_handle_expired_key(task))
599 return -EAGAIN;
600
563 nfs_invalidate_atime(data->inode); 601 nfs_invalidate_atime(data->inode);
564 if (task->tk_status >= 0) { 602 if (task->tk_status >= 0) {
565 nfs_refresh_inode(data->inode, data->res.fattr); 603 nfs_refresh_inode(data->inode, data->res.fattr);
@@ -579,6 +617,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
579 617
580static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 618static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
581{ 619{
620 if (nfs_async_handle_expired_key(task))
621 return -EAGAIN;
622
582 if (task->tk_status >= 0) 623 if (task->tk_status >= 0)
583 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); 624 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
584 return 0; 625 return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1afee4eea77..6baf9a393466 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2214,7 +2214,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2214 } else { 2214 } else {
2215 error = nfs_bdi_register(server); 2215 error = nfs_bdi_register(server);
2216 if (error) 2216 if (error)
2217 goto error_splat_super; 2217 goto error_splat_bdi;
2218 } 2218 }
2219 2219
2220 if (!s->s_root) { 2220 if (!s->s_root) {
@@ -2256,6 +2256,9 @@ out_err_nosb:
2256error_splat_root: 2256error_splat_root:
2257 dput(mntroot); 2257 dput(mntroot);
2258error_splat_super: 2258error_splat_super:
2259 if (server && !s->s_root)
2260 bdi_unregister(&server->backing_dev_info);
2261error_splat_bdi:
2259 deactivate_locked_super(s); 2262 deactivate_locked_super(s);
2260 goto out; 2263 goto out;
2261} 2264}
@@ -2326,7 +2329,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2326 } else { 2329 } else {
2327 error = nfs_bdi_register(server); 2330 error = nfs_bdi_register(server);
2328 if (error) 2331 if (error)
2329 goto error_splat_super; 2332 goto error_splat_bdi;
2330 } 2333 }
2331 2334
2332 if (!s->s_root) { 2335 if (!s->s_root) {
@@ -2363,6 +2366,9 @@ out_err_noserver:
2363 return error; 2366 return error;
2364 2367
2365error_splat_super: 2368error_splat_super:
2369 if (server && !s->s_root)
2370 bdi_unregister(&server->backing_dev_info);
2371error_splat_bdi:
2366 deactivate_locked_super(s); 2372 deactivate_locked_super(s);
2367 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2373 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2368 return error; 2374 return error;
@@ -2578,7 +2584,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2578 } else { 2584 } else {
2579 error = nfs_bdi_register(server); 2585 error = nfs_bdi_register(server);
2580 if (error) 2586 if (error)
2581 goto error_splat_super; 2587 goto error_splat_bdi;
2582 } 2588 }
2583 2589
2584 if (!s->s_root) { 2590 if (!s->s_root) {
@@ -2616,6 +2622,9 @@ out_free:
2616error_splat_root: 2622error_splat_root:
2617 dput(mntroot); 2623 dput(mntroot);
2618error_splat_super: 2624error_splat_super:
2625 if (server && !s->s_root)
2626 bdi_unregister(&server->backing_dev_info);
2627error_splat_bdi:
2619 deactivate_locked_super(s); 2628 deactivate_locked_super(s);
2620 goto out; 2629 goto out;
2621} 2630}
@@ -2811,7 +2820,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2811 } else { 2820 } else {
2812 error = nfs_bdi_register(server); 2821 error = nfs_bdi_register(server);
2813 if (error) 2822 if (error)
2814 goto error_splat_super; 2823 goto error_splat_bdi;
2815 } 2824 }
2816 2825
2817 if (!s->s_root) { 2826 if (!s->s_root) {
@@ -2847,6 +2856,9 @@ out_err_noserver:
2847 return error; 2856 return error;
2848 2857
2849error_splat_super: 2858error_splat_super:
2859 if (server && !s->s_root)
2860 bdi_unregister(&server->backing_dev_info);
2861error_splat_bdi:
2850 deactivate_locked_super(s); 2862 deactivate_locked_super(s);
2851 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2863 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2852 return error; 2864 return error;
@@ -2893,7 +2905,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2893 } else { 2905 } else {
2894 error = nfs_bdi_register(server); 2906 error = nfs_bdi_register(server);
2895 if (error) 2907 if (error)
2896 goto error_splat_super; 2908 goto error_splat_bdi;
2897 } 2909 }
2898 2910
2899 if (!s->s_root) { 2911 if (!s->s_root) {
@@ -2929,6 +2941,9 @@ out_err_noserver:
2929 return error; 2941 return error;
2930 2942
2931error_splat_super: 2943error_splat_super:
2944 if (server && !s->s_root)
2945 bdi_unregister(&server->backing_dev_info);
2946error_splat_bdi:
2932 deactivate_locked_super(s); 2947 deactivate_locked_super(s);
2933 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 2948 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2934 return error; 2949 return error;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc7..2ea9e5c27e55 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -50,7 +50,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
50 struct page *page; 50 struct page *page;
51 void *err; 51 void *err;
52 52
53 err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); 53 err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
54 if (err) 54 if (err)
55 goto read_failed; 55 goto read_failed;
56 page = read_cache_page(&inode->i_data, 0, 56 page = read_cache_page(&inode->i_data, 0,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d63d964a0392..53ff70e23993 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req)
438 radix_tree_tag_set(&nfsi->nfs_page_tree, 438 radix_tree_tag_set(&nfsi->nfs_page_tree,
439 req->wb_index, 439 req->wb_index,
440 NFS_PAGE_TAG_COMMIT); 440 NFS_PAGE_TAG_COMMIT);
441 nfsi->ncommit++;
441 spin_unlock(&inode->i_lock); 442 spin_unlock(&inode->i_lock);
442 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 443 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
443 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 444 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -501,57 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
501} 502}
502#endif 503#endif
503 504
504/*
505 * Wait for a request to complete.
506 *
507 * Interruptible by fatal signals only.
508 */
509static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
510{
511 struct nfs_inode *nfsi = NFS_I(inode);
512 struct nfs_page *req;
513 pgoff_t idx_end, next;
514 unsigned int res = 0;
515 int error;
516
517 if (npages == 0)
518 idx_end = ~0;
519 else
520 idx_end = idx_start + npages - 1;
521
522 next = idx_start;
523 while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
524 if (req->wb_index > idx_end)
525 break;
526
527 next = req->wb_index + 1;
528 BUG_ON(!NFS_WBACK_BUSY(req));
529
530 kref_get(&req->wb_kref);
531 spin_unlock(&inode->i_lock);
532 error = nfs_wait_on_request(req);
533 nfs_release_request(req);
534 spin_lock(&inode->i_lock);
535 if (error < 0)
536 return error;
537 res++;
538 }
539 return res;
540}
541
542static void nfs_cancel_commit_list(struct list_head *head)
543{
544 struct nfs_page *req;
545
546 while(!list_empty(head)) {
547 req = nfs_list_entry(head->next);
548 nfs_list_remove_request(req);
549 nfs_clear_request_commit(req);
550 nfs_inode_remove_request(req);
551 nfs_unlock_request(req);
552 }
553}
554
555#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 505#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
556static int 506static int
557nfs_need_commit(struct nfs_inode *nfsi) 507nfs_need_commit(struct nfs_inode *nfsi)
@@ -573,11 +523,17 @@ static int
573nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 523nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
574{ 524{
575 struct nfs_inode *nfsi = NFS_I(inode); 525 struct nfs_inode *nfsi = NFS_I(inode);
526 int ret;
576 527
577 if (!nfs_need_commit(nfsi)) 528 if (!nfs_need_commit(nfsi))
578 return 0; 529 return 0;
579 530
580 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); 531 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
532 if (ret > 0)
533 nfsi->ncommit -= ret;
534 if (nfs_need_commit(NFS_I(inode)))
535 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
536 return ret;
581} 537}
582#else 538#else
583static inline int nfs_need_commit(struct nfs_inode *nfsi) 539static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +598,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
642 spin_lock(&inode->i_lock); 598 spin_lock(&inode->i_lock);
643 } 599 }
644 600
645 if (nfs_clear_request_commit(req)) 601 if (nfs_clear_request_commit(req) &&
646 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, 602 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
647 req->wb_index, NFS_PAGE_TAG_COMMIT); 603 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
604 NFS_I(inode)->ncommit--;
648 605
649 /* Okay, the request matches. Update the region */ 606 /* Okay, the request matches. Update the region */
650 if (offset < req->wb_offset) { 607 if (offset < req->wb_offset) {
@@ -1391,7 +1348,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
1391 .rpc_release = nfs_commit_release, 1348 .rpc_release = nfs_commit_release,
1392}; 1349};
1393 1350
1394int nfs_commit_inode(struct inode *inode, int how) 1351static int nfs_commit_inode(struct inode *inode, int how)
1395{ 1352{
1396 LIST_HEAD(head); 1353 LIST_HEAD(head);
1397 int res; 1354 int res;
@@ -1406,92 +1363,51 @@ int nfs_commit_inode(struct inode *inode, int how)
1406 } 1363 }
1407 return res; 1364 return res;
1408} 1365}
1409#else
1410static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1411{
1412 return 0;
1413}
1414#endif
1415 1366
1416long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) 1367static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1417{ 1368{
1418 struct inode *inode = mapping->host; 1369 struct nfs_inode *nfsi = NFS_I(inode);
1419 pgoff_t idx_start, idx_end; 1370 int flags = FLUSH_SYNC;
1420 unsigned int npages = 0; 1371 int ret = 0;
1421 LIST_HEAD(head); 1372
1422 int nocommit = how & FLUSH_NOCOMMIT; 1373 /* Don't commit yet if this is a non-blocking flush and there are
1423 long pages, ret; 1374 * lots of outstanding writes for this mapping.
1424 1375 */
1425 /* FIXME */ 1376 if (wbc->sync_mode == WB_SYNC_NONE &&
1426 if (wbc->range_cyclic) 1377 nfsi->ncommit <= (nfsi->npages >> 1))
1427 idx_start = 0; 1378 goto out_mark_dirty;
1428 else { 1379
1429 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; 1380 if (wbc->nonblocking || wbc->for_background)
1430 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; 1381 flags = 0;
1431 if (idx_end > idx_start) { 1382 ret = nfs_commit_inode(inode, flags);
1432 pgoff_t l_npages = 1 + idx_end - idx_start; 1383 if (ret >= 0) {
1433 npages = l_npages; 1384 if (wbc->sync_mode == WB_SYNC_NONE) {
1434 if (sizeof(npages) != sizeof(l_npages) && 1385 if (ret < wbc->nr_to_write)
1435 (pgoff_t)npages != l_npages) 1386 wbc->nr_to_write -= ret;
1436 npages = 0; 1387 else
1388 wbc->nr_to_write = 0;
1437 } 1389 }
1390 return 0;
1438 } 1391 }
1439 how &= ~FLUSH_NOCOMMIT; 1392out_mark_dirty:
1440 spin_lock(&inode->i_lock); 1393 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1441 do {
1442 ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
1443 if (ret != 0)
1444 continue;
1445 if (nocommit)
1446 break;
1447 pages = nfs_scan_commit(inode, &head, idx_start, npages);
1448 if (pages == 0)
1449 break;
1450 if (how & FLUSH_INVALIDATE) {
1451 spin_unlock(&inode->i_lock);
1452 nfs_cancel_commit_list(&head);
1453 ret = pages;
1454 spin_lock(&inode->i_lock);
1455 continue;
1456 }
1457 pages += nfs_scan_commit(inode, &head, 0, 0);
1458 spin_unlock(&inode->i_lock);
1459 ret = nfs_commit_list(inode, &head, how);
1460 spin_lock(&inode->i_lock);
1461
1462 } while (ret >= 0);
1463 spin_unlock(&inode->i_lock);
1464 return ret; 1394 return ret;
1465} 1395}
1466 1396#else
1467static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) 1397static int nfs_commit_inode(struct inode *inode, int how)
1468{ 1398{
1469 int ret;
1470
1471 ret = nfs_writepages(mapping, wbc);
1472 if (ret < 0)
1473 goto out;
1474 ret = nfs_sync_mapping_wait(mapping, wbc, how);
1475 if (ret < 0)
1476 goto out;
1477 return 0; 1399 return 0;
1478out:
1479 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1480 return ret;
1481} 1400}
1482 1401
1483/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ 1402static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1484static int nfs_write_mapping(struct address_space *mapping, int how)
1485{ 1403{
1486 struct writeback_control wbc = { 1404 return 0;
1487 .bdi = mapping->backing_dev_info, 1405}
1488 .sync_mode = WB_SYNC_ALL, 1406#endif
1489 .nr_to_write = LONG_MAX,
1490 .range_start = 0,
1491 .range_end = LLONG_MAX,
1492 };
1493 1407
1494 return __nfs_write_mapping(mapping, &wbc, how); 1408int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1409{
1410 return nfs_commit_unstable_pages(inode, wbc);
1495} 1411}
1496 1412
1497/* 1413/*
@@ -1499,37 +1415,26 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1499 */ 1415 */
1500int nfs_wb_all(struct inode *inode) 1416int nfs_wb_all(struct inode *inode)
1501{ 1417{
1502 return nfs_write_mapping(inode->i_mapping, 0); 1418 struct writeback_control wbc = {
1503} 1419 .sync_mode = WB_SYNC_ALL,
1420 .nr_to_write = LONG_MAX,
1421 .range_start = 0,
1422 .range_end = LLONG_MAX,
1423 };
1504 1424
1505int nfs_wb_nocommit(struct inode *inode) 1425 return sync_inode(inode, &wbc);
1506{
1507 return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
1508} 1426}
1509 1427
1510int nfs_wb_page_cancel(struct inode *inode, struct page *page) 1428int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1511{ 1429{
1512 struct nfs_page *req; 1430 struct nfs_page *req;
1513 loff_t range_start = page_offset(page);
1514 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1515 struct writeback_control wbc = {
1516 .bdi = page->mapping->backing_dev_info,
1517 .sync_mode = WB_SYNC_ALL,
1518 .nr_to_write = LONG_MAX,
1519 .range_start = range_start,
1520 .range_end = range_end,
1521 };
1522 int ret = 0; 1431 int ret = 0;
1523 1432
1524 BUG_ON(!PageLocked(page)); 1433 BUG_ON(!PageLocked(page));
1525 for (;;) { 1434 for (;;) {
1526 req = nfs_page_find_request(page); 1435 req = nfs_page_find_request(page);
1527 if (req == NULL) 1436 if (req == NULL)
1528 goto out;
1529 if (test_bit(PG_CLEAN, &req->wb_flags)) {
1530 nfs_release_request(req);
1531 break; 1437 break;
1532 }
1533 if (nfs_lock_request_dontget(req)) { 1438 if (nfs_lock_request_dontget(req)) {
1534 nfs_inode_remove_request(req); 1439 nfs_inode_remove_request(req);
1535 /* 1440 /*
@@ -1543,54 +1448,54 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1543 ret = nfs_wait_on_request(req); 1448 ret = nfs_wait_on_request(req);
1544 nfs_release_request(req); 1449 nfs_release_request(req);
1545 if (ret < 0) 1450 if (ret < 0)
1546 goto out; 1451 break;
1547 } 1452 }
1548 if (!PagePrivate(page))
1549 return 0;
1550 ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
1551out:
1552 return ret; 1453 return ret;
1553} 1454}
1554 1455
1555static int nfs_wb_page_priority(struct inode *inode, struct page *page, 1456/*
1556 int how) 1457 * Write back all requests on one page - we do this before reading it.
1458 */
1459int nfs_wb_page(struct inode *inode, struct page *page)
1557{ 1460{
1558 loff_t range_start = page_offset(page); 1461 loff_t range_start = page_offset(page);
1559 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); 1462 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1560 struct writeback_control wbc = { 1463 struct writeback_control wbc = {
1561 .bdi = page->mapping->backing_dev_info,
1562 .sync_mode = WB_SYNC_ALL, 1464 .sync_mode = WB_SYNC_ALL,
1563 .nr_to_write = LONG_MAX, 1465 .nr_to_write = 0,
1564 .range_start = range_start, 1466 .range_start = range_start,
1565 .range_end = range_end, 1467 .range_end = range_end,
1566 }; 1468 };
1469 struct nfs_page *req;
1470 int need_commit;
1567 int ret; 1471 int ret;
1568 1472
1569 do { 1473 while(PagePrivate(page)) {
1570 if (clear_page_dirty_for_io(page)) { 1474 if (clear_page_dirty_for_io(page)) {
1571 ret = nfs_writepage_locked(page, &wbc); 1475 ret = nfs_writepage_locked(page, &wbc);
1572 if (ret < 0) 1476 if (ret < 0)
1573 goto out_error; 1477 goto out_error;
1574 } else if (!PagePrivate(page)) 1478 }
1479 req = nfs_find_and_lock_request(page);
1480 if (!req)
1575 break; 1481 break;
1576 ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); 1482 if (IS_ERR(req)) {
1577 if (ret < 0) 1483 ret = PTR_ERR(req);
1578 goto out_error; 1484 goto out_error;
1579 } while (PagePrivate(page)); 1485 }
1486 need_commit = test_bit(PG_CLEAN, &req->wb_flags);
1487 nfs_clear_page_tag_locked(req);
1488 if (need_commit) {
1489 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1490 if (ret < 0)
1491 goto out_error;
1492 }
1493 }
1580 return 0; 1494 return 0;
1581out_error: 1495out_error:
1582 __mark_inode_dirty(inode, I_DIRTY_PAGES);
1583 return ret; 1496 return ret;
1584} 1497}
1585 1498
1586/*
1587 * Write back all requests on one page - we do this before reading it.
1588 */
1589int nfs_wb_page(struct inode *inode, struct page* page)
1590{
1591 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1592}
1593
1594#ifdef CONFIG_MIGRATION 1499#ifdef CONFIG_MIGRATION
1595int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1500int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1596 struct page *page) 1501 struct page *page)
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index d3854d94b7cf..bf9cbd242ddd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -36,10 +36,9 @@ static struct file *do_open(char *name, int flags)
36 return ERR_PTR(error); 36 return ERR_PTR(error);
37 37
38 if (flags == O_RDWR) 38 if (flags == O_RDWR)
39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, 39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
40 FMODE_READ|FMODE_WRITE);
41 else 40 else
42 error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE); 41 error = may_open(&nd.path, MAY_WRITE, flags);
43 42
44 if (!error) 43 if (!error)
45 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 44 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c6eed2a3b093..4bc22c763de7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -525,6 +525,8 @@ static struct rpc_cred *callback_cred;
525 525
526int set_callback_cred(void) 526int set_callback_cred(void)
527{ 527{
528 if (callback_cred)
529 return 0;
528 callback_cred = rpc_lookup_machine_cred(); 530 callback_cred = rpc_lookup_machine_cred();
529 if (!callback_cred) 531 if (!callback_cred)
530 return -ENOMEM; 532 return -ENOMEM;
@@ -542,7 +544,8 @@ void do_probe_callback(struct nfs4_client *clp)
542 }; 544 };
543 int status; 545 int status;
544 546
545 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, 547 status = rpc_call_async(cb->cb_client, &msg,
548 RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
546 &nfsd4_cb_probe_ops, (void *)clp); 549 &nfsd4_cb_probe_ops, (void *)clp);
547 if (status) { 550 if (status) {
548 warn_no_callback_path(clp, status); 551 warn_no_callback_path(clp, status);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5a754f7b71ed..98fb98e330b4 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -119,9 +119,7 @@ out_no_tfm:
119static void 119static void
120nfsd4_sync_rec_dir(void) 120nfsd4_sync_rec_dir(void)
121{ 121{
122 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 122 vfs_fsync(NULL, rec_dir.dentry, 0);
123 nfsd_sync_dir(rec_dir.dentry);
124 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
125} 123}
126 124
127int 125int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f19ed866c95f..c97fddbd17db 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1998,7 +1998,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
1998{ 1998{
1999 if (share_access & NFS4_SHARE_ACCESS_WRITE) { 1999 if (share_access & NFS4_SHARE_ACCESS_WRITE) {
2000 drop_file_write_access(filp); 2000 drop_file_write_access(filp);
2001 spin_lock(&filp->f_lock);
2001 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; 2002 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
2003 spin_unlock(&filp->f_lock);
2002 } 2004 }
2003} 2005}
2004 2006
@@ -2480,8 +2482,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2480 } 2482 }
2481 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2483 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
2482 2484
2483 if (nfsd4_has_session(&resp->cstate)) 2485 if (nfsd4_has_session(&resp->cstate)) {
2484 open->op_stateowner->so_confirmed = 1; 2486 open->op_stateowner->so_confirmed = 1;
2487 nfsd4_create_clid_dir(open->op_stateowner->so_client);
2488 }
2485 2489
2486 /* 2490 /*
2487 * Attempt to hand out a delegation. No error return, because the 2491 * Attempt to hand out a delegation. No error return, because the
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a8587e90fd5a..c47b4d7bafa7 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1434,7 +1434,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1434 } 1434 }
1435 op->opnum = ntohl(*argp->p++); 1435 op->opnum = ntohl(*argp->p++);
1436 1436
1437 if (op->opnum >= OP_ACCESS && op->opnum < ops->nops) 1437 if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
1438 op->status = ops->decoders[op->opnum](argp, &op->u); 1438 op->status = ops->decoders[op->opnum](argp, &op->u);
1439 else { 1439 else {
1440 op->opnum = OP_ILLEGAL; 1440 op->opnum = OP_ILLEGAL;
@@ -1528,7 +1528,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
1528 } } while (0); 1528 } } while (0);
1529 1529
1530/* Encode as an array of strings the string given with components 1530/* Encode as an array of strings the string given with components
1531 * seperated @sep. 1531 * separated @sep.
1532 */ 1532 */
1533static __be32 nfsd4_encode_components(char sep, char *components, 1533static __be32 nfsd4_encode_components(char sep, char *components,
1534 __be32 **pp, int *buflen) 1534 __be32 **pp, int *buflen)
@@ -2121,9 +2121,15 @@ out_acl:
2121 * and this is the root of a cross-mounted filesystem. 2121 * and this is the root of a cross-mounted filesystem.
2122 */ 2122 */
2123 if (ignore_crossmnt == 0 && 2123 if (ignore_crossmnt == 0 &&
2124 exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) { 2124 dentry == exp->ex_path.mnt->mnt_root) {
2125 err = vfs_getattr(exp->ex_path.mnt->mnt_parent, 2125 struct path path = exp->ex_path;
2126 exp->ex_path.mnt->mnt_mountpoint, &stat); 2126 path_get(&path);
2127 while (follow_up(&path)) {
2128 if (path.dentry != path.mnt->mnt_root)
2129 break;
2130 }
2131 err = vfs_getattr(path.mnt, path.dentry, &stat);
2132 path_put(&path);
2127 if (err) 2133 if (err)
2128 goto out_nfserr; 2134 goto out_nfserr;
2129 } 2135 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2604c3e70ea5..0f0e77f2012f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -988,6 +988,7 @@ static ssize_t __write_ports_delfd(char *buf)
988static ssize_t __write_ports_addxprt(char *buf) 988static ssize_t __write_ports_addxprt(char *buf)
989{ 989{
990 char transport[16]; 990 char transport[16];
991 struct svc_xprt *xprt;
991 int port, err; 992 int port, err;
992 993
993 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 994 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
@@ -1002,13 +1003,24 @@ static ssize_t __write_ports_addxprt(char *buf)
1002 1003
1003 err = svc_create_xprt(nfsd_serv, transport, 1004 err = svc_create_xprt(nfsd_serv, transport,
1004 PF_INET, port, SVC_SOCK_ANONYMOUS); 1005 PF_INET, port, SVC_SOCK_ANONYMOUS);
1005 if (err < 0) { 1006 if (err < 0)
1006 /* Give a reasonable perror msg for bad transport string */ 1007 goto out_err;
1007 if (err == -ENOENT) 1008
1008 err = -EPROTONOSUPPORT; 1009 err = svc_create_xprt(nfsd_serv, transport,
1009 return err; 1010 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1010 } 1011 if (err < 0 && err != -EAFNOSUPPORT)
1012 goto out_close;
1011 return 0; 1013 return 0;
1014out_close:
1015 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
1016 if (xprt != NULL) {
1017 svc_close_xprt(xprt);
1018 svc_xprt_put(xprt);
1019 }
1020out_err:
1021 /* Decrease the count, but don't shut down the service */
1022 nfsd_serv->sv_nrthreads--;
1023 return err;
1012} 1024}
1013 1025
1014/* 1026/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8715d194561a..a11b0e8678ee 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -20,13 +20,14 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/namei.h> 21#include <linux/namei.h>
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/quotaops.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
26#include <linux/xattr.h> 25#include <linux/xattr.h>
27#include <linux/jhash.h> 26#include <linux/jhash.h>
28#include <linux/ima.h> 27#include <linux/ima.h>
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <linux/exportfs.h>
30#include <linux/writeback.h>
30 31
31#ifdef CONFIG_NFSD_V3 32#ifdef CONFIG_NFSD_V3
32#include "xdr3.h" 33#include "xdr3.h"
@@ -271,6 +272,32 @@ out:
271 return err; 272 return err;
272} 273}
273 274
275/*
276 * Commit metadata changes to stable storage.
277 */
278static int
279commit_metadata(struct svc_fh *fhp)
280{
281 struct inode *inode = fhp->fh_dentry->d_inode;
282 const struct export_operations *export_ops = inode->i_sb->s_export_op;
283 int error = 0;
284
285 if (!EX_ISSYNC(fhp->fh_export))
286 return 0;
287
288 if (export_ops->commit_metadata) {
289 error = export_ops->commit_metadata(inode);
290 } else {
291 struct writeback_control wbc = {
292 .sync_mode = WB_SYNC_ALL,
293 .nr_to_write = 0, /* metadata only */
294 };
295
296 error = sync_inode(inode, &wbc);
297 }
298
299 return error;
300}
274 301
275/* 302/*
276 * Set various file attributes. 303 * Set various file attributes.
@@ -361,7 +388,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
361 * If we are changing the size of the file, then 388 * If we are changing the size of the file, then
362 * we need to break all leases. 389 * we need to break all leases.
363 */ 390 */
364 host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); 391 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
365 if (host_err == -EWOULDBLOCK) 392 if (host_err == -EWOULDBLOCK)
366 host_err = -ETIMEDOUT; 393 host_err = -ETIMEDOUT;
367 if (host_err) /* ENOMEM or EWOULDBLOCK */ 394 if (host_err) /* ENOMEM or EWOULDBLOCK */
@@ -377,7 +404,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
377 put_write_access(inode); 404 put_write_access(inode);
378 goto out_nfserr; 405 goto out_nfserr;
379 } 406 }
380 vfs_dq_init(inode);
381 } 407 }
382 408
383 /* sanitize the mode change */ 409 /* sanitize the mode change */
@@ -734,7 +760,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
734 * Check to see if there are any leases on this file. 760 * Check to see if there are any leases on this file.
735 * This may block while leases are broken. 761 * This may block while leases are broken.
736 */ 762 */
737 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0)); 763 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
738 if (host_err == -EWOULDBLOCK) 764 if (host_err == -EWOULDBLOCK)
739 host_err = -ETIMEDOUT; 765 host_err = -ETIMEDOUT;
740 if (host_err) /* NOMEM or WOULDBLOCK */ 766 if (host_err) /* NOMEM or WOULDBLOCK */
@@ -745,8 +771,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
745 flags = O_RDWR|O_LARGEFILE; 771 flags = O_RDWR|O_LARGEFILE;
746 else 772 else
747 flags = O_WRONLY|O_LARGEFILE; 773 flags = O_WRONLY|O_LARGEFILE;
748
749 vfs_dq_init(inode);
750 } 774 }
751 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), 775 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
752 flags, current_cred()); 776 flags, current_cred());
@@ -771,43 +795,6 @@ nfsd_close(struct file *filp)
771} 795}
772 796
773/* 797/*
774 * Sync a file
775 * As this calls fsync (not fdatasync) there is no need for a write_inode
776 * after it.
777 */
778static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
779 const struct file_operations *fop)
780{
781 struct inode *inode = dp->d_inode;
782 int (*fsync) (struct file *, struct dentry *, int);
783 int err;
784
785 err = filemap_write_and_wait(inode->i_mapping);
786 if (err == 0 && fop && (fsync = fop->fsync))
787 err = fsync(filp, dp, 0);
788 return err;
789}
790
791static int
792nfsd_sync(struct file *filp)
793{
794 int err;
795 struct inode *inode = filp->f_path.dentry->d_inode;
796 dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
797 mutex_lock(&inode->i_mutex);
798 err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
799 mutex_unlock(&inode->i_mutex);
800
801 return err;
802}
803
804int
805nfsd_sync_dir(struct dentry *dp)
806{
807 return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
808}
809
810/*
811 * Obtain the readahead parameters for the file 798 * Obtain the readahead parameters for the file
812 * specified by (dev, ino). 799 * specified by (dev, ino).
813 */ 800 */
@@ -1010,7 +997,7 @@ static int wait_for_concurrent_writes(struct file *file)
1010 997
1011 if (inode->i_state & I_DIRTY) { 998 if (inode->i_state & I_DIRTY) {
1012 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 999 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1013 err = nfsd_sync(file); 1000 err = vfs_fsync(file, file->f_path.dentry, 0);
1014 } 1001 }
1015 last_ino = inode->i_ino; 1002 last_ino = inode->i_ino;
1016 last_dev = inode->i_sb->s_dev; 1003 last_dev = inode->i_sb->s_dev;
@@ -1158,8 +1145,9 @@ out:
1158#ifdef CONFIG_NFSD_V3 1145#ifdef CONFIG_NFSD_V3
1159/* 1146/*
1160 * Commit all pending writes to stable storage. 1147 * Commit all pending writes to stable storage.
1161 * Strictly speaking, we could sync just the indicated file region here, 1148 *
1162 * but there's currently no way we can ask the VFS to do so. 1149 * Note: we only guarantee that data that lies within the range specified
1150 * by the 'offset' and 'count' parameters will be synced.
1163 * 1151 *
1164 * Unfortunately we cannot lock the file to make sure we return full WCC 1152 * Unfortunately we cannot lock the file to make sure we return full WCC
1165 * data to the client, as locking happens lower down in the filesystem. 1153 * data to the client, as locking happens lower down in the filesystem.
@@ -1169,23 +1157,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1169 loff_t offset, unsigned long count) 1157 loff_t offset, unsigned long count)
1170{ 1158{
1171 struct file *file; 1159 struct file *file;
1172 __be32 err; 1160 loff_t end = LLONG_MAX;
1161 __be32 err = nfserr_inval;
1173 1162
1174 if ((u64)count > ~(u64)offset) 1163 if (offset < 0)
1175 return nfserr_inval; 1164 goto out;
1165 if (count != 0) {
1166 end = offset + (loff_t)count - 1;
1167 if (end < offset)
1168 goto out;
1169 }
1176 1170
1177 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); 1171 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
1178 if (err) 1172 if (err)
1179 return err; 1173 goto out;
1180 if (EX_ISSYNC(fhp->fh_export)) { 1174 if (EX_ISSYNC(fhp->fh_export)) {
1181 if (file->f_op && file->f_op->fsync) { 1175 int err2 = vfs_fsync_range(file, file->f_path.dentry,
1182 err = nfserrno(nfsd_sync(file)); 1176 offset, end, 0);
1183 } else { 1177
1178 if (err2 != -EINVAL)
1179 err = nfserrno(err2);
1180 else
1184 err = nfserr_notsupp; 1181 err = nfserr_notsupp;
1185 }
1186 } 1182 }
1187 1183
1188 nfsd_close(file); 1184 nfsd_close(file);
1185out:
1189 return err; 1186 return err;
1190} 1187}
1191#endif /* CONFIG_NFSD_V3 */ 1188#endif /* CONFIG_NFSD_V3 */
@@ -1338,12 +1335,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1338 goto out_nfserr; 1335 goto out_nfserr;
1339 } 1336 }
1340 1337
1341 if (EX_ISSYNC(fhp->fh_export)) { 1338 err = nfsd_create_setattr(rqstp, resfhp, iap);
1342 err = nfserrno(nfsd_sync_dir(dentry));
1343 write_inode_now(dchild->d_inode, 1);
1344 }
1345 1339
1346 err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1340 /*
1341 * nfsd_setattr already committed the child. Transactional filesystems
1342 * had a chance to commit changes for both parent and child
1343 * simultaneously making the following commit_metadata a noop.
1344 */
1345 err2 = nfserrno(commit_metadata(fhp));
1347 if (err2) 1346 if (err2)
1348 err = err2; 1347 err = err2;
1349 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1348 mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1375,7 +1374,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1375 struct dentry *dentry, *dchild = NULL; 1374 struct dentry *dentry, *dchild = NULL;
1376 struct inode *dirp; 1375 struct inode *dirp;
1377 __be32 err; 1376 __be32 err;
1378 __be32 err2;
1379 int host_err; 1377 int host_err;
1380 __u32 v_mtime=0, v_atime=0; 1378 __u32 v_mtime=0, v_atime=0;
1381 1379
@@ -1470,11 +1468,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1470 if (created) 1468 if (created)
1471 *created = 1; 1469 *created = 1;
1472 1470
1473 if (EX_ISSYNC(fhp->fh_export)) {
1474 err = nfserrno(nfsd_sync_dir(dentry));
1475 /* setattr will sync the child (or not) */
1476 }
1477
1478 nfsd_check_ignore_resizing(iap); 1471 nfsd_check_ignore_resizing(iap);
1479 1472
1480 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1473 if (createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -1489,9 +1482,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1489 } 1482 }
1490 1483
1491 set_attr: 1484 set_attr:
1492 err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1485 err = nfsd_create_setattr(rqstp, resfhp, iap);
1493 if (err2) 1486
1494 err = err2; 1487 /*
1488 * nfsd_setattr already committed the child (and possibly also the parent).
1489 */
1490 if (!err)
1491 err = nfserrno(commit_metadata(fhp));
1495 1492
1496 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1493 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1497 /* 1494 /*
@@ -1606,12 +1603,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1606 } 1603 }
1607 } else 1604 } else
1608 host_err = vfs_symlink(dentry->d_inode, dnew, path); 1605 host_err = vfs_symlink(dentry->d_inode, dnew, path);
1609
1610 if (!host_err) {
1611 if (EX_ISSYNC(fhp->fh_export))
1612 host_err = nfsd_sync_dir(dentry);
1613 }
1614 err = nfserrno(host_err); 1606 err = nfserrno(host_err);
1607 if (!err)
1608 err = nfserrno(commit_metadata(fhp));
1615 fh_unlock(fhp); 1609 fh_unlock(fhp);
1616 1610
1617 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1611 mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1673,11 +1667,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1673 } 1667 }
1674 host_err = vfs_link(dold, dirp, dnew); 1668 host_err = vfs_link(dold, dirp, dnew);
1675 if (!host_err) { 1669 if (!host_err) {
1676 if (EX_ISSYNC(ffhp->fh_export)) { 1670 err = nfserrno(commit_metadata(ffhp));
1677 err = nfserrno(nfsd_sync_dir(ddir)); 1671 if (!err)
1678 write_inode_now(dest, 1); 1672 err = nfserrno(commit_metadata(tfhp));
1679 }
1680 err = 0;
1681 } else { 1673 } else {
1682 if (host_err == -EXDEV && rqstp->rq_vers == 2) 1674 if (host_err == -EXDEV && rqstp->rq_vers == 2)
1683 err = nfserr_acces; 1675 err = nfserr_acces;
@@ -1773,10 +1765,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1773 goto out_dput_new; 1765 goto out_dput_new;
1774 1766
1775 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1767 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1776 if (!host_err && EX_ISSYNC(tfhp->fh_export)) { 1768 if (!host_err) {
1777 host_err = nfsd_sync_dir(tdentry); 1769 host_err = commit_metadata(tfhp);
1778 if (!host_err) 1770 if (!host_err)
1779 host_err = nfsd_sync_dir(fdentry); 1771 host_err = commit_metadata(ffhp);
1780 } 1772 }
1781 1773
1782 mnt_drop_write(ffhp->fh_export->ex_path.mnt); 1774 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
@@ -1857,12 +1849,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1857 1849
1858 dput(rdentry); 1850 dput(rdentry);
1859 1851
1860 if (host_err) 1852 if (!host_err)
1861 goto out_drop; 1853 host_err = commit_metadata(fhp);
1862 if (EX_ISSYNC(fhp->fh_export))
1863 host_err = nfsd_sync_dir(dentry);
1864 1854
1865out_drop:
1866 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1855 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1867out_nfserr: 1856out_nfserr:
1868 err = nfserrno(host_err); 1857 err = nfserrno(host_err);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f560..5cccf874d692 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *); 42 const struct buffer_head *, void *);
43 43
44/** 44/**
45 * nilfs_palloc_req - persistent alloctor request and reply 45 * nilfs_palloc_req - persistent allocator request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number) 46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors 47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap 48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 187dd07ba86c..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
288 * @vblocknrs and @nitems. 288 * @vblocknrs and @nitems.
289 * 289 *
290 * Return Value: On success, 0 is returned. On error, one of the following 290 * Return Value: On success, 0 is returned. On error, one of the following
291 * nagative error codes is returned. 291 * negative error codes is returned.
292 * 292 *
293 * %-EIO - I/O error. 293 * %-EIO - I/O error.
294 * 294 *
@@ -388,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
388 ret = -ENOENT; 388 ret = -ENOENT;
389 goto out; 389 goto out;
390 } 390 }
391 if (blocknrp != NULL) 391 *blocknrp = blocknr;
392 *blocknrp = blocknr;
393 392
394 out: 393 out:
395 kunmap_atomic(kaddr, KM_USER0); 394 kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 76d803e060a9..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -224,7 +224,7 @@ fail:
224 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. 224 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
225 */ 225 */
226static int 226static int
227nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) 227nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
228{ 228{
229 if (len != de->name_len) 229 if (len != de->name_len)
230 return 0; 230 return 0;
@@ -349,11 +349,11 @@ done:
349 * Entry is guaranteed to be valid. 349 * Entry is guaranteed to be valid.
350 */ 350 */
351struct nilfs_dir_entry * 351struct nilfs_dir_entry *
352nilfs_find_entry(struct inode *dir, struct dentry *dentry, 352nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
353 struct page **res_page) 353 struct page **res_page)
354{ 354{
355 const char *name = dentry->d_name.name; 355 const unsigned char *name = qstr->name;
356 int namelen = dentry->d_name.len; 356 int namelen = qstr->len;
357 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 357 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
358 unsigned long start, n; 358 unsigned long start, n;
359 unsigned long npages = dir_pages(dir); 359 unsigned long npages = dir_pages(dir);
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, struct dentry *dentry,
396 /* next page is past the blocks we've got */ 396 /* next page is past the blocks we've got */
397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { 397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
398 nilfs_error(dir->i_sb, __func__, 398 nilfs_error(dir->i_sb, __func__,
399 "dir %lu size %lld exceeds block cout %llu", 399 "dir %lu size %lld exceeds block count %llu",
400 dir->i_ino, dir->i_size, 400 dir->i_ino, dir->i_size,
401 (unsigned long long)dir->i_blocks); 401 (unsigned long long)dir->i_blocks);
402 goto out; 402 goto out;
@@ -424,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
424 return de; 424 return de;
425} 425}
426 426
427ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) 427ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
428{ 428{
429 ino_t res = 0; 429 ino_t res = 0;
430 struct nilfs_dir_entry *de; 430 struct nilfs_dir_entry *de;
431 struct page *page; 431 struct page *page;
432 432
433 de = nilfs_find_entry(dir, dentry, &page); 433 de = nilfs_find_entry(dir, qstr, &page);
434 if (de) { 434 if (de) {
435 res = le64_to_cpu(de->inode); 435 res = le64_to_cpu(de->inode);
436 kunmap(page); 436 kunmap(page);
@@ -465,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
465int nilfs_add_link(struct dentry *dentry, struct inode *inode) 465int nilfs_add_link(struct dentry *dentry, struct inode *inode)
466{ 466{
467 struct inode *dir = dentry->d_parent->d_inode; 467 struct inode *dir = dentry->d_parent->d_inode;
468 const char *name = dentry->d_name.name; 468 const unsigned char *name = dentry->d_name.name;
469 int namelen = dentry->d_name.len; 469 int namelen = dentry->d_name.len;
470 unsigned chunk_size = nilfs_chunk_size(dir); 470 unsigned chunk_size = nilfs_chunk_size(dir);
471 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 471 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa2..8880a9e281e7 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it 31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different 32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy 33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup 34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a 35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number. 36 * checkpoint number argument as well as an inode number.
37 * 37 *
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6b2b83de363..313d0a21da48 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,6 +26,7 @@
26#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
29#include <linux/nilfs2_fs.h> 30#include <linux/nilfs2_fs.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "segment.h" 32#include "segment.h"
@@ -107,20 +108,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
107 108
108 if (!capable(CAP_SYS_ADMIN)) 109 if (!capable(CAP_SYS_ADMIN))
109 return -EPERM; 110 return -EPERM;
111
112 ret = mnt_want_write(filp->f_path.mnt);
113 if (ret)
114 return ret;
115
116 ret = -EFAULT;
110 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 117 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
111 return -EFAULT; 118 goto out;
112 119
113 mutex_lock(&nilfs->ns_mount_mutex); 120 mutex_lock(&nilfs->ns_mount_mutex);
121
114 nilfs_transaction_begin(inode->i_sb, &ti, 0); 122 nilfs_transaction_begin(inode->i_sb, &ti, 0);
115 ret = nilfs_cpfile_change_cpmode( 123 ret = nilfs_cpfile_change_cpmode(
116 cpfile, cpmode.cm_cno, cpmode.cm_mode); 124 cpfile, cpmode.cm_cno, cpmode.cm_mode);
117 if (unlikely(ret < 0)) { 125 if (unlikely(ret < 0))
118 nilfs_transaction_abort(inode->i_sb); 126 nilfs_transaction_abort(inode->i_sb);
119 mutex_unlock(&nilfs->ns_mount_mutex); 127 else
120 return ret; 128 nilfs_transaction_commit(inode->i_sb); /* never fails */
121 } 129
122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 mutex_unlock(&nilfs->ns_mount_mutex); 130 mutex_unlock(&nilfs->ns_mount_mutex);
131out:
132 mnt_drop_write(filp->f_path.mnt);
124 return ret; 133 return ret;
125} 134}
126 135
@@ -135,16 +144,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
135 144
136 if (!capable(CAP_SYS_ADMIN)) 145 if (!capable(CAP_SYS_ADMIN))
137 return -EPERM; 146 return -EPERM;
147
148 ret = mnt_want_write(filp->f_path.mnt);
149 if (ret)
150 return ret;
151
152 ret = -EFAULT;
138 if (copy_from_user(&cno, argp, sizeof(cno))) 153 if (copy_from_user(&cno, argp, sizeof(cno)))
139 return -EFAULT; 154 goto out;
140 155
141 nilfs_transaction_begin(inode->i_sb, &ti, 0); 156 nilfs_transaction_begin(inode->i_sb, &ti, 0);
142 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); 157 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
143 if (unlikely(ret < 0)) { 158 if (unlikely(ret < 0))
144 nilfs_transaction_abort(inode->i_sb); 159 nilfs_transaction_abort(inode->i_sb);
145 return ret; 160 else
146 } 161 nilfs_transaction_commit(inode->i_sb); /* never fails */
147 nilfs_transaction_commit(inode->i_sb); /* never fails */ 162out:
163 mnt_drop_write(filp->f_path.mnt);
148 return ret; 164 return ret;
149} 165}
150 166
@@ -496,12 +512,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
496 if (!capable(CAP_SYS_ADMIN)) 512 if (!capable(CAP_SYS_ADMIN))
497 return -EPERM; 513 return -EPERM;
498 514
515 ret = mnt_want_write(filp->f_path.mnt);
516 if (ret)
517 return ret;
518
519 ret = -EFAULT;
499 if (copy_from_user(argv, argp, sizeof(argv))) 520 if (copy_from_user(argv, argp, sizeof(argv)))
500 return -EFAULT; 521 goto out;
501 522
523 ret = -EINVAL;
502 nsegs = argv[4].v_nmembs; 524 nsegs = argv[4].v_nmembs;
503 if (argv[4].v_size != argsz[4]) 525 if (argv[4].v_size != argsz[4])
504 return -EINVAL; 526 goto out;
527
505 /* 528 /*
506 * argv[4] points to segment numbers this ioctl cleans. We 529 * argv[4] points to segment numbers this ioctl cleans. We
507 * use kmalloc() for its buffer because memory used for the 530 * use kmalloc() for its buffer because memory used for the
@@ -509,9 +532,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
509 */ 532 */
510 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, 533 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
511 nsegs * sizeof(__u64)); 534 nsegs * sizeof(__u64));
512 if (IS_ERR(kbufs[4])) 535 if (IS_ERR(kbufs[4])) {
513 return PTR_ERR(kbufs[4]); 536 ret = PTR_ERR(kbufs[4]);
514 537 goto out;
538 }
515 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 539 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
516 540
517 for (n = 0; n < 4; n++) { 541 for (n = 0; n < 4; n++) {
@@ -563,10 +587,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
563 nilfs_remove_all_gcinode(nilfs); 587 nilfs_remove_all_gcinode(nilfs);
564 clear_nilfs_gc_running(nilfs); 588 clear_nilfs_gc_running(nilfs);
565 589
566 out_free: 590out_free:
567 while (--n >= 0) 591 while (--n >= 0)
568 vfree(kbufs[n]); 592 vfree(kbufs[n]);
569 kfree(kbufs[4]); 593 kfree(kbufs[4]);
594out:
595 mnt_drop_write(filp->f_path.mnt);
570 return ret; 596 return ret;
571} 597}
572 598
@@ -575,13 +601,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
575{ 601{
576 __u64 cno; 602 __u64 cno;
577 int ret; 603 int ret;
604 struct the_nilfs *nilfs;
578 605
579 ret = nilfs_construct_segment(inode->i_sb); 606 ret = nilfs_construct_segment(inode->i_sb);
580 if (ret < 0) 607 if (ret < 0)
581 return ret; 608 return ret;
582 609
583 if (argp != NULL) { 610 if (argp != NULL) {
584 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; 611 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
612 down_read(&nilfs->ns_segctor_sem);
613 cno = nilfs->ns_cno - 1;
614 up_read(&nilfs->ns_segctor_sem);
585 if (copy_to_user(argp, &cno, sizeof(cno))) 615 if (copy_to_user(argp, &cno, sizeof(cno)))
586 return -EFAULT; 616 return -EFAULT;
587 } 617 }
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 07ba838ef089..ad6ed2cf19b4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
67 if (dentry->d_name.len > NILFS_NAME_LEN) 67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG); 68 return ERR_PTR(-ENAMETOOLONG);
69 69
70 ino = nilfs_inode_by_name(dir, dentry); 70 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 71 inode = NULL;
72 if (ino) { 72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 73 inode = nilfs_iget(dir->i_sb, ino);
@@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child)
81{ 81{
82 unsigned long ino; 82 unsigned long ino;
83 struct inode *inode; 83 struct inode *inode;
84 struct dentry dotdot; 84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88 85
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot); 86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino) 87 if (!ino)
@@ -296,7 +293,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
296 int err; 293 int err;
297 294
298 err = -ENOENT; 295 err = -ENOENT;
299 de = nilfs_find_entry(dir, dentry, &page); 296 de = nilfs_find_entry(dir, &dentry->d_name, &page);
300 if (!de) 297 if (!de)
301 goto out; 298 goto out;
302 299
@@ -389,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
389 return err; 386 return err;
390 387
391 err = -ENOENT; 388 err = -ENOENT;
392 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); 389 old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
393 if (!old_de) 390 if (!old_de)
394 goto out; 391 goto out;
395 392
@@ -409,7 +406,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
409 goto out_dir; 406 goto out_dir;
410 407
411 err = -ENOENT; 408 err = -ENOENT;
412 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); 409 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
413 if (!new_de) 410 if (!new_de)
414 goto out_dir; 411 goto out_dir;
415 inc_nlink(old_inode); 412 inc_nlink(old_inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4da6f67e9a91..8723e5bfd071 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
217 217
218/* dir.c */ 218/* dir.c */
219extern int nilfs_add_link(struct dentry *, struct inode *); 219extern int nilfs_add_link(struct dentry *, struct inode *);
220extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); 220extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
221extern int nilfs_make_empty(struct inode *, struct inode *); 221extern int nilfs_make_empty(struct inode *, struct inode *);
222extern struct nilfs_dir_entry * 222extern struct nilfs_dir_entry *
223nilfs_find_entry(struct inode *, struct dentry *, struct page **); 223nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); 224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
225extern int nilfs_empty_dir(struct inode *); 225extern int nilfs_empty_dir(struct inode *);
226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); 226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..fc246dba112a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -292,7 +292,7 @@ void nilfs_free_private_page(struct page *page)
292 * @src: source page 292 * @src: source page
293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. 293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
294 * 294 *
295 * This fuction is for both data pages and btnode pages. The dirty flag 295 * This function is for both data pages and btnode pages. The dirty flag
296 * should be treated by caller. The page must not be under i/o. 296 * should be treated by caller. The page must not be under i/o.
297 * Both src and dst page must be locked 297 * Both src and dst page must be locked
298 */ 298 */
@@ -388,7 +388,7 @@ repeat:
388} 388}
389 389
390/** 390/**
391 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache 391 * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
392 * @dmap: destination page cache 392 * @dmap: destination page cache
393 * @smap: source page cache 393 * @smap: source page cache
394 * 394 *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index c9c96c7825dc..017bedc761a0 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -39,7 +39,6 @@ enum {
39 NILFS_SEG_FAIL_IO, 39 NILFS_SEG_FAIL_IO,
40 NILFS_SEG_FAIL_MAGIC, 40 NILFS_SEG_FAIL_MAGIC,
41 NILFS_SEG_FAIL_SEQ, 41 NILFS_SEG_FAIL_SEQ,
42 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
43 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, 42 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
44 NILFS_SEG_FAIL_CHECKSUM_FULL, 43 NILFS_SEG_FAIL_CHECKSUM_FULL,
45 NILFS_SEG_FAIL_CONSISTENCY, 44 NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +70,6 @@ static int nilfs_warn_segment_error(int err)
71 printk(KERN_WARNING 70 printk(KERN_WARNING
72 "NILFS warning: Sequence number mismatch\n"); 71 "NILFS warning: Sequence number mismatch\n");
73 break; 72 break;
74 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
75 printk(KERN_WARNING
76 "NILFS warning: Checksum error in segment summary\n");
77 break;
78 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: 73 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
79 printk(KERN_WARNING 74 printk(KERN_WARNING
80 "NILFS warning: Checksum error in super root\n"); 75 "NILFS warning: Checksum error in super root\n");
@@ -206,19 +201,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
206 * @pseg_start: start disk block number of partial segment 201 * @pseg_start: start disk block number of partial segment
207 * @seg_seq: sequence number requested 202 * @seg_seq: sequence number requested
208 * @ssi: pointer to nilfs_segsum_info struct to store information 203 * @ssi: pointer to nilfs_segsum_info struct to store information
209 * @full_check: full check flag
210 * (0: only checks segment summary CRC, 1: data CRC)
211 */ 204 */
212static int 205static int
213load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 206load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
214 u64 seg_seq, struct nilfs_segsum_info *ssi, 207 u64 seg_seq, struct nilfs_segsum_info *ssi)
215 int full_check)
216{ 208{
217 struct buffer_head *bh_sum; 209 struct buffer_head *bh_sum;
218 struct nilfs_segment_summary *sum; 210 struct nilfs_segment_summary *sum;
219 unsigned long offset, nblock; 211 unsigned long nblock;
220 u64 check_bytes; 212 u32 crc;
221 u32 crc, crc_sum;
222 int ret = NILFS_SEG_FAIL_IO; 213 int ret = NILFS_SEG_FAIL_IO;
223 214
224 bh_sum = sb_bread(sbi->s_super, pseg_start); 215 bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +228,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
237 ret = NILFS_SEG_FAIL_SEQ; 228 ret = NILFS_SEG_FAIL_SEQ;
238 goto failed; 229 goto failed;
239 } 230 }
240 if (full_check) {
241 offset = sizeof(sum->ss_datasum);
242 check_bytes =
243 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
244 nblock = ssi->nblocks;
245 crc_sum = le32_to_cpu(sum->ss_datasum);
246 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
247 } else { /* only checks segment summary */
248 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
249 check_bytes = ssi->sumbytes;
250 nblock = ssi->nsumblk;
251 crc_sum = le32_to_cpu(sum->ss_sumsum);
252 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
253 }
254 231
232 nblock = ssi->nblocks;
255 if (unlikely(nblock == 0 || 233 if (unlikely(nblock == 0 ||
256 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 234 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
257 /* This limits the number of blocks read in the CRC check */ 235 /* This limits the number of blocks read in the CRC check */
258 ret = NILFS_SEG_FAIL_CONSISTENCY; 236 ret = NILFS_SEG_FAIL_CONSISTENCY;
259 goto failed; 237 goto failed;
260 } 238 }
261 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, 239 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
240 ((u64)nblock << sbi->s_super->s_blocksize_bits),
262 pseg_start, nblock)) { 241 pseg_start, nblock)) {
263 ret = NILFS_SEG_FAIL_IO; 242 ret = NILFS_SEG_FAIL_IO;
264 goto failed; 243 goto failed;
265 } 244 }
266 if (crc == crc_sum) 245 if (crc == le32_to_cpu(sum->ss_datasum))
267 ret = 0; 246 ret = 0;
247 else
248 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
268 failed: 249 failed:
269 brelse(bh_sum); 250 brelse(bh_sum);
270 out: 251 out:
@@ -598,7 +579,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
598 579
599 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 580 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
600 581
601 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 582 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
602 if (ret) { 583 if (ret) {
603 if (ret == NILFS_SEG_FAIL_IO) { 584 if (ret == NILFS_SEG_FAIL_IO) {
604 err = -EIO; 585 err = -EIO;
@@ -821,7 +802,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
821 802
822 for (;;) { 803 for (;;) {
823 /* Load segment summary */ 804 /* Load segment summary */
824 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 805 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
825 if (ret) { 806 if (ret) {
826 if (ret == NILFS_SEG_FAIL_IO) 807 if (ret == NILFS_SEG_FAIL_IO)
827 goto failed; 808 goto failed;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 645c78656aa0..6129a431aa34 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -32,7 +32,7 @@
32struct nilfs_write_info { 32struct nilfs_write_info {
33 struct the_nilfs *nilfs; 33 struct the_nilfs *nilfs;
34 struct bio *bio; 34 struct bio *bio;
35 int start, end; /* The region to be submitted */ 35 int start, end; /* The region to be submitted */
36 int rest_blocks; 36 int rest_blocks;
37 int max_pages; 37 int max_pages;
38 int nr_vecs; 38 int nr_vecs;
@@ -40,6 +40,11 @@ struct nilfs_write_info {
40}; 40};
41 41
42 42
43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
44 struct the_nilfs *nilfs);
45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
46
47
43static struct kmem_cache *nilfs_segbuf_cachep; 48static struct kmem_cache *nilfs_segbuf_cachep;
44 49
45static void nilfs_segbuf_init_once(void *obj) 50static void nilfs_segbuf_init_once(void *obj)
@@ -169,7 +174,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
169} 174}
170 175
171/* 176/*
172 * Setup segument summary 177 * Setup segment summary
173 */ 178 */
174void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) 179void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
175{ 180{
@@ -302,17 +307,30 @@ void nilfs_truncate_logs(struct list_head *logs,
302 } 307 }
303} 308}
304 309
310int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
311{
312 struct nilfs_segment_buffer *segbuf;
313 int ret = 0;
314
315 list_for_each_entry(segbuf, logs, sb_list) {
316 ret = nilfs_segbuf_write(segbuf, nilfs);
317 if (ret)
318 break;
319 }
320 return ret;
321}
322
305int nilfs_wait_on_logs(struct list_head *logs) 323int nilfs_wait_on_logs(struct list_head *logs)
306{ 324{
307 struct nilfs_segment_buffer *segbuf; 325 struct nilfs_segment_buffer *segbuf;
308 int err; 326 int err, ret = 0;
309 327
310 list_for_each_entry(segbuf, logs, sb_list) { 328 list_for_each_entry(segbuf, logs, sb_list) {
311 err = nilfs_segbuf_wait(segbuf); 329 err = nilfs_segbuf_wait(segbuf);
312 if (err) 330 if (err && !ret)
313 return err; 331 ret = err;
314 } 332 }
315 return 0; 333 return ret;
316} 334}
317 335
318/* 336/*
@@ -452,8 +470,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
452 * 470 *
453 * %-ENOMEM - Insufficient memory available. 471 * %-ENOMEM - Insufficient memory available.
454 */ 472 */
455int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 473static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
456 struct the_nilfs *nilfs) 474 struct the_nilfs *nilfs)
457{ 475{
458 struct nilfs_write_info wi; 476 struct nilfs_write_info wi;
459 struct buffer_head *bh; 477 struct buffer_head *bh;
@@ -496,7 +514,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
496 * 514 *
497 * %-EIO - I/O error 515 * %-EIO - I/O error
498 */ 516 */
499int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) 517static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
500{ 518{
501 int err = 0; 519 int err = 0;
502 520
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 6af1630fb401..94dfd3517bc0 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -166,13 +166,10 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
166 segbuf->sb_sum.nfileblk++; 166 segbuf->sb_sum.nfileblk++;
167} 167}
168 168
169int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
170 struct the_nilfs *nilfs);
171int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
172
173void nilfs_clear_logs(struct list_head *logs); 169void nilfs_clear_logs(struct list_head *logs);
174void nilfs_truncate_logs(struct list_head *logs, 170void nilfs_truncate_logs(struct list_head *logs,
175 struct nilfs_segment_buffer *last); 171 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
176int nilfs_wait_on_logs(struct list_head *logs); 173int nilfs_wait_on_logs(struct list_head *logs);
177 174
178static inline void nilfs_destroy_logs(struct list_head *logs) 175static inline void nilfs_destroy_logs(struct list_head *logs)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 105b508b47a8..c161d89061b5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -141,7 +141,7 @@ int nilfs_init_transaction_cache(void)
141} 141}
142 142
143/** 143/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info 144 * nilfs_destroy_transaction_cache - destroy the cache for transaction info
145 * 145 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct 146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info. 147 * nilfs_transaction_info.
@@ -201,7 +201,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
201 * This function allocates a nilfs_transaction_info struct to keep context 201 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in 202 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used 203 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab. 204 * instead; otherwise a new struct is assigned from a slab.
205 * 205 *
206 * When @vacancy_check flag is set, this function will check the amount of 206 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity. 207 * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -1510,6 +1510,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1510 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1510 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1511 break; 1511 break;
1512 1512
1513 nilfs_clear_logs(&sci->sc_segbufs);
1514
1515 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1516 if (unlikely(err))
1517 return err;
1518
1513 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1519 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1514 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1520 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1515 sci->sc_freesegs, 1521 sci->sc_freesegs,
@@ -1517,12 +1523,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1517 NULL); 1523 NULL);
1518 WARN_ON(err); /* do not happen */ 1524 WARN_ON(err); /* do not happen */
1519 } 1525 }
1520 nilfs_clear_logs(&sci->sc_segbufs);
1521
1522 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1523 if (unlikely(err))
1524 return err;
1525
1526 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1526 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1527 sci->sc_stage = prev_stage; 1527 sci->sc_stage = prev_stage;
1528 } 1528 }
@@ -1764,14 +1764,9 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1764static int nilfs_segctor_write(struct nilfs_sc_info *sci, 1764static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1765 struct the_nilfs *nilfs) 1765 struct the_nilfs *nilfs)
1766{ 1766{
1767 struct nilfs_segment_buffer *segbuf; 1767 int ret;
1768 int ret = 0;
1769 1768
1770 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { 1769 ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
1771 ret = nilfs_segbuf_write(segbuf, nilfs);
1772 if (ret)
1773 break;
1774 }
1775 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); 1770 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
1776 return ret; 1771 return ret;
1777} 1772}
@@ -1902,8 +1897,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1902 1897
1903 list_splice_tail_init(&sci->sc_write_logs, &logs); 1898 list_splice_tail_init(&sci->sc_write_logs, &logs);
1904 ret = nilfs_wait_on_logs(&logs); 1899 ret = nilfs_wait_on_logs(&logs);
1905 if (ret) 1900 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
1906 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
1907 1901
1908 list_splice_tail_init(&sci->sc_segbufs, &logs); 1902 list_splice_tail_init(&sci->sc_segbufs, &logs);
1909 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1903 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1937,8 +1931,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1937{ 1931{
1938 struct nilfs_segment_buffer *segbuf; 1932 struct nilfs_segment_buffer *segbuf;
1939 struct page *bd_page = NULL, *fs_page = NULL; 1933 struct page *bd_page = NULL, *fs_page = NULL;
1940 struct nilfs_sb_info *sbi = sci->sc_sbi; 1934 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1941 struct the_nilfs *nilfs = sbi->s_nilfs;
1942 int update_sr = (sci->sc_super_root != NULL); 1935 int update_sr = (sci->sc_super_root != NULL);
1943 1936
1944 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1937 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -2020,7 +2013,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
2020 if (update_sr) { 2013 if (update_sr) {
2021 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 2014 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2022 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 2015 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2023 sbi->s_super->s_dirt = 1; 2016 set_nilfs_sb_dirty(nilfs);
2024 2017
2025 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 2018 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2026 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2019 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2220,7 +2213,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2220} 2213}
2221 2214
2222/** 2215/**
2223 * nilfs_secgtor_start_timer - set timer of background write 2216 * nilfs_segctor_start_timer - set timer of background write
2224 * @sci: nilfs_sc_info 2217 * @sci: nilfs_sc_info
2225 * 2218 *
2226 * If the timer has already been set, it ignores the new request. 2219 * If the timer has already been set, it ignores the new request.
@@ -2425,43 +2418,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2425 return err; 2418 return err;
2426} 2419}
2427 2420
2428struct nilfs_segctor_req {
2429 int mode;
2430 __u32 seq_accepted;
2431 int sc_err; /* construction failure */
2432 int sb_err; /* super block writeback failure */
2433};
2434
2435#define FLUSH_FILE_BIT (0x1) /* data file only */ 2421#define FLUSH_FILE_BIT (0x1) /* data file only */
2436#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ 2422#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2437 2423
2438static void nilfs_segctor_accept(struct nilfs_sc_info *sci, 2424/**
2439 struct nilfs_segctor_req *req) 2425 * nilfs_segctor_accept - record accepted sequence count of log-write requests
2426 * @sci: segment constructor object
2427 */
2428static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2440{ 2429{
2441 req->sc_err = req->sb_err = 0;
2442 spin_lock(&sci->sc_state_lock); 2430 spin_lock(&sci->sc_state_lock);
2443 req->seq_accepted = sci->sc_seq_request; 2431 sci->sc_seq_accepted = sci->sc_seq_request;
2444 spin_unlock(&sci->sc_state_lock); 2432 spin_unlock(&sci->sc_state_lock);
2445 2433
2446 if (sci->sc_timer) 2434 if (sci->sc_timer)
2447 del_timer_sync(sci->sc_timer); 2435 del_timer_sync(sci->sc_timer);
2448} 2436}
2449 2437
2450static void nilfs_segctor_notify(struct nilfs_sc_info *sci, 2438/**
2451 struct nilfs_segctor_req *req) 2439 * nilfs_segctor_notify - notify the result of request to caller threads
2440 * @sci: segment constructor object
2441 * @mode: mode of log forming
2442 * @err: error code to be notified
2443 */
2444static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2452{ 2445{
2453 /* Clear requests (even when the construction failed) */ 2446 /* Clear requests (even when the construction failed) */
2454 spin_lock(&sci->sc_state_lock); 2447 spin_lock(&sci->sc_state_lock);
2455 2448
2456 if (req->mode == SC_LSEG_SR) { 2449 if (mode == SC_LSEG_SR) {
2457 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; 2450 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2458 sci->sc_seq_done = req->seq_accepted; 2451 sci->sc_seq_done = sci->sc_seq_accepted;
2459 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); 2452 nilfs_segctor_wakeup(sci, err);
2460 sci->sc_flush_request = 0; 2453 sci->sc_flush_request = 0;
2461 } else { 2454 } else {
2462 if (req->mode == SC_FLUSH_FILE) 2455 if (mode == SC_FLUSH_FILE)
2463 sci->sc_flush_request &= ~FLUSH_FILE_BIT; 2456 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2464 else if (req->mode == SC_FLUSH_DAT) 2457 else if (mode == SC_FLUSH_DAT)
2465 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2458 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2466 2459
2467 /* re-enable timer if checkpoint creation was not done */ 2460 /* re-enable timer if checkpoint creation was not done */
@@ -2472,30 +2465,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2472 spin_unlock(&sci->sc_state_lock); 2465 spin_unlock(&sci->sc_state_lock);
2473} 2466}
2474 2467
2475static int nilfs_segctor_construct(struct nilfs_sc_info *sci, 2468/**
2476 struct nilfs_segctor_req *req) 2469 * nilfs_segctor_construct - form logs and write them to disk
2470 * @sci: segment constructor object
2471 * @mode: mode of log forming
2472 */
2473static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2477{ 2474{
2478 struct nilfs_sb_info *sbi = sci->sc_sbi; 2475 struct nilfs_sb_info *sbi = sci->sc_sbi;
2479 struct the_nilfs *nilfs = sbi->s_nilfs; 2476 struct the_nilfs *nilfs = sbi->s_nilfs;
2480 int err = 0; 2477 int err = 0;
2481 2478
2479 nilfs_segctor_accept(sci);
2480
2482 if (nilfs_discontinued(nilfs)) 2481 if (nilfs_discontinued(nilfs))
2483 req->mode = SC_LSEG_SR; 2482 mode = SC_LSEG_SR;
2484 if (!nilfs_segctor_confirm(sci)) { 2483 if (!nilfs_segctor_confirm(sci))
2485 err = nilfs_segctor_do_construct(sci, req->mode); 2484 err = nilfs_segctor_do_construct(sci, mode);
2486 req->sc_err = err; 2485
2487 }
2488 if (likely(!err)) { 2486 if (likely(!err)) {
2489 if (req->mode != SC_FLUSH_DAT) 2487 if (mode != SC_FLUSH_DAT)
2490 atomic_set(&nilfs->ns_ndirtyblks, 0); 2488 atomic_set(&nilfs->ns_ndirtyblks, 0);
2491 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2489 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2492 nilfs_discontinued(nilfs)) { 2490 nilfs_discontinued(nilfs)) {
2493 down_write(&nilfs->ns_sem); 2491 down_write(&nilfs->ns_sem);
2494 req->sb_err = nilfs_commit_super(sbi, 2492 err = nilfs_commit_super(
2495 nilfs_altsb_need_update(nilfs)); 2493 sbi, nilfs_altsb_need_update(nilfs));
2496 up_write(&nilfs->ns_sem); 2494 up_write(&nilfs->ns_sem);
2497 } 2495 }
2498 } 2496 }
2497
2498 nilfs_segctor_notify(sci, mode, err);
2499 return err; 2499 return err;
2500} 2500}
2501 2501
@@ -2526,7 +2526,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2526 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2526 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2527 struct the_nilfs *nilfs = sbi->s_nilfs; 2527 struct the_nilfs *nilfs = sbi->s_nilfs;
2528 struct nilfs_transaction_info ti; 2528 struct nilfs_transaction_info ti;
2529 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2530 int err; 2529 int err;
2531 2530
2532 if (unlikely(!sci)) 2531 if (unlikely(!sci))
@@ -2547,10 +2546,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); 2546 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
2548 2547
2549 for (;;) { 2548 for (;;) {
2550 nilfs_segctor_accept(sci, &req); 2549 err = nilfs_segctor_construct(sci, SC_LSEG_SR);
2551 err = nilfs_segctor_construct(sci, &req);
2552 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); 2550 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2553 nilfs_segctor_notify(sci, &req);
2554 2551
2555 if (likely(!err)) 2552 if (likely(!err))
2556 break; 2553 break;
@@ -2560,6 +2557,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2560 set_current_state(TASK_INTERRUPTIBLE); 2557 set_current_state(TASK_INTERRUPTIBLE);
2561 schedule_timeout(sci->sc_interval); 2558 schedule_timeout(sci->sc_interval);
2562 } 2559 }
2560 if (nilfs_test_opt(sbi, DISCARD)) {
2561 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
2562 sci->sc_nfreesegs);
2563 if (ret) {
2564 printk(KERN_WARNING
2565 "NILFS warning: error %d on discard request, "
2566 "turning discards off for the device\n", ret);
2567 nilfs_clear_opt(sbi, DISCARD);
2568 }
2569 }
2563 2570
2564 out_unlock: 2571 out_unlock:
2565 sci->sc_freesegs = NULL; 2572 sci->sc_freesegs = NULL;
@@ -2573,13 +2580,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2573{ 2580{
2574 struct nilfs_sb_info *sbi = sci->sc_sbi; 2581 struct nilfs_sb_info *sbi = sci->sc_sbi;
2575 struct nilfs_transaction_info ti; 2582 struct nilfs_transaction_info ti;
2576 struct nilfs_segctor_req req = { .mode = mode };
2577 2583
2578 nilfs_transaction_lock(sbi, &ti, 0); 2584 nilfs_transaction_lock(sbi, &ti, 0);
2579 2585 nilfs_segctor_construct(sci, mode);
2580 nilfs_segctor_accept(sci, &req);
2581 nilfs_segctor_construct(sci, &req);
2582 nilfs_segctor_notify(sci, &req);
2583 2586
2584 /* 2587 /*
2585 * Unclosed segment should be retried. We do this using sc_timer. 2588 * Unclosed segment should be retried. We do this using sc_timer.
@@ -2635,6 +2638,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2635static int nilfs_segctor_thread(void *arg) 2638static int nilfs_segctor_thread(void *arg)
2636{ 2639{
2637 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2640 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2641 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2638 struct timer_list timer; 2642 struct timer_list timer;
2639 int timeout = 0; 2643 int timeout = 0;
2640 2644
@@ -2680,7 +2684,6 @@ static int nilfs_segctor_thread(void *arg)
2680 } else { 2684 } else {
2681 DEFINE_WAIT(wait); 2685 DEFINE_WAIT(wait);
2682 int should_sleep = 1; 2686 int should_sleep = 1;
2683 struct the_nilfs *nilfs;
2684 2687
2685 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2688 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2686 TASK_INTERRUPTIBLE); 2689 TASK_INTERRUPTIBLE);
@@ -2701,8 +2704,8 @@ static int nilfs_segctor_thread(void *arg)
2701 finish_wait(&sci->sc_wait_daemon, &wait); 2704 finish_wait(&sci->sc_wait_daemon, &wait);
2702 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2705 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2703 time_after_eq(jiffies, sci->sc_timer->expires)); 2706 time_after_eq(jiffies, sci->sc_timer->expires));
2704 nilfs = sci->sc_sbi->s_nilfs; 2707
2705 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) 2708 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2706 set_nilfs_discontinued(nilfs); 2709 set_nilfs_discontinued(nilfs);
2707 } 2710 }
2708 goto loop; 2711 goto loop;
@@ -2797,12 +2800,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2797 do { 2800 do {
2798 struct nilfs_sb_info *sbi = sci->sc_sbi; 2801 struct nilfs_sb_info *sbi = sci->sc_sbi;
2799 struct nilfs_transaction_info ti; 2802 struct nilfs_transaction_info ti;
2800 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2801 2803
2802 nilfs_transaction_lock(sbi, &ti, 0); 2804 nilfs_transaction_lock(sbi, &ti, 0);
2803 nilfs_segctor_accept(sci, &req); 2805 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2804 ret = nilfs_segctor_construct(sci, &req);
2805 nilfs_segctor_notify(sci, &req);
2806 nilfs_transaction_unlock(sbi); 2806 nilfs_transaction_unlock(sbi);
2807 2807
2808 } while (ret && retrycount-- > 0); 2808 } while (ret && retrycount-- > 0);
@@ -2853,7 +2853,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2853 * @sbi: nilfs_sb_info 2853 * @sbi: nilfs_sb_info
2854 * 2854 *
2855 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2855 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2856 * initilizes it, and starts the segment constructor. 2856 * initializes it, and starts the segment constructor.
2857 * 2857 *
2858 * Return Value: On success, 0 is returned. On error, one of the following 2858 * Return Value: On success, 0 is returned. On error, one of the following
2859 * negative error code is returned. 2859 * negative error code is returned.
@@ -2865,8 +2865,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2865 struct the_nilfs *nilfs = sbi->s_nilfs; 2865 struct the_nilfs *nilfs = sbi->s_nilfs;
2866 int err; 2866 int err;
2867 2867
2868 /* Each field of nilfs_segctor is cleared through the initialization 2868 if (NILFS_SC(sbi)) {
2869 of super-block info */ 2869 /*
2870 * This happens if the filesystem was remounted
2871 * read/write after nilfs_error degenerated it into a
2872 * read-only mount.
2873 */
2874 nilfs_detach_segment_constructor(sbi);
2875 }
2876
2870 sbi->s_sc_info = nilfs_segctor_new(sbi); 2877 sbi->s_sc_info = nilfs_segctor_new(sbi);
2871 if (!sbi->s_sc_info) 2878 if (!sbi->s_sc_info)
2872 return -ENOMEM; 2879 return -ENOMEM;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3d3ab2f9864c..82dfd6a686b9 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
30#include "sb.h" 30#include "sb.h"
31 31
32/** 32/**
33 * struct nilfs_recovery_info - Recovery infomation 33 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root 35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint 36 * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
71 */ 71 */
72struct nilfs_cstage { 72struct nilfs_cstage {
73 int scnt; 73 int scnt;
74 unsigned flags; 74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr; 75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr; 76 struct nilfs_inode_info *gc_inode_ptr;
77}; 77};
@@ -116,6 +116,7 @@ struct nilfs_segsum_pointer {
116 * @sc_wait_daemon: Daemon wait queue 116 * @sc_wait_daemon: Daemon wait queue
117 * @sc_wait_task: Start/end wait queue to control segctord task 117 * @sc_wait_task: Start/end wait queue to control segctord task
118 * @sc_seq_request: Request counter 118 * @sc_seq_request: Request counter
119 * @sc_seq_accept: Accepted request count
119 * @sc_seq_done: Completion counter 120 * @sc_seq_done: Completion counter
120 * @sc_sync: Request of explicit sync operation 121 * @sc_sync: Request of explicit sync operation
121 * @sc_interval: Timeout value of background construction 122 * @sc_interval: Timeout value of background construction
@@ -169,6 +170,7 @@ struct nilfs_sc_info {
169 wait_queue_head_t sc_wait_task; 170 wait_queue_head_t sc_wait_task;
170 171
171 __u32 sc_seq_request; 172 __u32 sc_seq_request;
173 __u32 sc_seq_accepted;
172 __u32 sc_seq_done; 174 __u32 sc_seq_done;
173 175
174 int sc_sync; 176 int sc_sync;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc331..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Koji Sato <koji@osrg.net>. 20 * Written by Koji Sato <koji@osrg.net>.
21 * Rivised by Ryusuke Konishi <ryusuke@osrg.net>. 21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 */ 22 */
23 23
24#include <linux/kernel.h> 24#include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8173faee31e6..0cdbc5e7655a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function,
96 if (!(sb->s_flags & MS_RDONLY)) { 96 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs; 97 struct the_nilfs *nilfs = sbi->s_nilfs;
98 98
99 if (!nilfs_test_opt(sbi, ERRORS_CONT))
100 nilfs_detach_segment_constructor(sbi);
101
102 down_write(&nilfs->ns_sem); 99 down_write(&nilfs->ns_sem);
103 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { 100 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
104 nilfs->ns_mount_state |= NILFS_ERROR_FS; 101 nilfs->ns_mount_state |= NILFS_ERROR_FS;
@@ -301,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
301 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 298 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
302 nilfs->ns_sbwtime[1] = t; 299 nilfs->ns_sbwtime[1] = t;
303 } 300 }
304 sbi->s_super->s_dirt = 0; 301 clear_nilfs_sb_dirty(nilfs);
305 return nilfs_sync_super(sbi, dupsb); 302 return nilfs_sync_super(sbi, dupsb);
306} 303}
307 304
@@ -345,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
345 err = nilfs_construct_segment(sb); 342 err = nilfs_construct_segment(sb);
346 343
347 down_write(&nilfs->ns_sem); 344 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt) 345 if (nilfs_sb_dirty(nilfs))
349 nilfs_commit_super(sbi, 1); 346 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem); 347 up_write(&nilfs->ns_sem);
351 348
@@ -439,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
439 /* 436 /*
440 * Compute the overhead 437 * Compute the overhead
441 * 438 *
442 * When distributing meta data blocks outside semgent structure, 439 * When distributing meta data blocks outside segment structure,
443 * We must count them as the overhead. 440 * We must count them as the overhead.
444 */ 441 */
445 overhead = 0; 442 overhead = 0;
@@ -481,6 +478,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
481 seq_printf(seq, ",order=strict"); 478 seq_printf(seq, ",order=strict");
482 if (nilfs_test_opt(sbi, NORECOVERY)) 479 if (nilfs_test_opt(sbi, NORECOVERY))
483 seq_printf(seq, ",norecovery"); 480 seq_printf(seq, ",norecovery");
481 if (nilfs_test_opt(sbi, DISCARD))
482 seq_printf(seq, ",discard");
484 483
485 return 0; 484 return 0;
486} 485}
@@ -550,7 +549,7 @@ static const struct export_operations nilfs_export_ops = {
550enum { 549enum {
551 Opt_err_cont, Opt_err_panic, Opt_err_ro, 550 Opt_err_cont, Opt_err_panic, Opt_err_ro,
552 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 551 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
553 Opt_err, 552 Opt_discard, Opt_err,
554}; 553};
555 554
556static match_table_t tokens = { 555static match_table_t tokens = {
@@ -561,6 +560,7 @@ static match_table_t tokens = {
561 {Opt_snapshot, "cp=%u"}, 560 {Opt_snapshot, "cp=%u"},
562 {Opt_order, "order=%s"}, 561 {Opt_order, "order=%s"},
563 {Opt_norecovery, "norecovery"}, 562 {Opt_norecovery, "norecovery"},
563 {Opt_discard, "discard"},
564 {Opt_err, NULL} 564 {Opt_err, NULL}
565}; 565};
566 566
@@ -614,6 +614,9 @@ static int parse_options(char *options, struct super_block *sb)
614 case Opt_norecovery: 614 case Opt_norecovery:
615 nilfs_set_opt(sbi, NORECOVERY); 615 nilfs_set_opt(sbi, NORECOVERY);
616 break; 616 break;
617 case Opt_discard:
618 nilfs_set_opt(sbi, DISCARD);
619 break;
617 default: 620 default:
618 printk(KERN_ERR 621 printk(KERN_ERR
619 "NILFS: Unrecognized mount option \"%s\"\n", p); 622 "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -863,7 +866,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
863 if ((*flags & MS_RDONLY) && 866 if ((*flags & MS_RDONLY) &&
864 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 867 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
865 printk(KERN_WARNING "NILFS (device %s): couldn't " 868 printk(KERN_WARNING "NILFS (device %s): couldn't "
866 "remount to a different snapshot. \n", 869 "remount to a different snapshot.\n",
867 sb->s_id); 870 sb->s_id);
868 err = -EINVAL; 871 err = -EINVAL;
869 goto restore_opts; 872 goto restore_opts;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6241e1722efc..33871f7e4f01 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
386 386
387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { 388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
389 printk(KERN_ERR "NILFS: too short segment. \n"); 389 printk(KERN_ERR "NILFS: too short segment.\n");
390 return -EINVAL; 390 return -EINVAL;
391 } 391 }
392 392
@@ -646,6 +646,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
646 goto out; 646 goto out;
647} 647}
648 648
649int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
650 size_t nsegs)
651{
652 sector_t seg_start, seg_end;
653 sector_t start = 0, nblocks = 0;
654 unsigned int sects_per_block;
655 __u64 *sn;
656 int ret = 0;
657
658 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
659 bdev_logical_block_size(nilfs->ns_bdev);
660 for (sn = segnump; sn < segnump + nsegs; sn++) {
661 nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
662
663 if (!nblocks) {
664 start = seg_start;
665 nblocks = seg_end - seg_start + 1;
666 } else if (start + nblocks == seg_start) {
667 nblocks += seg_end - seg_start + 1;
668 } else {
669 ret = blkdev_issue_discard(nilfs->ns_bdev,
670 start * sects_per_block,
671 nblocks * sects_per_block,
672 GFP_NOFS,
673 DISCARD_FL_BARRIER);
674 if (ret < 0)
675 return ret;
676 nblocks = 0;
677 }
678 }
679 if (nblocks)
680 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block,
682 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER);
684 return ret;
685}
686
649int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 687int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
650{ 688{
651 struct inode *dat = nilfs_dat_inode(nilfs); 689 struct inode *dat = nilfs_dat_inode(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 589786e33464..e9795f1724d7 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -38,6 +38,7 @@ enum {
38 the latest checkpoint was loaded */ 38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */ 40 THE_NILFS_GC_RUNNING, /* gc process is running */
41 THE_NILFS_SB_DIRTY, /* super block is dirty */
41}; 42};
42 43
43/** 44/**
@@ -197,6 +198,7 @@ THE_NILFS_FNS(INIT, init)
197THE_NILFS_FNS(LOADED, loaded) 198THE_NILFS_FNS(LOADED, loaded)
198THE_NILFS_FNS(DISCONTINUED, discontinued) 199THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running) 200THE_NILFS_FNS(GC_RUNNING, gc_running)
201THE_NILFS_FNS(SB_DIRTY, sb_dirty)
200 202
201/* Minimum interval of periodical update of superblocks (in seconds) */ 203/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 204#define NILFS_SB_FREQ 10
@@ -221,6 +223,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
221void put_nilfs(struct the_nilfs *); 223void put_nilfs(struct the_nilfs *);
222int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 224int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
223int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 225int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
226int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
224int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 227int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
225struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 228struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
226int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); 229int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a94e8bd8eb1f..472cdf29ef82 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -29,14 +29,12 @@
29#include <linux/init.h> /* module_init */ 29#include <linux/init.h> /* module_init */
30#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/kernel.h> /* roundup() */ 31#include <linux/kernel.h> /* roundup() */
32#include <linux/magic.h> /* superblock magic number */
33#include <linux/mount.h> /* mntget */
34#include <linux/namei.h> /* LOOKUP_FOLLOW */ 32#include <linux/namei.h> /* LOOKUP_FOLLOW */
35#include <linux/path.h> /* struct path */
36#include <linux/sched.h> /* struct user */ 33#include <linux/sched.h> /* struct user */
37#include <linux/slab.h> /* struct kmem_cache */ 34#include <linux/slab.h> /* struct kmem_cache */
38#include <linux/syscalls.h> 35#include <linux/syscalls.h>
39#include <linux/types.h> 36#include <linux/types.h>
37#include <linux/anon_inodes.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/poll.h> 39#include <linux/poll.h>
42#include <linux/wait.h> 40#include <linux/wait.h>
@@ -45,8 +43,6 @@
45 43
46#include <asm/ioctls.h> 44#include <asm/ioctls.h>
47 45
48static struct vfsmount *inotify_mnt __read_mostly;
49
50/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
51static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
52static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
@@ -645,9 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
645{ 641{
646 struct fsnotify_group *group; 642 struct fsnotify_group *group;
647 struct user_struct *user; 643 struct user_struct *user;
648 struct file *filp; 644 int ret;
649 struct path path;
650 int fd, ret;
651 645
652 /* Check the IN_* constants for consistency. */ 646 /* Check the IN_* constants for consistency. */
653 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); 647 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
@@ -656,10 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
656 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) 650 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
657 return -EINVAL; 651 return -EINVAL;
658 652
659 fd = get_unused_fd_flags(flags & O_CLOEXEC);
660 if (fd < 0)
661 return fd;
662
663 user = get_current_user(); 653 user = get_current_user();
664 if (unlikely(atomic_read(&user->inotify_devs) >= 654 if (unlikely(atomic_read(&user->inotify_devs) >=
665 inotify_max_user_instances)) { 655 inotify_max_user_instances)) {
@@ -676,27 +666,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
676 666
677 atomic_inc(&user->inotify_devs); 667 atomic_inc(&user->inotify_devs);
678 668
679 path.mnt = inotify_mnt; 669 ret = anon_inode_getfd("inotify", &inotify_fops, group,
680 path.dentry = inotify_mnt->mnt_root; 670 O_RDONLY | flags);
681 path_get(&path); 671 if (ret >= 0)
682 filp = alloc_file(&path, FMODE_READ, &inotify_fops); 672 return ret;
683 if (!filp)
684 goto Enfile;
685 673
686 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
687 filp->private_data = group;
688
689 fd_install(fd, filp);
690
691 return fd;
692
693Enfile:
694 ret = -ENFILE;
695 path_put(&path);
696 atomic_dec(&user->inotify_devs); 674 atomic_dec(&user->inotify_devs);
697out_free_uid: 675out_free_uid:
698 free_uid(user); 676 free_uid(user);
699 put_unused_fd(fd);
700 return ret; 677 return ret;
701} 678}
702 679
@@ -783,20 +760,6 @@ out:
783 return ret; 760 return ret;
784} 761}
785 762
786static int
787inotify_get_sb(struct file_system_type *fs_type, int flags,
788 const char *dev_name, void *data, struct vfsmount *mnt)
789{
790 return get_sb_pseudo(fs_type, "inotify", NULL,
791 INOTIFYFS_SUPER_MAGIC, mnt);
792}
793
794static struct file_system_type inotify_fs_type = {
795 .name = "inotifyfs",
796 .get_sb = inotify_get_sb,
797 .kill_sb = kill_anon_super,
798};
799
800/* 763/*
801 * inotify_user_setup - Our initialization function. Note that we cannnot return 764 * inotify_user_setup - Our initialization function. Note that we cannnot return
802 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 765 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
@@ -804,16 +767,6 @@ static struct file_system_type inotify_fs_type = {
804 */ 767 */
805static int __init inotify_user_setup(void) 768static int __init inotify_user_setup(void)
806{ 769{
807 int ret;
808
809 ret = register_filesystem(&inotify_fs_type);
810 if (unlikely(ret))
811 panic("inotify: register_filesystem returned %d!\n", ret);
812
813 inotify_mnt = kern_mount(&inotify_fs_type);
814 if (IS_ERR(inotify_mnt))
815 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
816
817 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
818 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
819 772
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e194372..000000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
1ToDo/Notes:
2 - Find and fix bugs.
3 - The only places in the kernel where a file is resized are
4 ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
5 held. Just have to be careful in read-/writepage and other helpers
6 not running under i_mutex that we play nice. Also need to be careful
7 with initialized_size extension in ntfs_file_write*() and writepage.
8 UPDATE: The only things that need to be checked are the compressed
9 write and the other attribute resize/write cases like index
10 attributes, etc. For now none of these are implemented so are safe.
11 - Implement filling in of holes in aops.c::ntfs_writepage() and its
12 helpers.
13 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
14 leave the volume dirty on umount if the final iput(vol->mft_ino)
15 causes a write of any mirrored mft records due to the mft mirror
16 inode having been discarded already. Whether this can actually ever
17 happen is unclear however so it is worth waiting until someone hits
18 the problem.
19
202.1.29 - Fix a deadlock at mount time.
21
22 - During mount the VFS holds s_umount lock on the superblock. So when
23 we try to empty the journal $LogFile contents by calling
24 ntfs_attr_set() when the machine does not have much memory and the
25 journal is large ntfs_attr_set() results in the VM trying to balance
26 dirty pages which in turn tries to that the s_umount lock and thus we
27 get a deadlock. The solution is to not use ntfs_attr_set() and
28 instead do the zeroing by hand at the block level rather than page
29 cache level.
30 - Fix sparse warnings.
31
322.1.28 - Fix a deadlock.
33
34 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey
35 Vlasov for the report and detailed analysis of the deadlock. The fix
36 involved getting rid of ntfs_put_inode() altogether and hence NTFS no
37 longer has a ->put_inode super operation.
38
392.1.27 - Various bug fixes and cleanups.
40
41 - Fix two compiler warnings on Alpha. Thanks to Andrew Morton for
42 reporting them.
43 - Fix an (innocent) off-by-one error in the runlist code.
44 - Fix a buggette in an "should be impossible" case handling where we
45 continued the attribute lookup loop instead of aborting it.
46 - Use buffer_migrate_page() for the ->migratepage function of all ntfs
47 address space operations.
48 - Fix comparison of $MFT and $MFTMirr to not bail out when there are
49 unused, invalid mft records which are the same in both $MFT and
50 $MFTMirr.
51 - Add support for sparse files which have a compression unit of 0.
52 - Remove all the make_bad_inode() calls. This should only be called
53 from read inode and new inode code paths.
54 - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
55 allowed by NTFS, i.e. 255 Unicode characters, not including the
56 terminating NULL (which is not stored on disk).
57 - Improve comments on file attribute flags in fs/ntfs/layout.h.
58 - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
59 forgot to update a temporary variable so loading index inodes which
60 have an index allocation attribute failed.
61 - Add a missing call to flush_dcache_mft_record_page() in
62 fs/ntfs/inode.c::ntfs_write_inode().
63 - Handle the recently introduced -ENAMETOOLONG return value from
64 fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
65 - Semaphore to mutex conversion. (Ingo Molnar)
66
672.1.26 - Minor bug fixes and updates.
68
69 - Fix a potential overflow in file.c where a cast to s64 was missing in
70 a left shift of a page index.
71 - The struct inode has had its i_sem semaphore changed to a mutex named
72 i_mutex.
73 - We have struct kmem_cache now so use it instead of the typedef
74 kmem_cache_t. (Pekka Enberg)
75 - Implement support for sector sizes above 512 bytes (up to the maximum
76 supported by NTFS which is 4096 bytes).
77 - Do more detailed reporting of why we cannot mount read-write by
78 special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
79 - Miscellaneous updates to layout.h.
80 - Cope with attribute list attribute having invalid flags. Windows
81 copes with this and even chkdsk does not detect or fix this so we
82 have to cope with it, too. Thanks to Pawel Kot for reporting the
83 problem.
84
852.1.25 - (Almost) fully implement write(2) and truncate(2).
86
87 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
88 {__,}ntfs_cluster_free() to also take an optional attribute search
89 context as argument. This allows calling these functions with the
90 mft record mapped. Update all callers.
91 - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
92 error handling by passing in the active search context when calling
93 ntfs_cluster_free().
94 - Change ntfs_cluster_alloc() to take an extra boolean parameter
95 specifying whether the cluster are being allocated to extend an
96 attribute or to fill a hole.
97 - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
98 with @is_extension set to TRUE and remove the runlist terminator
99 fixup code as this is now done by ntfs_cluster_alloc().
100 - Change ntfs_attr_make_non_resident to take the attribute value size
101 as an extra parameter. This is needed since we need to know the size
102 before we can map the mft record and our callers always know it. The
103 reason we cannot simply read the size from the vfs inode i_size is
104 that this is not necessarily uptodate. This happens when
105 ntfs_attr_make_non_resident() is called in the ->truncate call path.
106 - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
107 which is zero for a resident attribute but should no longer be zero
108 once the attribute is non-resident as it then has real clusters
109 allocated.
110 - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
111 extend the allocation of an attributes. Optionally, the data size,
112 but not the initialized size can be extended, too.
113 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
114 uncompressed and unencrypted files and it never creates sparse files
115 at least for the moment (making a file sparse requires us to modify
116 its directory entries and we do not support directory operations at
117 the moment). Also, support for highly fragmented files, i.e. ones
118 whose data attribute is split across multiple extents, is severly
119 limited. When such a case is encountered, EOPNOTSUPP is returned.
120 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
121 the initial implementation of file truncation. Now both open(2)ing
122 a file with the O_TRUNC flag and the {,f}truncate(2) system calls
123 will resize a file appropriately. The limitations are that only
124 uncompressed and unencrypted files are supported. Also, there is
125 only very limited support for highly fragmented files (the ones whose
126 $DATA attribute is split into multiple attribute extents).
127 - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
128 and cond_resched() in the main loop as we could be dirtying a lot of
129 pages and this ensures we play nice with the VM and the system as a
130 whole.
131 - Implement file operations ->write, ->aio_write, ->writev for regular
132 files. This replaces the old use of generic_file_write(), et al and
133 the address space operations ->prepare_write and ->commit_write.
134 This means that both sparse and non-sparse (unencrypted and
135 uncompressed) files can now be extended using the normal write(2)
136 code path. There are two limitations at present and these are that
137 we never create sparse files and that we only have limited support
138 for highly fragmented files, i.e. ones whose data attribute is split
139 across multiple extents. When such a case is encountered,
140 EOPNOTSUPP is returned.
141 - $EA attributes can be both resident and non-resident.
142 - Use %z for size_t to fix compilation warnings. (Andrew Morton)
143 - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
144 - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
145 patch by Yura Pakhuchiy.)
146
1472.1.24 - Lots of bug fixes and support more clean journal states.
148
149 - Support journals ($LogFile) which have been modified by chkdsk. This
150 means users can boot into Windows after we marked the volume dirty.
151 The Windows boot will run chkdsk and then reboot. The user can then
152 immediately boot into Linux rather than having to do a full Windows
153 boot first before rebooting into Linux and we will recognize such a
154 journal and empty it as it is clean by definition. Note, this only
155 works if chkdsk left the journal in an obviously clean state.
156 - Support journals ($LogFile) with only one restart page as well as
157 journals with two different restart pages. We sanity check both and
158 either use the only sane one or the more recent one of the two in the
159 case that both are valid.
160 - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
161 ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
162 hence cannot fail.
163 - Use ntfs_malloc_nofs_nofail() in the two critical regions in
164 fs/ntfs/runlist.c::ntfs_runlists_merge(). This means we no longer
165 need to panic() if the allocation fails as it now cannot fail.
166 - Fix two nasty runlist merging bugs that had gone unnoticed so far.
167 Thanks to Stefano Picerno for the bug report.
168 - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
169 - Fix handling of valid but empty mapping pairs array in
170 fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
171 - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
172 messages and include the inode number. Thanks to Yura Pakhuchiy for
173 pointing this out.
174 - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
175 length is zero.
176 - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
177 specified hole into a runlist.
178 - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup(). When the returned
179 index entry is in the index root, we forgot to set the @ir pointer in
180 the index context. Thanks to Yura Pakhuchiy for finding this bug.
181 - Remove bogus setting of PageError in ntfs_read_compressed_block().
182 - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
183 - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
184 access to the allocated size in the ntfs inode with the size lock.
185 - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
186 return LCN_ENOENT when there is no runlist and the allocated size is
187 zero.
188 - Fix load_attribute_list() to handle the case of a NULL runlist.
189 - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
190 - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
191 to ensure that these functions are never called for compressed or
192 encrypted attributes.
193 - Fix cluster (de)allocators to work when the runlist is NULL and more
194 importantly to take a locked runlist rather than them locking it
195 which leads to lock reversal.
196 - Truncate {a,c,m}time to the ntfs supported time granularity when
197 updating the times in the inode in ntfs_setattr().
198 - Fixup handling of sparse, compressed, and encrypted attributes in
199 fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
200 fs/ntfs/aops.c::ntfs_{read,write}page().
201 - Make ntfs_write_block() not instantiate sparse blocks if they contain
202 only zeroes.
203 - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
204 lock protection over the buffer submission for i/o which allows the
205 removal of the get_bh()/put_bh() pairs for each buffer.
206 - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
207 where a concurrent truncate has truncated the runlist under our feet.
208 - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
209 - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
210 in the first buffer head instead of a driver global spin lock to
211 improve scalability.
212 - Minor fix to error handling and error message display in
213 fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
214 - Change the mount options {u,f,d}mask to always parse the number as
215 an octal number to conform to how chmod(1) works, too. Thanks to
216 Giuseppe Bilotta and Horst von Brand for pointing out the errors of
217 my ways.
218 - Fix various bugs in the runlist merging code. (Based on libntfs
219 changes by Richard Russon.)
220 - Fix sparse warnings that have crept in over time.
221 - Change ntfs_cluster_free() to require a write locked runlist on entry
222 since we otherwise get into a lock reversal deadlock if a read locked
223 runlist is passed in. In the process also change it to take an ntfs
224 inode instead of a vfs inode as parameter.
225 - Fix the definition of the CHKD ntfs record magic. It had an off by
226 two error causing it to be CHKB instead of CHKD.
227 - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
228 count to become negative and hence we had a wild memset() scribbling
229 all over the system's ram.
230
2312.1.23 - Implement extension of resident files and make writing safe as well as
232 many bug fixes, cleanups, and enhancements...
233
234 - Add printk rate limiting for ntfs_warning() and ntfs_error() when
235 compiled without debug. This avoids a possible denial of service
236 attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
237 out.
238 - Fix compilation warnings on ia64. (Randy Dunlap)
239 - Use i_size_{read,write}() instead of reading i_size by hand and cache
240 the value where apropriate.
241 - Add size_lock to the ntfs_inode structure. This is an rw spinlock
242 and it locks against access to the inode sizes. Note, ->size_lock
243 is also accessed from irq context so you must use the _irqsave and
244 _irqrestore lock and unlock functions, respectively. Protect all
245 accesses to allocated_size, initialized_size, and compressed_size.
246 - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
247 - Implement extension of resident files in the regular file write code
248 paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()). At present
249 this only works until the data attribute becomes too big for the mft
250 record after which we abort the write returning -EOPNOTSUPP from
251 ntfs_prepare_write().
252 - Add disable_sparse mount option together with a per volume sparse
253 enable bit which is set appropriately and a per inode sparse disable
254 bit which is preset on some system file inodes as appropriate.
255 - Enforce that sparse support is disabled on NTFS volumes pre 3.0.
256 - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
257 the creation of the unmapped runlist element for the base attribute
258 extent.
259 - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
260 helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
261 This allows us to map runlist fragments with the runlist lock already
262 held without having to drop and reacquire it around the call. Adapt
263 all callers.
264 - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
265 runlist. This allows us to find runlist elements with the runlist
266 lock already held without having to drop and reacquire it around the
267 call. Adapt all callers.
268 - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
269 warning in the do_div() call on sparc32. Thanks to Meelis Roos for
270 the report and analysis of the warning.
271 - Fix a nasty runlist merge bug when merging two holes.
272 - Set the ntfs_inode->allocated_size to the real allocated size in the
273 mft record for resident attributes (fs/ntfs/inode.c).
274 - Small readability cleanup to use "a" instead of "ctx->attr"
275 everywhere (fs/ntfs/inode.c).
276 - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
277 definition of ntfs_export_ops from fs/ntfs/super.c to namei.c. Also,
278 declare ntfs_export_ops in fs/ntfs/ntfs.h.
279 - Correct sparse file handling. The compressed values need to be
280 checked and set in the ntfs inode as done for compressed files and
281 the compressed size needs to be used for vfs inode->i_blocks instead
282 of the allocated size, again, as done for compressed files.
283 - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
284 non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
285 - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
286 write code.
287 - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
288 dropping the read lock and taking the write lock we were not checking
289 whether someone else did not already do the work we wanted to do.
290 - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
291 ntfs_attr_find_vcn_nolock() and update all callers.
292 - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
293 - Fix sign of various error return values to be negative in
294 fs/ntfs/lcnalloc.c.
295 - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
296 handle the case where an attribute is converted from resident to
297 non-resident by a concurrent file write.
298 - Remove checks for NULL before calling kfree() since kfree() does the
299 checking itself. (Jesper Juhl)
300 - Some utilities modify the boot sector but do not update the checksum.
301 Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
302 only emit a warning when the checksum is incorrect rather than
303 refusing the mount. Thanks to Bernd Casimir for pointing this
304 problem out.
305 - Update attribute definition handling.
306 - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
307 - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
308 - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
309 better code generation and one less sparse warning in fs/ntfs/aops.c.
310 - Remove spurious void pointer casts from fs/ntfs/. (Pekka Enberg)
311 - Use C99 style structure initialization after memory allocation where
312 possible (fs/ntfs/{attrib.c,index.c,super.c}). Thanks to Al Viro and
313 Pekka Enberg.
314 - Stamp the transaction log ($UsnJrnl), aka user space journal, if it
315 is active on the volume and we are mounting read-write or remounting
316 from read-only to read-write.
317 - Fix a bug in address space operations error recovery code paths where
318 if the runlist was not mapped at all and a mapping error occured we
319 would leave the runlist locked on exit to the function so that the
320 next access to the same file would try to take the lock and deadlock.
321 - Detect the case when Windows has been suspended to disk on the volume
322 to be mounted and if this is the case do not allow (re)mounting
323 read-write. This is done by parsing hiberfil.sys if present.
324 - Fix several occurences of a bug where we would perform 'var & ~const'
325 with a 64-bit variable and a int, i.e. 32-bit, constant. This causes
326 the higher order 32-bits of the 64-bit variable to be zeroed. To fix
327 this cast the 'const' to the same 64-bit type as 'var'.
328 - Change the runlist terminator of the newly allocated cluster(s) to
329 LCN_ENOENT in ntfs_attr_make_non_resident(). Otherwise the runlist
330 code gets confused.
331 - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
332 and ntfs_mapping_pairs_build() to allow the runlist encoding to be
333 partial which is desirable when filling holes in sparse attributes.
334 Update all callers.
335 - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
336 if the requested vcn is inside it. Otherwise we get into problems
337 when we try to map an out of bounds vcn because we then try to map
338 the already mapped runlist fragment which causes
339 ntfs_mapping_pairs_decompress() to fail and return error. Update
340 ntfs_attr_find_vcn_nolock() accordingly.
341 - Fix a nasty deadlock that appeared in recent kernels.
342 The situation: VFS inode X on a mounted ntfs volume is dirty. For
343 same inode X, the ntfs_inode is dirty and thus corresponding on-disk
344 inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
345 to the table of inodes, i.e. $MFT, inode 0.
346 What happens:
347 Process 1: sys_sync()/umount()/whatever... calls
348 __sync_single_inode() for $MFT -> do_writepages() -> write_page for
349 the dirty page containing the on-disk inode X, the page is now locked
350 -> ntfs_write_mst_block() which clears PageUptodate() on the page to
351 prevent anyone else getting hold of it whilst it does the write out.
352 This is necessary as the on-disk inode needs "fixups" applied before
353 the write to disk which are removed again after the write and
354 PageUptodate is then set again. It then analyses the page looking
355 for dirty on-disk inodes and when it finds one it calls
356 ntfs_may_write_mft_record() to see if it is safe to write this
357 on-disk inode. This then calls ilookup5() to check if the
358 corresponding VFS inode is in icache(). This in turn calls ifind()
359 which waits on the inode lock via wait_on_inode whilst holding the
360 global inode_lock.
361 Process 2: pdflush results in a call to __sync_single_inode for the
362 same VFS inode X on the ntfs volume. This locks the inode (I_LOCK)
363 then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
364 read_cache_page() for the page (in page cache of table of inodes
365 $MFT, inode 0) containing the on-disk inode. This page has
366 PageUptodate() clear because of Process 1 (see above) so
367 read_cache_page() blocks when it tries to take the page lock for the
368 page so it can call ntfs_read_page().
369 Thus Process 1 is holding the page lock on the page containing the
370 on-disk inode X and it is waiting on the inode X to be unlocked in
371 ifind() so it can write the page out and then unlock the page.
372 And Process 2 is holding the inode lock on inode X and is waiting for
373 the page to be unlocked so it can call ntfs_readpage() or discover
374 that Process 1 set PageUptodate() again and use the page.
375 Thus we have a deadlock due to ifind() waiting on the inode lock.
376 The solution: The fix is to use the newly introduced
377 ilookup5_nowait() which does not wait on the inode's lock and hence
378 avoids the deadlock. This is safe as we do not care about the VFS
379 inode and only use the fact that it is in the VFS inode cache and the
380 fact that the vfs and ntfs inodes are one struct in memory to find
381 the ntfs inode in memory if present. Also, the ntfs inode has its
382 own locking so it does not matter if the vfs inode is locked.
383 - Fix bug in mft record writing where we forgot to set the device in
384 the buffers when mapping them after the VM had discarded them.
385 Thanks to Martin MOKREJÃ… for the bug report.
386
3872.1.22 - Many bug and race fixes and error handling improvements.
388
389 - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
390 - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
391 instead of void and provide a helper ntfs_truncate_vfs() for the
392 vfs ->truncate method.
393 - Add a new ntfs inode flag NInoTruncateFailed() and modify
394 fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
395 - Fix min_size and max_size definitions in ATTR_DEF structure in
396 fs/ntfs/layout.h to be signed.
397 - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
398 ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
399 ntfs_attr_can_be_resident(), which in turn use the new private helper
400 ntfs_attr_find_in_attrdef().
401 - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
402 mapping->private_lock around the dirtying of the buffer heads
403 analagous to the way it is done in __set_page_dirty_buffers().
404 - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
405 mount time as this cannot work with the current implementation.
406 - Check for location of attribute name and improve error handling in
407 general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
408 - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
409 i_size, i.e. race with truncate, invalidate the buffers on the page
410 so that they become freeable and hence the page does not leak.
411 - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge(). (Adrian
412 Bunk)
413 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
414 a NULL pointer dereference in the error code path when a corrupt
415 attribute was found. (Thanks to Domen Puncer for the bug report.)
416 - Add MODULE_VERSION() to fs/ntfs/super.c.
417 - Make several functions and variables static. (Adrian Bunk)
418 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
419 buffers for the page if they are not present and then marks the
420 buffers belonging to the ntfs record dirty. This causes the buffers
421 to become busy and hence they are safe from removal until the page
422 has been written out.
423 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
424 error handling code path that resulted in a BUG() due to trying to
425 unmap an extent mft record when the mapping of it had failed and it
426 thus was not mapped. (Thanks to Ken MacFerrin for the bug report.)
427 - Drop the runlist lock after the vcn has been read in
428 fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
429 - Rewrite handling of multi sector transfer errors. We now do not set
430 PageError() when such errors are detected in the async i/o handler
431 fs/ntfs/aops.c::ntfs_end_buffer_async_read(). All users of mst
432 protected attributes now check the magic of each ntfs record as they
433 use it and act appropriately. This has the effect of making errors
434 granular per ntfs record rather than per page which solves the case
435 where we cannot access any of the ntfs records in a page when a
436 single one of them had an mst error. (Thanks to Ken MacFerrin for
437 the bug report.)
438 - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
439 where we failed to release i_mutex on the $Quota/$Q attribute inode.
440 - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
441 - Add mapping of unmapped buffers to all remaining code paths, i.e.
442 fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
443 and write_mft_record_nolock(). From now on we require that the
444 complete runlist for the mft mirror is always mapped into memory.
445 - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
446 - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
447 - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
448 resident attribute will be smaller than a page which makes the code
449 simpler. Also make the code more tolerant to concurrent ->truncate.
450
4512.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
452
453 - Implement extent mft record deallocation
454 fs/ntfs/mft.c::ntfs_extent_mft_record_free().
455 - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
456 - Add vol->mft_data_pos and initialize it at mount time.
457 - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
458 ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
459 ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
460 ntfs_runlists_merge() and adapt all callers.
461 - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
462 ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
463 and ntfs_mapping_pairs_build(), adapted from libntfs.
464 - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
465 static and add a declaration for it to lcnalloc.h.
466 - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
467 inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
468 cluster bitmap lock for the duration of the call.
469 - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
470 - Implement the equivalent of memset() for an ntfs attribute in
471 fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
472 fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
473 - Remove unnecessary casts from LCN_* constants.
474 - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
475 - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
476 change MFT_RECORD to contain the NTFS 3.1+ specific fields.
477 - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
478 marks all buffers belonging to an ntfs record dirty, followed by
479 marking the page the ntfs record is in dirty and also marking the vfs
480 inode containing the ntfs record dirty (I_DIRTY_PAGES).
481 - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
482 new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
483 longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
484 - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
485 include errors.
486 - Move the typedefs for runlist_element and runlist from types.h to
487 runlist.h and fix resulting include errors.
488 - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
489 - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
490 mark_ntfs_record_dirty() which also changes the behaviour in that we
491 now set the buffers belonging to the mft record dirty as well as the
492 page itself.
493 - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
494 to cope with the fact that there now are dirty buffers in mft pages.
495 - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
496 mark_ntfs_record_dirty() and thus to set the buffers belonging to the
497 mft record dirty as well as the page itself.
498 - Fix compiler warnings on x86-64 in fs/ntfs/dir.c. (Randy Dunlap,
499 slightly modified by me)
500 - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
501 the mft record is already locked and otherwise behaves the same way
502 as fs/ntfs/mft.c::map_mft_record().
503 - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
504 writes the mft record if the buffers belonging to it are dirty.
505 Otherwise we assume that it was written out by other means already.
506 - Attempting to write outside initialized size is _not_ a bug so remove
507 the bug check from fs/ntfs/aops.c::ntfs_write_mst_block(). It is in
508 fact required to write outside initialized size when preparing to
509 extend the initialized size.
510 - Map the page instead of using page_address() before writing to it in
511 fs/ntfs/aops.c::ntfs_mft_writepage().
512 - Provide exclusion between opening an inode / mapping an mft record
513 and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
514 by setting the page not uptodate throughout ntfs_mft_writepage().
515 - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
516 to ensure noone can see the page whilst the mst fixups are applied.
517 - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
518 checks if an mft record may be written out safely obtaining any
519 necessary locks in the process. This is used by
520 fs/ntfs/aops.c::ntfs_write_mst_block().
521 - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
522 writing mft records and improve its error handling in the process.
523 Now if any of the records in the page fail to be written out, all
524 other records will be written out instead of aborting completely.
525 - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
526 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
527 ntfs_mst_aops for all inodes which are NInoMstProtected() and
528 ntfs_aops for all other inodes.
529 - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
530 ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
531 no longer require an ntfs inode to be present. Update all callers.
532 - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
533 - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
534 to ensure noone can see the page whilst the mst fixups are applied.
535 - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
536 fs/ntfs/mft.c::try_map_mft_record().
537 - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
538 with the ntfs inode which contains the page rather than the ntfs
539 inode the mft record of which is in the page.
540 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
541 index inode bitmap inode release code from there to
542 fs/ntfs/inode.c::ntfs_clear_big_inode(). (Thanks to Christoph
543 Hellwig for spotting this.)
544 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
545 inode semaphore around the code that sets ni->itype.index.bmp_ino to
546 NULL and reorganize the code to optimize it a bit. (Thanks to
547 Christoph Hellwig for spotting this.)
548 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
549 ntfs inode as a parameter as this is confusing and misleading and the
550 needed ntfs inode is available via NTFS_I(page->mapping->host).
551 Adapt all callers to this change.
552 - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
553 fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
554 of the first buffer in a record and to take this as the ntfs record
555 dirty state. We cannot look at the dirty state for subsequent
556 buffers because we might be racing with
557 fs/ntfs/aops.c::mark_ntfs_record_dirty().
558 - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
559 inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
560 add a declaration for it to inode.h. Fix some compilation issues
561 that resulted due to #includes and header file interdependencies.
562 - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
563 - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
564 - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
565 record sequence number if it is specified (i.e. not zero).
566 - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
567 functions used by it.
568 - Update Documentation/filesystems/ntfs.txt with instructions on how to
569 use the Device-Mapper driver with NTFS ftdisk/LDM raid. This removes
570 the linear raid problem with the Software RAID / MD driver when one
571 or more of the devices has an odd number of sectors.
572
5732.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
574
575 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
576 where we did not clear ctx->al_entry but it was still set due to
577 changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
578 particular.
579 - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
580 where we forgot to unmap the extent mft record when we had finished
581 enumerating an attribute which caused a bug check to trigger when the
582 VFS calls ->clear_inode.
583
5842.1.19 - Many cleanups, improvements, and a minor bug fix.
585
586 - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
587 change the uid, gid, and mode of an inode as we do not support NTFS
588 ACLs yet.
589 - Remove BKL use from ntfs_setattr() syncing up with the rest of the
590 kernel.
591 - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
592 and ntfs_filldir() as per suggestion from Al Viro.
593 - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
594 - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
595 inode size has changed and to only output an error if so.
596 - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
597 - Add le{16,32,64} as well as sle{16,32,64} data types to
598 fs/ntfs/types.h.
599 - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
600 - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
601 respectively, to fs/ntfs/types.h.
602 - Update endianness conversion macros in fs/ntfs/endian.h to use the
603 new types as appropriate.
604 - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
605 and index.c.
606 - Add leMFT_REF data type to fs/ntfs/layout.h.
607 - Update all NTFS header files with the new little endian data types.
608 Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
609 - Do proper type casting when using ntfs_is_*_recordp() in
610 fs/ntfs/logfile.c, mft.c, and super.c.
611 - Fix all the sparse bitwise warnings. Had to change all the typedef
612 enums storing little endian values to simple enums plus a typedef for
613 the datatype to make sparse happy.
614 - Fix a bug found by the new sparse bitwise warnings where the default
615 upcase table was defined as a pointer to wchar_t rather than ntfschar
616 in fs/ntfs/ntfs.h and super.c.
617 - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
618
6192.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
620
621 - Remove vol->nr_mft_records as it was pretty meaningless and optimize
622 the calculation of total/free inodes as used by statfs().
623 - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
624 because the code itself is using the ntfs_lock semaphore which
625 provides safe locking. (Ingo Molnar)
626 - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
627 could occur in the future for when we start closing/freeing extent
628 inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
629 we free it.
630 - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
631 find_external_attr() to ntfs_external_attr_find() to cleanup the
632 namespace a bit and to be more consistent with libntfs.
633 - Rename {{re,}init,get,put}_attr_search_ctx() to
634 ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
635 attr_search_context to ntfs_attr_search_ctx.
636 - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
637 for the attribute list attribute itself.
638 - Fix endianness bug in ntfs_external_attr_find().
639 - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
640 if the attribute is not found, and -EIO on real error. In the case
641 of -ENOENT, the search context is updated to describe the attribute
642 before which the attribute being searched for would need to be
643 inserted if such an action were to be desired and in the case of
644 ntfs_external_attr_find() the search context is also updated to
645 indicate the attribute list entry before which the attribute list
646 entry of the attribute being searched for would need to be inserted
647 if such an action were to be desired. Also make ntfs_find_attr()
648 static and remove its prototype from attrib.h as it is not used
649 anywhere other than attrib.c. Update ntfs_attr_lookup() and all
650 callers of ntfs_{external,}attr_{find,lookup}() for the new return
651 values.
652 - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
653
6542.1.17 - Fix bugs in mount time error code paths and other updates.
655
656 - Implement bitmap modification code (fs/ntfs/bitmap.[hc]). This
657 includes functions to set/clear a single bit or a run of bits.
658 - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
659 runlist element containing a particular vcn. It also takes care of
660 mapping any needed runlist fragments.
661 - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
662 - Load attribute definition table from $AttrDef at mount time.
663 - Fix bugs in mount time error code paths involving (de)allocation of
664 the default and volume upcase tables.
665 - Remove ntfs_nr_mounts as it is no longer used.
666
6672.1.16 - Implement access time updates, file sync, async io, and read/writev.
668
669 - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
670 This is done by setting the appropriate file operations pointers to
671 the generic helper functions provided by mm/filemap.c.
672 - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
673 and directories (fs/ntfs/dir.c).
674 - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
675 Note, except for the root directory and any other system files opened
676 by the user, the system files will not have their access times
677 updated as they are only accessed at the inode level an hence the
678 file level functions which cause the times to be updated are never
679 invoked.
680
6812.1.15 - Invalidate quotas when (re)mounting read-write.
682
683 - Add new element itype.index.collation_rule to the ntfs inode
684 structure and set it appropriately in ntfs_read_locked_inode().
685 - Implement a new inode type "index" to allow efficient access to the
686 indices found in various system files and adapt inode handling
687 accordingly (fs/ntfs/inode.[hc]). An index inode is essentially an
688 attribute inode (NInoAttr() is true) with an attribute type of
689 AT_INDEX_ALLOCATION. As such, it is no longer allowed to call
690 ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
691 there would be no way to distinguish between normal attribute inodes
692 and index inodes. The function to obtain an index inode is
693 ntfs_index_iget() and it uses the helper function
694 ntfs_read_locked_index_inode(). Note, we do not overload
695 ntfs_attr_iget() as indices consist of multiple attributes so using
696 ntfs_attr_iget() to obtain an index inode would be confusing.
697 - Ensure that there is no overflow when doing page->index <<
698 PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
699 - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
700 and ntfs_read_block().
701 - Use case sensitive attribute lookups instead of case insensitive ones.
702 - Lock all page cache pages belonging to mst protected attributes while
703 accessing them to ensure we never see corrupt data while the page is
704 under writeout.
705 - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
706 We have ntfs_is_collation_rule_supported() to check if the collation
707 rule you want to use is supported and ntfs_collation() which actually
708 collates two data items. We currently only support COLLATION_BINARY
709 and COLLATION_NTOFS_ULONG but support for other collation rules will
710 be added as the need arises.
711 - Add a new type, ntfs_index_context, to allow retrieval of an index
712 entry using the corresponding index key. To get an index context,
713 use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
714 This also adds a new slab cache for the index contexts. To lookup a
715 key in an index inode, use ntfs_index_lookup(). After modifying an
716 index entry, call ntfs_index_entry_flush_dcache_page() followed by
717 ntfs_index_entry_mark_dirty() to ensure the changes are written out
718 to disk. For details see fs/ntfs/index.[hc]. Note, at present, if
719 an index entry is in the index allocation attribute rather than the
720 index root attribute it will not be written out (you will get a
721 warning message about discarded changes instead).
722 - Load the quota file ($Quota) and check if quota tracking is enabled
723 and if so, mark the quotas out of date. This causes windows to
724 rescan the volume on boot and update all quota entries.
725 - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
726 It is simply set to __set_page_dirty_nobuffers() to make sure that
727 running set_page_dirty() on a page containing mft/ntfs records will
728 not affect the dirty state of the page buffers.
729 - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
730 buffers that are inside the ntfs record in the page dirty after which
731 it sets the page dirty. This allows ->writepage to only write the
732 dirty index records rather than having to write all the records in
733 the page. Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
734 use this rather than __set_page_dirty_nobuffers().
735 - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
736 writing of page cache pages belonging to mst protected attributes
737 like the index allocation attribute in directory indices and other
738 indices like $Quota/$Q, etc. This means that the quota is now marked
739 out of date on all volumes rather than only on ones where the quota
740 defaults entry is in the index root attribute of the $Quota/$Q index.
741
7422.1.14 - Fix an NFSd caused deadlock reported by several users.
743
744 - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
745 to a buffer so that we can put the search context and unmap the mft
746 record before calling the filldir() callback. We need to do this
747 because of NFSd which calls ->lookup() from its filldir callback()
748 and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
749 of the directory and since ntfs_readdir() has got it mapped already
750 ntfs_lookup() deadlocks.
751
7522.1.13 - Enable overwriting of resident files and housekeeping of system files.
753
754 - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
755 keeping the mft mirror in sync with the mft when mirrored mft records
756 are written. The functions are write_mft_record{,_nolock}(). The
757 implementation is quite rudimentary for now with lots of things not
758 implemented yet but I am not sure any of them can actually occur so
759 I will wait for people to hit each one and only then implement it.
760 - Commit open system inodes at umount time. This should make it
761 virtually impossible for sync_mft_mirror_umount() to ever be needed.
762 - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
763 ntfs super operations. This gives us inode writing via the VFS inode
764 dirty code paths. Note: Access time updates are not implemented yet.
765 - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
766 fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
767 finally enabling resident file overwrite! (-8 This also includes a
768 placeholder for ->writepage (ntfs_mft_writepage()), which for now
769 just redirties the page and returns. Also, at umount time, we for
770 now throw away all mft data page cache pages after the last call to
771 ntfs_commit_inode() in the hope that all inodes will have been
772 written out by then and hence no dirty (meta)data will be lost. We
773 also check for this case and emit an error message telling the user
774 to run chkdsk.
775 - Use set_page_writeback() and end_page_writeback() in the resident
776 attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
777 the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
778 page is clean.
779 - Implement ntfs_mft_writepage() so it now checks if any of the mft
780 records in the page are dirty and if so redirties the page and
781 returns. Otherwise it just returns (after doing set_page_writeback(),
782 unlock_page(), end_page_writeback() or the radix-tree tag
783 PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
784 alowing the VM to do with the page as it pleases. Also, at umount
785 time, now only throw away dirty mft (meta)data pages if dirty inodes
786 are present and ask the user to email us if they see this happening.
787 - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
788 information flags (fs/ntfs/super.c).
789 - Mark the volume dirty when (re)mounting read-write and mark it clean
790 when unmounting or remounting read-only. If any volume errors are
791 found, the volume is left marked dirty to force chkdsk to run.
792 - Add code to set the NT4 compatibility flag when (re)mounting
793 read-write for newer NTFS versions but leave it commented out for now
794 since we do not make any modifications that are NTFS 1.2 specific yet
795 and since setting this flag breaks Captive-NTFS which is not nice.
796 This code must be enabled once we start writing NTFS 1.2 specific
797 changes otherwise Windows NTFS driver might crash / cause corruption.
798
7992.1.12 - Fix the second fix to the decompression engine and some cleanups.
800
801 - Add a new address space operations struct, ntfs_mst_aops, for mst
802 protected attributes. This is because the default ntfs_aops do not
803 make sense with mst protected data and were they to write anything to
804 such an attribute they would cause data corruption so we provide
805 ntfs_mst_aops which does not have any write related operations set.
806 - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
807 includes an adapted ntfs_commit_inode() and an implementation of
808 ntfs_write_inode() which for now just cleans dirty inodes without
809 writing them (it does emit a warning that this is happening).
810 - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
811 entry) as it was only fixing a theoretical bug but at the same time
812 it badly broke the handling of sparse and uncompressed compression
813 blocks.
814
8152.1.11 - Driver internal cleanups.
816
817 - Only build logfile.o if building the driver with read-write support.
818 - Really final white space cleanups.
819 - Use generic_ffs() instead of ffs() in logfile.c which allows the
820 log_page_size variable to be optimized by gcc into a constant.
821 - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
822 char as defined by POSIX and as found on some systems.
823
8242.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
825
826 - Finish off the white space cleanups (remove trailing spaces, etc).
827 - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
828 the kludges around the first iget(). Instead of (re)setting ->s_op
829 we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
830 insert_inode_hash() / call ntfs_read_inode_mount() directly. This
831 kills the need for second super_operations and allows to return error
832 from ntfs_read_inode_mount() without resorting to ugly "poisoning"
833 tricks. (Al Viro)
834 - Force read-only (re)mounting if any of the following bits are set in
835 the volume information flags:
836 VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
837 VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
838 VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
839 To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
840 above bits set so the test is made easy.
841
8422.1.9 - Fix two bugs in decompression engine.
843
844 - Fix a bug where we would not always detect that we have reached the
845 end of a compression block because we were ending at minus one byte
846 which is effectively the same as being at the end. The fix is to
847 check whether the uncompressed buffer has been fully filled and if so
848 we assume we have reached the end of the compression block. A big
849 thank you to Marcin Gibuła for the bug report, the assistance in
850 tracking down the bug and testing the fix.
851 - Fix a possible bug where when a compressed read is truncated to the
852 end of the file, the offset inside the last page was not truncated.
853
8542.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
855
856 - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
857 - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
858 utc2ntfs() to work with struct timespec instead of time_t on the
859 Linux UTC time side thus preserving the full precision of the NTFS
860 time and only loosing up to 99 nano-seconds in the Linux UTC time.
861 - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
862 static inline.
863 - Remove unused ntfs_dirty_inode().
864 - Cleanup super operations declaration in fs/ntfs/super.c.
865 - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
866 - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
867 fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
868 - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
869 fs/ntfs/inode.h so they can be used elsewhere.
870 - Determine the mft mirror size as the number of mirrored mft records
871 and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
872 - Load the mft mirror at mount time and compare the mft records stored
873 in it to the ones in the mft. Force a read-only mount if the two do
874 not match (fs/ntfs/super.c).
875 - Fix type casting related warnings on 64-bit architectures. Thanks
876 to Meelis Roos for reporting them.
877 - Move %L to %ll as %L is floating point and %ll is integer which is
878 what we want.
879 - Read the journal ($LogFile) and determine if the volume has been
880 shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
881 and fs/ntfs/logfile.c). This is a little bit of a crude check in
882 that we only look at the restart areas and not at the actual log
883 records so that there will be a very small number of cases where we
884 think that a volume is dirty when in fact it is clean. This should
885 only affect volumes that have not been shutdown cleanly and did not
886 have any pending, non-check-pointed i/o.
887 - If the $LogFile indicates a clean shutdown and a read-write (re)mount
888 is requested, empty $LogFile by overwriting it with 0xff bytes to
889 ensure that Windows cannot cause data corruption by replaying a stale
890 journal after Linux has written to the volume.
891
8922.1.7 - Enable NFS exporting of mounted NTFS volumes.
893
894 - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
895 - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
896 - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
897 default doesn't allow inode number 0 which is a valid inode on NTFS
898 and even if it did allow that it uses iget() instead of ntfs_iget()
899 which makes it useless for us.
900 - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
901 default just returns -EACCES which is not very useful.
902 - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
903 and set them up in the super block at mount time (super.c) this
904 allows mounted NTFS volumes to be exported via NFS.
905 - Add missing return -EOPNOTSUPP; in
906 fs/ntfs/aops.c::ntfs_commit_nonresident_write().
907 - Enforce no atime and no dir atime updates at mount/remount time as
908 they are not implemented yet anyway.
909 - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
910 after a NULL check. Thanks to Dave Jones for pointing this out.
911
9122.1.6 - Fix minor bug in handling of compressed directories.
913
914 - Fix bug in handling of compressed directories. A compressed
915 directory is not really compressed so when we set the ->i_blocks
916 field of a compressed directory inode we were setting it from the
917 non-existing field ni->itype.compressed.size which gave random
918 results... For directories we now always use ni->allocated_size.
919
9202.1.5 - Fix minor bug in attribute list attribute handling.
921
922 - Fix bug in attribute list handling. Actually it is not as much a bug
923 as too much protection in that we were not allowing attribute lists
924 which waste space on disk while Windows XP clearly allows it and in
925 fact creates such attribute lists so our driver was failing.
926 - Update NTFS documentation ready for 2.6 kernel release.
927
9282.1.4 - Reduce compiler requirements.
929
930 - Remove all uses of unnamed structs and unions in the driver to make
931 old and newer gcc versions happy. Makes it a bit uglier IMO but at
932 least people will stop hassling me about it.
933
9342.1.3 - Important bug fixes in corner cases.
935
936 - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
937 clusters. (Philipp Thomas)
938 - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
939 multiple of the block_size but not the cluster size. (Szabolcs
940 Szakacsits)
941
9422.1.2 - Important bug fixes aleviating the hangs in statfs.
943
944 - Fix buggy free cluster and free inode determination logic.
945
9462.1.1 - Minor updates.
947
948 - Add handling for initialized_size != data_size in compressed files.
949 - Reduce function local stack usage from 0x3d4 bytes to just noise in
950 fs/ntfs/upcase.c. (Randy Dunlap)
951 - Remove compiler warnings for newer gcc.
952 - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
953 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
954 in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
955 kmap_atomic(KM_USER0).
956
9572.1.0 - First steps towards write support: implement file overwrite.
958
959 - Add configuration option for developmental write support with an
960 appropriately scary configuration help text.
961 - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
962 helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
963 overwriting of existing files on ntfs. Note: Resident files are
964 only written into memory, and not written out to disk at present, so
965 avoid writing to files smaller than about 1kiB.
966 - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
967 helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
968 counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
969 fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
970 add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
971 This enables write(2) based overwriting of existing files on ntfs.
972 Note: As with mmap(2) based overwriting, resident files are only
973 written into memory, and not written out to disk at present, so avoid
974 writing to files smaller than about 1kiB.
975 - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
976 ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
977 files with the purpose of intercepting and aborting all i_size
978 changes which we do not support yet. ntfs_truncate() actually only
979 emits a warning message but AFAICS our interception of i_size changes
980 elsewhere means ntfs_truncate() never gets called for i_size changes.
981 It is only called from generic_file_write() when we fail in
982 ntfs_prepare_{,nonresident_}write() in order to discard any
983 instantiated buffers beyond i_size. Thus i_size is not actually
984 changed so our warning message is enough. Unfortunately it is not
985 possible to easily determine if i_size is being changed or not hence
986 we just emit an appropriately worded error message.
987
9882.0.25 - Small bug fixes and cleanups.
989
990 - Unlock the page in an out of memory error code path in
991 fs/ntfs/aops.c::ntfs_read_block().
992 - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
993 just unlock the page and return. (This can happen due to ->writepage
994 clearing PageUptodate() during write out of MstProtected()
995 attributes.
996 - Remove leaked write code again.
997
9982.0.24 - Cleanups.
999
1000 - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
1001 inside BUG_ON(). (Adam J. Richter)
1002 - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
1003 calls for improved debugging. (Adam J. Richter)
1004 - Add errors flag to the ntfs volume state, accessed via
1005 NVol{,Set,Clear}Errors(vol).
1006 - Do not allow read-write remounts of read-only volumes with errors.
1007 - Clarify comment for ntfs file operation sendfile which was added by
1008 Christoph Hellwig a while ago (just using generic_file_sendfile())
1009 to say that ntfs ->sendfile is only used for the case where the
1010 source data is on the ntfs partition and the destination is
1011 somewhere else, i.e. nothing we need to concern ourselves with.
1012 - Add generic_file_write() as our ntfs file write operation.
1013
10142.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
1015
1016 - Massive internal locking changes to mft record locking. Fixes lock
1017 recursion and replaces the mrec_lock read/write semaphore with a
1018 mutex. Also removes the now superfluous mft_count. This fixes several
1019 race conditions and deadlocks, especially in the future write code.
1020 - Fix ntfs over loopback for compressed files by adding an
1021 optimization barrier. (gcc was screwing up otherwise ?)
1022 - Miscellaneous cleanups all over the code and a fix or two in error
1023 handling code paths.
1024 Thanks go to Christoph Hellwig for pointing out the following two:
1025 - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
1026 - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
1027
10282.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
1029
1030 - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
1031 at entry/exit respectively.
1032 - Use C99 initializers for structures.
1033 - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
1034
10352.0.21 - Check for, and refuse to work with too large files/directories/volumes.
1036
1037 - Limit volume size at mount time to 2TiB on architectures where
1038 unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
1039 This is the most we can do without overflowing the 32-bit limit of
1040 the block device size imposed on us by sb_bread() and sb_getblk()
1041 for the time being.
1042 - Limit file/directory size at open() time to 16TiB on architectures
1043 where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
1044 fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
1045 overflowing the page cache page index.
1046
10472.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
1048
1049 - Move the directory index bitmap to use an attribute inode instead of
1050 having special fields for it inside the ntfs inode structure. This
1051 means that the index bitmaps now use the page cache for i/o, too,
1052 and also as a side effect we get support for non-resident index
1053 bitmaps for free.
1054 - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
1055 fix a page leak that manifested itself in some cases.
1056 - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
1057 index bitmap inode on the final iput().
1058
10592.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
1060
1061 - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
1062 to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
1063 - Drop the "file" from ntfs_file_read_compressed_block().
1064 - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
1065 ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
1066 - Update ntfs_end_buffer_async_read() with the improved logic from
1067 its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
1068 further logic improvements to better determine when we set PageError.
1069 - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
1070 check for the buffers being uptodate first in line with the updated
1071 fs/buffer.c::block_read_full_page(). This plugs a small race
1072 condition.
1073
10742.0.18 - Fix race condition in reading of compressed files.
1075
1076 - There was a narrow window between checking a buffer head for being
1077 uptodate and locking it in ntfs_file_read_compressed_block(). We now
1078 lock the buffer and then check whether it is uptodate or not.
1079
10802.0.17 - Cleanups and optimizations - shrinking the ToDo list.
1081
1082 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
1083 code and update callers, i.e. ntfs_iget(), to pass that error code
1084 up instead of just using -EIO.
1085 - Modifications to super.c to ensure that both mount and remount
1086 cannot set any write related options when the driver is compiled
1087 read-only.
1088 - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
1089 cache the current runlist element. This should improve performance
1090 when reading very large and/or very fragmented data.
1091
10922.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
1093
1094 - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
1095 wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
1096 - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
1097 - Convert $MFT/$BITMAP access to attribute inode API and remove all
1098 remnants of the ugly mftbmp address space and operations hack. This
1099 means we finally have only one readpage function as well as only one
1100 async io completion handler. Yey! The mft bitmap is now just an
1101 attribute inode and is accessed from vol->mftbmp_ino just as if it
1102 were a normal file. Fake inodes rule. (-:
1103
11042.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
1105
1106 - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
1107 remounts to fail when the partition had an entry in /etc/fstab and
1108 the entry specified the nls= option.
1109 - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
1110 expand all the helper functions NVolFoo(), NVolSetFoo(), and
1111 NVolClearFoo().
1112 - Move copyright statement from driver initialisation message to
1113 module description (fs/super.c). This makes the initialisation
1114 message fit on one line and fits in better with rest of kernel.
1115 - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
1116 attribute inodes, and both for files and directories.
1117 - Implement fake attribute inodes allowing all attribute i/o to go via
1118 the page cache and to use all the normal vfs/mm functionality:
1119 - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
1120 to fs/ntfs/inode.c.
1121 - Add needed cleanup code to ntfs_clear_big_inode().
1122 - Merge address space operations for files and directories (aops.c),
1123 now just have ntfs_aops:
1124 - Rename:
1125 end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
1126 ntfs_attr_read_block() -> ntfs_read_block(),
1127 ntfs_file_read_page() -> ntfs_readpage().
1128 - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
1129 attribute inodes, and both for files and directories.
1130 - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
1131
11322.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
1133
1134 - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
1135 the locking out of super.c::get_nr_free_mft_records() and taking and
1136 dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
1137 - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
1138 current userspace ntfs library code. This means that if a merge
1139 fails the original runlists are always left unmodified instead of
1140 being silently corrupted.
1141 - Misc typo fixes.
1142
11432.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
1144
1145 - Remove nr_mft_bits and the now superfluous union with nr_mft_records
1146 from ntfs_volume structure.
1147 - Remove nr_lcn_bits and the now superfluous union with nr_clusters
1148 from ntfs_volume structure.
1149 - Use iget5_locked() and friends instead of conventional iget(). Wrap
1150 the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
1151 to use ntfs_iget(). Leave only one iget() call at mount time so we
1152 don't need an ntfs_iget_mount().
1153 - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
1154 additional argument.
1155
11562.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
1157
1158 - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
1159 fs/ntfs/aops.c::end_buffer_read_file_async() into one function
1160 fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
1161 to determine whether to apply mst fixups or not.
1162 - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
1163 and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
1164 fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
1165 fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
1166 the VFS readpage function prototype to the ntfs_attr_read_block()
1167 function prototype.
1168
11692.0.11 - Initial preparations for fake inode based attribute i/o.
1170
1171 - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
1172 do some macro magic (adapted from include/linux/buffer_head.h) to
1173 expand all the helper functions NInoFoo(), NInoSetFoo(), and
1174 NInoClearFoo().
1175 - Add new flag to ntfs_inode_state_bits: NI_Sparse.
1176 - Add new fields to ntfs_inode structure to allow use of fake inodes
1177 for attribute i/o: type, name, name_len. Also add new state bits:
1178 NI_Attr, which, if set, indicates the inode is a fake inode, and
1179 NI_MstProtected, which, if set, indicates the attribute uses multi
1180 sector transfer protection, i.e. fixups need to be applied after
1181 reads and before/after writes.
1182 - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
1183 ntfs_{new,clear,destroy}_extent_inode() and update callers.
1184 - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
1185 instead of ntfs_destroy_extent_inode().
1186 - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
1187 - Make all operations on ntfs inode state bits use the NIno* functions.
1188 - Set up the new ntfs inode fields and state bits in
1189 fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
1190 allocated memory to __ntfs_clear_inode().
1191 - Cleanup ntfs_inode structure a bit for better ordering of elements
1192 w.r.t. their size to allow better packing of the structure in memory.
1193
11942.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
1195
1196 - Add check at mount time to verify that the number of inodes on the
1197 volume does not exceed 2^32 - 1, which is the maximum allowed for
1198 NTFS according to Microsoft.
1199 - Change mft_no member of ntfs_inode structure to be unsigned long.
1200 Update all users. This makes ntfs_inode->mft_no just a copy of struct
1201 inode->i_ino. But we can't just always use struct inode->i_ino and
1202 remove mft_no because extent inodes do not have an attached struct
1203 inode.
1204
12052.0.9 - Decompression engine now uses a single buffer and other cleanups.
1206
1207 - Change decompression engine to use a single buffer protected by a
1208 spin lock instead of per-CPU buffers. (Rusty Russell)
1209 - Do not update cb_pos when handling a partial final page during
1210 decompression of a sparse compression block, as the value is later
1211 reset without being read/used. (Rusty Russell)
1212 - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
1213 Morton)
1214 - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
1215 NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
1216 it also makes everything safer so it is a good thing.
1217 - Miscellaneous minor cleanups to comments.
1218
12192.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
1220
1221 Big thanks go to Al Viro and other inhabitants of #kernel for investing
1222 their time to discuss the case sensitivity and dcache aliasing issues.
1223
1224 - Remove unused source file fs/ntfs/attraops.c.
1225 - Remove show_inodes mount option(s), thus dropping support for
1226 displaying of short file names.
1227 - Remove deprecated mount option posix.
1228 - Restore show_sys_files mount option.
1229 - Add new mount option case_sensitive, to determine if the driver
1230 treats file names as case sensitive or not. If case sensitive, create
1231 file names in the POSIX namespace. Otherwise create file names in the
1232 LONG/WIN32 namespace. Note, files remain accessible via their short
1233 file name, if it exists.
1234 - Remove really dumb logic bug in boot sector recovery code.
1235 - Fix dcache aliasing issues wrt short/long file names via changes
1236 to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
1237 fs/ntfs/namei.c::ntfs_lookup():
1238 - Add additional argument to ntfs_lookup_inode_by_name() in which we
1239 return information about the matching file name if the case is not
1240 matching or the match is a short file name. See comments above the
1241 function definition for details.
1242 - Change ntfs_lookup() to only create dcache entries for the correctly
1243 cased file name and only for the WIN32 namespace counterpart of DOS
1244 namespace file names. This ensures we have only one dentry per
1245 directory and also removes all dcache aliasing issues between short
1246 and long file names once we add write support. See comments above
1247 function for details.
1248 - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
1249
12502.0.7 - Minor cleanups and updates for changes in core kernel code.
1251
1252 - Remove much of the NULL struct element initializers.
1253 - Various updates to make compatible with recent kernels.
1254 - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
1255 in fs/ntfs/ntfs.h instead.
1256 - Remove no longer needed KERNEL_VERSION checks. We are now in the
1257 kernel proper so they are no longer needed.
1258
12592.0.6 - Major bugfix to make compatible with other kernel changes.
1260
1261 - Initialize the mftbmp address space properly now that there are more
1262 fields in the struct address_space. This was leading to hangs and
1263 oopses on umount since 2.5.12 because of changes to other parts of
1264 the kernel. We probably want a kernel generic init_address_space()
1265 function...
1266 - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
1267 only caller of ->readdir() is vfs_readdir() which holds i_mutex
1268 during the call, and i_mutex is sufficient protection against changes
1269 in the directory inode (including ->i_size).
1270 - Use generic_file_llseek() for directories (as opposed to
1271 default_llseek()) as this downs i_mutex instead of the BKL which is
1272 what we now need for exclusion against ->f_pos changes considering we
1273 no longer take the BKL in ntfs_readdir().
1274
12752.0.5 - Major bugfix. Buffer overflow in extent inode handling.
1276
1277 - No need to set old blocksize in super.c::ntfs_fill_super() as the
1278 VFS does so via invocation of deactivate_super() calling
1279 fs->fill_super() calling block_kill_super() which does it.
1280 - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
1281 -> Do we really need it? I don't think so as we have exclusion on
1282 the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
1283 move the ->f_pos accesses under the mrec_lock though. Check this...
1284 - Fix really, really, really stupid buffer overflow in extent inode
1285 handling in mft.c::map_extent_mft_record().
1286
12872.0.4 - Cleanups and updates for kernel 2.5.11.
1288
1289 - Add documentation on how to use the MD driver to be able to use NTFS
1290 stripe and volume sets in Linux and generally cleanup documentation
1291 a bit.
1292 Remove all uses of kdev_t in favour of struct block_device *:
1293 - Change compress.c::ntfs_file_read_compressed_block() to use
1294 sb_getblk() instead of getblk().
1295 - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
1296 of get_hardsect_size().
1297 - No need to get old blocksize in super.c::ntfs_fill_super() as
1298 fs/super.c::get_sb_bdev() already does this.
1299 - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
1300
13012.0.3 - Small bug fixes, cleanups, and performance improvements.
1302
1303 - Remove some dead code from mft.c.
1304 - Optimize readpage and read_block functions throughout aops.c so that
1305 only initialized blocks are read. Non-initialized ones have their
1306 buffer head mapped, zeroed, and set up to date, without scheduling
1307 any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
1308 Thanks go to Andrew Morton for spotting the below:
1309 - Fix buglet in allocate_compression_buffers() error code path.
1310 - Call flush_dcache_page() after modifying page cache page contents in
1311 ntfs_file_readpage().
1312 - Check for existence of page buffers throughout aops.c before calling
1313 create_empty_buffers(). This happens when an I/O error occurs and the
1314 read is retried. (It also happens once writing is implemented so that
1315 needed doing anyway but I had left it for later...)
1316 - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
1317 readpage and read_block functions. Reasoning same as above (i.e. I/O
1318 error retries and future write code paths.)
1319
13202.0.2 - Minor updates and cleanups.
1321
1322 - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
1323 and cleanup the code a bit, removing the unused size parameter.
1324 - Change default fmask to 0177 and update documentation.
1325 - Change attrib.c::get_attr_search_ctx() to return the search context
1326 directly instead of taking the address of a pointer. A return value
1327 of NULL means the allocation failed. Updated all callers
1328 appropriately.
1329 - Update to 2.5.9 kernel (preserving backwards compatibility) by
1330 replacing all occurences of page->buffers with page_buffers(page).
1331 - Fix minor bugs in runlist merging, also minor cleanup.
1332 - Updates to bootsector layout and mft mirror contents descriptions.
1333 - Small bug fix in error detection in unistr.c and some cleanups.
1334 - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
1335 bytes.
1336
13372.0.1 - Minor updates.
1338
1339 - Make default umask correspond to documentation.
1340 - Improve documentation.
1341 - Set default mode to include execute bit. The {u,f,d}mask can be used
1342 to take it away if desired. This allows binaries to be executed from
1343 a mounted ntfs partition.
1344
13452.0.0 - New version number. Remove TNG from the name. Now in the kernel.
1346
1347 - Add kill_super, just keeping up with the vfs changes in the kernel.
1348 - Repeat some changes from tng-0.0.8 that somehow got lost on the way
1349 from the CVS import into BitKeeper.
1350 - Begin to implement proper handling of allocated_size vs
1351 initialized_size vs data_size (i.e. i_size). Done are
1352 mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
1353 and attrib.c::load_attribute_list().
1354 - Lock the runlist in attrib.c::load_attribute_list() while using it.
1355 - Fix memory leak in ntfs_file_read_compressed_block() and generally
1356 clean up compress.c a little, removing some uncommented/unused debug
1357 code.
1358 - Tidy up dir.c a little bit.
1359 - Don't bother getting the runlist in inode.c::ntfs_read_inode().
1360 - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
1361 creating aops.c::ntfs_mst_readpage(), improving the handling of
1362 holes and overflow in the process and implementing the correct
1363 equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
1364 I am aiming for correctness at the moment. Modularisation can come
1365 later.
1366 - Rename aops.c::end_buffer_read_index_async() to
1367 end_buffer_read_mst_async() and optimize the overflow checking and
1368 handling.
1369 - Use the host of the mftbmp address space mapping to hold the ntfs
1370 volume. This is needed so the async i/o completion handler can
1371 retrieve a pointer to the volume. Hopefully this will not cause
1372 problems elsewhere in the kernel... Otherwise will need to use a
1373 fake inode.
1374 - Complete implementation of proper handling of allocated_size vs
1375 initialized_size vs data_size (i.e. i_size) in whole driver.
1376 Basically aops.c is now completely rewritten.
1377 - Change NTFS driver name to just NTFS and set version number to 2.0.0
1378 to make a clear distinction from the old driver which is still on
1379 version 1.1.22.
1380
1381tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
1382
1383 - Replace bdevname(sb->s_dev) with sb->s_id.
1384 - Remove now superfluous new-line characters in all callers of
1385 ntfs_debug().
1386 - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
1387 directories. Without this the "find" utility gets very upset which is
1388 fair enough as Linux/Unix do not support directory hard links.
1389 - Further runlist merging work. (Richard Russon)
1390 - Backwards compatibility for gcc-2.95. (Richard Russon)
1391 - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
1392 - Convert to new filesystem declaration using ->ntfs_get_sb() and
1393 replacing ntfs_read_super() with ntfs_fill_super().
1394 - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
1395 overflow on 32-bit architectures.
1396 - Cleanup upcase loading code to use ntfs_(un)map_page().
1397 - Disable/reenable preemtion in critical sections of compession engine.
1398 - Replace device size determination in ntfs_fill_super() with
1399 sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
1400 function super.c::get_nr_blocks().
1401 - Implement a mount time option (show_inodes) allowing choice of which
1402 types of inode names readdir() returns and modify ntfs_filldir()
1403 accordingly. There are several parameters to show_inodes:
1404 system: system files
1405 win32: long file names (including POSIX file names) [DEFAULT]
1406 long: same as win32
1407 dos: short file names only (excluding POSIX file names)
1408 short: same as dos
1409 posix: same as both win32 and dos
1410 all: all file names
1411 Note that the options are additive, i.e. specifying:
1412 -o show_inodes=system,show_inodes=win32,show_inodes=dos
1413 is the same as specifying:
1414 -o show_inodes=all
1415 Note that the "posix" and "all" options will show all directory
1416 names, BUT the link count on each directory inode entry is set to 1,
1417 due to Linux not supporting directory hard links. This may well
1418 confuse some userspace applications, since the directory names will
1419 have the same inode numbers. Thus it is NOT advisable to use the
1420 "posix" or "all" options. We provide them only for completeness sake.
1421 - Add copies of allocated_size, initialized_size, and compressed_size to
1422 the ntfs inode structure and set them up in
1423 inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
1424 for files and the index allocation attribute for directories.
1425 - Add copies of allocated_size and initialized_size to ntfs inode for
1426 $BITMAP attribute of large directories and set them up in
1427 inode.c::ntfs_read_inode().
1428 - Add copies of allocated_size and initialized_size to ntfs volume for
1429 $BITMAP attribute of $MFT and set them up in
1430 super.c::load_system_files().
1431 - Parse deprecated ntfs driver options (iocharset, show_sys_files,
1432 posix, and utf8) and tell user what the new options to use are. Note
1433 we still do support them but they will be removed with kernel 2.7.x.
1434 - Change all occurences of integer long long printf formatting to hex
1435 as printk() will not support long long integer format if/when the
1436 div64 patch goes into the kernel.
1437 - Make slab caches have stable names and change the names to what they
1438 were intended to be. These changes are required/made possible by the
1439 new slab cache name handling which removes the length limitation by
1440 requiring the caller of kmem_cache_create() to supply a stable name
1441 which is then referenced but not copied.
1442 - Rename run_list structure to run_list_element and create a new
1443 run_list structure containing a pointer to a run_list_element
1444 structure and a read/write semaphore. Adapt all users of runlists
1445 to new scheme and take and release the lock as needed. This fixes a
1446 nasty race as the run_list changes even when inodes are locked for
1447 reading and even when the inode isn't locked at all, so we really
1448 needed the serialization. We use a semaphore rather than a spinlock
1449 as memory allocations can sleep and doing everything GFP_ATOMIC
1450 would be silly.
1451 - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
1452 This can never happen due to the nature of lookup_attr() and how we
1453 support attribute lists. If it did happen it would imply the inode
1454 being corrupt.
1455 - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
1456 bad if found.
1457 - Update to 2.5.6-pre2 changes in struct address_space.
1458 - Use parent_ino() when accessing d_parent inode number in dir.c.
1459 - Import Sourceforge CVS repository into BitKeeper repository:
1460 http://linux-ntfs.bkbits.net/ntfs-tng-2.5
1461 - Update fs/Makefile, fs/Config.help, fs/Config.in, and
1462 Documentation/filesystems/ntfs.txt for NTFS TNG.
1463 - Create kernel configuration option controlling whether debugging
1464 is enabled or not.
1465 - Add the required export of end_buffer_io_sync() from the patches
1466 directory to the kernel code.
1467 - Update inode.c::ntfs_show_options() with show_inodes mount option.
1468 - Update errors mount option.
1469
1470tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
1471
1472 - Cleanup mft.c and it's debug/error output in particular. Fix a minor
1473 bug in mapping of extent inodes. Update all the comments to fit all
1474 the recent code changes.
1475 - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
1476 - Cleanups in compress.c, mostly comments and folding help.
1477 - Implement attrib.c::map_run_list() as a generic helper.
1478 - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
1479 thus making code shorter and enabling attribute list support.
1480 - Cleanup incorrect use of [su]64 with %L printf format specifier in
1481 all source files. Type casts to [unsigned] long long added to correct
1482 the mismatches (important for architectures which have long long not
1483 being 64 bits).
1484 - Merge async io completion handlers for directory indexes and $MFT
1485 data into one by setting the index_block_size{_bits} of the ntfs
1486 inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
1487 - Cleanup aops.c, update comments.
1488 - Make ntfs_file_get_block() use map_run_list() so all files now
1489 support attribute lists.
1490 - Make ntfs_dir_readpage() almost verbatim copy of
1491 block_read_full_page() by using ntfs_file_get_block() with only real
1492 difference being the use of our own async io completion handler
1493 rather than the default one, thus reducing the amount of code and
1494 automatically enabling attribute list support for directory indices.
1495 - Fix bug in load_attribute_list() - forgot to call brelse in error
1496 code path.
1497 - Change parameters to find_attr() and lookup_attr(). We no longer
1498 pass in the upcase table and its length. These can be gotten from
1499 ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
1500 - Cleanups in attrib.c.
1501 - Implement merging of runlists, attrib.c::merge_run_lists() and its
1502 helpers. (Richard Russon)
1503 - Attribute lists part 2, attribute extents and multi part runlists:
1504 enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
1505 further runlist parts via attrib.c::map_run_list().
1506 - Tiny endianness bug fix in decompress_mapping_pairs().
1507
1508tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
1509
1510 - Enable encrypted directories. (Their index root is marked encrypted
1511 to indicate that new files in that directory should be created
1512 encrypted.)
1513 - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
1514 - Enable $Extend system directory. Most (if not all) extended system
1515 files do not have unnamed data attributes so ntfs_read_inode() had to
1516 special case them but that is ok, as the special casing recovery
1517 happens inside an error code path so there is zero slow down in the
1518 normal fast path. The special casing is done by introducing a new
1519 function inode.c::ntfs_is_extended_system_file() which checks if any
1520 of the hard links in the inode point to $Extend as being their parent
1521 directory and if they do we assume this is an extended system file.
1522 - Create a sysctl/proc interface to allow {dis,en}abling of debug output
1523 when compiled with -DDEBUG. Default is debug messages to be disabled.
1524 To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
1525 (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
1526 interface is enabled). Inspired by old ntfs driver.
1527 - Add debug_msgs insmod/kernel boot parameter to set whether debug
1528 messages are {dis,en}abled. This is useful to enable debug messages
1529 during ntfs initialization and is the only way to activate debugging
1530 when the sysctl interface is not enabled.
1531 - Cleanup debug output in various places.
1532 - Remove all dollar signs ($) from the source (except comments) to
1533 enable compilation on architectures whose gcc compiler does not
1534 support dollar signs in the names of variables/constants. Attribute
1535 types now start with AT_ instead of $ and $I30 is now just I30.
1536 - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
1537 - Load complete runlist for $MFT/$BITMAP during mount and cleanup
1538 access functions. This means we now cope with $MFT/$BITMAP being
1539 spread accross several mft records.
1540 - Disable modification of mft_zone_multiplier on remount. We can always
1541 reenable this later on if we really want to, but we will need to make
1542 sure we readjust the mft_zone size / layout accordingly.
1543
1544tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
1545
1546 - Use sb_set_blocksize() instead of set_blocksize() and verify the
1547 return value.
1548 - Use sb_bread() instead of bread() throughout.
1549 - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
1550 of a directory index block vcn. Apply resulting simplifications in
1551 dir.c everywhere.
1552 - Fix a small bug somewhere (but forgot what it was).
1553 - Change ntfs_{debug,error,warning} to enable gcc to do type checking
1554 on the printf-format parameter list and fix bugs reported by gcc
1555 as a result. (Richard Russon)
1556 - Move inode allocation strategy to Al's new stuff but maintain the
1557 divorce of ntfs_inode from struct inode. To achieve this we have two
1558 separate slab caches, one for big ntfs inodes containing a struct
1559 inode and pure ntfs inodes and at the same time fix some faulty
1560 error code paths in ntfs_read_inode().
1561 - Show mount options in proc (inode.c::ntfs_show_options()).
1562
1563tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
1564
1565 - Modified (un)map_mft_record functions to be common for read and write
1566 case. To specify which is which, added extra parameter at front of
1567 parameter list. Pass either READ or WRITE to this, each has the
1568 obvious meaning.
1569 - General cleanups to allow for easier folding in vi.
1570 - attrib.c::decompress_mapping_pairs() now accepts the old runlist
1571 argument, and invokes attrib.c::merge_run_lists() to merge the old
1572 and the new runlists.
1573 - Removed attrib.c::find_first_attr().
1574 - Implemented loading of attribute list and complete runlist for $MFT.
1575 This means we now cope with $MFT being spread across several mft
1576 records.
1577 - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
1578 - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
1579 - Make ntfs_volume be allocated via kmalloc() instead of using a slab
1580 cache. There are too little ntfs_volume structures at any one time
1581 to justify a private slab cache.
1582 - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
1583 Use KM_BIO_IRQ on advice from IRC/kernel...
1584 - Use ntfs_map_page() in map_mft_record() and create ->readpage method
1585 for reading $MFT (ntfs_mft_readpage). In the process create dedicated
1586 address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
1587 removed the now superfluous exports from the kernel core patch.
1588 - Fix a bug where kfree() was used instead of ntfs_free().
1589 - Change map_mft_record() to take ntfs_inode as argument instead of
1590 vfs inode. Dito for unmap_mft_record(). Adapt all callers.
1591 - Add pointer to ntfs_volume to ntfs_inode.
1592 - Add mft record number and sequence number to ntfs_inode. Stop using
1593 i_ino and i_generation for in-driver purposes.
1594 - Implement attrib.c::merge_run_lists(). (Richard Russon)
1595 - Remove use of proper inodes by extent inodes. Move i_ino and
1596 i_generation to ntfs_inode to do this. Apply simplifications that
1597 result and remove iget_no_wait(), etc.
1598 - Pass ntfs_inode everywhere in the driver (used to be struct inode).
1599 - Add reference counting in ntfs_inode for the ntfs inode itself and
1600 for the mapped mft record.
1601 - Extend mft record mapping so we can (un)map extent mft records (new
1602 functions (un)map_extent_mft_record), and so mappings are reference
1603 counted and don't have to happen twice if already mapped - just ref
1604 count increases.
1605 - Add -o iocharset as alias to -o nls for backwards compatibility.
1606 - The latest core patch is now tiny. In fact just a single additional
1607 export is necessary over the base kernel.
1608
1609tng-0.0.3 - Cleanups, enhancements, bug fixes.
1610
1611 - Work on attrib.c::decompress_mapping_pairs() to detect base extents
1612 and setup the runlist appropriately using knowledge provided by the
1613 sizes in the base attribute record.
1614 - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
1615 any more.
1616 - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
1617 page or use vmalloc depending on the amount of memory requested.
1618 - Cleanup error output. The __FUNCTION__ "(): " is now added
1619 automatically. Introduced a new header file debug.h to support this
1620 and also moved ntfs_debug() function into it.
1621 - Make reading of compressed files more intelligent and especially get
1622 rid of the vmalloc_nofs() from readpage(). This now uses per CPU
1623 buffers (allocated at first mount with cluster size <= 4kiB and
1624 deallocated on last umount with cluster size <= 4kiB), and
1625 asynchronous io for the compressed data using a list of buffer heads.
1626 Er, we use synchronous io as async io only works on whole pages
1627 covered by buffers and not on individual buffer heads...
1628 - Bug fix for reading compressed files with sparse compression blocks.
1629
1630tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
1631
1632 - Fixed handling of directories when cluster size exceeds index block
1633 size.
1634 - Hide DOS only name space directory entries from readdir() but allow
1635 them in lookup(). This should fix the problem that Linux doesn't
1636 support directory hard links, while still allowing access to entries
1637 via their short file name. This also has the benefit of mimicking
1638 what Windows users are used to, so it is the ideal solution.
1639 - Implemented sync_page everywhere so no more hangs in D state when
1640 waiting for a page.
1641 - Stop using bforget() in favour of brelse().
1642 - Stop locking buffers unnecessarily.
1643 - Implemented compressed files (inode->mapping contains uncompressed
1644 data, raw compressed data is currently bread() into a vmalloc()ed
1645 memory buffer).
1646 - Enable compressed directories. (Their index root is marked compressed
1647 to indicate that new files in that directory should be created
1648 compressed.)
1649 - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
1650 functions. (Thanks to Will Dyson for pointing this out.)
1651 - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
1652 ntfs_sb_info) out of the common inode and super_block structures and
1653 started using the generic_ip and generic_sbp pointers instead. This
1654 makes ntfs entirely private with respect to the kernel tree.
1655 - Detect compiler version and abort with error message if gcc less than
1656 2.96 is used.
1657 - Fix bug in name comparison function in unistr.c.
1658 - Implement attribute lists part 1, the infrastructure: search contexts
1659 and operations, find_external_attr(), lookup_attr()) and make the
1660 code use the infrastructure.
1661 - Fix stupid buffer overflow bug that became apparent on larger run
1662 list containing attributes.
1663 - Fix bugs in readdir() that became apparent on larger directories.
1664
1665 The driver is now really useful and survives the test
1666 find . -type f -exec md5sum "{}" \;
1667 without any error messages on a over 1GiB sized partition with >16k
1668 files on it, including compressed files and directories and many files
1669 and directories with attribute lists.
1670
1671tng-0.0.1 - The first useful version.
1672
1673 - Added ntfs_lookup().
1674 - Added default upcase generation and handling.
1675 - Added compile options to be shown on module init.
1676 - Many bug fixes that were "hidden" before.
1677 - Update to latest kernel.
1678 - Added ntfs_readdir().
1679 - Added file operations for mmap(), read(), open() and llseek(). We just
1680 use the generic ones. The whole point of going through implementing
1681 readpage() methods and where possible get_block() call backs is that
1682 this allows us to make use of the generic high level methods provided
1683 by the kernel.
1684
1685 The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
1686 though and it doesn't implement accesssing compressed files yet. Also,
1687 accessing files with attribute list attributes is not implemented yet
1688 either. But for small or simple filesystems it should work and allow
1689 you to list directories, use stat on directory entries and the file
1690 system, open, read, mmap and llseek around in files. A big mile stone
1691 has been reached!
1692
1693tng-0.0.0 - Initial version tag.
1694
1695 Initial driver implementation. The driver can mount and umount simple
1696 NTFS filesystems (i.e. ones without attribute lists in the system
1697 files). If the mount fails there might be problems in the error handling
1698 code paths, so be warned. Otherwise it seems to be loading the system
1699 files nicely and the mft record read mapping/unmapping seems to be
1700 working nicely, too. Proof of inode metadata in the page cache and non-
1701 resident file unnamed stream data in the page cache concepts is thus
1702 complete.
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e37..9173e82a45d1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
1545 write_inode_now(bmp_vi, !datasync); 1545 write_inode_now(bmp_vi, !datasync);
1546 iput(bmp_vi); 1546 iput(bmp_vi);
1547 } 1547 }
1548 ret = ntfs_write_inode(vi, 1); 1548 ret = __ntfs_write_inode(vi, 1);
1549 write_inode_now(vi, !datasync); 1549 write_inode_now(vi, !datasync);
1550 err = sync_blockdev(vi->i_sb->s_bdev); 1550 err = sync_blockdev(vi->i_sb->s_bdev);
1551 if (unlikely(err && !ret)) 1551 if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 43179ddd336f..b681c71d7069 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
2182 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2182 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2183 BUG_ON(S_ISDIR(vi->i_mode)); 2183 BUG_ON(S_ISDIR(vi->i_mode));
2184 if (!datasync || !NInoNonResident(NTFS_I(vi))) 2184 if (!datasync || !NInoNonResident(NTFS_I(vi)))
2185 ret = ntfs_write_inode(vi, 1); 2185 ret = __ntfs_write_inode(vi, 1);
2186 write_inode_now(vi, !datasync); 2186 write_inode_now(vi, !datasync);
2187 /* 2187 /*
2188 * NOTE: If we were to use mapping->private_list (see ext2 and 2188 * NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index dc2505abb6d7..4b57fb1eac2a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2957,7 +2957,7 @@ out:
2957 * 2957 *
2958 * Return 0 on success and -errno on error. 2958 * Return 0 on success and -errno on error.
2959 */ 2959 */
2960int ntfs_write_inode(struct inode *vi, int sync) 2960int __ntfs_write_inode(struct inode *vi, int sync)
2961{ 2961{
2962 sle64 nt; 2962 sle64 nt;
2963 ntfs_inode *ni = NTFS_I(vi); 2963 ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a3..9a113544605d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
307 307
308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); 308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
309 309
310extern int ntfs_write_inode(struct inode *vi, int sync); 310extern int __ntfs_write_inode(struct inode *vi, int sync);
311 311
312static inline void ntfs_commit_inode(struct inode *vi) 312static inline void ntfs_commit_inode(struct inode *vi)
313{ 313{
314 if (!is_bad_inode(vi)) 314 if (!is_bad_inode(vi))
315 ntfs_write_inode(vi, 1); 315 __ntfs_write_inode(vi, 1);
316 return; 316 return;
317} 317}
318 318
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e9..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
34#include <linux/bitmap.h>
34 35
35#include "sysctl.h" 36#include "sysctl.h"
36#include "logfile.h" 37#include "logfile.h"
@@ -39,6 +40,7 @@
39#include "dir.h" 40#include "dir.h"
40#include "debug.h" 41#include "debug.h"
41#include "index.h" 42#include "index.h"
43#include "inode.h"
42#include "aops.h" 44#include "aops.h"
43#include "layout.h" 45#include "layout.h"
44#include "malloc.h" 46#include "malloc.h"
@@ -2457,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
2457static s64 get_nr_free_clusters(ntfs_volume *vol) 2459static s64 get_nr_free_clusters(ntfs_volume *vol)
2458{ 2460{
2459 s64 nr_free = vol->nr_clusters; 2461 s64 nr_free = vol->nr_clusters;
2460 u32 *kaddr;
2461 struct address_space *mapping = vol->lcnbmp_ino->i_mapping; 2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
2462 struct page *page; 2463 struct page *page;
2463 pgoff_t index, max_index; 2464 pgoff_t index, max_index;
@@ -2476,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2476 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", 2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
2477 max_index, PAGE_CACHE_SIZE / 4); 2478 max_index, PAGE_CACHE_SIZE / 4);
2478 for (index = 0; index < max_index; index++) { 2479 for (index = 0; index < max_index; index++) {
2479 unsigned int i; 2480 unsigned long *kaddr;
2481
2480 /* 2482 /*
2481 * Read the page from page cache, getting it from backing store 2483 * Read the page from page cache, getting it from backing store
2482 * if necessary, and increment the use count. 2484 * if necessary, and increment the use count.
@@ -2489,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2489 nr_free -= PAGE_CACHE_SIZE * 8; 2491 nr_free -= PAGE_CACHE_SIZE * 8;
2490 continue; 2492 continue;
2491 } 2493 }
2492 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2494 kaddr = kmap_atomic(page, KM_USER0);
2493 /* 2495 /*
2494 * For each 4 bytes, subtract the number of set bits. If this 2496 * Subtract the number of set bits. If this
2495 * is the last page and it is partial we don't really care as 2497 * is the last page and it is partial we don't really care as
2496 * it just means we do a little extra work but it won't affect 2498 * it just means we do a little extra work but it won't affect
2497 * the result as all out of range bytes are set to zero by 2499 * the result as all out of range bytes are set to zero by
2498 * ntfs_readpage(). 2500 * ntfs_readpage().
2499 */ 2501 */
2500 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2502 nr_free -= bitmap_weight(kaddr,
2501 nr_free -= (s64)hweight32(kaddr[i]); 2503 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2502 kunmap_atomic(kaddr, KM_USER0); 2504 kunmap_atomic(kaddr, KM_USER0);
2503 page_cache_release(page); 2505 page_cache_release(page);
2504 } 2506 }
@@ -2537,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2537static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, 2539static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2538 s64 nr_free, const pgoff_t max_index) 2540 s64 nr_free, const pgoff_t max_index)
2539{ 2541{
2540 u32 *kaddr;
2541 struct address_space *mapping = vol->mftbmp_ino->i_mapping; 2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2542 struct page *page; 2543 struct page *page;
2543 pgoff_t index; 2544 pgoff_t index;
@@ -2547,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2547 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " 2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
2548 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); 2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
2549 for (index = 0; index < max_index; index++) { 2550 for (index = 0; index < max_index; index++) {
2550 unsigned int i; 2551 unsigned long *kaddr;
2552
2551 /* 2553 /*
2552 * Read the page from page cache, getting it from backing store 2554 * Read the page from page cache, getting it from backing store
2553 * if necessary, and increment the use count. 2555 * if necessary, and increment the use count.
@@ -2560,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2560 nr_free -= PAGE_CACHE_SIZE * 8; 2562 nr_free -= PAGE_CACHE_SIZE * 8;
2561 continue; 2563 continue;
2562 } 2564 }
2563 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2565 kaddr = kmap_atomic(page, KM_USER0);
2564 /* 2566 /*
2565 * For each 4 bytes, subtract the number of set bits. If this 2567 * Subtract the number of set bits. If this
2566 * is the last page and it is partial we don't really care as 2568 * is the last page and it is partial we don't really care as
2567 * it just means we do a little extra work but it won't affect 2569 * it just means we do a little extra work but it won't affect
2568 * the result as all out of range bytes are set to zero by 2570 * the result as all out of range bytes are set to zero by
2569 * ntfs_readpage(). 2571 * ntfs_readpage().
2570 */ 2572 */
2571 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2573 nr_free -= bitmap_weight(kaddr,
2572 nr_free -= (s64)hweight32(kaddr[i]); 2574 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2573 kunmap_atomic(kaddr, KM_USER0); 2575 kunmap_atomic(kaddr, KM_USER0);
2574 page_cache_release(page); 2576 page_cache_release(page);
2575 } 2577 }
@@ -2662,6 +2664,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
2662 return 0; 2664 return 0;
2663} 2665}
2664 2666
2667#ifdef NTFS_RW
2668static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
2669{
2670 return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
2671}
2672#endif
2673
2665/** 2674/**
2666 * The complete super operations. 2675 * The complete super operations.
2667 */ 2676 */
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 600d2d2ade11..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -46,6 +46,7 @@ ocfs2_stackglue-objs := stackglue.o
46ocfs2_stack_o2cb-objs := stack_o2cb.o 46ocfs2_stack_o2cb-objs := stack_o2cb.o
47ocfs2_stack_user-objs := stack_user.o 47ocfs2_stack_user-objs := stack_user.o
48 48
49obj-$(CONFIG_OCFS2_FS) += dlmfs/
49# cluster/ is always needed when OCFS2_FS for masklog support 50# cluster/ is always needed when OCFS2_FS for masklog support
50obj-$(CONFIG_OCFS2_FS) += cluster/ 51obj-$(CONFIG_OCFS2_FS) += cluster/
51obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ 52obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..8ccf0f8c9cc8 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -30,6 +30,8 @@
30#include "alloc.h" 30#include "alloc.h"
31#include "dlmglue.h" 31#include "dlmglue.h"
32#include "file.h" 32#include "file.h"
33#include "inode.h"
34#include "journal.h"
33#include "ocfs2_fs.h" 35#include "ocfs2_fs.h"
34 36
35#include "xattr.h" 37#include "xattr.h"
@@ -166,6 +168,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
166} 168}
167 169
168/* 170/*
171 * Helper function to set i_mode in memory and disk. Some call paths
172 * will not have di_bh or a journal handle to pass, in which case it
173 * will create it's own.
174 */
175static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
176 handle_t *handle, umode_t new_mode)
177{
178 int ret, commit_handle = 0;
179 struct ocfs2_dinode *di;
180
181 if (di_bh == NULL) {
182 ret = ocfs2_read_inode_block(inode, &di_bh);
183 if (ret) {
184 mlog_errno(ret);
185 goto out;
186 }
187 } else
188 get_bh(di_bh);
189
190 if (handle == NULL) {
191 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
192 OCFS2_INODE_UPDATE_CREDITS);
193 if (IS_ERR(handle)) {
194 ret = PTR_ERR(handle);
195 mlog_errno(ret);
196 goto out_brelse;
197 }
198
199 commit_handle = 1;
200 }
201
202 di = (struct ocfs2_dinode *)di_bh->b_data;
203 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
204 OCFS2_JOURNAL_ACCESS_WRITE);
205 if (ret) {
206 mlog_errno(ret);
207 goto out_commit;
208 }
209
210 inode->i_mode = new_mode;
211 di->i_mode = cpu_to_le16(inode->i_mode);
212
213 ocfs2_journal_dirty(handle, di_bh);
214
215out_commit:
216 if (commit_handle)
217 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
218out_brelse:
219 brelse(di_bh);
220out:
221 return ret;
222}
223
224/*
169 * Set the access or default ACL of an inode. 225 * Set the access or default ACL of an inode.
170 */ 226 */
171static int ocfs2_set_acl(handle_t *handle, 227static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +249,14 @@ static int ocfs2_set_acl(handle_t *handle,
193 if (ret < 0) 249 if (ret < 0)
194 return ret; 250 return ret;
195 else { 251 else {
196 inode->i_mode = mode;
197 if (ret == 0) 252 if (ret == 0)
198 acl = NULL; 253 acl = NULL;
254
255 ret = ocfs2_acl_set_mode(inode, di_bh,
256 handle, mode);
257 if (ret)
258 return ret;
259
199 } 260 }
200 } 261 }
201 break; 262 break;
@@ -283,6 +344,7 @@ int ocfs2_init_acl(handle_t *handle,
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 344 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 struct posix_acl *acl = NULL; 345 struct posix_acl *acl = NULL;
285 int ret = 0; 346 int ret = 0;
347 mode_t mode;
286 348
287 if (!S_ISLNK(inode->i_mode)) { 349 if (!S_ISLNK(inode->i_mode)) {
288 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 350 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +353,17 @@ int ocfs2_init_acl(handle_t *handle,
291 if (IS_ERR(acl)) 353 if (IS_ERR(acl))
292 return PTR_ERR(acl); 354 return PTR_ERR(acl);
293 } 355 }
294 if (!acl) 356 if (!acl) {
295 inode->i_mode &= ~current_umask(); 357 mode = inode->i_mode & ~current_umask();
358 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
359 if (ret) {
360 mlog_errno(ret);
361 goto cleanup;
362 }
363 }
296 } 364 }
297 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 365 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
298 struct posix_acl *clone; 366 struct posix_acl *clone;
299 mode_t mode;
300 367
301 if (S_ISDIR(inode->i_mode)) { 368 if (S_ISDIR(inode->i_mode)) {
302 ret = ocfs2_set_acl(handle, inode, di_bh, 369 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +380,7 @@ int ocfs2_init_acl(handle_t *handle,
313 mode = inode->i_mode; 380 mode = inode->i_mode;
314 ret = posix_acl_create_masq(clone, &mode); 381 ret = posix_acl_create_masq(clone, &mode);
315 if (ret >= 0) { 382 if (ret >= 0) {
316 inode->i_mode = mode; 383 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
317 if (ret > 0) { 384 if (ret > 0) {
318 ret = ocfs2_set_acl(handle, inode, 385 ret = ocfs2_set_acl(handle, inode,
319 di_bh, ACL_TYPE_ACCESS, 386 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d17bdc718f74..9f8bd913c51e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1051 eb->h_blkno = cpu_to_le64(first_blkno); 1051 eb->h_blkno = cpu_to_le64(first_blkno);
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1054 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1055 eb->h_list.l_count = 1056 eb->h_list.l_count =
1056 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -5712,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5712 goto out; 5713 goto out;
5713 } 5714 }
5714 5715
5715 vfs_dq_free_space_nodirty(inode, 5716 dquot_free_space_nodirty(inode,
5716 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5717 ocfs2_clusters_to_bytes(inode->i_sb, len));
5717 5718
5718 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); 5719 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6037 if (status < 0) 6038 if (status < 0)
6038 mlog_errno(status); 6039 mlog_errno(status);
6039 else 6040 else
6040 ocfs2_init_inode_steal_slot(osb); 6041 ocfs2_init_steal_slots(osb);
6041 6042
6042 mlog_exit(status); 6043 mlog_exit(status);
6043} 6044}
@@ -6935,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6935 goto bail; 6936 goto bail;
6936 } 6937 }
6937 6938
6938 vfs_dq_free_space_nodirty(inode, 6939 dquot_free_space_nodirty(inode,
6939 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); 6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6940 spin_lock(&OCFS2_I(inode)->ip_lock); 6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6941 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7300,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7300 unsigned int page_end; 7301 unsigned int page_end;
7301 u64 phys; 7302 u64 phys;
7302 7303
7303 if (vfs_dq_alloc_space_nodirty(inode, 7304 ret = dquot_alloc_space_nodirty(inode,
7304 ocfs2_clusters_to_bytes(osb->sb, 1))) { 7305 ocfs2_clusters_to_bytes(osb->sb, 1));
7305 ret = -EDQUOT; 7306 if (ret)
7306 goto out_commit; 7307 goto out_commit;
7307 }
7308 did_quota = 1; 7308 did_quota = 1;
7309 7309
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7380,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7380 7380
7381out_commit: 7381out_commit:
7382 if (ret < 0 && did_quota) 7382 if (ret < 0 && did_quota)
7383 vfs_dq_free_space_nodirty(inode, 7383 dquot_free_space_nodirty(inode,
7384 ocfs2_clusters_to_bytes(osb->sb, 1)); 7384 ocfs2_clusters_to_bytes(osb->sb, 1));
7385 7385
7386 ocfs2_commit_trans(osb, handle); 7386 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7e9df11260f4..21441ddb5506 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -577,8 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
577 goto bail; 577 goto bail;
578 } 578 }
579 579
580 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent in case of create. */
581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
582
582 /* 583 /*
583 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
584 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
@@ -1763,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1763 1764
1764 wc->w_handle = handle; 1765 wc->w_handle = handle;
1765 1766
1766 if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, 1767 if (clusters_to_alloc) {
1767 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { 1768 ret = dquot_alloc_space_nodirty(inode,
1768 ret = -EDQUOT; 1769 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1769 goto out_commit; 1770 if (ret)
1771 goto out_commit;
1770 } 1772 }
1771 /* 1773 /*
1772 * We don't want this to fail in ocfs2_write_end(), so do it 1774 * We don't want this to fail in ocfs2_write_end(), so do it
@@ -1809,7 +1811,7 @@ success:
1809 return 0; 1811 return 0;
1810out_quota: 1812out_quota:
1811 if (clusters_to_alloc) 1813 if (clusters_to_alloc)
1812 vfs_dq_free_space(inode, 1814 dquot_free_space(inode,
1813 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); 1815 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1814out_commit: 1816out_commit:
1815 ocfs2_commit_trans(osb, handle); 1817 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..3bb928a2bf7d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS),
115 define_mask(ERROR), 116 define_mask(ERROR),
116 define_mask(NOTICE), 117 define_mask(NOTICE),
117 define_mask(KTHREAD), 118 define_mask(KTHREAD),
@@ -135,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
135 return mlog_mask_store(mlog_attr->mask, buf, count); 136 return mlog_mask_store(mlog_attr->mask, buf, count);
136} 137}
137 138
138static struct sysfs_ops mlog_attr_ops = { 139static const struct sysfs_ops mlog_attr_ops = {
139 .show = mlog_show, 140 .show = mlog_show,
140 .store = mlog_store, 141 .store = mlog_store,
141}; 142};
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
117/* bits that are infrequently given and frequently matched in the high word */ 118/* bits that are infrequently given and frequently matched in the high word */
118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
@@ -194,9 +195,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
194 * previous token if args expands to nothing. 195 * previous token if args expands to nothing.
195 */ 196 */
196#define __mlog_printk(level, fmt, args...) \ 197#define __mlog_printk(level, fmt, args...) \
197 printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ 198 printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
198 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ 199 task_pid_nr(current), __mlog_cpu_guess, \
199 ##args) 200 __PRETTY_FUNCTION__, __LINE__ , ##args)
200 201
201#define mlog(mask, fmt, args...) do { \ 202#define mlog(mask, fmt, args...) do { \
202 u64 __m = MLOG_MASK_PREFIX | (mask); \ 203 u64 __m = MLOG_MASK_PREFIX | (mask); \
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d8d0c65ac03c..73e743eea2c8 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
72 72
73#include "tcp_internal.h" 73#include "tcp_internal.h"
74 74
75#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" 75#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ 76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
77 NIPQUAD(sc->sc_node->nd_ipv4_address), \ 77 &sc->sc_node->nd_ipv4_address, \
78 ntohs(sc->sc_node->nd_ipv4_port) 78 ntohs(sc->sc_node->nd_ipv4_port)
79 79
80/* 80/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..efd77d071c80 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2440 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); 2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2964 goto out; 2964 goto out;
2965 } 2965 }
2966 2966
2967 if (vfs_dq_alloc_space_nodirty(dir, 2967 ret = dquot_alloc_space_nodirty(dir,
2968 ocfs2_clusters_to_bytes(osb->sb, 2968 ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
2969 alloc + dx_alloc))) { 2969 if (ret)
2970 ret = -EDQUOT;
2971 goto out_commit; 2970 goto out_commit;
2972 }
2973 did_quota = 1; 2971 did_quota = 1;
2974 2972
2975 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2973 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3178 3176
3179out_commit: 3177out_commit:
3180 if (ret < 0 && did_quota) 3178 if (ret < 0 && did_quota)
3181 vfs_dq_free_space_nodirty(dir, bytes_allocated); 3179 dquot_free_space_nodirty(dir, bytes_allocated);
3182 3180
3183 ocfs2_commit_trans(osb, handle); 3181 ocfs2_commit_trans(osb, handle);
3184 3182
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3221 if (extend) { 3219 if (extend) {
3222 u32 offset = OCFS2_I(dir)->ip_clusters; 3220 u32 offset = OCFS2_I(dir)->ip_clusters;
3223 3221
3224 if (vfs_dq_alloc_space_nodirty(dir, 3222 status = dquot_alloc_space_nodirty(dir,
3225 ocfs2_clusters_to_bytes(sb, 1))) { 3223 ocfs2_clusters_to_bytes(sb, 1));
3226 status = -EDQUOT; 3224 if (status)
3227 goto bail; 3225 goto bail;
3228 }
3229 did_quota = 1; 3226 did_quota = 1;
3230 3227
3231 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 3228 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3254 status = 0; 3251 status = 0;
3255bail: 3252bail:
3256 if (did_quota && status < 0) 3253 if (did_quota && status < 0)
3257 vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); 3254 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3258 mlog_exit(status); 3255 mlog_exit(status);
3259 return status; 3256 return status;
3260} 3257}
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3889 goto out; 3886 goto out;
3890 } 3887 }
3891 3888
3892 if (vfs_dq_alloc_space_nodirty(dir, 3889 ret = dquot_alloc_space_nodirty(dir,
3893 ocfs2_clusters_to_bytes(dir->i_sb, 1))) { 3890 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3894 ret = -EDQUOT; 3891 if (ret)
3895 goto out_commit; 3892 goto out_commit;
3896 }
3897 did_quota = 1; 3893 did_quota = 1;
3898 3894
3899 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, 3895 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3983 3979
3984out_commit: 3980out_commit:
3985 if (ret < 0 && did_quota) 3981 if (ret < 0 && did_quota)
3986 vfs_dq_free_space_nodirty(dir, 3982 dquot_free_space_nodirty(dir,
3987 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3983 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3988 3984
3989 ocfs2_commit_trans(osb, handle); 3985 ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4165 goto out; 4161 goto out;
4166 } 4162 }
4167 4163
4168 if (vfs_dq_alloc_space_nodirty(dir, 4164 ret = dquot_alloc_space_nodirty(dir,
4169 ocfs2_clusters_to_bytes(osb->sb, 1))) { 4165 ocfs2_clusters_to_bytes(osb->sb, 1));
4170 ret = -EDQUOT; 4166 if (ret)
4171 goto out_commit; 4167 goto out_commit;
4172 }
4173 did_quota = 1; 4168 did_quota = 1;
4174 4169
4175 /* 4170 /*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4229 4224
4230out_commit: 4225out_commit:
4231 if (ret < 0 && did_quota) 4226 if (ret < 0 && did_quota)
4232 vfs_dq_free_space_nodirty(dir, 4227 dquot_free_space_nodirty(dir,
4233 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 4228 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4234 4229
4235 ocfs2_commit_trans(osb, handle); 4230 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7 7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1875ok:
1876 spin_unlock(&res->spinlock); 1876 spin_unlock(&res->spinlock);
1877 } 1877 }
1878 spin_unlock(&dlm->spinlock);
1879 1878
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1879 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1880 // assert->node_idx);
@@ -1926,7 +1925,6 @@ ok:
1926 /* master is known, detach if not already detached. 1925 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1926 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1927 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1928 spin_lock(&dlm->master_lock);
1931 1929
1932 rr = atomic_read(&mle->mle_refs.refcount); 1930 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
1959 __dlm_put_mle(mle); 1957 __dlm_put_mle(mle);
1960 } 1958 }
1961 spin_unlock(&dlm->master_lock); 1959 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1960 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1961 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1962 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
1967 res->owner, namelen, name); 1964 res->owner, namelen, name);
1968 } 1965 }
1969 } 1966 }
1967 spin_unlock(&dlm->spinlock);
1970 1968
1971done: 1969done:
1972 ret = 0; 1970 ret = 0;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 344bcf90cbf4..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
310 mlog(0, "dlm thread running for %s...\n", dlm->name); 310 mlog(0, "dlm thread running for %s...\n", dlm->name);
311 311
312 while (!kthread_should_stop()) { 312 while (!kthread_should_stop()) {
313 if (dlm_joined(dlm)) { 313 if (dlm_domain_fully_joined(dlm)) {
314 status = dlm_do_recovery(dlm); 314 status = dlm_do_recovery(dlm);
315 if (status == -EAGAIN) { 315 if (status == -EAGAIN) {
316 /* do not sleep, recheck immediately. */ 316 /* do not sleep, recheck immediately. */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..1b0de157a08c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/backing-dev.h> 45#include <linux/backing-dev.h>
46#include <linux/poll.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49 50#include "stackglue.h"
50#include "cluster/nodemanager.h"
51#include "cluster/heartbeat.h"
52#include "cluster/tcp.h"
53
54#include "dlmapi.h"
55
56#include "userdlm.h" 51#include "userdlm.h"
57
58#include "dlmfsver.h" 52#include "dlmfsver.h"
59 53
60#define MLOG_MASK_PREFIX ML_DLMFS 54#define MLOG_MASK_PREFIX ML_DLMFS
61#include "cluster/masklog.h" 55#include "cluster/masklog.h"
62 56
63#include "ocfs2_lockingver.h"
64 57
65static const struct super_operations dlmfs_ops; 58static const struct super_operations dlmfs_ops;
66static const struct file_operations dlmfs_file_operations; 59static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
71 64
72struct workqueue_struct *user_dlm_worker; 65struct workqueue_struct *user_dlm_worker;
73 66
67
68
74/* 69/*
75 * This is the userdlmfs locking protocol version. 70 * These are the ABI capabilities of dlmfs.
71 *
72 * Over time, dlmfs has added some features that were not part of the
73 * initial ABI. Unfortunately, some of these features are not detectable
74 * via standard usage. For example, Linux's default poll always returns
75 * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
76 * added poll support. Instead, we provide this list of new capabilities.
77 *
78 * Capabilities is a read-only attribute. We do it as a module parameter
79 * so we can discover it whether dlmfs is built in, loaded, or even not
80 * loaded.
76 * 81 *
77 * See fs/ocfs2/dlmglue.c for more details on locking versions. 82 * The ABI features are local to this machine's dlmfs mount. This is
83 * distinct from the locking protocol, which is concerned with inter-node
84 * interaction.
85 *
86 * Capabilities:
87 * - bast : POLLIN against the file descriptor of a held lock
88 * signifies a bast fired on the lock.
78 */ 89 */
79static const struct dlm_protocol_version user_locking_protocol = { 90#define DLMFS_CAPABILITIES "bast stackglue"
80 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, 91extern int param_set_dlmfs_capabilities(const char *val,
81 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, 92 struct kernel_param *kp)
82}; 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
95 return -EINVAL;
96}
97static int param_get_dlmfs_capabilities(char *buffer,
98 struct kernel_param *kp)
99{
100 return strlcpy(buffer, DLMFS_CAPABILITIES,
101 strlen(DLMFS_CAPABILITIES) + 1);
102}
103module_param_call(capabilities, param_set_dlmfs_capabilities,
104 param_get_dlmfs_capabilities, NULL, 0444);
105MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
106
83 107
84/* 108/*
85 * decodes a set of open flags into a valid lock level and a set of flags. 109 * decodes a set of open flags into a valid lock level and a set of flags.
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
179 return 0; 203 return 0;
180} 204}
181 205
206/*
207 * We do ->setattr() just to override size changes. Our size is the size
208 * of the LVB and nothing else.
209 */
210static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
211{
212 int error;
213 struct inode *inode = dentry->d_inode;
214
215 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr);
217 if (!error)
218 error = inode_setattr(inode, attr);
219
220 return error;
221}
222
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
224{
225 int event = 0;
226 struct inode *inode = file->f_path.dentry->d_inode;
227 struct dlmfs_inode_private *ip = DLMFS_I(inode);
228
229 poll_wait(file, &ip->ip_lockres.l_event, wait);
230
231 spin_lock(&ip->ip_lockres.l_lock);
232 if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
233 event = POLLIN | POLLRDNORM;
234 spin_unlock(&ip->ip_lockres.l_lock);
235
236 return event;
237}
238
182static ssize_t dlmfs_file_read(struct file *filp, 239static ssize_t dlmfs_file_read(struct file *filp,
183 char __user *buf, 240 char __user *buf,
184 size_t count, 241 size_t count,
185 loff_t *ppos) 242 loff_t *ppos)
186{ 243{
187 int bytes_left; 244 int bytes_left;
188 ssize_t readlen; 245 ssize_t readlen, got;
189 char *lvb_buf; 246 char *lvb_buf;
190 struct inode *inode = filp->f_path.dentry->d_inode; 247 struct inode *inode = filp->f_path.dentry->d_inode;
191 248
@@ -211,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp,
211 if (!lvb_buf) 268 if (!lvb_buf)
212 return -ENOMEM; 269 return -ENOMEM;
213 270
214 user_dlm_read_lvb(inode, lvb_buf, readlen); 271 got = user_dlm_read_lvb(inode, lvb_buf, readlen);
215 bytes_left = __copy_to_user(buf, lvb_buf, readlen); 272 if (got) {
216 readlen -= bytes_left; 273 BUG_ON(got != readlen);
274 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
275 readlen -= bytes_left;
276 } else
277 readlen = 0;
217 278
218 kfree(lvb_buf); 279 kfree(lvb_buf);
219 280
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
272 struct dlmfs_inode_private *ip = 333 struct dlmfs_inode_private *ip =
273 (struct dlmfs_inode_private *) foo; 334 (struct dlmfs_inode_private *) foo;
274 335
275 ip->ip_dlm = NULL; 336 ip->ip_conn = NULL;
276 ip->ip_parent = NULL; 337 ip->ip_parent = NULL;
277 338
278 inode_init_once(&ip->ip_vfs_inode); 339 inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
314 goto clear_fields; 375 goto clear_fields;
315 } 376 }
316 377
317 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); 378 mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
318 /* we must be a directory. If required, lets unregister the 379 /* we must be a directory. If required, lets unregister the
319 * dlm context now. */ 380 * dlm context now. */
320 if (ip->ip_dlm) 381 if (ip->ip_conn)
321 user_dlm_unregister_context(ip->ip_dlm); 382 user_dlm_unregister(ip->ip_conn);
322clear_fields: 383clear_fields:
323 ip->ip_parent = NULL; 384 ip->ip_parent = NULL;
324 ip->ip_dlm = NULL; 385 ip->ip_conn = NULL;
325} 386}
326 387
327static struct backing_dev_info dlmfs_backing_dev_info = { 388static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
371 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
372 433
373 ip = DLMFS_I(inode); 434 ip = DLMFS_I(inode);
374 ip->ip_dlm = DLMFS_I(parent)->ip_dlm; 435 ip->ip_conn = DLMFS_I(parent)->ip_conn;
375 436
376 switch (mode & S_IFMT) { 437 switch (mode & S_IFMT) {
377 default: 438 default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
425 struct inode *inode = NULL; 486 struct inode *inode = NULL;
426 struct qstr *domain = &dentry->d_name; 487 struct qstr *domain = &dentry->d_name;
427 struct dlmfs_inode_private *ip; 488 struct dlmfs_inode_private *ip;
428 struct dlm_ctxt *dlm; 489 struct ocfs2_cluster_connection *conn;
429 struct dlm_protocol_version proto = user_locking_protocol;
430 490
431 mlog(0, "mkdir %.*s\n", domain->len, domain->name); 491 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
432 492
433 /* verify that we have a proper domain */ 493 /* verify that we have a proper domain */
434 if (domain->len >= O2NM_MAX_NAME_LEN) { 494 if (domain->len >= GROUP_NAME_MAX) {
435 status = -EINVAL; 495 status = -EINVAL;
436 mlog(ML_ERROR, "invalid domain name for directory.\n"); 496 mlog(ML_ERROR, "invalid domain name for directory.\n");
437 goto bail; 497 goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
446 506
447 ip = DLMFS_I(inode); 507 ip = DLMFS_I(inode);
448 508
449 dlm = user_dlm_register_context(domain, &proto); 509 conn = user_dlm_register(domain);
450 if (IS_ERR(dlm)) { 510 if (IS_ERR(conn)) {
451 status = PTR_ERR(dlm); 511 status = PTR_ERR(conn);
452 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", 512 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
453 status, domain->len, domain->name); 513 status, domain->len, domain->name);
454 goto bail; 514 goto bail;
455 } 515 }
456 ip->ip_dlm = dlm; 516 ip->ip_conn = conn;
457 517
458 inc_nlink(dir); 518 inc_nlink(dir);
459 d_instantiate(dentry, inode); 519 d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
549static const struct file_operations dlmfs_file_operations = { 609static const struct file_operations dlmfs_file_operations = {
550 .open = dlmfs_file_open, 610 .open = dlmfs_file_open,
551 .release = dlmfs_file_release, 611 .release = dlmfs_file_release,
612 .poll = dlmfs_file_poll,
552 .read = dlmfs_file_read, 613 .read = dlmfs_file_read,
553 .write = dlmfs_file_write, 614 .write = dlmfs_file_write,
554}; 615};
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
576 637
577static const struct inode_operations dlmfs_file_inode_operations = { 638static const struct inode_operations dlmfs_file_inode_operations = {
578 .getattr = simple_getattr, 639 .getattr = simple_getattr,
640 .setattr = dlmfs_file_setattr,
579}; 641};
580 642
581static int dlmfs_get_sb(struct file_system_type *fs_type, 643static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
620 } 682 }
621 cleanup_worker = 1; 683 cleanup_worker = 1;
622 684
685 user_dlm_set_locking_protocol();
623 status = register_filesystem(&dlmfs_fs_type); 686 status = register_filesystem(&dlmfs_fs_type);
624bail: 687bail:
625 if (status) { 688 if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
34#include <linux/types.h> 34#include <linux/types.h>
35#include <linux/crc32.h> 35#include <linux/crc32.h>
36 36
37 37#include "ocfs2_lockingver.h"
38#include "cluster/nodemanager.h" 38#include "stackglue.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h" 39#include "userdlm.h"
45 40
46#define MLOG_MASK_PREFIX ML_DLMFS 41#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h" 42#include "cluster/masklog.h"
48 43
44
45static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
46{
47 return container_of(lksb, struct user_lock_res, l_lksb);
48}
49
49static inline int user_check_wait_flag(struct user_lock_res *lockres, 50static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag) 51 int flag)
51{ 52{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
73} 74}
74 75
75/* I heart container_of... */ 76/* I heart container_of... */
76static inline struct dlm_ctxt * 77static inline struct ocfs2_cluster_connection *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) 78cluster_connection_from_user_lockres(struct user_lock_res *lockres)
78{ 79{
79 struct dlmfs_inode_private *ip; 80 struct dlmfs_inode_private *ip;
80 81
81 ip = container_of(lockres, 82 ip = container_of(lockres,
82 struct dlmfs_inode_private, 83 struct dlmfs_inode_private,
83 ip_lockres); 84 ip_lockres);
84 return ip->ip_dlm; 85 return ip->ip_conn;
85} 86}
86 87
87static struct inode * 88static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
103} 104}
104 105
105#define user_log_dlm_error(_func, _stat, _lockres) do { \ 106#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 107 mlog(ML_ERROR, "Dlm error %d while calling %s on " \
107 "resource %.*s: %s\n", dlm_errname(_stat), _func, \ 108 "resource %.*s\n", _stat, _func, \
108 _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ 109 _lockres->l_namelen, _lockres->l_name); \
109} while (0) 110} while (0)
110 111
111/* WARNING: This function lives in a world where the only three lock 112/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
113 * lock types are added. */ 114 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level) 115static inline int user_highest_compat_lock_level(int level)
115{ 116{
116 int new_level = LKM_EXMODE; 117 int new_level = DLM_LOCK_EX;
117 118
118 if (level == LKM_EXMODE) 119 if (level == DLM_LOCK_EX)
119 new_level = LKM_NLMODE; 120 new_level = DLM_LOCK_NL;
120 else if (level == LKM_PRMODE) 121 else if (level == DLM_LOCK_PR)
121 new_level = LKM_PRMODE; 122 new_level = DLM_LOCK_PR;
122 return new_level; 123 return new_level;
123} 124}
124 125
125static void user_ast(void *opaque) 126static void user_ast(struct ocfs2_dlm_lksb *lksb)
126{ 127{
127 struct user_lock_res *lockres = opaque; 128 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
128 struct dlm_lockstatus *lksb; 129 int status;
129 130
130 mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, 131 mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
131 lockres->l_name); 132 lockres->l_namelen, lockres->l_name, lockres->l_level,
133 lockres->l_requested);
132 134
133 spin_lock(&lockres->l_lock); 135 spin_lock(&lockres->l_lock);
134 136
135 lksb = &(lockres->l_lksb); 137 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
136 if (lksb->status != DLM_NORMAL) { 138 if (status) {
137 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", 139 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
138 lksb->status, lockres->l_namelen, lockres->l_name); 140 status, lockres->l_namelen, lockres->l_name);
139 spin_unlock(&lockres->l_lock); 141 spin_unlock(&lockres->l_lock);
140 return; 142 return;
141 } 143 }
142 144
143 mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, 145 mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
144 "Lockres %.*s, requested ivmode. flags 0x%x\n", 146 "Lockres %.*s, requested ivmode. flags 0x%x\n",
145 lockres->l_namelen, lockres->l_name, lockres->l_flags); 147 lockres->l_namelen, lockres->l_name, lockres->l_flags);
146 148
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
148 if (lockres->l_requested < lockres->l_level) { 150 if (lockres->l_requested < lockres->l_level) {
149 if (lockres->l_requested <= 151 if (lockres->l_requested <=
150 user_highest_compat_lock_level(lockres->l_blocking)) { 152 user_highest_compat_lock_level(lockres->l_blocking)) {
151 lockres->l_blocking = LKM_NLMODE; 153 lockres->l_blocking = DLM_LOCK_NL;
152 lockres->l_flags &= ~USER_LOCK_BLOCKED; 154 lockres->l_flags &= ~USER_LOCK_BLOCKED;
153 } 155 }
154 } 156 }
155 157
156 lockres->l_level = lockres->l_requested; 158 lockres->l_level = lockres->l_requested;
157 lockres->l_requested = LKM_IVMODE; 159 lockres->l_requested = DLM_LOCK_IV;
158 lockres->l_flags |= USER_LOCK_ATTACHED; 160 lockres->l_flags |= USER_LOCK_ATTACHED;
159 lockres->l_flags &= ~USER_LOCK_BUSY; 161 lockres->l_flags &= ~USER_LOCK_BUSY;
160 162
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
193 return; 195 return;
194 196
195 switch (lockres->l_blocking) { 197 switch (lockres->l_blocking) {
196 case LKM_EXMODE: 198 case DLM_LOCK_EX:
197 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 199 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
198 queue = 1; 200 queue = 1;
199 break; 201 break;
200 case LKM_PRMODE: 202 case DLM_LOCK_PR:
201 if (!lockres->l_ex_holders) 203 if (!lockres->l_ex_holders)
202 queue = 1; 204 queue = 1;
203 break; 205 break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
209 __user_dlm_queue_lockres(lockres); 211 __user_dlm_queue_lockres(lockres);
210} 212}
211 213
212static void user_bast(void *opaque, int level) 214static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
213{ 215{
214 struct user_lock_res *lockres = opaque; 216 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
215 217
216 mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", 218 mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
217 lockres->l_namelen, lockres->l_name, level); 219 lockres->l_namelen, lockres->l_name, level, lockres->l_level);
218 220
219 spin_lock(&lockres->l_lock); 221 spin_lock(&lockres->l_lock);
220 lockres->l_flags |= USER_LOCK_BLOCKED; 222 lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
227 wake_up(&lockres->l_event); 229 wake_up(&lockres->l_event);
228} 230}
229 231
230static void user_unlock_ast(void *opaque, enum dlm_status status) 232static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
231{ 233{
232 struct user_lock_res *lockres = opaque; 234 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
233 235
234 mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, 236 mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
235 lockres->l_name); 237 lockres->l_namelen, lockres->l_name, lockres->l_flags);
236 238
237 if (status != DLM_NORMAL && status != DLM_CANCELGRANT) 239 if (status)
238 mlog(ML_ERROR, "Dlm returns status %d\n", status); 240 mlog(ML_ERROR, "dlm returns status %d\n", status);
239 241
240 spin_lock(&lockres->l_lock); 242 spin_lock(&lockres->l_lock);
241 /* The teardown flag gets set early during the unlock process, 243 /* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
243 * for a concurrent cancel. */ 245 * for a concurrent cancel. */
244 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN 246 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
245 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { 247 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
246 lockres->l_level = LKM_IVMODE; 248 lockres->l_level = DLM_LOCK_IV;
247 } else if (status == DLM_CANCELGRANT) { 249 } else if (status == DLM_CANCELGRANT) {
248 /* We tried to cancel a convert request, but it was 250 /* We tried to cancel a convert request, but it was
249 * already granted. Don't clear the busy flag - the 251 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
254 } else { 256 } else {
255 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 257 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
256 /* Cancel succeeded, we want to re-queue */ 258 /* Cancel succeeded, we want to re-queue */
257 lockres->l_requested = LKM_IVMODE; /* cancel an 259 lockres->l_requested = DLM_LOCK_IV; /* cancel an
258 * upconvert 260 * upconvert
259 * request. */ 261 * request. */
260 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 262 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
271 wake_up(&lockres->l_event); 273 wake_up(&lockres->l_event);
272} 274}
273 275
276/*
277 * This is the userdlmfs locking protocol version.
278 *
279 * See fs/ocfs2/dlmglue.c for more details on locking versions.
280 */
281static struct ocfs2_locking_protocol user_dlm_lproto = {
282 .lp_max_version = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285 },
286 .lp_lock_ast = user_ast,
287 .lp_blocking_ast = user_bast,
288 .lp_unlock_ast = user_unlock_ast,
289};
290
274static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) 291static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
275{ 292{
276 struct inode *inode; 293 struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
283 int new_level, status; 300 int new_level, status;
284 struct user_lock_res *lockres = 301 struct user_lock_res *lockres =
285 container_of(work, struct user_lock_res, l_work); 302 container_of(work, struct user_lock_res, l_work);
286 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 303 struct ocfs2_cluster_connection *conn =
304 cluster_connection_from_user_lockres(lockres);
287 305
288 mlog(0, "processing lockres %.*s\n", lockres->l_namelen, 306 mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
289 lockres->l_name);
290 307
291 spin_lock(&lockres->l_lock); 308 spin_lock(&lockres->l_lock);
292 309
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
304 * flag, and finally we might get another bast which re-queues 321 * flag, and finally we might get another bast which re-queues
305 * us before our ast for the downconvert is called. */ 322 * us before our ast for the downconvert is called. */
306 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { 323 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
324 mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
325 lockres->l_namelen, lockres->l_name);
307 spin_unlock(&lockres->l_lock); 326 spin_unlock(&lockres->l_lock);
308 goto drop_ref; 327 goto drop_ref;
309 } 328 }
310 329
311 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 330 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
331 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
332 lockres->l_namelen, lockres->l_name);
312 spin_unlock(&lockres->l_lock); 333 spin_unlock(&lockres->l_lock);
313 goto drop_ref; 334 goto drop_ref;
314 } 335 }
315 336
316 if (lockres->l_flags & USER_LOCK_BUSY) { 337 if (lockres->l_flags & USER_LOCK_BUSY) {
317 if (lockres->l_flags & USER_LOCK_IN_CANCEL) { 338 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
339 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
340 lockres->l_namelen, lockres->l_name);
318 spin_unlock(&lockres->l_lock); 341 spin_unlock(&lockres->l_lock);
319 goto drop_ref; 342 goto drop_ref;
320 } 343 }
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
322 lockres->l_flags |= USER_LOCK_IN_CANCEL; 345 lockres->l_flags |= USER_LOCK_IN_CANCEL;
323 spin_unlock(&lockres->l_lock); 346 spin_unlock(&lockres->l_lock);
324 347
325 status = dlmunlock(dlm, 348 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
326 &lockres->l_lksb, 349 DLM_LKF_CANCEL);
327 LKM_CANCEL, 350 if (status)
328 user_unlock_ast, 351 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
329 lockres);
330 if (status != DLM_NORMAL)
331 user_log_dlm_error("dlmunlock", status, lockres);
332 goto drop_ref; 352 goto drop_ref;
333 } 353 }
334 354
335 /* If there are still incompat holders, we can exit safely 355 /* If there are still incompat holders, we can exit safely
336 * without worrying about re-queueing this lock as that will 356 * without worrying about re-queueing this lock as that will
337 * happen on the last call to user_cluster_unlock. */ 357 * happen on the last call to user_cluster_unlock. */
338 if ((lockres->l_blocking == LKM_EXMODE) 358 if ((lockres->l_blocking == DLM_LOCK_EX)
339 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 359 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
340 spin_unlock(&lockres->l_lock); 360 spin_unlock(&lockres->l_lock);
341 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", 361 mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
342 lockres->l_ro_holders, lockres->l_ex_holders); 362 lockres->l_namelen, lockres->l_name,
363 lockres->l_ex_holders, lockres->l_ro_holders);
343 goto drop_ref; 364 goto drop_ref;
344 } 365 }
345 366
346 if ((lockres->l_blocking == LKM_PRMODE) 367 if ((lockres->l_blocking == DLM_LOCK_PR)
347 && lockres->l_ex_holders) { 368 && lockres->l_ex_holders) {
348 spin_unlock(&lockres->l_lock); 369 spin_unlock(&lockres->l_lock);
349 mlog(0, "can't downconvert for pr: ex = %u\n", 370 mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
350 lockres->l_ex_holders); 371 lockres->l_namelen, lockres->l_name,
372 lockres->l_ex_holders);
351 goto drop_ref; 373 goto drop_ref;
352 } 374 }
353 375
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
355 new_level = user_highest_compat_lock_level(lockres->l_blocking); 377 new_level = user_highest_compat_lock_level(lockres->l_blocking);
356 lockres->l_requested = new_level; 378 lockres->l_requested = new_level;
357 lockres->l_flags |= USER_LOCK_BUSY; 379 lockres->l_flags |= USER_LOCK_BUSY;
358 mlog(0, "Downconvert lock from %d to %d\n", 380 mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
359 lockres->l_level, new_level); 381 lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
360 spin_unlock(&lockres->l_lock); 382 spin_unlock(&lockres->l_lock);
361 383
362 /* need lock downconvert request now... */ 384 /* need lock downconvert request now... */
363 status = dlmlock(dlm, 385 status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
364 new_level, 386 DLM_LKF_CONVERT|DLM_LKF_VALBLK,
365 &lockres->l_lksb, 387 lockres->l_name,
366 LKM_CONVERT|LKM_VALBLK, 388 lockres->l_namelen);
367 lockres->l_name, 389 if (status) {
368 lockres->l_namelen, 390 user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
369 user_ast,
370 lockres,
371 user_bast);
372 if (status != DLM_NORMAL) {
373 user_log_dlm_error("dlmlock", status, lockres);
374 user_recover_from_dlm_error(lockres); 391 user_recover_from_dlm_error(lockres);
375 } 392 }
376 393
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
382 int level) 399 int level)
383{ 400{
384 switch(level) { 401 switch(level) {
385 case LKM_EXMODE: 402 case DLM_LOCK_EX:
386 lockres->l_ex_holders++; 403 lockres->l_ex_holders++;
387 break; 404 break;
388 case LKM_PRMODE: 405 case DLM_LOCK_PR:
389 lockres->l_ro_holders++; 406 lockres->l_ro_holders++;
390 break; 407 break;
391 default: 408 default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
410 int lkm_flags) 427 int lkm_flags)
411{ 428{
412 int status, local_flags; 429 int status, local_flags;
413 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 430 struct ocfs2_cluster_connection *conn =
431 cluster_connection_from_user_lockres(lockres);
414 432
415 if (level != LKM_EXMODE && 433 if (level != DLM_LOCK_EX &&
416 level != LKM_PRMODE) { 434 level != DLM_LOCK_PR) {
417 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 435 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
418 lockres->l_namelen, lockres->l_name); 436 lockres->l_namelen, lockres->l_name);
419 status = -EINVAL; 437 status = -EINVAL;
420 goto bail; 438 goto bail;
421 } 439 }
422 440
423 mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", 441 mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
424 lockres->l_namelen, lockres->l_name, 442 lockres->l_namelen, lockres->l_name, level, lkm_flags);
425 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
426 lkm_flags);
427 443
428again: 444again:
429 if (signal_pending(current)) { 445 if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
457 } 473 }
458 474
459 if (level > lockres->l_level) { 475 if (level > lockres->l_level) {
460 local_flags = lkm_flags | LKM_VALBLK; 476 local_flags = lkm_flags | DLM_LKF_VALBLK;
461 if (lockres->l_level != LKM_IVMODE) 477 if (lockres->l_level != DLM_LOCK_IV)
462 local_flags |= LKM_CONVERT; 478 local_flags |= DLM_LKF_CONVERT;
463 479
464 lockres->l_requested = level; 480 lockres->l_requested = level;
465 lockres->l_flags |= USER_LOCK_BUSY; 481 lockres->l_flags |= USER_LOCK_BUSY;
466 spin_unlock(&lockres->l_lock); 482 spin_unlock(&lockres->l_lock);
467 483
468 BUG_ON(level == LKM_IVMODE); 484 BUG_ON(level == DLM_LOCK_IV);
469 BUG_ON(level == LKM_NLMODE); 485 BUG_ON(level == DLM_LOCK_NL);
470 486
471 /* call dlm_lock to upgrade lock now */ 487 /* call dlm_lock to upgrade lock now */
472 status = dlmlock(dlm, 488 status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
473 level, 489 local_flags, lockres->l_name,
474 &lockres->l_lksb, 490 lockres->l_namelen);
475 local_flags, 491 if (status) {
476 lockres->l_name, 492 if ((lkm_flags & DLM_LKF_NOQUEUE) &&
477 lockres->l_namelen, 493 (status != -EAGAIN))
478 user_ast, 494 user_log_dlm_error("ocfs2_dlm_lock",
479 lockres, 495 status, lockres);
480 user_bast);
481 if (status != DLM_NORMAL) {
482 if ((lkm_flags & LKM_NOQUEUE) &&
483 (status == DLM_NOTQUEUED))
484 status = -EAGAIN;
485 else {
486 user_log_dlm_error("dlmlock", status, lockres);
487 status = -EINVAL;
488 }
489 user_recover_from_dlm_error(lockres); 496 user_recover_from_dlm_error(lockres);
490 goto bail; 497 goto bail;
491 } 498 }
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
506 int level) 513 int level)
507{ 514{
508 switch(level) { 515 switch(level) {
509 case LKM_EXMODE: 516 case DLM_LOCK_EX:
510 BUG_ON(!lockres->l_ex_holders); 517 BUG_ON(!lockres->l_ex_holders);
511 lockres->l_ex_holders--; 518 lockres->l_ex_holders--;
512 break; 519 break;
513 case LKM_PRMODE: 520 case DLM_LOCK_PR:
514 BUG_ON(!lockres->l_ro_holders); 521 BUG_ON(!lockres->l_ro_holders);
515 lockres->l_ro_holders--; 522 lockres->l_ro_holders--;
516 break; 523 break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
522void user_dlm_cluster_unlock(struct user_lock_res *lockres, 529void user_dlm_cluster_unlock(struct user_lock_res *lockres,
523 int level) 530 int level)
524{ 531{
525 if (level != LKM_EXMODE && 532 if (level != DLM_LOCK_EX &&
526 level != LKM_PRMODE) { 533 level != DLM_LOCK_PR) {
527 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 534 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
528 lockres->l_namelen, lockres->l_name); 535 lockres->l_namelen, lockres->l_name);
529 return; 536 return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
540 unsigned int len) 547 unsigned int len)
541{ 548{
542 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 549 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
543 char *lvb = lockres->l_lksb.lvb; 550 char *lvb;
544 551
545 BUG_ON(len > DLM_LVB_LEN); 552 BUG_ON(len > DLM_LVB_LEN);
546 553
547 spin_lock(&lockres->l_lock); 554 spin_lock(&lockres->l_lock);
548 555
549 BUG_ON(lockres->l_level < LKM_EXMODE); 556 BUG_ON(lockres->l_level < DLM_LOCK_EX);
557 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
550 memcpy(lvb, val, len); 558 memcpy(lvb, val, len);
551 559
552 spin_unlock(&lockres->l_lock); 560 spin_unlock(&lockres->l_lock);
553} 561}
554 562
555void user_dlm_read_lvb(struct inode *inode, 563ssize_t user_dlm_read_lvb(struct inode *inode,
556 char *val, 564 char *val,
557 unsigned int len) 565 unsigned int len)
558{ 566{
559 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 567 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
560 char *lvb = lockres->l_lksb.lvb; 568 char *lvb;
569 ssize_t ret = len;
561 570
562 BUG_ON(len > DLM_LVB_LEN); 571 BUG_ON(len > DLM_LVB_LEN);
563 572
564 spin_lock(&lockres->l_lock); 573 spin_lock(&lockres->l_lock);
565 574
566 BUG_ON(lockres->l_level < LKM_PRMODE); 575 BUG_ON(lockres->l_level < DLM_LOCK_PR);
567 memcpy(val, lvb, len); 576 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
577 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
578 memcpy(val, lvb, len);
579 } else
580 ret = 0;
568 581
569 spin_unlock(&lockres->l_lock); 582 spin_unlock(&lockres->l_lock);
583 return ret;
570} 584}
571 585
572void user_dlm_lock_res_init(struct user_lock_res *lockres, 586void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
576 590
577 spin_lock_init(&lockres->l_lock); 591 spin_lock_init(&lockres->l_lock);
578 init_waitqueue_head(&lockres->l_event); 592 init_waitqueue_head(&lockres->l_event);
579 lockres->l_level = LKM_IVMODE; 593 lockres->l_level = DLM_LOCK_IV;
580 lockres->l_requested = LKM_IVMODE; 594 lockres->l_requested = DLM_LOCK_IV;
581 lockres->l_blocking = LKM_IVMODE; 595 lockres->l_blocking = DLM_LOCK_IV;
582 596
583 /* should have been checked before getting here. */ 597 /* should have been checked before getting here. */
584 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); 598 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
592int user_dlm_destroy_lock(struct user_lock_res *lockres) 606int user_dlm_destroy_lock(struct user_lock_res *lockres)
593{ 607{
594 int status = -EBUSY; 608 int status = -EBUSY;
595 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 609 struct ocfs2_cluster_connection *conn =
610 cluster_connection_from_user_lockres(lockres);
596 611
597 mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); 612 mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
598 613
599 spin_lock(&lockres->l_lock); 614 spin_lock(&lockres->l_lock);
600 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 615 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
627 lockres->l_flags |= USER_LOCK_BUSY; 642 lockres->l_flags |= USER_LOCK_BUSY;
628 spin_unlock(&lockres->l_lock); 643 spin_unlock(&lockres->l_lock);
629 644
630 status = dlmunlock(dlm, 645 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
631 &lockres->l_lksb, 646 if (status) {
632 LKM_VALBLK, 647 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
633 user_unlock_ast,
634 lockres);
635 if (status != DLM_NORMAL) {
636 user_log_dlm_error("dlmunlock", status, lockres);
637 status = -EINVAL;
638 goto bail; 648 goto bail;
639 } 649 }
640 650
@@ -645,32 +655,34 @@ bail:
645 return status; 655 return status;
646} 656}
647 657
648struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 658static void user_dlm_recovery_handler_noop(int node_num,
649 struct dlm_protocol_version *proto) 659 void *recovery_data)
650{ 660{
651 struct dlm_ctxt *dlm; 661 /* We ignore recovery events */
652 u32 dlm_key; 662 return;
653 char *domain; 663}
654
655 domain = kmalloc(name->len + 1, GFP_NOFS);
656 if (!domain) {
657 mlog_errno(-ENOMEM);
658 return ERR_PTR(-ENOMEM);
659 }
660 664
661 dlm_key = crc32_le(0, name->name, name->len); 665void user_dlm_set_locking_protocol(void)
666{
667 ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
668}
662 669
663 snprintf(domain, name->len + 1, "%.*s", name->len, name->name); 670struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
671{
672 int rc;
673 struct ocfs2_cluster_connection *conn;
664 674
665 dlm = dlm_register_domain(domain, dlm_key, proto); 675 rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
666 if (IS_ERR(dlm)) 676 &user_dlm_lproto,
667 mlog_errno(PTR_ERR(dlm)); 677 user_dlm_recovery_handler_noop,
678 NULL, &conn);
679 if (rc)
680 mlog_errno(rc);
668 681
669 kfree(domain); 682 return rc ? ERR_PTR(rc) : conn;
670 return dlm;
671} 683}
672 684
673void user_dlm_unregister_context(struct dlm_ctxt *dlm) 685void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
674{ 686{
675 dlm_unregister_domain(dlm); 687 ocfs2_cluster_disconnect(conn, 0);
676} 688}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
57 int l_level; 57 int l_level;
58 unsigned int l_ro_holders; 58 unsigned int l_ro_holders;
59 unsigned int l_ex_holders; 59 unsigned int l_ex_holders;
60 struct dlm_lockstatus l_lksb; 60 struct ocfs2_dlm_lksb l_lksb;
61 61
62 int l_requested; 62 int l_requested;
63 int l_blocking; 63 int l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
80void user_dlm_write_lvb(struct inode *inode, 80void user_dlm_write_lvb(struct inode *inode,
81 const char *val, 81 const char *val,
82 unsigned int len); 82 unsigned int len);
83void user_dlm_read_lvb(struct inode *inode, 83ssize_t user_dlm_read_lvb(struct inode *inode,
84 char *val, 84 char *val,
85 unsigned int len); 85 unsigned int len);
86struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 86struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
87 struct dlm_protocol_version *proto); 87void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
88void user_dlm_unregister_context(struct dlm_ctxt *dlm); 88void user_dlm_set_locking_protocol(void);
89 89
90struct dlmfs_inode_private { 90struct dlmfs_inode_private {
91 struct dlm_ctxt *ip_dlm; 91 struct ocfs2_cluster_connection *ip_conn;
92 92
93 struct user_lock_res ip_lockres; /* unused for directories. */ 93 struct user_lock_res ip_lockres; /* unused for directories. */
94 struct inode *ip_parent; 94 struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e044019cb3b1..50c4ee805da4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
298} 298}
299 299
300static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
301{
302 return container_of(lksb, struct ocfs2_lock_res, l_lksb);
303}
304
300static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 305static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
301{ 306{
302 BUG_ON(!ocfs2_is_inode_lock(lockres)); 307 BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -927,6 +932,10 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
927 lockres->l_blocking = level; 932 lockres->l_blocking = level;
928 } 933 }
929 934
935 mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
936 lockres->l_name, level, lockres->l_level, lockres->l_blocking,
937 needs_downconvert);
938
930 if (needs_downconvert) 939 if (needs_downconvert)
931 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 940 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
932 941
@@ -1040,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
1040 return lockres->l_pending_gen; 1049 return lockres->l_pending_gen;
1041} 1050}
1042 1051
1043 1052static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
1044static void ocfs2_blocking_ast(void *opaque, int level)
1045{ 1053{
1046 struct ocfs2_lock_res *lockres = opaque; 1054 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1047 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1055 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1048 int needs_downconvert; 1056 int needs_downconvert;
1049 unsigned long flags; 1057 unsigned long flags;
1050 1058
1051 BUG_ON(level <= DLM_LOCK_NL); 1059 BUG_ON(level <= DLM_LOCK_NL);
1052 1060
1053 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 1061 mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
1054 lockres->l_name, level, lockres->l_level, 1062 "type %s\n", lockres->l_name, level, lockres->l_level,
1055 ocfs2_lock_type_string(lockres->l_type)); 1063 ocfs2_lock_type_string(lockres->l_type));
1056 1064
1057 /* 1065 /*
@@ -1072,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
1072 ocfs2_wake_downconvert_thread(osb); 1080 ocfs2_wake_downconvert_thread(osb);
1073} 1081}
1074 1082
1075static void ocfs2_locking_ast(void *opaque) 1083static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
1076{ 1084{
1077 struct ocfs2_lock_res *lockres = opaque; 1085 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1078 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1086 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1079 unsigned long flags; 1087 unsigned long flags;
1080 int status; 1088 int status;
@@ -1095,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
1095 return; 1103 return;
1096 } 1104 }
1097 1105
1106 mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
1107 "level %d => %d\n", lockres->l_name, lockres->l_action,
1108 lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
1109
1098 switch(lockres->l_action) { 1110 switch(lockres->l_action) {
1099 case OCFS2_AST_ATTACH: 1111 case OCFS2_AST_ATTACH:
1100 ocfs2_generic_handle_attach_action(lockres); 1112 ocfs2_generic_handle_attach_action(lockres);
@@ -1107,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
1107 ocfs2_generic_handle_downconvert_action(lockres); 1119 ocfs2_generic_handle_downconvert_action(lockres);
1108 break; 1120 break;
1109 default: 1121 default:
1110 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 1122 mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
1111 "lockres flags = 0x%lx, unlock action: %u\n", 1123 "flags 0x%lx, unlock: %u\n",
1112 lockres->l_name, lockres->l_action, lockres->l_flags, 1124 lockres->l_name, lockres->l_action, lockres->l_flags,
1113 lockres->l_unlock_action); 1125 lockres->l_unlock_action);
1114 BUG(); 1126 BUG();
@@ -1134,6 +1146,88 @@ out:
1134 spin_unlock_irqrestore(&lockres->l_lock, flags); 1146 spin_unlock_irqrestore(&lockres->l_lock, flags);
1135} 1147}
1136 1148
1149static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1150{
1151 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1152 unsigned long flags;
1153
1154 mlog_entry_void();
1155
1156 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
1157 lockres->l_name, lockres->l_unlock_action);
1158
1159 spin_lock_irqsave(&lockres->l_lock, flags);
1160 if (error) {
1161 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
1162 "unlock_action %d\n", error, lockres->l_name,
1163 lockres->l_unlock_action);
1164 spin_unlock_irqrestore(&lockres->l_lock, flags);
1165 mlog_exit_void();
1166 return;
1167 }
1168
1169 switch(lockres->l_unlock_action) {
1170 case OCFS2_UNLOCK_CANCEL_CONVERT:
1171 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
1172 lockres->l_action = OCFS2_AST_INVALID;
1173 /* Downconvert thread may have requeued this lock, we
1174 * need to wake it. */
1175 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1176 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
1177 break;
1178 case OCFS2_UNLOCK_DROP_LOCK:
1179 lockres->l_level = DLM_LOCK_IV;
1180 break;
1181 default:
1182 BUG();
1183 }
1184
1185 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1187 wake_up(&lockres->l_event);
1188 spin_unlock_irqrestore(&lockres->l_lock, flags);
1189
1190 mlog_exit_void();
1191}
1192
1193/*
1194 * This is the filesystem locking protocol. It provides the lock handling
1195 * hooks for the underlying DLM. It has a maximum version number.
1196 * The version number allows interoperability with systems running at
1197 * the same major number and an equal or smaller minor number.
1198 *
1199 * Whenever the filesystem does new things with locks (adds or removes a
1200 * lock, orders them differently, does different things underneath a lock),
1201 * the version must be changed. The protocol is negotiated when joining
1202 * the dlm domain. A node may join the domain if its major version is
1203 * identical to all other nodes and its minor version is greater than
1204 * or equal to all other nodes. When its minor version is greater than
1205 * the other nodes, it will run at the minor version specified by the
1206 * other nodes.
1207 *
1208 * If a locking change is made that will not be compatible with older
1209 * versions, the major number must be increased and the minor version set
1210 * to zero. If a change merely adds a behavior that can be disabled when
1211 * speaking to older versions, the minor version must be increased. If a
1212 * change adds a fully backwards compatible change (eg, LVB changes that
1213 * are just ignored by older versions), the version does not need to be
1214 * updated.
1215 */
1216static struct ocfs2_locking_protocol lproto = {
1217 .lp_max_version = {
1218 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
1219 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
1220 },
1221 .lp_lock_ast = ocfs2_locking_ast,
1222 .lp_blocking_ast = ocfs2_blocking_ast,
1223 .lp_unlock_ast = ocfs2_unlock_ast,
1224};
1225
1226void ocfs2_set_locking_protocol(void)
1227{
1228 ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
1229}
1230
1137static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 1231static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1138 int convert) 1232 int convert)
1139{ 1233{
@@ -1189,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1189 &lockres->l_lksb, 1283 &lockres->l_lksb,
1190 dlm_flags, 1284 dlm_flags,
1191 lockres->l_name, 1285 lockres->l_name,
1192 OCFS2_LOCK_ID_MAX_LEN - 1, 1286 OCFS2_LOCK_ID_MAX_LEN - 1);
1193 lockres);
1194 lockres_clear_pending(lockres, gen, osb); 1287 lockres_clear_pending(lockres, gen, osb);
1195 if (ret) { 1288 if (ret) {
1196 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1289 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1412,7 +1505,7 @@ again:
1412 BUG_ON(level == DLM_LOCK_IV); 1505 BUG_ON(level == DLM_LOCK_IV);
1413 BUG_ON(level == DLM_LOCK_NL); 1506 BUG_ON(level == DLM_LOCK_NL);
1414 1507
1415 mlog(0, "lock %s, convert from %d to level = %d\n", 1508 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
1416 lockres->l_name, lockres->l_level, level); 1509 lockres->l_name, lockres->l_level, level);
1417 1510
1418 /* call dlm_lock to upgrade lock now */ 1511 /* call dlm_lock to upgrade lock now */
@@ -1421,8 +1514,7 @@ again:
1421 &lockres->l_lksb, 1514 &lockres->l_lksb,
1422 lkm_flags, 1515 lkm_flags,
1423 lockres->l_name, 1516 lockres->l_name,
1424 OCFS2_LOCK_ID_MAX_LEN - 1, 1517 OCFS2_LOCK_ID_MAX_LEN - 1);
1425 lockres);
1426 lockres_clear_pending(lockres, gen, osb); 1518 lockres_clear_pending(lockres, gen, osb);
1427 if (ret) { 1519 if (ret) {
1428 if (!(lkm_flags & DLM_LKF_NOQUEUE) || 1520 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1789,7 +1881,7 @@ out:
1789 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of 1881 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1790 * flock() calls. The locking approach this requires is sufficiently 1882 * flock() calls. The locking approach this requires is sufficiently
1791 * different from all other cluster lock types that we implement a 1883 * different from all other cluster lock types that we implement a
1792 * seperate path to the "low-level" dlm calls. In particular: 1884 * separate path to the "low-level" dlm calls. In particular:
1793 * 1885 *
1794 * - No optimization of lock levels is done - we take at exactly 1886 * - No optimization of lock levels is done - we take at exactly
1795 * what's been requested. 1887 * what's been requested.
@@ -1859,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1859 spin_unlock_irqrestore(&lockres->l_lock, flags); 1951 spin_unlock_irqrestore(&lockres->l_lock, flags);
1860 1952
1861 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, 1953 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1862 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1954 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
1863 lockres);
1864 if (ret) { 1955 if (ret) {
1865 if (!trylock || (ret != -EAGAIN)) { 1956 if (!trylock || (ret != -EAGAIN)) {
1866 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1957 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -2989,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2989 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 3080 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2990 osb->uuid_str, 3081 osb->uuid_str,
2991 strlen(osb->uuid_str), 3082 strlen(osb->uuid_str),
2992 ocfs2_do_node_down, osb, 3083 &lproto, ocfs2_do_node_down, osb,
2993 &conn); 3084 &conn);
2994 if (status) { 3085 if (status) {
2995 mlog_errno(status); 3086 mlog_errno(status);
@@ -3056,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3056 mlog_exit_void(); 3147 mlog_exit_void();
3057} 3148}
3058 3149
3059static void ocfs2_unlock_ast(void *opaque, int error)
3060{
3061 struct ocfs2_lock_res *lockres = opaque;
3062 unsigned long flags;
3063
3064 mlog_entry_void();
3065
3066 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
3067 lockres->l_unlock_action);
3068
3069 spin_lock_irqsave(&lockres->l_lock, flags);
3070 if (error) {
3071 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
3072 "unlock_action %d\n", error, lockres->l_name,
3073 lockres->l_unlock_action);
3074 spin_unlock_irqrestore(&lockres->l_lock, flags);
3075 mlog_exit_void();
3076 return;
3077 }
3078
3079 switch(lockres->l_unlock_action) {
3080 case OCFS2_UNLOCK_CANCEL_CONVERT:
3081 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
3082 lockres->l_action = OCFS2_AST_INVALID;
3083 /* Downconvert thread may have requeued this lock, we
3084 * need to wake it. */
3085 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
3086 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
3087 break;
3088 case OCFS2_UNLOCK_DROP_LOCK:
3089 lockres->l_level = DLM_LOCK_IV;
3090 break;
3091 default:
3092 BUG();
3093 }
3094
3095 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
3096 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
3097 wake_up(&lockres->l_event);
3098 spin_unlock_irqrestore(&lockres->l_lock, flags);
3099
3100 mlog_exit_void();
3101}
3102
3103static int ocfs2_drop_lock(struct ocfs2_super *osb, 3150static int ocfs2_drop_lock(struct ocfs2_super *osb,
3104 struct ocfs2_lock_res *lockres) 3151 struct ocfs2_lock_res *lockres)
3105{ 3152{
@@ -3167,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
3167 3214
3168 mlog(0, "lock %s\n", lockres->l_name); 3215 mlog(0, "lock %s\n", lockres->l_name);
3169 3216
3170 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, 3217 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
3171 lockres);
3172 if (ret) { 3218 if (ret) {
3173 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3219 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3174 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 3220 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3276,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
3276 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); 3322 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
3277 3323
3278 if (lockres->l_level <= new_level) { 3324 if (lockres->l_level <= new_level) {
3279 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", 3325 mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
3280 lockres->l_level, new_level); 3326 "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
3327 "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
3328 new_level, list_empty(&lockres->l_blocked_list),
3329 list_empty(&lockres->l_mask_waiters), lockres->l_type,
3330 lockres->l_flags, lockres->l_ro_holders,
3331 lockres->l_ex_holders, lockres->l_action,
3332 lockres->l_unlock_action, lockres->l_requested,
3333 lockres->l_blocking, lockres->l_pending_gen);
3281 BUG(); 3334 BUG();
3282 } 3335 }
3283 3336
3284 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 3337 mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
3285 lockres->l_name, new_level, lockres->l_blocking); 3338 lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
3286 3339
3287 lockres->l_action = OCFS2_AST_DOWNCONVERT; 3340 lockres->l_action = OCFS2_AST_DOWNCONVERT;
3288 lockres->l_requested = new_level; 3341 lockres->l_requested = new_level;
@@ -3301,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3301 3354
3302 mlog_entry_void(); 3355 mlog_entry_void();
3303 3356
3357 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
3358 lockres->l_level, new_level);
3359
3304 if (lvb) 3360 if (lvb)
3305 dlm_flags |= DLM_LKF_VALBLK; 3361 dlm_flags |= DLM_LKF_VALBLK;
3306 3362
@@ -3309,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3309 &lockres->l_lksb, 3365 &lockres->l_lksb,
3310 dlm_flags, 3366 dlm_flags,
3311 lockres->l_name, 3367 lockres->l_name,
3312 OCFS2_LOCK_ID_MAX_LEN - 1, 3368 OCFS2_LOCK_ID_MAX_LEN - 1);
3313 lockres);
3314 lockres_clear_pending(lockres, generation, osb); 3369 lockres_clear_pending(lockres, generation, osb);
3315 if (ret) { 3370 if (ret) {
3316 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 3371 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3331,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3331 assert_spin_locked(&lockres->l_lock); 3386 assert_spin_locked(&lockres->l_lock);
3332 3387
3333 mlog_entry_void(); 3388 mlog_entry_void();
3334 mlog(0, "lock %s\n", lockres->l_name);
3335 3389
3336 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 3390 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3337 /* If we're already trying to cancel a lock conversion 3391 /* If we're already trying to cancel a lock conversion
3338 * then just drop the spinlock and allow the caller to 3392 * then just drop the spinlock and allow the caller to
3339 * requeue this lock. */ 3393 * requeue this lock. */
3340 3394 mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
3341 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
3342 return 0; 3395 return 0;
3343 } 3396 }
3344 3397
@@ -3353,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3353 "lock %s, invalid flags: 0x%lx\n", 3406 "lock %s, invalid flags: 0x%lx\n",
3354 lockres->l_name, lockres->l_flags); 3407 lockres->l_name, lockres->l_flags);
3355 3408
3409 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3410
3356 return 1; 3411 return 1;
3357} 3412}
3358 3413
@@ -3362,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3362 int ret; 3417 int ret;
3363 3418
3364 mlog_entry_void(); 3419 mlog_entry_void();
3365 mlog(0, "lock %s\n", lockres->l_name);
3366 3420
3367 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, 3421 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
3368 DLM_LKF_CANCEL, lockres); 3422 DLM_LKF_CANCEL);
3369 if (ret) { 3423 if (ret) {
3370 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3424 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3371 ocfs2_recover_from_dlm_error(lockres, 0); 3425 ocfs2_recover_from_dlm_error(lockres, 0);
3372 } 3426 }
3373 3427
3374 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); 3428 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3375 3429
3376 mlog_exit(ret); 3430 mlog_exit(ret);
3377 return ret; 3431 return ret;
@@ -3428,8 +3482,11 @@ recheck:
3428 * at the same time they set OCFS2_DLM_BUSY. They must 3482 * at the same time they set OCFS2_DLM_BUSY. They must
3429 * clear OCFS2_DLM_PENDING after dlm_lock() returns. 3483 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3430 */ 3484 */
3431 if (lockres->l_flags & OCFS2_LOCK_PENDING) 3485 if (lockres->l_flags & OCFS2_LOCK_PENDING) {
3486 mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
3487 lockres->l_name);
3432 goto leave_requeue; 3488 goto leave_requeue;
3489 }
3433 3490
3434 ctl->requeue = 1; 3491 ctl->requeue = 1;
3435 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3492 ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3461,6 +3518,7 @@ recheck:
3461 */ 3518 */
3462 if (lockres->l_level == DLM_LOCK_NL) { 3519 if (lockres->l_level == DLM_LOCK_NL) {
3463 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders); 3520 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
3521 mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
3464 lockres->l_blocking = DLM_LOCK_NL; 3522 lockres->l_blocking = DLM_LOCK_NL;
3465 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 3523 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
3466 spin_unlock_irqrestore(&lockres->l_lock, flags); 3524 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3470,28 +3528,41 @@ recheck:
3470 /* if we're blocking an exclusive and we have *any* holders, 3528 /* if we're blocking an exclusive and we have *any* holders,
3471 * then requeue. */ 3529 * then requeue. */
3472 if ((lockres->l_blocking == DLM_LOCK_EX) 3530 if ((lockres->l_blocking == DLM_LOCK_EX)
3473 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3531 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
3532 mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
3533 lockres->l_name, lockres->l_ex_holders,
3534 lockres->l_ro_holders);
3474 goto leave_requeue; 3535 goto leave_requeue;
3536 }
3475 3537
3476 /* If it's a PR we're blocking, then only 3538 /* If it's a PR we're blocking, then only
3477 * requeue if we've got any EX holders */ 3539 * requeue if we've got any EX holders */
3478 if (lockres->l_blocking == DLM_LOCK_PR && 3540 if (lockres->l_blocking == DLM_LOCK_PR &&
3479 lockres->l_ex_holders) 3541 lockres->l_ex_holders) {
3542 mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
3543 lockres->l_name, lockres->l_ex_holders);
3480 goto leave_requeue; 3544 goto leave_requeue;
3545 }
3481 3546
3482 /* 3547 /*
3483 * Can we get a lock in this state if the holder counts are 3548 * Can we get a lock in this state if the holder counts are
3484 * zero? The meta data unblock code used to check this. 3549 * zero? The meta data unblock code used to check this.
3485 */ 3550 */
3486 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 3551 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
3487 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) 3552 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
3553 mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
3554 lockres->l_name);
3488 goto leave_requeue; 3555 goto leave_requeue;
3556 }
3489 3557
3490 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 3558 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
3491 3559
3492 if (lockres->l_ops->check_downconvert 3560 if (lockres->l_ops->check_downconvert
3493 && !lockres->l_ops->check_downconvert(lockres, new_level)) 3561 && !lockres->l_ops->check_downconvert(lockres, new_level)) {
3562 mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
3563 lockres->l_name);
3494 goto leave_requeue; 3564 goto leave_requeue;
3565 }
3495 3566
3496 /* If we get here, then we know that there are no more 3567 /* If we get here, then we know that there are no more
3497 * incompatible holders (and anyone asking for an incompatible 3568 * incompatible holders (and anyone asking for an incompatible
@@ -3509,13 +3580,19 @@ recheck:
3509 3580
3510 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); 3581 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
3511 3582
3512 if (ctl->unblock_action == UNBLOCK_STOP_POST) 3583 if (ctl->unblock_action == UNBLOCK_STOP_POST) {
3584 mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
3585 lockres->l_name);
3513 goto leave; 3586 goto leave;
3587 }
3514 3588
3515 spin_lock_irqsave(&lockres->l_lock, flags); 3589 spin_lock_irqsave(&lockres->l_lock, flags);
3516 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) { 3590 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
3517 /* If this changed underneath us, then we can't drop 3591 /* If this changed underneath us, then we can't drop
3518 * it just yet. */ 3592 * it just yet. */
3593 mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
3594 "Recheck\n", lockres->l_name, blocking,
3595 lockres->l_blocking, level, lockres->l_level);
3519 goto recheck; 3596 goto recheck;
3520 } 3597 }
3521 3598
@@ -3910,45 +3987,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3910 ocfs2_cluster_unlock(osb, lockres, level); 3987 ocfs2_cluster_unlock(osb, lockres, level);
3911} 3988}
3912 3989
3913/*
3914 * This is the filesystem locking protocol. It provides the lock handling
3915 * hooks for the underlying DLM. It has a maximum version number.
3916 * The version number allows interoperability with systems running at
3917 * the same major number and an equal or smaller minor number.
3918 *
3919 * Whenever the filesystem does new things with locks (adds or removes a
3920 * lock, orders them differently, does different things underneath a lock),
3921 * the version must be changed. The protocol is negotiated when joining
3922 * the dlm domain. A node may join the domain if its major version is
3923 * identical to all other nodes and its minor version is greater than
3924 * or equal to all other nodes. When its minor version is greater than
3925 * the other nodes, it will run at the minor version specified by the
3926 * other nodes.
3927 *
3928 * If a locking change is made that will not be compatible with older
3929 * versions, the major number must be increased and the minor version set
3930 * to zero. If a change merely adds a behavior that can be disabled when
3931 * speaking to older versions, the minor version must be increased. If a
3932 * change adds a fully backwards compatible change (eg, LVB changes that
3933 * are just ignored by older versions), the version does not need to be
3934 * updated.
3935 */
3936static struct ocfs2_locking_protocol lproto = {
3937 .lp_max_version = {
3938 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3939 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3940 },
3941 .lp_lock_ast = ocfs2_locking_ast,
3942 .lp_blocking_ast = ocfs2_blocking_ast,
3943 .lp_unlock_ast = ocfs2_unlock_ast,
3944};
3945
3946void ocfs2_set_locking_protocol(void)
3947{
3948 ocfs2_stack_glue_set_locking_protocol(&lproto);
3949}
3950
3951
3952static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3990static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3953 struct ocfs2_lock_res *lockres) 3991 struct ocfs2_lock_res *lockres)
3954{ 3992{
@@ -3965,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3965 BUG_ON(!lockres); 4003 BUG_ON(!lockres);
3966 BUG_ON(!lockres->l_ops); 4004 BUG_ON(!lockres->l_ops);
3967 4005
3968 mlog(0, "lockres %s blocked.\n", lockres->l_name); 4006 mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
3969 4007
3970 /* Detect whether a lock has been marked as going away while 4008 /* Detect whether a lock has been marked as going away while
3971 * the downconvert thread was processing other things. A lock can 4009 * the downconvert thread was processing other things. A lock can
@@ -3988,7 +4026,7 @@ unqueue:
3988 } else 4026 } else
3989 ocfs2_schedule_blocked_lock(osb, lockres); 4027 ocfs2_schedule_blocked_lock(osb, lockres);
3990 4028
3991 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 4029 mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
3992 ctl.requeue ? "yes" : "no"); 4030 ctl.requeue ? "yes" : "no");
3993 spin_unlock_irqrestore(&lockres->l_lock, flags); 4031 spin_unlock_irqrestore(&lockres->l_lock, flags);
3994 4032
@@ -4010,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
4010 /* Do not schedule a lock for downconvert when it's on 4048 /* Do not schedule a lock for downconvert when it's on
4011 * the way to destruction - any nodes wanting access 4049 * the way to destruction - any nodes wanting access
4012 * to the resource will get it soon. */ 4050 * to the resource will get it soon. */
4013 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 4051 mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
4014 lockres->l_name, lockres->l_flags); 4052 lockres->l_name, lockres->l_flags);
4015 return; 4053 return;
4016 } 4054 }
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 5328529e7fd2..c562a7581cf9 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -453,7 +453,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
453 if (i == -1) { 453 if (i == -1) {
454 /* 454 /*
455 * Holes can be larger than the maximum size of an 455 * Holes can be larger than the maximum size of an
456 * extent, so we return their lengths in a seperate 456 * extent, so we return their lengths in a separate
457 * field. 457 * field.
458 */ 458 */
459 if (hole_len) { 459 if (hole_len) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 558ce0312421..17947dc8341e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
109 109
110 if (file->f_mode & FMODE_WRITE)
111 dquot_initialize(inode);
112
110 spin_lock(&oi->ip_lock); 113 spin_lock(&oi->ip_lock);
111 114
112 /* Check that the inode hasn't been wiped from disk by another 115 /* Check that the inode hasn't been wiped from disk by another
@@ -629,11 +632,10 @@ restart_all:
629 } 632 }
630 633
631restarted_transaction: 634restarted_transaction:
632 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 635 status = dquot_alloc_space_nodirty(inode,
633 clusters_to_add))) { 636 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
634 status = -EDQUOT; 637 if (status)
635 goto leave; 638 goto leave;
636 }
637 did_quota = 1; 639 did_quota = 1;
638 640
639 /* reserve a write to the file entry early on - that we if we 641 /* reserve a write to the file entry early on - that we if we
@@ -674,7 +676,7 @@ restarted_transaction:
674 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
675 spin_unlock(&OCFS2_I(inode)->ip_lock); 677 spin_unlock(&OCFS2_I(inode)->ip_lock);
676 /* Release unused quota reservation */ 678 /* Release unused quota reservation */
677 vfs_dq_free_space(inode, 679 dquot_free_space(inode,
678 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 680 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
679 did_quota = 0; 681 did_quota = 0;
680 682
@@ -710,7 +712,7 @@ restarted_transaction:
710 712
711leave: 713leave:
712 if (status < 0 && did_quota) 714 if (status < 0 && did_quota)
713 vfs_dq_free_space(inode, 715 dquot_free_space(inode,
714 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 716 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
715 if (handle) { 717 if (handle) {
716 ocfs2_commit_trans(osb, handle); 718 ocfs2_commit_trans(osb, handle);
@@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
978 980
979 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 981 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
980 if (size_change) { 982 if (size_change) {
983 dquot_initialize(inode);
984
981 status = ocfs2_rw_lock(inode, 1); 985 status = ocfs2_rw_lock(inode, 1);
982 if (status < 0) { 986 if (status < 0) {
983 mlog_errno(status); 987 mlog_errno(status);
@@ -993,10 +997,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
993 } 997 }
994 998
995 if (size_change && attr->ia_size != i_size_read(inode)) { 999 if (size_change && attr->ia_size != i_size_read(inode)) {
996 if (attr->ia_size > sb->s_maxbytes) { 1000 status = inode_newsize_ok(inode, attr->ia_size);
997 status = -EFBIG; 1001 if (status)
998 goto bail_unlock; 1002 goto bail_unlock;
999 }
1000 1003
1001 if (i_size_read(inode) > attr->ia_size) { 1004 if (i_size_read(inode) > attr->ia_size) {
1002 if (ocfs2_should_order_data(inode)) { 1005 if (ocfs2_should_order_data(inode)) {
@@ -1021,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1021 /* 1024 /*
1022 * Gather pointers to quota structures so that allocation / 1025 * Gather pointers to quota structures so that allocation /
1023 * freeing of quota structures happens here and not inside 1026 * freeing of quota structures happens here and not inside
1024 * vfs_dq_transfer() where we have problems with lock ordering 1027 * dquot_transfer() where we have problems with lock ordering
1025 */ 1028 */
1026 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1029 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1027 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1030 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1054,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1054 mlog_errno(status); 1057 mlog_errno(status);
1055 goto bail_unlock; 1058 goto bail_unlock;
1056 } 1059 }
1057 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 1060 status = dquot_transfer(inode, attr);
1058 if (status < 0) 1061 if (status < 0)
1059 goto bail_commit; 1062 goto bail_commit;
1060 } else { 1063 } else {
@@ -1836,6 +1839,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1836 &meta_level); 1839 &meta_level);
1837 if (has_refcount) 1840 if (has_refcount)
1838 *has_refcount = 1; 1841 *has_refcount = 1;
1842 if (direct_io)
1843 *direct_io = 0;
1839 } 1844 }
1840 1845
1841 if (ret < 0) { 1846 if (ret < 0) {
@@ -1859,10 +1864,6 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1859 break; 1864 break;
1860 } 1865 }
1861 1866
1862 if (has_refcount && *has_refcount == 1) {
1863 *direct_io = 0;
1864 break;
1865 }
1866 /* 1867 /*
1867 * Allowing concurrent direct writes means 1868 * Allowing concurrent direct writes means
1868 * i_size changes wouldn't be synchronized, so 1869 * i_size changes wouldn't be synchronized, so
@@ -2043,7 +2044,7 @@ out_dio:
2043 * async dio is going to do it in the future or an end_io after an 2044 * async dio is going to do it in the future or an end_io after an
2044 * error has already done it. 2045 * error has already done it.
2045 */ 2046 */
2046 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2047 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2047 rw_level = -1; 2048 rw_level = -1;
2048 have_alloc_sem = 0; 2049 have_alloc_sem = 0;
2049 } 2050 }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 88459bdd1ff3..ab207901d32a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode,
665 } 665 }
666 666
667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
668 vfs_dq_free_inode(inode); 668 dquot_free_inode(inode);
669 669
670 status = ocfs2_free_dinode(handle, inode_alloc_inode, 670 status = ocfs2_free_dinode(handle, inode_alloc_inode,
671 inode_alloc_bh, di); 671 inode_alloc_bh, di);
@@ -891,6 +891,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
891 /* Do some basic inode verification... */ 891 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 892 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
894 /*
895 * Inodes in the orphan dir must have ORPHANED_FL. The only
896 * inodes that come back out of the orphan dir are reflink
897 * targets. A reflink target may be moved out of the orphan
898 * dir between the time we scan the directory and the time we
899 * process it. This would lead to HAS_REFCOUNT_FL being set but
900 * ORPHANED_FL not.
901 */
902 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
903 mlog(0, "Reflinked inode %llu is no longer orphaned. "
904 "it shouldn't be deleted\n",
905 (unsigned long long)oi->ip_blkno);
906 goto bail;
907 }
908
894 /* for lack of a better error? */ 909 /* for lack of a better error? */
895 status = -EEXIST; 910 status = -EEXIST;
896 mlog(ML_ERROR, 911 mlog(ML_ERROR,
@@ -971,6 +986,8 @@ void ocfs2_delete_inode(struct inode *inode)
971 goto bail; 986 goto bail;
972 } 987 }
973 988
989 dquot_initialize(inode);
990
974 if (!ocfs2_inode_is_valid_to_delete(inode)) { 991 if (!ocfs2_inode_is_valid_to_delete(inode)) {
975 /* It's probably not necessary to truncate_inode_pages 992 /* It's probably not necessary to truncate_inode_pages
976 * here but we do it for safety anyway (it will most 993 * here but we do it for safety anyway (it will most
@@ -1087,6 +1104,8 @@ void ocfs2_clear_inode(struct inode *inode)
1087 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1104 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1088 "Inode=%lu\n", inode->i_ino); 1105 "Inode=%lu\n", inode->i_ino);
1089 1106
1107 dquot_drop(inode);
1108
1090 /* To preven remote deletes we hold open lock before, now it 1109 /* To preven remote deletes we hold open lock before, now it
1091 * is time to unlock PR and EX open locks. */ 1110 * is time to unlock PR and EX open locks. */
1092 ocfs2_open_unlock(inode); 1111 ocfs2_open_unlock(inode);
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
7 * 7 *
8 */ 8 */
9 9
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_PROTO_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_PROTO_H
12 12
13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
15 15
16#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
476 476
477out: 477out:
478 if (!status) 478 if (!status)
479 ocfs2_init_inode_steal_slot(osb); 479 ocfs2_init_steal_slots(osb);
480 mlog_exit(status); 480 mlog_exit(status);
481 return status; 481 return status;
482} 482}
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 872 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 873 (unsigned long long)blkno);
874 874
875 status = ocfs2_free_clusters(handle, main_bm_inode, 875 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 876 main_bm_inode,
877 main_bm_bh, blkno,
878 count);
877 if (status < 0) { 879 if (status < 0) {
878 mlog_errno(status); 880 mlog_errno(status);
879 goto bail; 881 goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 986 }
985 987
986retry_enospc: 988retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 989 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 990 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 991 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 992 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1062 OCFS2_LA_DISABLED)
1062 goto bail; 1063 goto bail;
1063 1064
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1066 status = ocfs2_claim_clusters(osb, handle, ac,
1065 osb->local_alloc_bits, 1067 osb->local_alloc_bits,
1066 &cluster_off, 1068 &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 50fb26a6a5f5..b1eb50ae4097 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
212 } else 212 } else
213 inode->i_gid = current_fsgid(); 213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode; 214 inode->i_mode = mode;
215 vfs_dq_init(inode); 215 dquot_initialize(inode);
216 return inode; 216 return inode;
217} 217}
218 218
@@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir,
244 (unsigned long)dev, dentry->d_name.len, 244 (unsigned long)dev, dentry->d_name.len,
245 dentry->d_name.name); 245 dentry->d_name.name);
246 246
247 dquot_initialize(dir);
248
247 /* get our super block */ 249 /* get our super block */
248 osb = OCFS2_SB(dir->i_sb); 250 osb = OCFS2_SB(dir->i_sb);
249 251
@@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir,
348 goto leave; 350 goto leave;
349 } 351 }
350 352
351 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 353 status = dquot_alloc_inode(inode);
352 * to be called. */ 354 if (status)
353 if (sb_any_quota_active(osb->sb) &&
354 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
355 status = -EDQUOT;
356 goto leave; 355 goto leave;
357 }
358 did_quota_inode = 1; 356 did_quota_inode = 1;
359 357
360 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 358 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
431 status = 0; 429 status = 0;
432leave: 430leave:
433 if (status < 0 && did_quota_inode) 431 if (status < 0 && did_quota_inode)
434 vfs_dq_free_inode(inode); 432 dquot_free_inode(inode);
435 if (handle) 433 if (handle)
436 ocfs2_commit_trans(osb, handle); 434 ocfs2_commit_trans(osb, handle);
437 435
@@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry,
636 if (S_ISDIR(inode->i_mode)) 634 if (S_ISDIR(inode->i_mode))
637 return -EPERM; 635 return -EPERM;
638 636
637 dquot_initialize(dir);
638
639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); 639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
640 if (err < 0) { 640 if (err < 0) {
641 if (err != -ENOENT) 641 if (err != -ENOENT)
@@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir,
791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
792 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
793 793
794 dquot_initialize(dir);
795
794 BUG_ON(dentry->d_parent->d_inode != dir); 796 BUG_ON(dentry->d_parent->d_inode != dir);
795 797
796 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 798 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -877,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
877 fe = (struct ocfs2_dinode *) fe_bh->b_data; 879 fe = (struct ocfs2_dinode *) fe_bh->b_data;
878 880
879 if (inode_is_unlinkable(inode)) { 881 if (inode_is_unlinkable(inode)) {
880 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 882 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
881 &orphan_insert, orphan_dir); 883 &orphan_insert, orphan_dir);
882 if (status < 0) { 884 if (status < 0) {
883 mlog_errno(status); 885 mlog_errno(status);
@@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir,
1051 old_dentry->d_name.len, old_dentry->d_name.name, 1053 old_dentry->d_name.len, old_dentry->d_name.name,
1052 new_dentry->d_name.len, new_dentry->d_name.name); 1054 new_dentry->d_name.len, new_dentry->d_name.name);
1053 1055
1056 dquot_initialize(old_dir);
1057 dquot_initialize(new_dir);
1058
1054 osb = OCFS2_SB(old_dir->i_sb); 1059 osb = OCFS2_SB(old_dir->i_sb);
1055 1060
1056 if (new_inode) { 1061 if (new_inode) {
@@ -1295,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
1295 if (S_ISDIR(new_inode->i_mode) || 1300 if (S_ISDIR(new_inode->i_mode) ||
1296 (ocfs2_read_links_count(newfe) == 1)) { 1301 (ocfs2_read_links_count(newfe) == 1)) {
1297 status = ocfs2_orphan_add(osb, handle, new_inode, 1302 status = ocfs2_orphan_add(osb, handle, new_inode,
1298 newfe, orphan_name, 1303 newfe_bh, orphan_name,
1299 &orphan_insert, orphan_dir); 1304 &orphan_insert, orphan_dir);
1300 if (status < 0) { 1305 if (status < 0) {
1301 mlog_errno(status); 1306 mlog_errno(status);
@@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir,
1599 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1604 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1600 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1605 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1601 1606
1607 dquot_initialize(dir);
1608
1602 sb = dir->i_sb; 1609 sb = dir->i_sb;
1603 osb = OCFS2_SB(sb); 1610 osb = OCFS2_SB(sb);
1604 1611
@@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir,
1688 goto bail; 1695 goto bail;
1689 } 1696 }
1690 1697
1691 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 1698 status = dquot_alloc_inode(inode);
1692 * to be called. */ 1699 if (status)
1693 if (sb_any_quota_active(osb->sb) &&
1694 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
1695 status = -EDQUOT;
1696 goto bail; 1700 goto bail;
1697 }
1698 did_quota_inode = 1; 1701 did_quota_inode = 1;
1699 1702
1700 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, 1703 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir,
1716 u32 offset = 0; 1719 u32 offset = 0;
1717 1720
1718 inode->i_op = &ocfs2_symlink_inode_operations; 1721 inode->i_op = &ocfs2_symlink_inode_operations;
1719 if (vfs_dq_alloc_space_nodirty(inode, 1722 status = dquot_alloc_space_nodirty(inode,
1720 ocfs2_clusters_to_bytes(osb->sb, 1))) { 1723 ocfs2_clusters_to_bytes(osb->sb, 1));
1721 status = -EDQUOT; 1724 if (status)
1722 goto bail; 1725 goto bail;
1723 }
1724 did_quota = 1; 1726 did_quota = 1;
1725 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1727 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1726 new_fe_bh, 1728 new_fe_bh,
@@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir,
1788 d_instantiate(dentry, inode); 1790 d_instantiate(dentry, inode);
1789bail: 1791bail:
1790 if (status < 0 && did_quota) 1792 if (status < 0 && did_quota)
1791 vfs_dq_free_space_nodirty(inode, 1793 dquot_free_space_nodirty(inode,
1792 ocfs2_clusters_to_bytes(osb->sb, 1)); 1794 ocfs2_clusters_to_bytes(osb->sb, 1));
1793 if (status < 0 && did_quota_inode) 1795 if (status < 0 && did_quota_inode)
1794 vfs_dq_free_inode(inode); 1796 dquot_free_inode(inode);
1795 if (handle) 1797 if (handle)
1796 ocfs2_commit_trans(osb, handle); 1798 ocfs2_commit_trans(osb, handle);
1797 1799
@@ -1909,7 +1911,7 @@ leave:
1909static int ocfs2_orphan_add(struct ocfs2_super *osb, 1911static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 handle_t *handle, 1912 handle_t *handle,
1911 struct inode *inode, 1913 struct inode *inode,
1912 struct ocfs2_dinode *fe, 1914 struct buffer_head *fe_bh,
1913 char *name, 1915 char *name,
1914 struct ocfs2_dir_lookup_result *lookup, 1916 struct ocfs2_dir_lookup_result *lookup,
1915 struct inode *orphan_dir_inode) 1917 struct inode *orphan_dir_inode)
@@ -1917,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1917 struct buffer_head *orphan_dir_bh = NULL; 1919 struct buffer_head *orphan_dir_bh = NULL;
1918 int status = 0; 1920 int status = 0;
1919 struct ocfs2_dinode *orphan_fe; 1921 struct ocfs2_dinode *orphan_fe;
1922 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1920 1923
1921 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1924 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1922 1925
@@ -1957,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1957 goto leave; 1960 goto leave;
1958 } 1961 }
1959 1962
1963 /*
1964 * We're going to journal the change of i_flags and i_orphaned_slot.
1965 * It's safe anyway, though some callers may duplicate the journaling.
1966 * Journaling within the func just make the logic look more
1967 * straightforward.
1968 */
1969 status = ocfs2_journal_access_di(handle,
1970 INODE_CACHE(inode),
1971 fe_bh,
1972 OCFS2_JOURNAL_ACCESS_WRITE);
1973 if (status < 0) {
1974 mlog_errno(status);
1975 goto leave;
1976 }
1977
1960 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1978 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1961 1979
1962 /* Record which orphan dir our inode now resides 1980 /* Record which orphan dir our inode now resides
@@ -1964,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1964 * dir to lock. */ 1982 * dir to lock. */
1965 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 1983 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1966 1984
1985 ocfs2_journal_dirty(handle, fe_bh);
1986
1967 mlog(0, "Inode %llu orphaned in slot %d\n", 1987 mlog(0, "Inode %llu orphaned in slot %d\n",
1968 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1988 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1969 1989
@@ -2099,13 +2119,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2099 goto leave; 2119 goto leave;
2100 } 2120 }
2101 2121
2102 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 2122 status = dquot_alloc_inode(inode);
2103 * to be called. */ 2123 if (status)
2104 if (sb_any_quota_active(osb->sb) &&
2105 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
2106 status = -EDQUOT;
2107 goto leave; 2124 goto leave;
2108 }
2109 did_quota_inode = 1; 2125 did_quota_inode = 1;
2110 2126
2111 inode->i_nlink = 0; 2127 inode->i_nlink = 0;
@@ -2125,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2125 } 2141 }
2126 2142
2127 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2143 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2128 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2144 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2129 &orphan_insert, orphan_dir); 2145 &orphan_insert, orphan_dir);
2130 if (status < 0) { 2146 if (status < 0) {
2131 mlog_errno(status); 2147 mlog_errno(status);
@@ -2140,7 +2156,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2140 insert_inode_hash(inode); 2156 insert_inode_hash(inode);
2141leave: 2157leave:
2142 if (status < 0 && did_quota_inode) 2158 if (status < 0 && did_quota_inode)
2143 vfs_dq_free_inode(inode); 2159 dquot_free_inode(inode);
2144 if (handle) 2160 if (handle)
2145 ocfs2_commit_trans(osb, handle); 2161 ocfs2_commit_trans(osb, handle);
2146 2162
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740f448041e2..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
42 42
43#include "ocfs2_fs.h" 43#include "ocfs2_fs.h"
44#include "ocfs2_lockid.h" 44#include "ocfs2_lockid.h"
45#include "ocfs2_ioctl.h"
45 46
46/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
47#include "blockcheck.h" 48#include "blockcheck.h"
@@ -159,7 +160,7 @@ struct ocfs2_lock_res {
159 int l_level; 160 int l_level;
160 unsigned int l_ro_holders; 161 unsigned int l_ro_holders;
161 unsigned int l_ex_holders; 162 unsigned int l_ex_holders;
162 union ocfs2_dlm_lksb l_lksb; 163 struct ocfs2_dlm_lksb l_lksb;
163 164
164 /* used from AST/BAST funcs. */ 165 /* used from AST/BAST funcs. */
165 enum ocfs2_ast_action l_action; 166 enum ocfs2_ast_action l_action;
@@ -305,7 +306,9 @@ struct ocfs2_super
305 u32 s_next_generation; 306 u32 s_next_generation;
306 unsigned long osb_flags; 307 unsigned long osb_flags;
307 s16 s_inode_steal_slot; 308 s16 s_inode_steal_slot;
309 s16 s_meta_steal_slot;
308 atomic_t s_num_inodes_stolen; 310 atomic_t s_num_inodes_stolen;
311 atomic_t s_num_meta_stolen;
309 312
310 unsigned long s_mount_opt; 313 unsigned long s_mount_opt;
311 unsigned int s_atime_quantum; 314 unsigned int s_atime_quantum;
@@ -760,35 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
760 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
761} 764}
762 765
763static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
764{ 767{
765 spin_lock(&osb->osb_lock); 768 ext2_set_bit(bit, bitmap);
766 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
767 spin_unlock(&osb->osb_lock);
768 atomic_set(&osb->s_num_inodes_stolen, 0);
769} 769}
770#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
770 771
771static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb, 772static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
772 s16 slot)
773{ 773{
774 spin_lock(&osb->osb_lock); 774 ext2_clear_bit(bit, bitmap);
775 osb->s_inode_steal_slot = slot;
776 spin_unlock(&osb->osb_lock);
777}
778
779static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
780{
781 s16 slot;
782
783 spin_lock(&osb->osb_lock);
784 slot = osb->s_inode_steal_slot;
785 spin_unlock(&osb->osb_lock);
786
787 return slot;
788} 775}
776#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
789 777
790#define ocfs2_set_bit ext2_set_bit
791#define ocfs2_clear_bit ext2_clear_bit
792#define ocfs2_test_bit ext2_test_bit 778#define ocfs2_test_bit ext2_test_bit
793#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 779#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
794#define ocfs2_find_next_bit ext2_find_next_bit 780#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7638a38c32bc..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -254,63 +254,6 @@
254 * refcount tree */ 254 * refcount tree */
255 255
256/* 256/*
257 * ioctl commands
258 */
259#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
260#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
261#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
262#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
263
264/*
265 * Space reservation / allocation / free ioctls and argument structure
266 * are designed to be compatible with XFS.
267 *
268 * ALLOCSP* and FREESP* are not and will never be supported, but are
269 * included here for completeness.
270 */
271struct ocfs2_space_resv {
272 __s16 l_type;
273 __s16 l_whence;
274 __s64 l_start;
275 __s64 l_len; /* len == 0 means until end of file */
276 __s32 l_sysid;
277 __u32 l_pid;
278 __s32 l_pad[4]; /* reserve area */
279};
280
281#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
282#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
283#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
284#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
285#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
286#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
287#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
288#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
289
290/* Used to pass group descriptor data when online resize is done */
291struct ocfs2_new_group_input {
292 __u64 group; /* Group descriptor's blkno. */
293 __u32 clusters; /* Total number of clusters in this group */
294 __u32 frees; /* Total free clusters in this group */
295 __u16 chain; /* Chain for this group */
296 __u16 reserved1;
297 __u32 reserved2;
298};
299
300#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
313/*
314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 257 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
315 */ 258 */
316#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 259#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_ioctl.h
5 *
6 * Defines OCFS2 ioctls.
7 *
8 * Copyright (C) 2010 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_IOCTL_H
21#define OCFS2_IOCTL_H
22
23/*
24 * ioctl commands
25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
30
31/*
32 * Space reservation / allocation / free ioctls and argument structure
33 * are designed to be compatible with XFS.
34 *
35 * ALLOCSP* and FREESP* are not and will never be supported, but are
36 * included here for completeness.
37 */
38struct ocfs2_space_resv {
39 __s16 l_type;
40 __s16 l_whence;
41 __s64 l_start;
42 __s64 l_len; /* len == 0 means until end of file */
43 __s32 l_sysid;
44 __u32 l_pid;
45 __s32 l_pad[4]; /* reserve area */
46};
47
48#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
49#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
50#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
51#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
52#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
53#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
54#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
55#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
56
57/* Used to pass group descriptor data when online resize is done */
58struct ocfs2_new_group_input {
59 __u64 group; /* Group descriptor's blkno. */
60 __u32 clusters; /* Total number of clusters in this group */
61 __u32 frees; /* Total free clusters in this group */
62 __u16 chain; /* Chain for this group */
63 __u16 reserved1;
64 __u32 reserved2;
65};
66
67#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
68#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
69#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
70
71/* Used to pass 2 file names to reflink. */
72struct reflink_arguments {
73 __u64 old_path;
74 __u64 new_path;
75 __u64 preserve;
76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78
79#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
23/* 23/*
24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for 24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for
25 * more details. 25 * more details.
26 *
27 * 1.0 - Initial locking version from ocfs2 1.4.
26 */ 28 */
27#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 29#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
28#define OCFS2_LOCKING_PROTOCOL_MINOR 0 30#define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4cad..355f41d1d520 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -851,13 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
851} 851}
852 852
853const struct dquot_operations ocfs2_quota_operations = { 853const struct dquot_operations ocfs2_quota_operations = {
854 .initialize = dquot_initialize,
855 .drop = dquot_drop,
856 .alloc_space = dquot_alloc_space,
857 .alloc_inode = dquot_alloc_inode,
858 .free_space = dquot_free_space,
859 .free_inode = dquot_free_inode,
860 .transfer = dquot_transfer,
861 .write_dquot = ocfs2_write_dquot, 854 .write_dquot = ocfs2_write_dquot,
862 .acquire_dquot = ocfs2_acquire_dquot, 855 .acquire_dquot = ocfs2_acquire_dquot,
863 .release_dquot = ocfs2_release_dquot, 856 .release_dquot = ocfs2_release_dquot,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 21f9e71223ca..a6467f3d262e 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -457,7 +457,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
457 break; 457 break;
458 } 458 }
459 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; 459 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
460 for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { 460 for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
461 qbh = NULL; 461 qbh = NULL;
462 status = ocfs2_read_quota_block(lqinode, 462 status = ocfs2_read_quota_block(lqinode,
463 ol_dqblk_block(sb, chunk, bit), 463 ol_dqblk_block(sb, chunk, bit),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8ae65c9c020c..29405f2ff616 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -626,7 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
626 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 626 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
627 memset(rb, 0, inode->i_sb->s_blocksize); 627 memset(rb, 0, inode->i_sb->s_blocksize);
628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 629 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
632 rb->rf_blkno = cpu_to_le64(first_blkno); 632 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -1330,7 +1330,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1331 1331
1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1333 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1333 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1335 new_rb->rf_blkno = cpu_to_le64(blkno);
1336 new_rb->rf_cpos = cpu_to_le32(0); 1336 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1576,7 +1576,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1577 memset(new_rb, 0, sb->s_blocksize); 1577 memset(new_rb, 0, sb->s_blocksize);
1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1579 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1579 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1582 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -4075,6 +4075,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4075 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4075 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4076 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4076 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4077 i_size_write(t_inode, size); 4077 i_size_write(t_inode, size);
4078 t_inode->i_blocks = s_inode->i_blocks;
4078 4079
4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4080 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4080 di->i_clusters = s_di->i_clusters; 4081 di->i_clusters = s_di->i_clusters;
@@ -4390,7 +4391,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4390 } 4391 }
4391 4392
4392 mutex_lock(&inode->i_mutex); 4393 mutex_lock(&inode->i_mutex);
4393 vfs_dq_init(dir); 4394 dquot_initialize(dir);
4394 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); 4395 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4395 mutex_unlock(&inode->i_mutex); 4396 mutex_unlock(&inode->i_mutex);
4396 if (!error) 4397 if (!error)
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3038c92af493..7020e1253ffa 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -161,24 +161,23 @@ static int dlm_status_to_errno(enum dlm_status status)
161 161
162static void o2dlm_lock_ast_wrapper(void *astarg) 162static void o2dlm_lock_ast_wrapper(void *astarg)
163{ 163{
164 BUG_ON(o2cb_stack.sp_proto == NULL); 164 struct ocfs2_dlm_lksb *lksb = astarg;
165 165
166 o2cb_stack.sp_proto->lp_lock_ast(astarg); 166 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
167} 167}
168 168
169static void o2dlm_blocking_ast_wrapper(void *astarg, int level) 169static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
170{ 170{
171 BUG_ON(o2cb_stack.sp_proto == NULL); 171 struct ocfs2_dlm_lksb *lksb = astarg;
172 172
173 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); 173 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
174} 174}
175 175
176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) 176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
177{ 177{
178 struct ocfs2_dlm_lksb *lksb = astarg;
178 int error = dlm_status_to_errno(status); 179 int error = dlm_status_to_errno(status);
179 180
180 BUG_ON(o2cb_stack.sp_proto == NULL);
181
182 /* 181 /*
183 * In o2dlm, you can get both the lock_ast() for the lock being 182 * In o2dlm, you can get both the lock_ast() for the lock being
184 * granted and the unlock_ast() for the CANCEL failing. A 183 * granted and the unlock_ast() for the CANCEL failing. A
@@ -193,16 +192,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
193 if (status == DLM_CANCELGRANT) 192 if (status == DLM_CANCELGRANT)
194 return; 193 return;
195 194
196 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); 195 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
197} 196}
198 197
199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, 198static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
200 int mode, 199 int mode,
201 union ocfs2_dlm_lksb *lksb, 200 struct ocfs2_dlm_lksb *lksb,
202 u32 flags, 201 u32 flags,
203 void *name, 202 void *name,
204 unsigned int namelen, 203 unsigned int namelen)
205 void *astarg)
206{ 204{
207 enum dlm_status status; 205 enum dlm_status status;
208 int o2dlm_mode = mode_to_o2dlm(mode); 206 int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +209,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
211 209
212 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, 210 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
213 o2dlm_flags, name, namelen, 211 o2dlm_flags, name, namelen,
214 o2dlm_lock_ast_wrapper, astarg, 212 o2dlm_lock_ast_wrapper, lksb,
215 o2dlm_blocking_ast_wrapper); 213 o2dlm_blocking_ast_wrapper);
216 ret = dlm_status_to_errno(status); 214 ret = dlm_status_to_errno(status);
217 return ret; 215 return ret;
218} 216}
219 217
220static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, 218static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
221 union ocfs2_dlm_lksb *lksb, 219 struct ocfs2_dlm_lksb *lksb,
222 u32 flags, 220 u32 flags)
223 void *astarg)
224{ 221{
225 enum dlm_status status; 222 enum dlm_status status;
226 int o2dlm_flags = flags_to_o2dlm(flags); 223 int o2dlm_flags = flags_to_o2dlm(flags);
227 int ret; 224 int ret;
228 225
229 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, 226 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
230 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); 227 o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
231 ret = dlm_status_to_errno(status); 228 ret = dlm_status_to_errno(status);
232 return ret; 229 return ret;
233} 230}
234 231
235static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 232static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
236{ 233{
237 return dlm_status_to_errno(lksb->lksb_o2dlm.status); 234 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
238} 235}
@@ -242,17 +239,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
242 * contents, it will zero out the LVB. Thus the caller can always trust 239 * contents, it will zero out the LVB. Thus the caller can always trust
243 * the contents. 240 * the contents.
244 */ 241 */
245static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 242static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
246{ 243{
247 return 1; 244 return 1;
248} 245}
249 246
250static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) 247static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
251{ 248{
252 return (void *)(lksb->lksb_o2dlm.lvb); 249 return (void *)(lksb->lksb_o2dlm.lvb);
253} 250}
254 251
255static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) 252static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256{ 253{
257 dlm_print_one_lock(lksb->lksb_o2dlm.lockid); 254 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
258} 255}
@@ -280,7 +277,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
280 struct dlm_protocol_version fs_version; 277 struct dlm_protocol_version fs_version;
281 278
282 BUG_ON(conn == NULL); 279 BUG_ON(conn == NULL);
283 BUG_ON(o2cb_stack.sp_proto == NULL); 280 BUG_ON(conn->cc_proto == NULL);
284 281
285 /* for now we only have one cluster/node, make sure we see it 282 /* for now we only have one cluster/node, make sure we see it
286 * in the heartbeat universe */ 283 * in the heartbeat universe */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index da78a2a334fd..5ae8812b2864 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -25,7 +25,6 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27 27
28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 28#include "stackglue.h"
30 29
31#include <linux/dlm_plock.h> 30#include <linux/dlm_plock.h>
@@ -63,8 +62,8 @@
63 * negotiated by the client. The client negotiates based on the maximum 62 * negotiated by the client. The client negotiates based on the maximum
64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major 63 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
65 * number from the "SETV" message must match 64 * number from the "SETV" message must match
66 * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number 65 * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
67 * must be less than or equal to ...->lp_max_version.pv_minor. 66 * must be less than or equal to ...sp_max_version.pv_minor.
68 * 67 *
69 * Once this information has been set, mounts will be allowed. From this 68 * Once this information has been set, mounts will be allowed. From this
70 * point on, the "DOWN" message can be sent for node down notification. 69 * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
401 char *ptr = NULL; 400 char *ptr = NULL;
402 struct ocfs2_control_private *p = file->private_data; 401 struct ocfs2_control_private *p = file->private_data;
403 struct ocfs2_protocol_version *max = 402 struct ocfs2_protocol_version *max =
404 &ocfs2_user_plugin.sp_proto->lp_max_version; 403 &ocfs2_user_plugin.sp_max_proto;
405 404
406 if (ocfs2_control_get_handshake_state(file) != 405 if (ocfs2_control_get_handshake_state(file) !=
407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL) 406 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +663,10 @@ static void ocfs2_control_exit(void)
664 -rc); 663 -rc);
665} 664}
666 665
667static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
668{
669 struct ocfs2_lock_res *res = astarg;
670 return &res->l_lksb.lksb_fsdlm;
671}
672
673static void fsdlm_lock_ast_wrapper(void *astarg) 666static void fsdlm_lock_ast_wrapper(void *astarg)
674{ 667{
675 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); 668 struct ocfs2_dlm_lksb *lksb = astarg;
676 int status = lksb->sb_status; 669 int status = lksb->lksb_fsdlm.sb_status;
677
678 BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
679 670
680 /* 671 /*
681 * For now we're punting on the issue of other non-standard errors 672 * For now we're punting on the issue of other non-standard errors
@@ -688,25 +679,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
688 */ 679 */
689 680
690 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) 681 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
691 ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); 682 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
692 else 683 else
693 ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); 684 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
694} 685}
695 686
696static void fsdlm_blocking_ast_wrapper(void *astarg, int level) 687static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
697{ 688{
698 BUG_ON(ocfs2_user_plugin.sp_proto == NULL); 689 struct ocfs2_dlm_lksb *lksb = astarg;
699 690
700 ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); 691 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
701} 692}
702 693
703static int user_dlm_lock(struct ocfs2_cluster_connection *conn, 694static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
704 int mode, 695 int mode,
705 union ocfs2_dlm_lksb *lksb, 696 struct ocfs2_dlm_lksb *lksb,
706 u32 flags, 697 u32 flags,
707 void *name, 698 void *name,
708 unsigned int namelen, 699 unsigned int namelen)
709 void *astarg)
710{ 700{
711 int ret; 701 int ret;
712 702
@@ -716,36 +706,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
716 706
717 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, 707 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
718 flags|DLM_LKF_NODLCKWT, name, namelen, 0, 708 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
719 fsdlm_lock_ast_wrapper, astarg, 709 fsdlm_lock_ast_wrapper, lksb,
720 fsdlm_blocking_ast_wrapper); 710 fsdlm_blocking_ast_wrapper);
721 return ret; 711 return ret;
722} 712}
723 713
724static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, 714static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
725 union ocfs2_dlm_lksb *lksb, 715 struct ocfs2_dlm_lksb *lksb,
726 u32 flags, 716 u32 flags)
727 void *astarg)
728{ 717{
729 int ret; 718 int ret;
730 719
731 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, 720 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
732 flags, &lksb->lksb_fsdlm, astarg); 721 flags, &lksb->lksb_fsdlm, lksb);
733 return ret; 722 return ret;
734} 723}
735 724
736static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 725static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
737{ 726{
738 return lksb->lksb_fsdlm.sb_status; 727 return lksb->lksb_fsdlm.sb_status;
739} 728}
740 729
741static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 730static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
742{ 731{
743 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; 732 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
744 733
745 return !invalid; 734 return !invalid;
746} 735}
747 736
748static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) 737static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
749{ 738{
750 if (!lksb->lksb_fsdlm.sb_lvbptr) 739 if (!lksb->lksb_fsdlm.sb_lvbptr)
751 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + 740 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +742,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
753 return (void *)(lksb->lksb_fsdlm.sb_lvbptr); 742 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
754} 743}
755 744
756static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 745static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
757{ 746{
758} 747}
759 748
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index f3df0baa9a48..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
36#define OCFS2_STACK_PLUGIN_USER "user" 36#define OCFS2_STACK_PLUGIN_USER "user"
37#define OCFS2_MAX_HB_CTL_PATH 256 37#define OCFS2_MAX_HB_CTL_PATH 256
38 38
39static struct ocfs2_locking_protocol *lproto; 39static struct ocfs2_protocol_version locking_max_version;
40static DEFINE_SPINLOCK(ocfs2_stack_lock); 40static DEFINE_SPINLOCK(ocfs2_stack_lock);
41static LIST_HEAD(ocfs2_stack_list); 41static LIST_HEAD(ocfs2_stack_list);
42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; 42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
176 spin_lock(&ocfs2_stack_lock); 176 spin_lock(&ocfs2_stack_lock);
177 if (!ocfs2_stack_lookup(plugin->sp_name)) { 177 if (!ocfs2_stack_lookup(plugin->sp_name)) {
178 plugin->sp_count = 0; 178 plugin->sp_count = 0;
179 plugin->sp_proto = lproto; 179 plugin->sp_max_proto = locking_max_version;
180 list_add(&plugin->sp_list, &ocfs2_stack_list); 180 list_add(&plugin->sp_list, &ocfs2_stack_list);
181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", 181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
182 plugin->sp_name); 182 plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
213} 213}
214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); 214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
215 215
216void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) 216void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
217{ 217{
218 struct ocfs2_stack_plugin *p; 218 struct ocfs2_stack_plugin *p;
219 219
220 BUG_ON(proto == NULL);
221
222 spin_lock(&ocfs2_stack_lock); 220 spin_lock(&ocfs2_stack_lock);
223 BUG_ON(active_stack != NULL); 221 if (memcmp(max_proto, &locking_max_version,
222 sizeof(struct ocfs2_protocol_version))) {
223 BUG_ON(locking_max_version.pv_major != 0);
224 224
225 lproto = proto; 225 locking_max_version = *max_proto;
226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) { 226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
227 p->sp_proto = lproto; 227 p->sp_max_proto = locking_max_version;
228 }
228 } 229 }
229
230 spin_unlock(&ocfs2_stack_lock); 230 spin_unlock(&ocfs2_stack_lock);
231} 231}
232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); 232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
233 233
234 234
235/* 235/*
236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take 236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
237 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the 237 * for the ast and bast functions. They will pass the lksb to the ast
238 * underlying stack plugins need to pilfer the lksb off of the lock_res. 238 * and bast. The caller can wrap the lksb with their own structure to
239 * If some other structure needs to be passed as an astarg, the plugins 239 * get more information.
240 * will need to be given a different avenue to the lksb.
241 */ 240 */
242int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 241int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
243 int mode, 242 int mode,
244 union ocfs2_dlm_lksb *lksb, 243 struct ocfs2_dlm_lksb *lksb,
245 u32 flags, 244 u32 flags,
246 void *name, 245 void *name,
247 unsigned int namelen, 246 unsigned int namelen)
248 struct ocfs2_lock_res *astarg)
249{ 247{
250 BUG_ON(lproto == NULL); 248 if (!lksb->lksb_conn)
251 249 lksb->lksb_conn = conn;
250 else
251 BUG_ON(lksb->lksb_conn != conn);
252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, 252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
253 name, namelen, astarg); 253 name, namelen);
254} 254}
255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); 255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
256 256
257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
258 union ocfs2_dlm_lksb *lksb, 258 struct ocfs2_dlm_lksb *lksb,
259 u32 flags, 259 u32 flags)
260 struct ocfs2_lock_res *astarg)
261{ 260{
262 BUG_ON(lproto == NULL); 261 BUG_ON(lksb->lksb_conn == NULL);
263 262
264 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); 263 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
265} 264}
266EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); 265EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
267 266
268int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 267int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
269{ 268{
270 return active_stack->sp_ops->lock_status(lksb); 269 return active_stack->sp_ops->lock_status(lksb);
271} 270}
272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); 271EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
273 272
274int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 273int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
275{ 274{
276 return active_stack->sp_ops->lvb_valid(lksb); 275 return active_stack->sp_ops->lvb_valid(lksb);
277} 276}
278EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); 277EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
279 278
280void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) 279void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
281{ 280{
282 return active_stack->sp_ops->lock_lvb(lksb); 281 return active_stack->sp_ops->lock_lvb(lksb);
283} 282}
284EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); 283EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
285 284
286void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 285void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
287{ 286{
288 active_stack->sp_ops->dump_lksb(lksb); 287 active_stack->sp_ops->dump_lksb(lksb);
289} 288}
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
312int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
313 const char *group, 312 const char *group,
314 int grouplen, 313 int grouplen,
314 struct ocfs2_locking_protocol *lproto,
315 void (*recovery_handler)(int node_num, 315 void (*recovery_handler)(int node_num,
316 void *recovery_data), 316 void *recovery_data),
317 void *recovery_data, 317 void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
329 goto out; 329 goto out;
330 } 330 }
331 331
332 if (memcmp(&lproto->lp_max_version, &locking_max_version,
333 sizeof(struct ocfs2_protocol_version))) {
334 rc = -EINVAL;
335 goto out;
336 }
337
332 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), 338 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
333 GFP_KERNEL); 339 GFP_KERNEL);
334 if (!new_conn) { 340 if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
341 new_conn->cc_recovery_handler = recovery_handler; 347 new_conn->cc_recovery_handler = recovery_handler;
342 new_conn->cc_recovery_data = recovery_data; 348 new_conn->cc_recovery_data = recovery_data;
343 349
350 new_conn->cc_proto = lproto;
344 /* Start the new connection at our maximum compatibility level */ 351 /* Start the new connection at our maximum compatibility level */
345 new_conn->cc_version = lproto->lp_max_version; 352 new_conn->cc_version = lproto->lp_max_version;
346 353
@@ -366,6 +373,24 @@ out:
366} 373}
367EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); 374EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
368 375
376/* The caller will ensure all nodes have the same cluster stack */
377int ocfs2_cluster_connect_agnostic(const char *group,
378 int grouplen,
379 struct ocfs2_locking_protocol *lproto,
380 void (*recovery_handler)(int node_num,
381 void *recovery_data),
382 void *recovery_data,
383 struct ocfs2_cluster_connection **conn)
384{
385 char *stack_name = NULL;
386
387 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
390 recovery_handler, recovery_data, conn);
391}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393
369/* If hangup_pending is 0, the stack driver will be dropped */ 394/* If hangup_pending is 0, the stack driver will be dropped */
370int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 395int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
371 int hangup_pending) 396 int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
453 ssize_t ret = 0; 478 ssize_t ret = 0;
454 479
455 spin_lock(&ocfs2_stack_lock); 480 spin_lock(&ocfs2_stack_lock);
456 if (lproto) 481 if (locking_max_version.pv_major)
457 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", 482 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
458 lproto->lp_max_version.pv_major, 483 locking_max_version.pv_major,
459 lproto->lp_max_version.pv_minor); 484 locking_max_version.pv_minor);
460 spin_unlock(&ocfs2_stack_lock); 485 spin_unlock(&ocfs2_stack_lock);
461 486
462 return ret; 487 return ret;
@@ -685,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
685 710
686static void __exit ocfs2_stack_glue_exit(void) 711static void __exit ocfs2_stack_glue_exit(void)
687{ 712{
688 lproto = NULL; 713 memset(&locking_max_version, 0,
714 sizeof(struct ocfs2_protocol_version));
715 locking_max_version.pv_major = 0;
716 locking_max_version.pv_minor = 0;
689 ocfs2_sysfs_exit(); 717 ocfs2_sysfs_exit();
690 if (ocfs2_table_header) 718 if (ocfs2_table_header)
691 unregister_sysctl_table(ocfs2_table_header); 719 unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
56}; 56};
57 57
58/* 58/*
59 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
60 */
61struct ocfs2_locking_protocol {
62 struct ocfs2_protocol_version lp_max_version;
63 void (*lp_lock_ast)(void *astarg);
64 void (*lp_blocking_ast)(void *astarg, int level);
65 void (*lp_unlock_ast)(void *astarg, int error);
66};
67
68
69/*
70 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only 59 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
71 * has a pointer to separately allocated lvb space. This struct exists only to 60 * has a pointer to separately allocated lvb space. This struct exists only to
72 * include in the lksb union to make space for a combined dlm_lksb and lvb. 61 * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
81 * size of the union is known. Lock status structures are embedded in 70 * size of the union is known. Lock status structures are embedded in
82 * ocfs2 inodes. 71 * ocfs2 inodes.
83 */ 72 */
84union ocfs2_dlm_lksb { 73struct ocfs2_cluster_connection;
85 struct dlm_lockstatus lksb_o2dlm; 74struct ocfs2_dlm_lksb {
86 struct dlm_lksb lksb_fsdlm; 75 union {
87 struct fsdlm_lksb_plus_lvb padding; 76 struct dlm_lockstatus lksb_o2dlm;
77 struct dlm_lksb lksb_fsdlm;
78 struct fsdlm_lksb_plus_lvb padding;
79 };
80 struct ocfs2_cluster_connection *lksb_conn;
81};
82
83/*
84 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
85 */
86struct ocfs2_locking_protocol {
87 struct ocfs2_protocol_version lp_max_version;
88 void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
89 void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
90 void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
88}; 91};
89 92
93
90/* 94/*
91 * A cluster connection. Mostly opaque to ocfs2, the connection holds 95 * A cluster connection. Mostly opaque to ocfs2, the connection holds
92 * state for the underlying stack. ocfs2 does use cc_version to determine 96 * state for the underlying stack. ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
96 char cc_name[GROUP_NAME_MAX]; 100 char cc_name[GROUP_NAME_MAX];
97 int cc_namelen; 101 int cc_namelen;
98 struct ocfs2_protocol_version cc_version; 102 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto;
99 void (*cc_recovery_handler)(int node_num, void *recovery_data); 104 void (*cc_recovery_handler)(int node_num, void *recovery_data);
100 void *cc_recovery_data; 105 void *cc_recovery_data;
101 void *cc_lockspace; 106 void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
155 * 160 *
156 * ast and bast functions are not part of the call because the 161 * ast and bast functions are not part of the call because the
157 * stack will likely want to wrap ast and bast calls before passing 162 * stack will likely want to wrap ast and bast calls before passing
158 * them to stack->sp_proto. 163 * them to stack->sp_proto. There is no astarg. The lksb will
164 * be passed back to the ast and bast functions. The caller can
165 * use this to find their object.
159 */ 166 */
160 int (*dlm_lock)(struct ocfs2_cluster_connection *conn, 167 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
161 int mode, 168 int mode,
162 union ocfs2_dlm_lksb *lksb, 169 struct ocfs2_dlm_lksb *lksb,
163 u32 flags, 170 u32 flags,
164 void *name, 171 void *name,
165 unsigned int namelen, 172 unsigned int namelen);
166 void *astarg);
167 173
168 /* 174 /*
169 * Call the underlying dlm unlock function. The ->dlm_unlock() 175 * Call the underlying dlm unlock function. The ->dlm_unlock()
170 * function should convert the flags as appropriate. 176 * function should convert the flags as appropriate.
171 * 177 *
172 * The unlock ast is not passed, as the stack will want to wrap 178 * The unlock ast is not passed, as the stack will want to wrap
173 * it before calling stack->sp_proto->lp_unlock_ast(). 179 * it before calling stack->sp_proto->lp_unlock_ast(). There is
180 * no astarg. The lksb will be passed back to the unlock ast
181 * function. The caller can use this to find their object.
174 */ 182 */
175 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, 183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
176 union ocfs2_dlm_lksb *lksb, 184 struct ocfs2_dlm_lksb *lksb,
177 u32 flags, 185 u32 flags);
178 void *astarg);
179 186
180 /* 187 /*
181 * Return the status of the current lock status block. The fs 188 * Return the status of the current lock status block. The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
183 * callback pulls out the stack-specific lksb, converts the status 190 * callback pulls out the stack-specific lksb, converts the status
184 * to a proper errno, and returns it. 191 * to a proper errno, and returns it.
185 */ 192 */
186 int (*lock_status)(union ocfs2_dlm_lksb *lksb); 193 int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
187 194
188 /* 195 /*
189 * Return non-zero if the LVB is valid. 196 * Return non-zero if the LVB is valid.
190 */ 197 */
191 int (*lvb_valid)(union ocfs2_dlm_lksb *lksb); 198 int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
192 199
193 /* 200 /*
194 * Pull the lvb pointer off of the stack-specific lksb. 201 * Pull the lvb pointer off of the stack-specific lksb.
195 */ 202 */
196 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 203 void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
197 204
198 /* 205 /*
199 * Cluster-aware posix locks 206 * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
210 * This is an optoinal debugging hook. If provided, the 217 * This is an optoinal debugging hook. If provided, the
211 * stack can dump debugging information about this lock. 218 * stack can dump debugging information about this lock.
212 */ 219 */
213 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); 220 void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
214}; 221};
215 222
216/* 223/*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
226 /* These are managed by the stackglue code. */ 233 /* These are managed by the stackglue code. */
227 struct list_head sp_list; 234 struct list_head sp_list;
228 unsigned int sp_count; 235 unsigned int sp_count;
229 struct ocfs2_locking_protocol *sp_proto; 236 struct ocfs2_protocol_version sp_max_proto;
230}; 237};
231 238
232 239
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
234int ocfs2_cluster_connect(const char *stack_name, 241int ocfs2_cluster_connect(const char *stack_name,
235 const char *group, 242 const char *group,
236 int grouplen, 243 int grouplen,
244 struct ocfs2_locking_protocol *lproto,
237 void (*recovery_handler)(int node_num, 245 void (*recovery_handler)(int node_num,
238 void *recovery_data), 246 void *recovery_data),
239 void *recovery_data, 247 void *recovery_data,
240 struct ocfs2_cluster_connection **conn); 248 struct ocfs2_cluster_connection **conn);
249/*
250 * Used by callers that don't store their stack name. They must ensure
251 * all nodes have the same stack.
252 */
253int ocfs2_cluster_connect_agnostic(const char *group,
254 int grouplen,
255 struct ocfs2_locking_protocol *lproto,
256 void (*recovery_handler)(int node_num,
257 void *recovery_data),
258 void *recovery_data,
259 struct ocfs2_cluster_connection **conn);
241int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
242 int hangup_pending); 261 int hangup_pending);
243void ocfs2_cluster_hangup(const char *group, int grouplen); 262void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
246struct ocfs2_lock_res; 265struct ocfs2_lock_res;
247int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
248 int mode, 267 int mode,
249 union ocfs2_dlm_lksb *lksb, 268 struct ocfs2_dlm_lksb *lksb,
250 u32 flags, 269 u32 flags,
251 void *name, 270 void *name,
252 unsigned int namelen, 271 unsigned int namelen);
253 struct ocfs2_lock_res *astarg);
254int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 272int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
255 union ocfs2_dlm_lksb *lksb, 273 struct ocfs2_dlm_lksb *lksb,
256 u32 flags, 274 u32 flags);
257 struct ocfs2_lock_res *astarg);
258 275
259int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); 276int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
260int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); 277int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
261void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 278void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
262void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 279void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
263 280
264int ocfs2_stack_supports_plocks(void); 281int ocfs2_stack_supports_plocks(void);
265int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, 282int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
266 struct file *file, int cmd, struct file_lock *fl); 283 struct file *file, int cmd, struct file_lock *fl);
267 284
268void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 285void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
269 286
270 287
271/* Used by stack plugins */ 288/* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
51#define ALLOC_NEW_GROUP 0x1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2 52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
53 53
54#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 95 struct buffer_head *group_bh,
96 unsigned int bit_off, 96 unsigned int bit_off,
97 unsigned int num_bits); 97 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 98static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 99 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 100 struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 145
153#define do_error(fmt, ...) \ 146#define do_error(fmt, ...) \
154 do{ \ 147 do{ \
155 if (clean_error) \ 148 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 149 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 150 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 151 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 153
161static int ocfs2_validate_gd_self(struct super_block *sb, 154static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 155 struct buffer_head *bh,
163 int clean_error) 156 int resize)
164{ 157{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 158 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 159
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 204static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 205 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 206 struct buffer_head *bh,
214 int clean_error) 207 int resize)
215{ 208{
216 unsigned int max_bits; 209 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 226 return -EINVAL;
234 } 227 }
235 228
236 if (le16_to_cpu(gd->bg_chain) >= 229 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 230 if ((le16_to_cpu(gd->bg_chain) >
231 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232 ((le16_to_cpu(gd->bg_chain) ==
233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 235 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
@@ -637,12 +633,113 @@ bail:
637 return status; 633 return status;
638} 634}
639 635
636static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
637{
638 spin_lock(&osb->osb_lock);
639 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
640 spin_unlock(&osb->osb_lock);
641 atomic_set(&osb->s_num_inodes_stolen, 0);
642}
643
644static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
645{
646 spin_lock(&osb->osb_lock);
647 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
648 spin_unlock(&osb->osb_lock);
649 atomic_set(&osb->s_num_meta_stolen, 0);
650}
651
652void ocfs2_init_steal_slots(struct ocfs2_super *osb)
653{
654 ocfs2_init_inode_steal_slot(osb);
655 ocfs2_init_meta_steal_slot(osb);
656}
657
658static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
659{
660 spin_lock(&osb->osb_lock);
661 if (type == INODE_ALLOC_SYSTEM_INODE)
662 osb->s_inode_steal_slot = slot;
663 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
664 osb->s_meta_steal_slot = slot;
665 spin_unlock(&osb->osb_lock);
666}
667
668static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
669{
670 int slot = OCFS2_INVALID_SLOT;
671
672 spin_lock(&osb->osb_lock);
673 if (type == INODE_ALLOC_SYSTEM_INODE)
674 slot = osb->s_inode_steal_slot;
675 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
676 slot = osb->s_meta_steal_slot;
677 spin_unlock(&osb->osb_lock);
678
679 return slot;
680}
681
682static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
683{
684 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
685}
686
687static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
688{
689 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
690}
691
692static int ocfs2_steal_resource(struct ocfs2_super *osb,
693 struct ocfs2_alloc_context *ac,
694 int type)
695{
696 int i, status = -ENOSPC;
697 int slot = __ocfs2_get_steal_slot(osb, type);
698
699 /* Start to steal resource from the first slot after ours. */
700 if (slot == OCFS2_INVALID_SLOT)
701 slot = osb->slot_num + 1;
702
703 for (i = 0; i < osb->max_slots; i++, slot++) {
704 if (slot == osb->max_slots)
705 slot = 0;
706
707 if (slot == osb->slot_num)
708 continue;
709
710 status = ocfs2_reserve_suballoc_bits(osb, ac,
711 type,
712 (u32)slot, NULL,
713 NOT_ALLOC_NEW_GROUP);
714 if (status >= 0) {
715 __ocfs2_set_steal_slot(osb, slot, type);
716 break;
717 }
718
719 ocfs2_free_ac_resource(ac);
720 }
721
722 return status;
723}
724
725static int ocfs2_steal_inode(struct ocfs2_super *osb,
726 struct ocfs2_alloc_context *ac)
727{
728 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
729}
730
731static int ocfs2_steal_meta(struct ocfs2_super *osb,
732 struct ocfs2_alloc_context *ac)
733{
734 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
735}
736
640int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 737int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
641 int blocks, 738 int blocks,
642 struct ocfs2_alloc_context **ac) 739 struct ocfs2_alloc_context **ac)
643{ 740{
644 int status; 741 int status;
645 u32 slot; 742 int slot = ocfs2_get_meta_steal_slot(osb);
646 743
647 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 744 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
648 if (!(*ac)) { 745 if (!(*ac)) {
@@ -653,12 +750,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
653 750
654 (*ac)->ac_bits_wanted = blocks; 751 (*ac)->ac_bits_wanted = blocks;
655 (*ac)->ac_which = OCFS2_AC_USE_META; 752 (*ac)->ac_which = OCFS2_AC_USE_META;
656 slot = osb->slot_num;
657 (*ac)->ac_group_search = ocfs2_block_group_search; 753 (*ac)->ac_group_search = ocfs2_block_group_search;
658 754
755 if (slot != OCFS2_INVALID_SLOT &&
756 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
757 goto extent_steal;
758
759 atomic_set(&osb->s_num_meta_stolen, 0);
659 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 760 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
660 EXTENT_ALLOC_SYSTEM_INODE, 761 EXTENT_ALLOC_SYSTEM_INODE,
661 slot, NULL, ALLOC_NEW_GROUP); 762 (u32)osb->slot_num, NULL,
763 ALLOC_NEW_GROUP);
764
765
766 if (status >= 0) {
767 status = 0;
768 if (slot != OCFS2_INVALID_SLOT)
769 ocfs2_init_meta_steal_slot(osb);
770 goto bail;
771 } else if (status < 0 && status != -ENOSPC) {
772 mlog_errno(status);
773 goto bail;
774 }
775
776 ocfs2_free_ac_resource(*ac);
777
778extent_steal:
779 status = ocfs2_steal_meta(osb, *ac);
780 atomic_inc(&osb->s_num_meta_stolen);
662 if (status < 0) { 781 if (status < 0) {
663 if (status != -ENOSPC) 782 if (status != -ENOSPC)
664 mlog_errno(status); 783 mlog_errno(status);
@@ -685,43 +804,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
685 ac); 804 ac);
686} 805}
687 806
688static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
689 struct ocfs2_alloc_context *ac)
690{
691 int i, status = -ENOSPC;
692 s16 slot = ocfs2_get_inode_steal_slot(osb);
693
694 /* Start to steal inodes from the first slot after ours. */
695 if (slot == OCFS2_INVALID_SLOT)
696 slot = osb->slot_num + 1;
697
698 for (i = 0; i < osb->max_slots; i++, slot++) {
699 if (slot == osb->max_slots)
700 slot = 0;
701
702 if (slot == osb->slot_num)
703 continue;
704
705 status = ocfs2_reserve_suballoc_bits(osb, ac,
706 INODE_ALLOC_SYSTEM_INODE,
707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
709 if (status >= 0) {
710 ocfs2_set_inode_steal_slot(osb, slot);
711 break;
712 }
713
714 ocfs2_free_ac_resource(ac);
715 }
716
717 return status;
718}
719
720int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 807int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
721 struct ocfs2_alloc_context **ac) 808 struct ocfs2_alloc_context **ac)
722{ 809{
723 int status; 810 int status;
724 s16 slot = ocfs2_get_inode_steal_slot(osb); 811 int slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group; 812 u64 alloc_group;
726 813
727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 814 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +841,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
754 * need to check our slots to see whether there is some space for us. 841 * need to check our slots to see whether there is some space for us.
755 */ 842 */
756 if (slot != OCFS2_INVALID_SLOT && 843 if (slot != OCFS2_INVALID_SLOT &&
757 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) 844 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
758 goto inode_steal; 845 goto inode_steal;
759 846
760 atomic_set(&osb->s_num_inodes_stolen, 0); 847 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group; 848 alloc_group = osb->osb_inode_alloc_group;
762 status = ocfs2_reserve_suballoc_bits(osb, *ac, 849 status = ocfs2_reserve_suballoc_bits(osb, *ac,
763 INODE_ALLOC_SYSTEM_INODE, 850 INODE_ALLOC_SYSTEM_INODE,
764 osb->slot_num, 851 (u32)osb->slot_num,
765 &alloc_group, 852 &alloc_group,
766 ALLOC_NEW_GROUP | 853 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL); 854 ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +876,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
789 ocfs2_free_ac_resource(*ac); 876 ocfs2_free_ac_resource(*ac);
790 877
791inode_steal: 878inode_steal:
792 status = ocfs2_steal_inode_from_other_nodes(osb, *ac); 879 status = ocfs2_steal_inode(osb, *ac);
793 atomic_inc(&osb->s_num_inodes_stolen); 880 atomic_inc(&osb->s_num_inodes_stolen);
794 if (status < 0) { 881 if (status < 0) {
795 if (status != -ENOSPC) 882 if (status != -ENOSPC)
@@ -1884,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1884 bits_wanted, cluster_start, num_clusters); 1971 bits_wanted, cluster_start, num_clusters);
1885} 1972}
1886 1973
1887static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1974static int ocfs2_block_group_clear_bits(handle_t *handle,
1888 struct inode *alloc_inode, 1975 struct inode *alloc_inode,
1889 struct ocfs2_group_desc *bg, 1976 struct ocfs2_group_desc *bg,
1890 struct buffer_head *group_bh, 1977 struct buffer_head *group_bh,
1891 unsigned int bit_off, 1978 unsigned int bit_off,
1892 unsigned int num_bits) 1979 unsigned int num_bits,
1980 void (*undo_fn)(unsigned int bit,
1981 unsigned long *bmap))
1893{ 1982{
1894 int status; 1983 int status;
1895 unsigned int tmp; 1984 unsigned int tmp;
1896 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1897 struct ocfs2_group_desc *undo_bg = NULL; 1985 struct ocfs2_group_desc *undo_bg = NULL;
1898 int cluster_bitmap = 0;
1899 1986
1900 mlog_entry_void(); 1987 mlog_entry_void();
1901 1988
@@ -1905,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1905 1992
1906 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1993 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1907 1994
1908 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1995 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1909 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1910
1911 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1996 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1912 group_bh, journal_type); 1997 group_bh,
1998 undo_fn ?
1999 OCFS2_JOURNAL_ACCESS_UNDO :
2000 OCFS2_JOURNAL_ACCESS_WRITE);
1913 if (status < 0) { 2001 if (status < 0) {
1914 mlog_errno(status); 2002 mlog_errno(status);
1915 goto bail; 2003 goto bail;
1916 } 2004 }
1917 2005
1918 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2006 if (undo_fn) {
1919 cluster_bitmap = 1;
1920
1921 if (cluster_bitmap) {
1922 jbd_lock_bh_state(group_bh); 2007 jbd_lock_bh_state(group_bh);
1923 undo_bg = (struct ocfs2_group_desc *) 2008 undo_bg = (struct ocfs2_group_desc *)
1924 bh2jh(group_bh)->b_committed_data; 2009 bh2jh(group_bh)->b_committed_data;
@@ -1929,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1929 while(tmp--) { 2014 while(tmp--) {
1930 ocfs2_clear_bit((bit_off + tmp), 2015 ocfs2_clear_bit((bit_off + tmp),
1931 (unsigned long *) bg->bg_bitmap); 2016 (unsigned long *) bg->bg_bitmap);
1932 if (cluster_bitmap) 2017 if (undo_fn)
1933 ocfs2_set_bit(bit_off + tmp, 2018 undo_fn(bit_off + tmp,
1934 (unsigned long *) undo_bg->bg_bitmap); 2019 (unsigned long *) undo_bg->bg_bitmap);
1935 } 2020 }
1936 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2021 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1937 2022
1938 if (cluster_bitmap) 2023 if (undo_fn)
1939 jbd_unlock_bh_state(group_bh); 2024 jbd_unlock_bh_state(group_bh);
1940 2025
1941 status = ocfs2_journal_dirty(handle, group_bh); 2026 status = ocfs2_journal_dirty(handle, group_bh);
@@ -1948,12 +2033,14 @@ bail:
1948/* 2033/*
1949 * expects the suballoc inode to already be locked. 2034 * expects the suballoc inode to already be locked.
1950 */ 2035 */
1951int ocfs2_free_suballoc_bits(handle_t *handle, 2036static int _ocfs2_free_suballoc_bits(handle_t *handle,
1952 struct inode *alloc_inode, 2037 struct inode *alloc_inode,
1953 struct buffer_head *alloc_bh, 2038 struct buffer_head *alloc_bh,
1954 unsigned int start_bit, 2039 unsigned int start_bit,
1955 u64 bg_blkno, 2040 u64 bg_blkno,
1956 unsigned int count) 2041 unsigned int count,
2042 void (*undo_fn)(unsigned int bit,
2043 unsigned long *bitmap))
1957{ 2044{
1958 int status = 0; 2045 int status = 0;
1959 u32 tmp_used; 2046 u32 tmp_used;
@@ -1988,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1988 2075
1989 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2076 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1990 group, group_bh, 2077 group, group_bh,
1991 start_bit, count); 2078 start_bit, count, undo_fn);
1992 if (status < 0) { 2079 if (status < 0) {
1993 mlog_errno(status); 2080 mlog_errno(status);
1994 goto bail; 2081 goto bail;
@@ -2019,6 +2106,17 @@ bail:
2019 return status; 2106 return status;
2020} 2107}
2021 2108
2109int ocfs2_free_suballoc_bits(handle_t *handle,
2110 struct inode *alloc_inode,
2111 struct buffer_head *alloc_bh,
2112 unsigned int start_bit,
2113 u64 bg_blkno,
2114 unsigned int count)
2115{
2116 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117 start_bit, bg_blkno, count, NULL);
2118}
2119
2022int ocfs2_free_dinode(handle_t *handle, 2120int ocfs2_free_dinode(handle_t *handle,
2023 struct inode *inode_alloc_inode, 2121 struct inode *inode_alloc_inode,
2024 struct buffer_head *inode_alloc_bh, 2122 struct buffer_head *inode_alloc_bh,
@@ -2032,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
2032 inode_alloc_bh, bit, bg_blkno, 1); 2130 inode_alloc_bh, bit, bg_blkno, 1);
2033} 2131}
2034 2132
2035int ocfs2_free_clusters(handle_t *handle, 2133static int _ocfs2_free_clusters(handle_t *handle,
2036 struct inode *bitmap_inode, 2134 struct inode *bitmap_inode,
2037 struct buffer_head *bitmap_bh, 2135 struct buffer_head *bitmap_bh,
2038 u64 start_blk, 2136 u64 start_blk,
2039 unsigned int num_clusters) 2137 unsigned int num_clusters,
2138 void (*undo_fn)(unsigned int bit,
2139 unsigned long *bitmap))
2040{ 2140{
2041 int status; 2141 int status;
2042 u16 bg_start_bit; 2142 u16 bg_start_bit;
@@ -2063,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
2063 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2163 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2064 (unsigned long long)bg_blkno, bg_start_bit); 2164 (unsigned long long)bg_blkno, bg_start_bit);
2065 2165
2066 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2166 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2067 bg_start_bit, bg_blkno, 2167 bg_start_bit, bg_blkno,
2068 num_clusters); 2168 num_clusters, undo_fn);
2069 if (status < 0) { 2169 if (status < 0) {
2070 mlog_errno(status); 2170 mlog_errno(status);
2071 goto out; 2171 goto out;
@@ -2079,6 +2179,32 @@ out:
2079 return status; 2179 return status;
2080} 2180}
2081 2181
2182int ocfs2_free_clusters(handle_t *handle,
2183 struct inode *bitmap_inode,
2184 struct buffer_head *bitmap_bh,
2185 u64 start_blk,
2186 unsigned int num_clusters)
2187{
2188 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189 start_blk, num_clusters,
2190 _ocfs2_set_bit);
2191}
2192
2193/*
2194 * Give never-used clusters back to the global bitmap. We don't need
2195 * to protect these bits in the undo buffer.
2196 */
2197int ocfs2_release_clusters(handle_t *handle,
2198 struct inode *bitmap_inode,
2199 struct buffer_head *bitmap_bh,
2200 u64 start_blk,
2201 unsigned int num_clusters)
2202{
2203 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204 start_blk, num_clusters,
2205 _ocfs2_clear_bit);
2206}
2207
2082static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2208static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2083{ 2209{
2084 printk("Block Group:\n"); 2210 printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
56 is the same as ~0 - unlimited */ 56 is the same as ~0 - unlimited */
57}; 57};
58 58
59void ocfs2_init_steal_slots(struct ocfs2_super *osb);
59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 60void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
60static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) 61static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
61{ 62{
@@ -126,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
126 struct buffer_head *bitmap_bh, 127 struct buffer_head *bitmap_bh,
127 u64 start_blk, 128 u64 start_blk,
128 unsigned int num_clusters); 129 unsigned int num_clusters);
130int ocfs2_release_clusters(handle_t *handle,
131 struct inode *bitmap_inode,
132 struct buffer_head *bitmap_bh,
133 u64 start_blk,
134 unsigned int num_clusters);
129 135
130static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 136static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
131{ 137{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 755cd49a5ef3..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
69#include "xattr.h" 69#include "xattr.h"
70#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h" 71#include "refcounttree.h"
72#include "suballoc.h"
72 73
73#include "buffer_head_io.h" 74#include "buffer_head_io.h"
74 75
@@ -301,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
301 302
302 spin_lock(&osb->osb_lock); 303 spin_lock(&osb->osb_lock);
303 out += snprintf(buf + out, len - out, 304 out += snprintf(buf + out, len - out,
304 "%10s => Slot: %d NumStolen: %d\n", "Steal", 305 "%10s => InodeSlot: %d StolenInodes: %d, "
306 "MetaSlot: %d StolenMeta: %d\n", "Steal",
305 osb->s_inode_steal_slot, 307 osb->s_inode_steal_slot,
306 atomic_read(&osb->s_num_inodes_stolen)); 308 atomic_read(&osb->s_num_inodes_stolen),
309 osb->s_meta_steal_slot,
310 atomic_read(&osb->s_num_meta_stolen));
307 spin_unlock(&osb->osb_lock); 311 spin_unlock(&osb->osb_lock);
308 312
309 out += snprintf(buf + out, len - out, "OrphanScan => "); 313 out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -1997,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1997 osb->blocked_lock_count = 0; 2001 osb->blocked_lock_count = 0;
1998 spin_lock_init(&osb->osb_lock); 2002 spin_lock_init(&osb->osb_lock);
1999 spin_lock_init(&osb->osb_xattr_lock); 2003 spin_lock_init(&osb->osb_xattr_lock);
2000 ocfs2_init_inode_steal_slot(osb); 2004 ocfs2_init_steal_slots(osb);
2001 2005
2002 atomic_set(&osb->alloc_stats.moves, 0); 2006 atomic_set(&osb->alloc_stats.moves, 0);
2003 atomic_set(&osb->alloc_stats.local_data, 0); 2007 atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8fc6fb071c6d..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -116,10 +116,11 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
116}; 116};
117 117
118struct ocfs2_xattr_info { 118struct ocfs2_xattr_info {
119 int name_index; 119 int xi_name_index;
120 const char *name; 120 const char *xi_name;
121 const void *value; 121 int xi_name_len;
122 size_t value_len; 122 const void *xi_value;
123 size_t xi_value_len;
123}; 124};
124 125
125struct ocfs2_xattr_search { 126struct ocfs2_xattr_search {
@@ -137,6 +138,115 @@ struct ocfs2_xattr_search {
137 int not_found; 138 int not_found;
138}; 139};
139 140
141/* Operations on struct ocfs2_xa_entry */
142struct ocfs2_xa_loc;
143struct ocfs2_xa_loc_operations {
144 /*
145 * Journal functions
146 */
147 int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
148 int type);
149 void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
150
151 /*
152 * Return a pointer to the appropriate buffer in loc->xl_storage
153 * at the given offset from loc->xl_header.
154 */
155 void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
156
157 /* Can we reuse the existing entry for the new value? */
158 int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
159 struct ocfs2_xattr_info *xi);
160
161 /* How much space is needed for the new value? */
162 int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
163 struct ocfs2_xattr_info *xi);
164
165 /*
166 * Return the offset of the first name+value pair. This is
167 * the start of our downward-filling free space.
168 */
169 int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
170
171 /*
172 * Remove the name+value at this location. Do whatever is
173 * appropriate with the remaining name+value pairs.
174 */
175 void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
176
177 /* Fill xl_entry with a new entry */
178 void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
179
180 /* Add name+value storage to an entry */
181 void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
182
183 /*
184 * Initialize the value buf's access and bh fields for this entry.
185 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
186 */
187 void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
188 struct ocfs2_xattr_value_buf *vb);
189};
190
191/*
192 * Describes an xattr entry location. This is a memory structure
193 * tracking the on-disk structure.
194 */
195struct ocfs2_xa_loc {
196 /* This xattr belongs to this inode */
197 struct inode *xl_inode;
198
199 /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
200 struct ocfs2_xattr_header *xl_header;
201
202 /* Bytes from xl_header to the end of the storage */
203 int xl_size;
204
205 /*
206 * The ocfs2_xattr_entry this location describes. If this is
207 * NULL, this location describes the on-disk structure where it
208 * would have been.
209 */
210 struct ocfs2_xattr_entry *xl_entry;
211
212 /*
213 * Internal housekeeping
214 */
215
216 /* Buffer(s) containing this entry */
217 void *xl_storage;
218
219 /* Operations on the storage backing this location */
220 const struct ocfs2_xa_loc_operations *xl_ops;
221};
222
223/*
224 * Convenience functions to calculate how much space is needed for a
225 * given name+value pair
226 */
227static int namevalue_size(int name_len, uint64_t value_len)
228{
229 if (value_len > OCFS2_XATTR_INLINE_SIZE)
230 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
231 else
232 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
233}
234
235static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
236{
237 return namevalue_size(xi->xi_name_len, xi->xi_value_len);
238}
239
240static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
241{
242 u64 value_len = le64_to_cpu(xe->xe_value_size);
243
244 BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
245 ocfs2_xattr_is_local(xe));
246 return namevalue_size(xe->xe_name_len, value_len);
247}
248
249
140static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, 250static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
141 struct ocfs2_xattr_header *xh, 251 struct ocfs2_xattr_header *xh,
142 int index, 252 int index,
@@ -212,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
212 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); 322 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
213} 323}
214 324
215static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
216{
217 u16 len = sb->s_blocksize -
218 offsetof(struct ocfs2_xattr_header, xh_entries);
219
220 return len / sizeof(struct ocfs2_xattr_entry);
221}
222
223#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) 325#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
224#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) 326#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
225#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) 327#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -463,35 +565,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
463 return hash; 565 return hash;
464} 566}
465 567
466/* 568static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
467 * ocfs2_xattr_hash_entry()
468 *
469 * Compute the hash of an extended attribute.
470 */
471static void ocfs2_xattr_hash_entry(struct inode *inode,
472 struct ocfs2_xattr_header *header,
473 struct ocfs2_xattr_entry *entry)
474{ 569{
475 u32 hash = 0; 570 return namevalue_size(name_len, value_len) +
476 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); 571 sizeof(struct ocfs2_xattr_entry);
477
478 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
479 entry->xe_name_hash = cpu_to_le32(hash);
480
481 return;
482} 572}
483 573
484static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) 574static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
485{ 575{
486 int size = 0; 576 return namevalue_size_xi(xi) +
487 577 sizeof(struct ocfs2_xattr_entry);
488 if (value_len <= OCFS2_XATTR_INLINE_SIZE) 578}
489 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
490 else
491 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
492 size += sizeof(struct ocfs2_xattr_entry);
493 579
494 return size; 580static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
581{
582 return namevalue_size_xe(xe) +
583 sizeof(struct ocfs2_xattr_entry);
495} 584}
496 585
497int ocfs2_calc_security_init(struct inode *dir, 586int ocfs2_calc_security_init(struct inode *dir,
@@ -1308,452 +1397,897 @@ out:
1308 return ret; 1397 return ret;
1309} 1398}
1310 1399
1311static int ocfs2_xattr_cleanup(struct inode *inode, 1400static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
1312 handle_t *handle, 1401 int num_entries)
1313 struct ocfs2_xattr_info *xi,
1314 struct ocfs2_xattr_search *xs,
1315 struct ocfs2_xattr_value_buf *vb,
1316 size_t offs)
1317{ 1402{
1318 int ret = 0; 1403 int free_space;
1319 size_t name_len = strlen(xi->name);
1320 void *val = xs->base + offs;
1321 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1322 1404
1323 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1405 if (!needed_space)
1324 OCFS2_JOURNAL_ACCESS_WRITE); 1406 return 0;
1325 if (ret) {
1326 mlog_errno(ret);
1327 goto out;
1328 }
1329 /* Decrease xattr count */
1330 le16_add_cpu(&xs->header->xh_count, -1);
1331 /* Remove the xattr entry and tree root which has already be set*/
1332 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
1333 memset(val, 0, size);
1334 1407
1335 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1408 free_space = free_start -
1336 if (ret < 0) 1409 sizeof(struct ocfs2_xattr_header) -
1337 mlog_errno(ret); 1410 (num_entries * sizeof(struct ocfs2_xattr_entry)) -
1338out: 1411 OCFS2_XATTR_HEADER_GAP;
1339 return ret; 1412 if (free_space < 0)
1413 return -EIO;
1414 if (free_space < needed_space)
1415 return -ENOSPC;
1416
1417 return 0;
1340} 1418}
1341 1419
1342static int ocfs2_xattr_update_entry(struct inode *inode, 1420static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
1343 handle_t *handle, 1421 int type)
1344 struct ocfs2_xattr_info *xi,
1345 struct ocfs2_xattr_search *xs,
1346 struct ocfs2_xattr_value_buf *vb,
1347 size_t offs)
1348{ 1422{
1349 int ret; 1423 return loc->xl_ops->xlo_journal_access(handle, loc, type);
1424}
1350 1425
1351 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1426static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
1352 OCFS2_JOURNAL_ACCESS_WRITE); 1427{
1353 if (ret) { 1428 loc->xl_ops->xlo_journal_dirty(handle, loc);
1354 mlog_errno(ret); 1429}
1355 goto out;
1356 }
1357 1430
1358 xs->here->xe_name_offset = cpu_to_le16(offs); 1431/* Give a pointer into the storage for the given offset */
1359 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1432static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
1360 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) 1433{
1361 ocfs2_xattr_set_local(xs->here, 1); 1434 BUG_ON(offset >= loc->xl_size);
1362 else 1435 return loc->xl_ops->xlo_offset_pointer(loc, offset);
1363 ocfs2_xattr_set_local(xs->here, 0); 1436}
1364 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1365 1437
1366 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1438/*
1367 if (ret < 0) 1439 * Wipe the name+value pair and allow the storage to reclaim it. This
1368 mlog_errno(ret); 1440 * must be followed by either removal of the entry or a call to
1369out: 1441 * ocfs2_xa_add_namevalue().
1370 return ret; 1442 */
1443static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
1444{
1445 loc->xl_ops->xlo_wipe_namevalue(loc);
1371} 1446}
1372 1447
1373/* 1448/*
1374 * ocfs2_xattr_set_value_outside() 1449 * Find lowest offset to a name+value pair. This is the start of our
1375 * 1450 * downward-growing free space.
1376 * Set large size value in B tree.
1377 */ 1451 */
1378static int ocfs2_xattr_set_value_outside(struct inode *inode, 1452static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
1379 struct ocfs2_xattr_info *xi,
1380 struct ocfs2_xattr_search *xs,
1381 struct ocfs2_xattr_set_ctxt *ctxt,
1382 struct ocfs2_xattr_value_buf *vb,
1383 size_t offs)
1384{ 1453{
1385 size_t name_len = strlen(xi->name); 1454 return loc->xl_ops->xlo_get_free_start(loc);
1386 void *val = xs->base + offs; 1455}
1387 struct ocfs2_xattr_value_root *xv = NULL;
1388 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1389 int ret = 0;
1390 1456
1391 memset(val, 0, size); 1457/* Can we reuse loc->xl_entry for xi? */
1392 memcpy(val, xi->name, name_len); 1458static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
1393 xv = (struct ocfs2_xattr_value_root *) 1459 struct ocfs2_xattr_info *xi)
1394 (val + OCFS2_XATTR_SIZE(name_len)); 1460{
1395 xv->xr_clusters = 0; 1461 return loc->xl_ops->xlo_can_reuse(loc, xi);
1396 xv->xr_last_eb_blk = 0; 1462}
1397 xv->xr_list.l_tree_depth = 0; 1463
1398 xv->xr_list.l_count = cpu_to_le16(1); 1464/* How much free space is needed to set the new value */
1399 xv->xr_list.l_next_free_rec = 0; 1465static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
1400 vb->vb_xv = xv; 1466 struct ocfs2_xattr_info *xi)
1401 1467{
1402 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); 1468 return loc->xl_ops->xlo_check_space(loc, xi);
1403 if (ret < 0) { 1469}
1404 mlog_errno(ret); 1470
1405 return ret; 1471static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1472{
1473 loc->xl_ops->xlo_add_entry(loc, name_hash);
1474 loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
1475 /*
1476 * We can't leave the new entry's xe_name_offset at zero or
1477 * add_namevalue() will go nuts. We set it to the size of our
1478 * storage so that it can never be less than any other entry.
1479 */
1480 loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
1481}
1482
1483static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
1484 struct ocfs2_xattr_info *xi)
1485{
1486 int size = namevalue_size_xi(xi);
1487 int nameval_offset;
1488 char *nameval_buf;
1489
1490 loc->xl_ops->xlo_add_namevalue(loc, size);
1491 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1492 loc->xl_entry->xe_name_len = xi->xi_name_len;
1493 ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
1494 ocfs2_xattr_set_local(loc->xl_entry,
1495 xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
1496
1497 nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1498 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
1499 memset(nameval_buf, 0, size);
1500 memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
1501}
1502
1503static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
1504 struct ocfs2_xattr_value_buf *vb)
1505{
1506 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1507 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1508
1509 /* Value bufs are for value trees */
1510 BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
1511 BUG_ON(namevalue_size_xe(loc->xl_entry) !=
1512 (name_size + OCFS2_XATTR_ROOT_SIZE));
1513
1514 loc->xl_ops->xlo_fill_value_buf(loc, vb);
1515 vb->vb_xv =
1516 (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
1517 nameval_offset +
1518 name_size);
1519}
1520
1521static int ocfs2_xa_block_journal_access(handle_t *handle,
1522 struct ocfs2_xa_loc *loc, int type)
1523{
1524 struct buffer_head *bh = loc->xl_storage;
1525 ocfs2_journal_access_func access;
1526
1527 if (loc->xl_size == (bh->b_size -
1528 offsetof(struct ocfs2_xattr_block,
1529 xb_attrs.xb_header)))
1530 access = ocfs2_journal_access_xb;
1531 else
1532 access = ocfs2_journal_access_di;
1533 return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
1534}
1535
1536static void ocfs2_xa_block_journal_dirty(handle_t *handle,
1537 struct ocfs2_xa_loc *loc)
1538{
1539 struct buffer_head *bh = loc->xl_storage;
1540
1541 ocfs2_journal_dirty(handle, bh);
1542}
1543
1544static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
1545 int offset)
1546{
1547 return (char *)loc->xl_header + offset;
1548}
1549
1550static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
1551 struct ocfs2_xattr_info *xi)
1552{
1553 /*
1554 * Block storage is strict. If the sizes aren't exact, we will
1555 * remove the old one and reinsert the new.
1556 */
1557 return namevalue_size_xe(loc->xl_entry) ==
1558 namevalue_size_xi(xi);
1559}
1560
1561static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
1562{
1563 struct ocfs2_xattr_header *xh = loc->xl_header;
1564 int i, count = le16_to_cpu(xh->xh_count);
1565 int offset, free_start = loc->xl_size;
1566
1567 for (i = 0; i < count; i++) {
1568 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1569 if (offset < free_start)
1570 free_start = offset;
1406 } 1571 }
1407 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); 1572
1408 if (ret < 0) { 1573 return free_start;
1409 mlog_errno(ret); 1574}
1410 return ret; 1575
1576static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
1577 struct ocfs2_xattr_info *xi)
1578{
1579 int count = le16_to_cpu(loc->xl_header->xh_count);
1580 int free_start = ocfs2_xa_get_free_start(loc);
1581 int needed_space = ocfs2_xi_entry_usage(xi);
1582
1583 /*
1584 * Block storage will reclaim the original entry before inserting
1585 * the new value, so we only need the difference. If the new
1586 * entry is smaller than the old one, we don't need anything.
1587 */
1588 if (loc->xl_entry) {
1589 /* Don't need space if we're reusing! */
1590 if (ocfs2_xa_can_reuse_entry(loc, xi))
1591 needed_space = 0;
1592 else
1593 needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
1411 } 1594 }
1412 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, 1595 if (needed_space < 0)
1413 xi->value, xi->value_len); 1596 needed_space = 0;
1414 if (ret < 0) 1597 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1415 mlog_errno(ret); 1598}
1416 1599
1417 return ret; 1600/*
1601 * Block storage for xattrs keeps the name+value pairs compacted. When
1602 * we remove one, we have to shift any that preceded it towards the end.
1603 */
1604static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1605{
1606 int i, offset;
1607 int namevalue_offset, first_namevalue_offset, namevalue_size;
1608 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1609 struct ocfs2_xattr_header *xh = loc->xl_header;
1610 int count = le16_to_cpu(xh->xh_count);
1611
1612 namevalue_offset = le16_to_cpu(entry->xe_name_offset);
1613 namevalue_size = namevalue_size_xe(entry);
1614 first_namevalue_offset = ocfs2_xa_get_free_start(loc);
1615
1616 /* Shift the name+value pairs */
1617 memmove((char *)xh + first_namevalue_offset + namevalue_size,
1618 (char *)xh + first_namevalue_offset,
1619 namevalue_offset - first_namevalue_offset);
1620 memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
1621
1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size);
1628 }
1629
1630 /*
1631 * Note that we don't update xh_free_start or xh_name_value_len
1632 * because they're not used in block-stored xattrs.
1633 */
1634}
1635
1636static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1637{
1638 int count = le16_to_cpu(loc->xl_header->xh_count);
1639 loc->xl_entry = &(loc->xl_header->xh_entries[count]);
1640 le16_add_cpu(&loc->xl_header->xh_count, 1);
1641 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1642}
1643
1644static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1645{
1646 int free_start = ocfs2_xa_get_free_start(loc);
1647
1648 loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
1649}
1650
1651static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
1652 struct ocfs2_xattr_value_buf *vb)
1653{
1654 struct buffer_head *bh = loc->xl_storage;
1655
1656 if (loc->xl_size == (bh->b_size -
1657 offsetof(struct ocfs2_xattr_block,
1658 xb_attrs.xb_header)))
1659 vb->vb_access = ocfs2_journal_access_xb;
1660 else
1661 vb->vb_access = ocfs2_journal_access_di;
1662 vb->vb_bh = bh;
1418} 1663}
1419 1664
1420/* 1665/*
1421 * ocfs2_xattr_set_entry_local() 1666 * Operations for xattrs stored in blocks. This includes inline inode
1422 * 1667 * storage and unindexed ocfs2_xattr_blocks.
1423 * Set, replace or remove extended attribute in local.
1424 */ 1668 */
1425static void ocfs2_xattr_set_entry_local(struct inode *inode, 1669static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
1426 struct ocfs2_xattr_info *xi, 1670 .xlo_journal_access = ocfs2_xa_block_journal_access,
1427 struct ocfs2_xattr_search *xs, 1671 .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
1428 struct ocfs2_xattr_entry *last, 1672 .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
1429 size_t min_offs) 1673 .xlo_check_space = ocfs2_xa_block_check_space,
1674 .xlo_can_reuse = ocfs2_xa_block_can_reuse,
1675 .xlo_get_free_start = ocfs2_xa_block_get_free_start,
1676 .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
1677 .xlo_add_entry = ocfs2_xa_block_add_entry,
1678 .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
1679 .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
1680};
1681
1682static int ocfs2_xa_bucket_journal_access(handle_t *handle,
1683 struct ocfs2_xa_loc *loc, int type)
1430{ 1684{
1431 size_t name_len = strlen(xi->name); 1685 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1432 int i;
1433 1686
1434 if (xi->value && xs->not_found) { 1687 return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
1435 /* Insert the new xattr entry. */ 1688}
1436 le16_add_cpu(&xs->header->xh_count, 1); 1689
1437 ocfs2_xattr_set_type(last, xi->name_index); 1690static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
1438 ocfs2_xattr_set_local(last, 1); 1691 struct ocfs2_xa_loc *loc)
1439 last->xe_name_len = name_len; 1692{
1440 } else { 1693 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1441 void *first_val; 1694
1442 void *val; 1695 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
1443 size_t offs, size; 1696}
1444 1697
1445 first_val = xs->base + min_offs; 1698static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
1446 offs = le16_to_cpu(xs->here->xe_name_offset); 1699 int offset)
1447 val = xs->base + offs; 1700{
1448 1701 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1449 if (le64_to_cpu(xs->here->xe_value_size) > 1702 int block, block_offset;
1450 OCFS2_XATTR_INLINE_SIZE) 1703
1451 size = OCFS2_XATTR_SIZE(name_len) + 1704 /* The header is at the front of the bucket */
1452 OCFS2_XATTR_ROOT_SIZE; 1705 block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
1706 block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
1707
1708 return bucket_block(bucket, block) + block_offset;
1709}
1710
1711static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
1712 struct ocfs2_xattr_info *xi)
1713{
1714 return namevalue_size_xe(loc->xl_entry) >=
1715 namevalue_size_xi(xi);
1716}
1717
1718static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
1719{
1720 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1721 return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
1722}
1723
1724static int ocfs2_bucket_align_free_start(struct super_block *sb,
1725 int free_start, int size)
1726{
1727 /*
1728 * We need to make sure that the name+value pair fits within
1729 * one block.
1730 */
1731 if (((free_start - size) >> sb->s_blocksize_bits) !=
1732 ((free_start - 1) >> sb->s_blocksize_bits))
1733 free_start -= free_start % sb->s_blocksize;
1734
1735 return free_start;
1736}
1737
1738static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
1739 struct ocfs2_xattr_info *xi)
1740{
1741 int rc;
1742 int count = le16_to_cpu(loc->xl_header->xh_count);
1743 int free_start = ocfs2_xa_get_free_start(loc);
1744 int needed_space = ocfs2_xi_entry_usage(xi);
1745 int size = namevalue_size_xi(xi);
1746 struct super_block *sb = loc->xl_inode->i_sb;
1747
1748 /*
1749 * Bucket storage does not reclaim name+value pairs it cannot
1750 * reuse. They live as holes until the bucket fills, and then
1751 * the bucket is defragmented. However, the bucket can reclaim
1752 * the ocfs2_xattr_entry.
1753 */
1754 if (loc->xl_entry) {
1755 /* Don't need space if we're reusing! */
1756 if (ocfs2_xa_can_reuse_entry(loc, xi))
1757 needed_space = 0;
1453 else 1758 else
1454 size = OCFS2_XATTR_SIZE(name_len) + 1759 needed_space -= sizeof(struct ocfs2_xattr_entry);
1455 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 1760 }
1456 1761 BUG_ON(needed_space < 0);
1457 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1458 OCFS2_XATTR_SIZE(xi->value_len)) {
1459 /* The old and the new value have the
1460 same size. Just replace the value. */
1461 ocfs2_xattr_set_local(xs->here, 1);
1462 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1463 /* Clear value bytes. */
1464 memset(val + OCFS2_XATTR_SIZE(name_len),
1465 0,
1466 OCFS2_XATTR_SIZE(xi->value_len));
1467 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1468 xi->value,
1469 xi->value_len);
1470 return;
1471 }
1472 /* Remove the old name+value. */
1473 memmove(first_val + size, first_val, val - first_val);
1474 memset(first_val, 0, size);
1475 xs->here->xe_name_hash = 0;
1476 xs->here->xe_name_offset = 0;
1477 ocfs2_xattr_set_local(xs->here, 1);
1478 xs->here->xe_value_size = 0;
1479
1480 min_offs += size;
1481
1482 /* Adjust all value offsets. */
1483 last = xs->header->xh_entries;
1484 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1485 size_t o = le16_to_cpu(last->xe_name_offset);
1486
1487 if (o < offs)
1488 last->xe_name_offset = cpu_to_le16(o + size);
1489 last += 1;
1490 }
1491 1762
1492 if (!xi->value) { 1763 if (free_start < size) {
1493 /* Remove the old entry. */ 1764 if (needed_space)
1494 last -= 1; 1765 return -ENOSPC;
1495 memmove(xs->here, xs->here + 1, 1766 } else {
1496 (void *)last - (void *)xs->here); 1767 /*
1497 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 1768 * First we check if it would fit in the first place.
1498 le16_add_cpu(&xs->header->xh_count, -1); 1769 * Below, we align the free start to a block. This may
1499 } 1770 * slide us below the minimum gap. By checking unaligned
1771 * first, we avoid that error.
1772 */
1773 rc = ocfs2_xa_check_space_helper(needed_space, free_start,
1774 count);
1775 if (rc)
1776 return rc;
1777 free_start = ocfs2_bucket_align_free_start(sb, free_start,
1778 size);
1500 } 1779 }
1501 if (xi->value) { 1780 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1502 /* Insert the new name+value. */ 1781}
1503 size_t size = OCFS2_XATTR_SIZE(name_len) + 1782
1504 OCFS2_XATTR_SIZE(xi->value_len); 1783static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
1505 void *val = xs->base + min_offs - size; 1784{
1785 le16_add_cpu(&loc->xl_header->xh_name_value_len,
1786 -namevalue_size_xe(loc->xl_entry));
1787}
1506 1788
1507 xs->here->xe_name_offset = cpu_to_le16(min_offs - size); 1789static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1508 memset(val, 0, size); 1790{
1509 memcpy(val, xi->name, name_len); 1791 struct ocfs2_xattr_header *xh = loc->xl_header;
1510 memcpy(val + OCFS2_XATTR_SIZE(name_len), 1792 int count = le16_to_cpu(xh->xh_count);
1511 xi->value, 1793 int low = 0, high = count - 1, tmp;
1512 xi->value_len); 1794 struct ocfs2_xattr_entry *tmp_xe;
1513 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1795
1514 ocfs2_xattr_set_local(xs->here, 1); 1796 /*
1515 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1797 * We keep buckets sorted by name_hash, so we need to find
1798 * our insert place.
1799 */
1800 while (low <= high && count) {
1801 tmp = (low + high) / 2;
1802 tmp_xe = &xh->xh_entries[tmp];
1803
1804 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
1805 low = tmp + 1;
1806 else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
1807 high = tmp - 1;
1808 else {
1809 low = tmp;
1810 break;
1811 }
1516 } 1812 }
1517 1813
1518 return; 1814 if (low != count)
1815 memmove(&xh->xh_entries[low + 1],
1816 &xh->xh_entries[low],
1817 ((count - low) * sizeof(struct ocfs2_xattr_entry)));
1818
1819 le16_add_cpu(&xh->xh_count, 1);
1820 loc->xl_entry = &xh->xh_entries[low];
1821 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1822}
1823
1824static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1825{
1826 int free_start = ocfs2_xa_get_free_start(loc);
1827 struct ocfs2_xattr_header *xh = loc->xl_header;
1828 struct super_block *sb = loc->xl_inode->i_sb;
1829 int nameval_offset;
1830
1831 free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
1832 nameval_offset = free_start - size;
1833 loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
1834 xh->xh_free_start = cpu_to_le16(nameval_offset);
1835 le16_add_cpu(&xh->xh_name_value_len, size);
1836
1837}
1838
1839static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
1840 struct ocfs2_xattr_value_buf *vb)
1841{
1842 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1843 struct super_block *sb = loc->xl_inode->i_sb;
1844 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1845 int size = namevalue_size_xe(loc->xl_entry);
1846 int block_offset = nameval_offset >> sb->s_blocksize_bits;
1847
1848 /* Values are not allowed to straddle block boundaries */
1849 BUG_ON(block_offset !=
1850 ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
1851 /* We expect the bucket to be filled in */
1852 BUG_ON(!bucket->bu_bhs[block_offset]);
1853
1854 vb->vb_access = ocfs2_journal_access;
1855 vb->vb_bh = bucket->bu_bhs[block_offset];
1856}
1857
1858/* Operations for xattrs stored in buckets. */
1859static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
1860 .xlo_journal_access = ocfs2_xa_bucket_journal_access,
1861 .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
1862 .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
1863 .xlo_check_space = ocfs2_xa_bucket_check_space,
1864 .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
1865 .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
1866 .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
1867 .xlo_add_entry = ocfs2_xa_bucket_add_entry,
1868 .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
1869 .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
1870};
1871
1872static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
1873{
1874 struct ocfs2_xattr_value_buf vb;
1875
1876 if (ocfs2_xattr_is_local(loc->xl_entry))
1877 return 0;
1878
1879 ocfs2_xa_fill_value_buf(loc, &vb);
1880 return le32_to_cpu(vb.vb_xv->xr_clusters);
1881}
1882
1883static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
1884 struct ocfs2_xattr_set_ctxt *ctxt)
1885{
1886 int trunc_rc, access_rc;
1887 struct ocfs2_xattr_value_buf vb;
1888
1889 ocfs2_xa_fill_value_buf(loc, &vb);
1890 trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
1891 ctxt);
1892
1893 /*
1894 * The caller of ocfs2_xa_value_truncate() has already called
1895 * ocfs2_xa_journal_access on the loc. However, The truncate code
1896 * calls ocfs2_extend_trans(). This may commit the previous
1897 * transaction and open a new one. If this is a bucket, truncate
1898 * could leave only vb->vb_bh set up for journaling. Meanwhile,
1899 * the caller is expecting to dirty the entire bucket. So we must
1900 * reset the journal work. We do this even if truncate has failed,
1901 * as it could have failed after committing the extend.
1902 */
1903 access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
1904 OCFS2_JOURNAL_ACCESS_WRITE);
1905
1906 /* Errors in truncate take precedence */
1907 return trunc_rc ? trunc_rc : access_rc;
1908}
1909
1910static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
1911{
1912 int index, count;
1913 struct ocfs2_xattr_header *xh = loc->xl_header;
1914 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1915
1916 ocfs2_xa_wipe_namevalue(loc);
1917 loc->xl_entry = NULL;
1918
1919 le16_add_cpu(&xh->xh_count, -1);
1920 count = le16_to_cpu(xh->xh_count);
1921
1922 /*
1923 * Only zero out the entry if there are more remaining. This is
1924 * important for an empty bucket, as it keeps track of the
1925 * bucket's hash value. It doesn't hurt empty block storage.
1926 */
1927 if (count) {
1928 index = ((char *)entry - (char *)&xh->xh_entries) /
1929 sizeof(struct ocfs2_xattr_entry);
1930 memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
1931 (count - index) * sizeof(struct ocfs2_xattr_entry));
1932 memset(&xh->xh_entries[count], 0,
1933 sizeof(struct ocfs2_xattr_entry));
1934 }
1519} 1935}
1520 1936
1521/* 1937/*
1522 * ocfs2_xattr_set_entry() 1938 * If we have a problem adjusting the size of an external value during
1939 * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
1940 * in an intermediate state. For example, the value may be partially
1941 * truncated.
1942 *
1943 * If the value tree hasn't changed, the extend/truncate went nowhere.
1944 * We have nothing to do. The caller can treat it as a straight error.
1523 * 1945 *
1524 * Set extended attribute entry into inode or block. 1946 * If the value tree got partially truncated, we now have a corrupted
1947 * extended attribute. We're going to wipe its entry and leak the
1948 * clusters. Better to leak some storage than leave a corrupt entry.
1525 * 1949 *
1526 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, 1950 * If the value tree grew, it obviously didn't grow enough for the
1527 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), 1951 * new entry. We're not going to try and reclaim those clusters either.
1528 * then set value in B tree with set_value_outside(). 1952 * If there was already an external value there (orig_clusters != 0),
1953 * the new clusters are attached safely and we can just leave the old
1954 * value in place. If there was no external value there, we remove
1955 * the entry.
1956 *
1957 * This way, the xattr block we store in the journal will be consistent.
1958 * If the size change broke because of the journal, no changes will hit
1959 * disk anyway.
1529 */ 1960 */
1530static int ocfs2_xattr_set_entry(struct inode *inode, 1961static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
1531 struct ocfs2_xattr_info *xi, 1962 const char *what,
1532 struct ocfs2_xattr_search *xs, 1963 unsigned int orig_clusters)
1533 struct ocfs2_xattr_set_ctxt *ctxt, 1964{
1534 int flag) 1965 unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
1535{ 1966 char *nameval_buf = ocfs2_xa_offset_pointer(loc,
1536 struct ocfs2_xattr_entry *last; 1967 le16_to_cpu(loc->xl_entry->xe_name_offset));
1537 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1968
1538 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1969 if (new_clusters < orig_clusters) {
1539 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1970 mlog(ML_ERROR,
1540 size_t size_l = 0; 1971 "Partial truncate while %s xattr %.*s. Leaking "
1541 handle_t *handle = ctxt->handle; 1972 "%u clusters and removing the entry\n",
1542 int free, i, ret; 1973 what, loc->xl_entry->xe_name_len, nameval_buf,
1543 struct ocfs2_xattr_info xi_l = { 1974 orig_clusters - new_clusters);
1544 .name_index = xi->name_index, 1975 ocfs2_xa_remove_entry(loc);
1545 .name = xi->name, 1976 } else if (!orig_clusters) {
1546 .value = xi->value, 1977 mlog(ML_ERROR,
1547 .value_len = xi->value_len, 1978 "Unable to allocate an external value for xattr "
1548 }; 1979 "%.*s safely. Leaking %u clusters and removing the "
1549 struct ocfs2_xattr_value_buf vb = { 1980 "entry\n",
1550 .vb_bh = xs->xattr_bh, 1981 loc->xl_entry->xe_name_len, nameval_buf,
1551 .vb_access = ocfs2_journal_access_di, 1982 new_clusters - orig_clusters);
1552 }; 1983 ocfs2_xa_remove_entry(loc);
1984 } else if (new_clusters > orig_clusters)
1985 mlog(ML_ERROR,
1986 "Unable to grow xattr %.*s safely. %u new clusters "
1987 "have been added, but the value will not be "
1988 "modified\n",
1989 loc->xl_entry->xe_name_len, nameval_buf,
1990 new_clusters - orig_clusters);
1991}
1992
1993static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
1994 struct ocfs2_xattr_set_ctxt *ctxt)
1995{
1996 int rc = 0;
1997 unsigned int orig_clusters;
1998
1999 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2000 orig_clusters = ocfs2_xa_value_clusters(loc);
2001 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2002 if (rc) {
2003 mlog_errno(rc);
2004 /*
2005 * Since this is remove, we can return 0 if
2006 * ocfs2_xa_cleanup_value_truncate() is going to
2007 * wipe the entry anyway. So we check the
2008 * cluster count as well.
2009 */
2010 if (orig_clusters != ocfs2_xa_value_clusters(loc))
2011 rc = 0;
2012 ocfs2_xa_cleanup_value_truncate(loc, "removing",
2013 orig_clusters);
2014 if (rc)
2015 goto out;
2016 }
2017 }
1553 2018
1554 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2019 ocfs2_xa_remove_entry(loc);
1555 BUG_ON(xs->xattr_bh == xs->inode_bh);
1556 vb.vb_access = ocfs2_journal_access_xb;
1557 } else
1558 BUG_ON(xs->xattr_bh != xs->inode_bh);
1559 2020
1560 /* Compute min_offs, last and free space. */ 2021out:
1561 last = xs->header->xh_entries; 2022 return rc;
2023}
1562 2024
1563 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { 2025static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
1564 size_t offs = le16_to_cpu(last->xe_name_offset); 2026{
1565 if (offs < min_offs) 2027 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1566 min_offs = offs; 2028 char *nameval_buf;
1567 last += 1;
1568 }
1569 2029
1570 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; 2030 nameval_buf = ocfs2_xa_offset_pointer(loc,
1571 if (free < 0) 2031 le16_to_cpu(loc->xl_entry->xe_name_offset));
1572 return -EIO; 2032 memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
2033}
1573 2034
1574 if (!xs->not_found) { 2035/*
1575 size_t size = 0; 2036 * Take an existing entry and make it ready for the new value. This
1576 if (ocfs2_xattr_is_local(xs->here)) 2037 * won't allocate space, but it may free space. It should be ready for
1577 size = OCFS2_XATTR_SIZE(name_len) + 2038 * ocfs2_xa_prepare_entry() to finish the work.
1578 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 2039 */
1579 else 2040static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
1580 size = OCFS2_XATTR_SIZE(name_len) + 2041 struct ocfs2_xattr_info *xi,
1581 OCFS2_XATTR_ROOT_SIZE; 2042 struct ocfs2_xattr_set_ctxt *ctxt)
1582 free += (size + sizeof(struct ocfs2_xattr_entry)); 2043{
1583 } 2044 int rc = 0;
1584 /* Check free space in inode or block */ 2045 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
1585 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2046 unsigned int orig_clusters;
1586 if (free < sizeof(struct ocfs2_xattr_entry) + 2047 char *nameval_buf;
1587 OCFS2_XATTR_SIZE(name_len) + 2048 int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
1588 OCFS2_XATTR_ROOT_SIZE) { 2049 int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
1589 ret = -ENOSPC; 2050
1590 goto out; 2051 BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
2052 name_size);
2053
2054 nameval_buf = ocfs2_xa_offset_pointer(loc,
2055 le16_to_cpu(loc->xl_entry->xe_name_offset));
2056 if (xe_local) {
2057 memset(nameval_buf + name_size, 0,
2058 namevalue_size_xe(loc->xl_entry) - name_size);
2059 if (!xi_local)
2060 ocfs2_xa_install_value_root(loc);
2061 } else {
2062 orig_clusters = ocfs2_xa_value_clusters(loc);
2063 if (xi_local) {
2064 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2065 if (rc < 0)
2066 mlog_errno(rc);
2067 else
2068 memset(nameval_buf + name_size, 0,
2069 namevalue_size_xe(loc->xl_entry) -
2070 name_size);
2071 } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
2072 xi->xi_value_len) {
2073 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
2074 ctxt);
2075 if (rc < 0)
2076 mlog_errno(rc);
1591 } 2077 }
1592 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 2078
1593 xi_l.value = (void *)&def_xv; 2079 if (rc) {
1594 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; 2080 ocfs2_xa_cleanup_value_truncate(loc, "reusing",
1595 } else if (xi->value) { 2081 orig_clusters);
1596 if (free < sizeof(struct ocfs2_xattr_entry) +
1597 OCFS2_XATTR_SIZE(name_len) +
1598 OCFS2_XATTR_SIZE(xi->value_len)) {
1599 ret = -ENOSPC;
1600 goto out; 2082 goto out;
1601 } 2083 }
1602 } 2084 }
1603 2085
1604 if (!xs->not_found) { 2086 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1605 /* For existing extended attribute */ 2087 ocfs2_xattr_set_local(loc->xl_entry, xi_local);
1606 size_t size = OCFS2_XATTR_SIZE(name_len) +
1607 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1608 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1609 void *val = xs->base + offs;
1610 2088
1611 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 2089out:
1612 /* Replace existing local xattr with tree root */ 2090 return rc;
1613 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 2091}
1614 ctxt, &vb, offs);
1615 if (ret < 0)
1616 mlog_errno(ret);
1617 goto out;
1618 } else if (!ocfs2_xattr_is_local(xs->here)) {
1619 /* For existing xattr which has value outside */
1620 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1621 (val + OCFS2_XATTR_SIZE(name_len));
1622 2092
1623 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2093/*
1624 /* 2094 * Prepares loc->xl_entry to receive the new xattr. This includes
1625 * If new value need set outside also, 2095 * properly setting up the name+value pair region. If loc->xl_entry
1626 * first truncate old value to new value, 2096 * already exists, it will take care of modifying it appropriately.
1627 * then set new value with set_value_outside(). 2097 *
1628 */ 2098 * Note that this modifies the data. You did journal_access already,
1629 ret = ocfs2_xattr_value_truncate(inode, 2099 * right?
1630 &vb, 2100 */
1631 xi->value_len, 2101static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
1632 ctxt); 2102 struct ocfs2_xattr_info *xi,
1633 if (ret < 0) { 2103 u32 name_hash,
1634 mlog_errno(ret); 2104 struct ocfs2_xattr_set_ctxt *ctxt)
1635 goto out; 2105{
1636 } 2106 int rc = 0;
2107 unsigned int orig_clusters;
2108 __le64 orig_value_size = 0;
1637 2109
1638 ret = ocfs2_xattr_update_entry(inode, 2110 rc = ocfs2_xa_check_space(loc, xi);
1639 handle, 2111 if (rc)
1640 xi, 2112 goto out;
1641 xs,
1642 &vb,
1643 offs);
1644 if (ret < 0) {
1645 mlog_errno(ret);
1646 goto out;
1647 }
1648 2113
1649 ret = __ocfs2_xattr_set_value_outside(inode, 2114 if (loc->xl_entry) {
1650 handle, 2115 if (ocfs2_xa_can_reuse_entry(loc, xi)) {
1651 &vb, 2116 orig_value_size = loc->xl_entry->xe_value_size;
1652 xi->value, 2117 rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
1653 xi->value_len); 2118 if (rc)
1654 if (ret < 0) 2119 goto out;
1655 mlog_errno(ret); 2120 goto alloc_value;
2121 }
2122
2123 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2124 orig_clusters = ocfs2_xa_value_clusters(loc);
2125 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2126 if (rc) {
2127 mlog_errno(rc);
2128 ocfs2_xa_cleanup_value_truncate(loc,
2129 "overwriting",
2130 orig_clusters);
1656 goto out; 2131 goto out;
1657 } else {
1658 /*
1659 * If new value need set in local,
1660 * just trucate old value to zero.
1661 */
1662 ret = ocfs2_xattr_value_truncate(inode,
1663 &vb,
1664 0,
1665 ctxt);
1666 if (ret < 0)
1667 mlog_errno(ret);
1668 } 2132 }
1669 } 2133 }
2134 ocfs2_xa_wipe_namevalue(loc);
2135 } else
2136 ocfs2_xa_add_entry(loc, name_hash);
2137
2138 /*
2139 * If we get here, we have a blank entry. Fill it. We grow our
2140 * name+value pair back from the end.
2141 */
2142 ocfs2_xa_add_namevalue(loc, xi);
2143 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2144 ocfs2_xa_install_value_root(loc);
2145
2146alloc_value:
2147 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2148 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) {
2151 /*
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters);
2160 mlog_errno(rc);
2161 }
1670 } 2162 }
1671 2163
1672 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, 2164out:
2165 return rc;
2166}
2167
2168/*
2169 * Store the value portion of the name+value pair. This will skip
2170 * values that are stored externally. Their tree roots were set up
2171 * by ocfs2_xa_prepare_entry().
2172 */
2173static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
2174 struct ocfs2_xattr_info *xi,
2175 struct ocfs2_xattr_set_ctxt *ctxt)
2176{
2177 int rc = 0;
2178 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
2179 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
2180 char *nameval_buf;
2181 struct ocfs2_xattr_value_buf vb;
2182
2183 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
2184 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2185 ocfs2_xa_fill_value_buf(loc, &vb);
2186 rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
2187 ctxt->handle, &vb,
2188 xi->xi_value,
2189 xi->xi_value_len);
2190 } else
2191 memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
2192
2193 return rc;
2194}
2195
2196static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
2197 struct ocfs2_xattr_info *xi,
2198 struct ocfs2_xattr_set_ctxt *ctxt)
2199{
2200 int ret;
2201 u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
2202 xi->xi_name_len);
2203
2204 ret = ocfs2_xa_journal_access(ctxt->handle, loc,
1673 OCFS2_JOURNAL_ACCESS_WRITE); 2205 OCFS2_JOURNAL_ACCESS_WRITE);
1674 if (ret) { 2206 if (ret) {
1675 mlog_errno(ret); 2207 mlog_errno(ret);
1676 goto out; 2208 goto out;
1677 } 2209 }
1678 2210
1679 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1680 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1681 OCFS2_JOURNAL_ACCESS_WRITE);
1682 if (ret) {
1683 mlog_errno(ret);
1684 goto out;
1685 }
1686 }
1687
1688 /* 2211 /*
1689 * Set value in local, include set tree root in local. 2212 * From here on out, everything is going to modify the buffer a
1690 * This is the first step for value size >INLINE_SIZE. 2213 * little. Errors are going to leave the xattr header in a
2214 * sane state. Thus, even with errors we dirty the sucker.
1691 */ 2215 */
1692 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1693 2216
1694 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2217 /* Don't worry, we are never called with !xi_value and !xl_entry */
1695 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 2218 if (!xi->xi_value) {
1696 if (ret < 0) { 2219 ret = ocfs2_xa_remove(loc, ctxt);
1697 mlog_errno(ret); 2220 goto out_dirty;
1698 goto out;
1699 }
1700 } 2221 }
1701 2222
1702 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && 2223 ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
1703 (flag & OCFS2_INLINE_XATTR_FL)) { 2224 if (ret) {
1704 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 if (ret != -ENOSPC)
1705 unsigned int xattrsize = osb->s_xattr_inline_size; 2226 mlog_errno(ret);
1706 2227 goto out_dirty;
1707 /*
1708 * Adjust extent record count or inline data size
1709 * to reserve space for extended attribute.
1710 */
1711 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1712 struct ocfs2_inline_data *idata = &di->id2.i_data;
1713 le16_add_cpu(&idata->id_count, -xattrsize);
1714 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1715 struct ocfs2_extent_list *el = &di->id2.i_list;
1716 le16_add_cpu(&el->l_count, -(xattrsize /
1717 sizeof(struct ocfs2_extent_rec)));
1718 }
1719 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1720 } 2228 }
1721 /* Update xattr flag */
1722 spin_lock(&oi->ip_lock);
1723 oi->ip_dyn_features |= flag;
1724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1725 spin_unlock(&oi->ip_lock);
1726 2229
1727 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2230 ret = ocfs2_xa_store_value(loc, xi, ctxt);
1728 if (ret < 0) 2231 if (ret)
1729 mlog_errno(ret); 2232 mlog_errno(ret);
1730 2233
1731 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2234out_dirty:
1732 /* 2235 ocfs2_xa_journal_dirty(ctxt->handle, loc);
1733 * Set value outside in B tree.
1734 * This is the second step for value size > INLINE_SIZE.
1735 */
1736 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1737 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1738 &vb, offs);
1739 if (ret < 0) {
1740 int ret2;
1741 2236
1742 mlog_errno(ret);
1743 /*
1744 * If set value outside failed, we have to clean
1745 * the junk tree root we have already set in local.
1746 */
1747 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1748 xi, xs, &vb, offs);
1749 if (ret2 < 0)
1750 mlog_errno(ret2);
1751 }
1752 }
1753out: 2237out:
1754 return ret; 2238 return ret;
1755} 2239}
1756 2240
2241static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
2242 struct inode *inode,
2243 struct buffer_head *bh,
2244 struct ocfs2_xattr_entry *entry)
2245{
2246 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2247
2248 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
2249
2250 loc->xl_inode = inode;
2251 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2252 loc->xl_storage = bh;
2253 loc->xl_entry = entry;
2254 loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
2255 loc->xl_header =
2256 (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
2257 loc->xl_size);
2258}
2259
2260static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
2261 struct inode *inode,
2262 struct buffer_head *bh,
2263 struct ocfs2_xattr_entry *entry)
2264{
2265 struct ocfs2_xattr_block *xb =
2266 (struct ocfs2_xattr_block *)bh->b_data;
2267
2268 BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
2269
2270 loc->xl_inode = inode;
2271 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2272 loc->xl_storage = bh;
2273 loc->xl_header = &(xb->xb_attrs.xb_header);
2274 loc->xl_entry = entry;
2275 loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
2276 xb_attrs.xb_header);
2277}
2278
2279static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
2280 struct ocfs2_xattr_bucket *bucket,
2281 struct ocfs2_xattr_entry *entry)
2282{
2283 loc->xl_inode = bucket->bu_inode;
2284 loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
2285 loc->xl_storage = bucket;
2286 loc->xl_header = bucket_xh(bucket);
2287 loc->xl_entry = entry;
2288 loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
2289}
2290
1757/* 2291/*
1758 * In xattr remove, if it is stored outside and refcounted, we may have 2292 * In xattr remove, if it is stored outside and refcounted, we may have
1759 * the chance to split the refcount tree. So need the allocators. 2293 * the chance to split the refcount tree. So need the allocators.
@@ -2149,6 +2683,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
2149 return 0; 2683 return 0;
2150} 2684}
2151 2685
2686static int ocfs2_xattr_ibody_init(struct inode *inode,
2687 struct buffer_head *di_bh,
2688 struct ocfs2_xattr_set_ctxt *ctxt)
2689{
2690 int ret;
2691 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2692 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2693 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2694 unsigned int xattrsize = osb->s_xattr_inline_size;
2695
2696 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2697 ret = -ENOSPC;
2698 goto out;
2699 }
2700
2701 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
2702 OCFS2_JOURNAL_ACCESS_WRITE);
2703 if (ret) {
2704 mlog_errno(ret);
2705 goto out;
2706 }
2707
2708 /*
2709 * Adjust extent record count or inline data size
2710 * to reserve space for extended attribute.
2711 */
2712 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2713 struct ocfs2_inline_data *idata = &di->id2.i_data;
2714 le16_add_cpu(&idata->id_count, -xattrsize);
2715 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
2716 struct ocfs2_extent_list *el = &di->id2.i_list;
2717 le16_add_cpu(&el->l_count, -(xattrsize /
2718 sizeof(struct ocfs2_extent_rec)));
2719 }
2720 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
2721
2722 spin_lock(&oi->ip_lock);
2723 oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock);
2726
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730
2731out:
2732 return ret;
2733}
2734
2152/* 2735/*
2153 * ocfs2_xattr_ibody_set() 2736 * ocfs2_xattr_ibody_set()
2154 * 2737 *
@@ -2160,9 +2743,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2160 struct ocfs2_xattr_search *xs, 2743 struct ocfs2_xattr_search *xs,
2161 struct ocfs2_xattr_set_ctxt *ctxt) 2744 struct ocfs2_xattr_set_ctxt *ctxt)
2162{ 2745{
2746 int ret;
2163 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2747 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2164 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2748 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2165 int ret; 2749 struct ocfs2_xa_loc loc;
2166 2750
2167 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2751 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
2168 return -ENOSPC; 2752 return -ENOSPC;
@@ -2175,8 +2759,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2175 } 2759 }
2176 } 2760 }
2177 2761
2178 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2762 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2179 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2763 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2764 if (ret) {
2765 if (ret != -ENOSPC)
2766 mlog_errno(ret);
2767 goto out;
2768 }
2769 }
2770
2771 ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
2772 xs->not_found ? NULL : xs->here);
2773 ret = ocfs2_xa_set(&loc, xi, ctxt);
2774 if (ret) {
2775 if (ret != -ENOSPC)
2776 mlog_errno(ret);
2777 goto out;
2778 }
2779 xs->here = loc.xl_entry;
2780
2180out: 2781out:
2181 up_write(&oi->ip_alloc_sem); 2782 up_write(&oi->ip_alloc_sem);
2182 2783
@@ -2236,12 +2837,11 @@ cleanup:
2236 return ret; 2837 return ret;
2237} 2838}
2238 2839
2239static int ocfs2_create_xattr_block(handle_t *handle, 2840static int ocfs2_create_xattr_block(struct inode *inode,
2240 struct inode *inode,
2241 struct buffer_head *inode_bh, 2841 struct buffer_head *inode_bh,
2242 struct ocfs2_alloc_context *meta_ac, 2842 struct ocfs2_xattr_set_ctxt *ctxt,
2243 struct buffer_head **ret_bh, 2843 int indexed,
2244 int indexed) 2844 struct buffer_head **ret_bh)
2245{ 2845{
2246 int ret; 2846 int ret;
2247 u16 suballoc_bit_start; 2847 u16 suballoc_bit_start;
@@ -2252,14 +2852,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2252 struct buffer_head *new_bh = NULL; 2852 struct buffer_head *new_bh = NULL;
2253 struct ocfs2_xattr_block *xblk; 2853 struct ocfs2_xattr_block *xblk;
2254 2854
2255 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, 2855 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2256 OCFS2_JOURNAL_ACCESS_CREATE); 2856 inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
2257 if (ret < 0) { 2857 if (ret < 0) {
2258 mlog_errno(ret); 2858 mlog_errno(ret);
2259 goto end; 2859 goto end;
2260 } 2860 }
2261 2861
2262 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
2263 &suballoc_bit_start, &num_got, 2863 &suballoc_bit_start, &num_got,
2264 &first_blkno); 2864 &first_blkno);
2265 if (ret < 0) { 2865 if (ret < 0) {
@@ -2270,7 +2870,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2270 new_bh = sb_getblk(inode->i_sb, first_blkno); 2870 new_bh = sb_getblk(inode->i_sb, first_blkno);
2271 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); 2871 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2272 2872
2273 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), 2873 ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
2274 new_bh, 2874 new_bh,
2275 OCFS2_JOURNAL_ACCESS_CREATE); 2875 OCFS2_JOURNAL_ACCESS_CREATE);
2276 if (ret < 0) { 2876 if (ret < 0) {
@@ -2282,11 +2882,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2282 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2882 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2283 memset(xblk, 0, inode->i_sb->s_blocksize); 2883 memset(xblk, 0, inode->i_sb->s_blocksize);
2284 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2285 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); 2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2286 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2287 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2288 xblk->xb_blkno = cpu_to_le64(first_blkno); 2888 xblk->xb_blkno = cpu_to_le64(first_blkno);
2289
2290 if (indexed) { 2889 if (indexed) {
2291 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2292 xr->xt_clusters = cpu_to_le32(1); 2891 xr->xt_clusters = cpu_to_le32(1);
@@ -2297,14 +2896,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2297 xr->xt_list.l_next_free_rec = cpu_to_le16(1); 2896 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2298 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); 2897 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2299 } 2898 }
2899 ocfs2_journal_dirty(ctxt->handle, new_bh);
2300 2900
2301 ret = ocfs2_journal_dirty(handle, new_bh); 2901 /* Add it to the inode */
2302 if (ret < 0) {
2303 mlog_errno(ret);
2304 goto end;
2305 }
2306 di->i_xattr_loc = cpu_to_le64(first_blkno); 2902 di->i_xattr_loc = cpu_to_le64(first_blkno);
2307 ocfs2_journal_dirty(handle, inode_bh); 2903
2904 spin_lock(&OCFS2_I(inode)->ip_lock);
2905 OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
2906 di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
2907 spin_unlock(&OCFS2_I(inode)->ip_lock);
2908
2909 ocfs2_journal_dirty(ctxt->handle, inode_bh);
2308 2910
2309 *ret_bh = new_bh; 2911 *ret_bh = new_bh;
2310 new_bh = NULL; 2912 new_bh = NULL;
@@ -2326,13 +2928,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2326 struct ocfs2_xattr_set_ctxt *ctxt) 2928 struct ocfs2_xattr_set_ctxt *ctxt)
2327{ 2929{
2328 struct buffer_head *new_bh = NULL; 2930 struct buffer_head *new_bh = NULL;
2329 handle_t *handle = ctxt->handle;
2330 struct ocfs2_xattr_block *xblk = NULL; 2931 struct ocfs2_xattr_block *xblk = NULL;
2331 int ret; 2932 int ret;
2933 struct ocfs2_xa_loc loc;
2332 2934
2333 if (!xs->xattr_bh) { 2935 if (!xs->xattr_bh) {
2334 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, 2936 ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
2335 ctxt->meta_ac, &new_bh, 0); 2937 0, &new_bh);
2336 if (ret) { 2938 if (ret) {
2337 mlog_errno(ret); 2939 mlog_errno(ret);
2338 goto end; 2940 goto end;
@@ -2348,21 +2950,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2348 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2349 2951
2350 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
2351 /* Set extended attribute into external block */ 2953 ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
2352 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2954 xs->not_found ? NULL : xs->here);
2353 OCFS2_HAS_XATTR_FL);
2354 if (!ret || ret != -ENOSPC)
2355 goto end;
2356 2955
2357 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2956 ret = ocfs2_xa_set(&loc, xi, ctxt);
2358 if (ret) 2957 if (!ret)
2958 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC)
2359 goto end; 2960 goto end;
2961 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
2963 if (ret)
2964 goto end;
2965 }
2360 } 2966 }
2361 2967
2362 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); 2968 if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
2969 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
2363 2970
2364end: 2971end:
2365
2366 return ret; 2972 return ret;
2367} 2973}
2368 2974
@@ -2371,7 +2977,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2371 struct ocfs2_xattr_info *xi, 2977 struct ocfs2_xattr_info *xi,
2372 struct ocfs2_xattr_search *xs) 2978 struct ocfs2_xattr_search *xs)
2373{ 2979{
2374 u64 value_size;
2375 struct ocfs2_xattr_entry *last; 2980 struct ocfs2_xattr_entry *last;
2376 int free, i; 2981 int free, i;
2377 size_t min_offs = xs->end - xs->base; 2982 size_t min_offs = xs->end - xs->base;
@@ -2394,13 +2999,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2394 2999
2395 BUG_ON(!xs->not_found); 3000 BUG_ON(!xs->not_found);
2396 3001
2397 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3002 if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
2398 value_size = OCFS2_XATTR_ROOT_SIZE;
2399 else
2400 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2401
2402 if (free >= sizeof(struct ocfs2_xattr_entry) +
2403 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2404 return 1; 3003 return 1;
2405 3004
2406 return 0; 3005 return 0;
@@ -2424,7 +3023,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2424 char *base = NULL; 3023 char *base = NULL;
2425 int name_offset, name_len = 0; 3024 int name_offset, name_len = 0;
2426 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, 3025 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2427 xi->value_len); 3026 xi->xi_value_len);
2428 u64 value_size; 3027 u64 value_size;
2429 3028
2430 /* 3029 /*
@@ -2432,14 +3031,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2432 * No matter whether we replace an old one or add a new one, 3031 * No matter whether we replace an old one or add a new one,
2433 * we need this for writing. 3032 * we need this for writing.
2434 */ 3033 */
2435 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3034 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2436 credits += new_clusters * 3035 credits += new_clusters *
2437 ocfs2_clusters_to_blocks(inode->i_sb, 1); 3036 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2438 3037
2439 if (xis->not_found && xbs->not_found) { 3038 if (xis->not_found && xbs->not_found) {
2440 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3039 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2441 3040
2442 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3041 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2443 clusters_add += new_clusters; 3042 clusters_add += new_clusters;
2444 credits += ocfs2_calc_extend_credits(inode->i_sb, 3043 credits += ocfs2_calc_extend_credits(inode->i_sb,
2445 &def_xv.xv.xr_list, 3044 &def_xv.xv.xr_list,
@@ -2484,7 +3083,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2484 * The credits for removing the value tree will be extended 3083 * The credits for removing the value tree will be extended
2485 * by ocfs2_remove_extent itself. 3084 * by ocfs2_remove_extent itself.
2486 */ 3085 */
2487 if (!xi->value) { 3086 if (!xi->xi_value) {
2488 if (!ocfs2_xattr_is_local(xe)) 3087 if (!ocfs2_xattr_is_local(xe))
2489 credits += ocfs2_remove_extent_credits(inode->i_sb); 3088 credits += ocfs2_remove_extent_credits(inode->i_sb);
2490 3089
@@ -2514,7 +3113,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2514 } 3113 }
2515 } 3114 }
2516 3115
2517 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3116 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2518 /* the new values will be stored outside. */ 3117 /* the new values will be stored outside. */
2519 u32 old_clusters = 0; 3118 u32 old_clusters = 0;
2520 3119
@@ -2547,9 +3146,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2547 * value, we don't need any allocation, otherwise we have 3146 * value, we don't need any allocation, otherwise we have
2548 * to guess metadata allocation. 3147 * to guess metadata allocation.
2549 */ 3148 */
2550 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || 3149 if ((ocfs2_xattr_is_local(xe) &&
3150 (value_size >= xi->xi_value_len)) ||
2551 (!ocfs2_xattr_is_local(xe) && 3151 (!ocfs2_xattr_is_local(xe) &&
2552 OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) 3152 OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
2553 goto out; 3153 goto out;
2554 } 3154 }
2555 3155
@@ -2639,7 +3239,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2639 3239
2640 meta_add += extra_meta; 3240 meta_add += extra_meta;
2641 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 3241 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2642 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 3242 "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
2643 3243
2644 if (meta_add) { 3244 if (meta_add) {
2645 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, 3245 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2679,7 +3279,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2679{ 3279{
2680 int ret = 0, credits, old_found; 3280 int ret = 0, credits, old_found;
2681 3281
2682 if (!xi->value) { 3282 if (!xi->xi_value) {
2683 /* Remove existing extended attribute */ 3283 /* Remove existing extended attribute */
2684 if (!xis->not_found) 3284 if (!xis->not_found)
2685 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); 3285 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2693,8 +3293,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2693 * If succeed and that extended attribute existing in 3293 * If succeed and that extended attribute existing in
2694 * external block, then we will remove it. 3294 * external block, then we will remove it.
2695 */ 3295 */
2696 xi->value = NULL; 3296 xi->xi_value = NULL;
2697 xi->value_len = 0; 3297 xi->xi_value_len = 0;
2698 3298
2699 old_found = xis->not_found; 3299 old_found = xis->not_found;
2700 xis->not_found = -ENODATA; 3300 xis->not_found = -ENODATA;
@@ -2722,8 +3322,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2722 } else if (ret == -ENOSPC) { 3322 } else if (ret == -ENOSPC) {
2723 if (di->i_xattr_loc && !xbs->xattr_bh) { 3323 if (di->i_xattr_loc && !xbs->xattr_bh) {
2724 ret = ocfs2_xattr_block_find(inode, 3324 ret = ocfs2_xattr_block_find(inode,
2725 xi->name_index, 3325 xi->xi_name_index,
2726 xi->name, xbs); 3326 xi->xi_name, xbs);
2727 if (ret) 3327 if (ret)
2728 goto out; 3328 goto out;
2729 3329
@@ -2762,8 +3362,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2762 * If succeed and that extended attribute 3362 * If succeed and that extended attribute
2763 * existing in inode, we will remove it. 3363 * existing in inode, we will remove it.
2764 */ 3364 */
2765 xi->value = NULL; 3365 xi->xi_value = NULL;
2766 xi->value_len = 0; 3366 xi->xi_value_len = 0;
2767 xbs->not_found = -ENODATA; 3367 xbs->not_found = -ENODATA;
2768 ret = ocfs2_calc_xattr_set_need(inode, 3368 ret = ocfs2_calc_xattr_set_need(inode,
2769 di, 3369 di,
@@ -2829,10 +3429,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
2829 int ret; 3429 int ret;
2830 3430
2831 struct ocfs2_xattr_info xi = { 3431 struct ocfs2_xattr_info xi = {
2832 .name_index = name_index, 3432 .xi_name_index = name_index,
2833 .name = name, 3433 .xi_name = name,
2834 .value = value, 3434 .xi_name_len = strlen(name),
2835 .value_len = value_len, 3435 .xi_value = value,
3436 .xi_value_len = value_len,
2836 }; 3437 };
2837 3438
2838 struct ocfs2_xattr_search xis = { 3439 struct ocfs2_xattr_search xis = {
@@ -2912,10 +3513,11 @@ int ocfs2_xattr_set(struct inode *inode,
2912 struct ocfs2_refcount_tree *ref_tree = NULL; 3513 struct ocfs2_refcount_tree *ref_tree = NULL;
2913 3514
2914 struct ocfs2_xattr_info xi = { 3515 struct ocfs2_xattr_info xi = {
2915 .name_index = name_index, 3516 .xi_name_index = name_index,
2916 .name = name, 3517 .xi_name = name,
2917 .value = value, 3518 .xi_name_len = strlen(name),
2918 .value_len = value_len, 3519 .xi_value = value,
3520 .xi_value_len = value_len,
2919 }; 3521 };
2920 3522
2921 struct ocfs2_xattr_search xis = { 3523 struct ocfs2_xattr_search xis = {
@@ -3759,7 +4361,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3759 struct ocfs2_xattr_bucket *bucket) 4361 struct ocfs2_xattr_bucket *bucket)
3760{ 4362{
3761 int ret, i; 4363 int ret, i;
3762 size_t end, offset, len, value_len; 4364 size_t end, offset, len;
3763 struct ocfs2_xattr_header *xh; 4365 struct ocfs2_xattr_header *xh;
3764 char *entries, *buf, *bucket_buf = NULL; 4366 char *entries, *buf, *bucket_buf = NULL;
3765 u64 blkno = bucket_blkno(bucket); 4367 u64 blkno = bucket_blkno(bucket);
@@ -3813,12 +4415,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3813 end = OCFS2_XATTR_BUCKET_SIZE; 4415 end = OCFS2_XATTR_BUCKET_SIZE;
3814 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { 4416 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
3815 offset = le16_to_cpu(xe->xe_name_offset); 4417 offset = le16_to_cpu(xe->xe_name_offset);
3816 if (ocfs2_xattr_is_local(xe)) 4418 len = namevalue_size_xe(xe);
3817 value_len = OCFS2_XATTR_SIZE(
3818 le64_to_cpu(xe->xe_value_size));
3819 else
3820 value_len = OCFS2_XATTR_ROOT_SIZE;
3821 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
3822 4419
3823 /* 4420 /*
3824 * We must make sure that the name/value pair 4421 * We must make sure that the name/value pair
@@ -4007,7 +4604,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4007 int new_bucket_head) 4604 int new_bucket_head)
4008{ 4605{
4009 int ret, i; 4606 int ret, i;
4010 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 4607 int count, start, len, name_value_len = 0, name_offset = 0;
4011 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; 4608 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
4012 struct ocfs2_xattr_header *xh; 4609 struct ocfs2_xattr_header *xh;
4013 struct ocfs2_xattr_entry *xe; 4610 struct ocfs2_xattr_entry *xe;
@@ -4098,13 +4695,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4098 name_value_len = 0; 4695 name_value_len = 0;
4099 for (i = 0; i < start; i++) { 4696 for (i = 0; i < start; i++) {
4100 xe = &xh->xh_entries[i]; 4697 xe = &xh->xh_entries[i];
4101 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); 4698 name_value_len += namevalue_size_xe(xe);
4102 if (ocfs2_xattr_is_local(xe))
4103 xe_len +=
4104 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4105 else
4106 xe_len += OCFS2_XATTR_ROOT_SIZE;
4107 name_value_len += xe_len;
4108 if (le16_to_cpu(xe->xe_name_offset) < name_offset) 4699 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
4109 name_offset = le16_to_cpu(xe->xe_name_offset); 4700 name_offset = le16_to_cpu(xe->xe_name_offset);
4110 } 4701 }
@@ -4134,12 +4725,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4134 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); 4725 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4135 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 4726 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4136 xe = &xh->xh_entries[i]; 4727 xe = &xh->xh_entries[i];
4137 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
4138 if (ocfs2_xattr_is_local(xe))
4139 xe_len +=
4140 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4141 else
4142 xe_len += OCFS2_XATTR_ROOT_SIZE;
4143 if (le16_to_cpu(xe->xe_name_offset) < 4728 if (le16_to_cpu(xe->xe_name_offset) <
4144 le16_to_cpu(xh->xh_free_start)) 4729 le16_to_cpu(xh->xh_free_start))
4145 xh->xh_free_start = xe->xe_name_offset; 4730 xh->xh_free_start = xe->xe_name_offset;
@@ -4751,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
4751} 5336}
4752 5337
4753/* 5338/*
4754 * Handle the normal xattr set, including replace, delete and new.
4755 *
4756 * Note: "local" indicates the real data's locality. So we can't
4757 * just its bucket locality by its length.
4758 */
4759static void ocfs2_xattr_set_entry_normal(struct inode *inode,
4760 struct ocfs2_xattr_info *xi,
4761 struct ocfs2_xattr_search *xs,
4762 u32 name_hash,
4763 int local)
4764{
4765 struct ocfs2_xattr_entry *last, *xe;
4766 int name_len = strlen(xi->name);
4767 struct ocfs2_xattr_header *xh = xs->header;
4768 u16 count = le16_to_cpu(xh->xh_count), start;
4769 size_t blocksize = inode->i_sb->s_blocksize;
4770 char *val;
4771 size_t offs, size, new_size;
4772
4773 last = &xh->xh_entries[count];
4774 if (!xs->not_found) {
4775 xe = xs->here;
4776 offs = le16_to_cpu(xe->xe_name_offset);
4777 if (ocfs2_xattr_is_local(xe))
4778 size = OCFS2_XATTR_SIZE(name_len) +
4779 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4780 else
4781 size = OCFS2_XATTR_SIZE(name_len) +
4782 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4783
4784 /*
4785 * If the new value will be stored outside, xi->value has been
4786 * initalized as an empty ocfs2_xattr_value_root, and the same
4787 * goes with xi->value_len, so we can set new_size safely here.
4788 * See ocfs2_xattr_set_in_bucket.
4789 */
4790 new_size = OCFS2_XATTR_SIZE(name_len) +
4791 OCFS2_XATTR_SIZE(xi->value_len);
4792
4793 le16_add_cpu(&xh->xh_name_value_len, -size);
4794 if (xi->value) {
4795 if (new_size > size)
4796 goto set_new_name_value;
4797
4798 /* Now replace the old value with new one. */
4799 if (local)
4800 xe->xe_value_size = cpu_to_le64(xi->value_len);
4801 else
4802 xe->xe_value_size = 0;
4803
4804 val = ocfs2_xattr_bucket_get_val(inode,
4805 xs->bucket, offs);
4806 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
4807 size - OCFS2_XATTR_SIZE(name_len));
4808 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
4809 memcpy(val + OCFS2_XATTR_SIZE(name_len),
4810 xi->value, xi->value_len);
4811
4812 le16_add_cpu(&xh->xh_name_value_len, new_size);
4813 ocfs2_xattr_set_local(xe, local);
4814 return;
4815 } else {
4816 /*
4817 * Remove the old entry if there is more than one.
4818 * We don't remove the last entry so that we can
4819 * use it to indicate the hash value of the empty
4820 * bucket.
4821 */
4822 last -= 1;
4823 le16_add_cpu(&xh->xh_count, -1);
4824 if (xh->xh_count) {
4825 memmove(xe, xe + 1,
4826 (void *)last - (void *)xe);
4827 memset(last, 0,
4828 sizeof(struct ocfs2_xattr_entry));
4829 } else
4830 xh->xh_free_start =
4831 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4832
4833 return;
4834 }
4835 } else {
4836 /* find a new entry for insert. */
4837 int low = 0, high = count - 1, tmp;
4838 struct ocfs2_xattr_entry *tmp_xe;
4839
4840 while (low <= high && count) {
4841 tmp = (low + high) / 2;
4842 tmp_xe = &xh->xh_entries[tmp];
4843
4844 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
4845 low = tmp + 1;
4846 else if (name_hash <
4847 le32_to_cpu(tmp_xe->xe_name_hash))
4848 high = tmp - 1;
4849 else {
4850 low = tmp;
4851 break;
4852 }
4853 }
4854
4855 xe = &xh->xh_entries[low];
4856 if (low != count)
4857 memmove(xe + 1, xe, (void *)last - (void *)xe);
4858
4859 le16_add_cpu(&xh->xh_count, 1);
4860 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4861 xe->xe_name_hash = cpu_to_le32(name_hash);
4862 xe->xe_name_len = name_len;
4863 ocfs2_xattr_set_type(xe, xi->name_index);
4864 }
4865
4866set_new_name_value:
4867 /* Insert the new name+value. */
4868 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4869
4870 /*
4871 * We must make sure that the name/value pair
4872 * exists in the same block.
4873 */
4874 offs = le16_to_cpu(xh->xh_free_start);
4875 start = offs - size;
4876
4877 if (start >> inode->i_sb->s_blocksize_bits !=
4878 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4879 offs = offs - offs % blocksize;
4880 xh->xh_free_start = cpu_to_le16(offs);
4881 }
4882
4883 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4884 xe->xe_name_offset = cpu_to_le16(offs - size);
4885
4886 memset(val, 0, size);
4887 memcpy(val, xi->name, name_len);
4888 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4889
4890 xe->xe_value_size = cpu_to_le64(xi->value_len);
4891 ocfs2_xattr_set_local(xe, local);
4892 xs->here = xe;
4893 le16_add_cpu(&xh->xh_free_start, -size);
4894 le16_add_cpu(&xh->xh_name_value_len, size);
4895
4896 return;
4897}
4898
4899/*
4900 * Set the xattr entry in the specified bucket.
4901 * The bucket is indicated by xs->bucket and it should have the enough
4902 * space for the xattr insertion.
4903 */
4904static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4905 handle_t *handle,
4906 struct ocfs2_xattr_info *xi,
4907 struct ocfs2_xattr_search *xs,
4908 u32 name_hash,
4909 int local)
4910{
4911 int ret;
4912 u64 blkno;
4913
4914 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4915 (unsigned long)xi->value_len, xi->name_index,
4916 (unsigned long long)bucket_blkno(xs->bucket));
4917
4918 if (!xs->bucket->bu_bhs[1]) {
4919 blkno = bucket_blkno(xs->bucket);
4920 ocfs2_xattr_bucket_relse(xs->bucket);
4921 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4922 if (ret) {
4923 mlog_errno(ret);
4924 goto out;
4925 }
4926 }
4927
4928 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4929 OCFS2_JOURNAL_ACCESS_WRITE);
4930 if (ret < 0) {
4931 mlog_errno(ret);
4932 goto out;
4933 }
4934
4935 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4936 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4937
4938out:
4939 return ret;
4940}
4941
4942/*
4943 * Truncate the specified xe_off entry in xattr bucket. 5339 * Truncate the specified xe_off entry in xattr bucket.
4944 * bucket is indicated by header_bh and len is the new length. 5340 * bucket is indicated by header_bh and len is the new length.
4945 * Both the ocfs2_xattr_value_root and the entry will be updated here. 5341 * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5009,66 +5405,6 @@ out:
5009 return ret; 5405 return ret;
5010} 5406}
5011 5407
5012static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
5013 struct ocfs2_xattr_search *xs,
5014 int len,
5015 struct ocfs2_xattr_set_ctxt *ctxt)
5016{
5017 int ret, offset;
5018 struct ocfs2_xattr_entry *xe = xs->here;
5019 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
5020
5021 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
5022
5023 offset = xe - xh->xh_entries;
5024 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
5025 offset, len, ctxt);
5026 if (ret)
5027 mlog_errno(ret);
5028
5029 return ret;
5030}
5031
5032static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
5033 handle_t *handle,
5034 struct ocfs2_xattr_search *xs,
5035 char *val,
5036 int value_len)
5037{
5038 int ret, offset, block_off;
5039 struct ocfs2_xattr_value_root *xv;
5040 struct ocfs2_xattr_entry *xe = xs->here;
5041 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5042 void *base;
5043 struct ocfs2_xattr_value_buf vb = {
5044 .vb_access = ocfs2_journal_access,
5045 };
5046
5047 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
5048
5049 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
5050 xe - xh->xh_entries,
5051 &block_off,
5052 &offset);
5053 if (ret) {
5054 mlog_errno(ret);
5055 goto out;
5056 }
5057
5058 base = bucket_block(xs->bucket, block_off);
5059 xv = (struct ocfs2_xattr_value_root *)(base + offset +
5060 OCFS2_XATTR_SIZE(xe->xe_name_len));
5061
5062 vb.vb_xv = xv;
5063 vb.vb_bh = xs->bucket->bu_bhs[block_off];
5064 ret = __ocfs2_xattr_set_value_outside(inode, handle,
5065 &vb, val, value_len);
5066 if (ret)
5067 mlog_errno(ret);
5068out:
5069 return ret;
5070}
5071
5072static int ocfs2_rm_xattr_cluster(struct inode *inode, 5408static int ocfs2_rm_xattr_cluster(struct inode *inode,
5073 struct buffer_head *root_bh, 5409 struct buffer_head *root_bh,
5074 u64 blkno, 5410 u64 blkno,
@@ -5167,128 +5503,6 @@ out:
5167 return ret; 5503 return ret;
5168} 5504}
5169 5505
5170static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
5171 handle_t *handle,
5172 struct ocfs2_xattr_search *xs)
5173{
5174 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5175 struct ocfs2_xattr_entry *last = &xh->xh_entries[
5176 le16_to_cpu(xh->xh_count) - 1];
5177 int ret = 0;
5178
5179 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
5180 OCFS2_JOURNAL_ACCESS_WRITE);
5181 if (ret) {
5182 mlog_errno(ret);
5183 return;
5184 }
5185
5186 /* Remove the old entry. */
5187 memmove(xs->here, xs->here + 1,
5188 (void *)last - (void *)xs->here);
5189 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
5190 le16_add_cpu(&xh->xh_count, -1);
5191
5192 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
5193}
5194
5195/*
5196 * Set the xattr name/value in the bucket specified in xs.
5197 *
5198 * As the new value in xi may be stored in the bucket or in an outside cluster,
5199 * we divide the whole process into 3 steps:
5200 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
5201 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
5202 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
5203 * 4. If the clusters for the new outside value can't be allocated, we need
5204 * to free the xattr we allocated in set.
5205 */
5206static int ocfs2_xattr_set_in_bucket(struct inode *inode,
5207 struct ocfs2_xattr_info *xi,
5208 struct ocfs2_xattr_search *xs,
5209 struct ocfs2_xattr_set_ctxt *ctxt)
5210{
5211 int ret, local = 1;
5212 size_t value_len;
5213 char *val = (char *)xi->value;
5214 struct ocfs2_xattr_entry *xe = xs->here;
5215 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
5216 strlen(xi->name));
5217
5218 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
5219 /*
5220 * We need to truncate the xattr storage first.
5221 *
5222 * If both the old and new value are stored to
5223 * outside block, we only need to truncate
5224 * the storage and then set the value outside.
5225 *
5226 * If the new value should be stored within block,
5227 * we should free all the outside block first and
5228 * the modification to the xattr block will be done
5229 * by following steps.
5230 */
5231 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
5232 value_len = xi->value_len;
5233 else
5234 value_len = 0;
5235
5236 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5237 value_len,
5238 ctxt);
5239 if (ret)
5240 goto out;
5241
5242 if (value_len)
5243 goto set_value_outside;
5244 }
5245
5246 value_len = xi->value_len;
5247 /* So we have to handle the inside block change now. */
5248 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
5249 /*
5250 * If the new value will be stored outside of block,
5251 * initalize a new empty value root and insert it first.
5252 */
5253 local = 0;
5254 xi->value = &def_xv;
5255 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
5256 }
5257
5258 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
5259 name_hash, local);
5260 if (ret) {
5261 mlog_errno(ret);
5262 goto out;
5263 }
5264
5265 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
5266 goto out;
5267
5268 /* allocate the space now for the outside block storage. */
5269 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5270 value_len, ctxt);
5271 if (ret) {
5272 mlog_errno(ret);
5273
5274 if (xs->not_found) {
5275 /*
5276 * We can't allocate enough clusters for outside
5277 * storage and we have allocated xattr already,
5278 * so need to remove it.
5279 */
5280 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
5281 }
5282 goto out;
5283 }
5284
5285set_value_outside:
5286 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5287 xs, val, value_len);
5288out:
5289 return ret;
5290}
5291
5292/* 5506/*
5293 * check whether the xattr bucket is filled up with the same hash value. 5507 * check whether the xattr bucket is filled up with the same hash value.
5294 * If we want to insert the xattr with the same hash, return -ENOSPC. 5508 * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5317,156 +5531,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
5317 return 0; 5531 return 0;
5318} 5532}
5319 5533
5320static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5534/*
5321 struct ocfs2_xattr_info *xi, 5535 * Try to set the entry in the current bucket. If we fail, the caller
5322 struct ocfs2_xattr_search *xs, 5536 * will handle getting us another bucket.
5323 struct ocfs2_xattr_set_ctxt *ctxt) 5537 */
5538static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5539 struct ocfs2_xattr_info *xi,
5540 struct ocfs2_xattr_search *xs,
5541 struct ocfs2_xattr_set_ctxt *ctxt)
5324{ 5542{
5325 struct ocfs2_xattr_header *xh; 5543 int ret;
5326 struct ocfs2_xattr_entry *xe; 5544 struct ocfs2_xa_loc loc;
5327 u16 count, header_size, xh_free_start;
5328 int free, max_free, need, old;
5329 size_t value_size = 0, name_len = strlen(xi->name);
5330 size_t blocksize = inode->i_sb->s_blocksize;
5331 int ret, allocation = 0;
5332
5333 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
5334
5335try_again:
5336 xh = xs->header;
5337 count = le16_to_cpu(xh->xh_count);
5338 xh_free_start = le16_to_cpu(xh->xh_free_start);
5339 header_size = sizeof(struct ocfs2_xattr_header) +
5340 count * sizeof(struct ocfs2_xattr_entry);
5341 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5342 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5343
5344 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5345 "of %u which exceed block size\n",
5346 (unsigned long long)bucket_blkno(xs->bucket),
5347 header_size);
5348 5545
5349 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5546 mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
5350 value_size = OCFS2_XATTR_ROOT_SIZE;
5351 else if (xi->value)
5352 value_size = OCFS2_XATTR_SIZE(xi->value_len);
5353 5547
5354 if (xs->not_found) 5548 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
5355 need = sizeof(struct ocfs2_xattr_entry) + 5549 xs->not_found ? NULL : xs->here);
5356 OCFS2_XATTR_SIZE(name_len) + value_size; 5550 ret = ocfs2_xa_set(&loc, xi, ctxt);
5357 else { 5551 if (!ret) {
5358 need = value_size + OCFS2_XATTR_SIZE(name_len); 5552 xs->here = loc.xl_entry;
5553 goto out;
5554 }
5555 if (ret != -ENOSPC) {
5556 mlog_errno(ret);
5557 goto out;
5558 }
5359 5559
5360 /* 5560 /* Ok, we need space. Let's try defragmenting the bucket. */
5361 * We only replace the old value if the new length is smaller 5561 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5362 * than the old one. Otherwise we will allocate new space in the 5562 xs->bucket);
5363 * bucket to store it. 5563 if (ret) {
5364 */ 5564 mlog_errno(ret);
5365 xe = xs->here; 5565 goto out;
5366 if (ocfs2_xattr_is_local(xe)) 5566 }
5367 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
5368 else
5369 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
5370 5567
5371 if (old >= value_size) 5568 ret = ocfs2_xa_set(&loc, xi, ctxt);
5372 need = 0; 5569 if (!ret) {
5570 xs->here = loc.xl_entry;
5571 goto out;
5373 } 5572 }
5573 if (ret != -ENOSPC)
5574 mlog_errno(ret);
5374 5575
5375 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5376 /*
5377 * We need to make sure the new name/value pair
5378 * can exist in the same block.
5379 */
5380 if (xh_free_start % blocksize < need)
5381 free -= xh_free_start % blocksize;
5382
5383 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
5384 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
5385 " %u\n", xs->not_found,
5386 (unsigned long long)bucket_blkno(xs->bucket),
5387 free, need, max_free, le16_to_cpu(xh->xh_free_start),
5388 le16_to_cpu(xh->xh_name_value_len));
5389
5390 if (free < need ||
5391 (xs->not_found &&
5392 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
5393 if (need <= max_free &&
5394 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
5395 /*
5396 * We can create the space by defragment. Since only the
5397 * name/value will be moved, the xe shouldn't be changed
5398 * in xs.
5399 */
5400 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5401 xs->bucket);
5402 if (ret) {
5403 mlog_errno(ret);
5404 goto out;
5405 }
5406 5576
5407 xh_free_start = le16_to_cpu(xh->xh_free_start); 5577out:
5408 free = xh_free_start - header_size 5578 mlog_exit(ret);
5409 - OCFS2_XATTR_HEADER_GAP; 5579 return ret;
5410 if (xh_free_start % blocksize < need) 5580}
5411 free -= xh_free_start % blocksize;
5412 5581
5413 if (free >= need) 5582static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5414 goto xattr_set; 5583 struct ocfs2_xattr_info *xi,
5584 struct ocfs2_xattr_search *xs,
5585 struct ocfs2_xattr_set_ctxt *ctxt)
5586{
5587 int ret;
5415 5588
5416 mlog(0, "Can't get enough space for xattr insert by " 5589 mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
5417 "defragment. Need %u bytes, but we have %d, so "
5418 "allocate new bucket for it.\n", need, free);
5419 }
5420 5590
5421 /* 5591 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5422 * We have to add new buckets or clusters and one 5592 if (!ret)
5423 * allocation should leave us enough space for insert. 5593 goto out;
5424 */ 5594 if (ret != -ENOSPC) {
5425 BUG_ON(allocation); 5595 mlog_errno(ret);
5596 goto out;
5597 }
5426 5598
5427 /* 5599 /* Ack, need more space. Let's try to get another bucket! */
5428 * We do not allow for overlapping ranges between buckets. And
5429 * the maximum number of collisions we will allow for then is
5430 * one bucket's worth, so check it here whether we need to
5431 * add a new bucket for the insert.
5432 */
5433 ret = ocfs2_check_xattr_bucket_collision(inode,
5434 xs->bucket,
5435 xi->name);
5436 if (ret) {
5437 mlog_errno(ret);
5438 goto out;
5439 }
5440 5600
5441 ret = ocfs2_add_new_xattr_bucket(inode, 5601 /*
5442 xs->xattr_bh, 5602 * We do not allow for overlapping ranges between buckets. And
5603 * the maximum number of collisions we will allow for then is
5604 * one bucket's worth, so check it here whether we need to
5605 * add a new bucket for the insert.
5606 */
5607 ret = ocfs2_check_xattr_bucket_collision(inode,
5443 xs->bucket, 5608 xs->bucket,
5444 ctxt); 5609 xi->xi_name);
5445 if (ret) { 5610 if (ret) {
5446 mlog_errno(ret); 5611 mlog_errno(ret);
5447 goto out; 5612 goto out;
5448 } 5613 }
5449 5614
5450 /* 5615 ret = ocfs2_add_new_xattr_bucket(inode,
5451 * ocfs2_add_new_xattr_bucket() will have updated 5616 xs->xattr_bh,
5452 * xs->bucket if it moved, but it will not have updated 5617 xs->bucket,
5453 * any of the other search fields. Thus, we drop it and 5618 ctxt);
5454 * re-search. Everything should be cached, so it'll be 5619 if (ret) {
5455 * quick. 5620 mlog_errno(ret);
5456 */ 5621 goto out;
5457 ocfs2_xattr_bucket_relse(xs->bucket);
5458 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5459 xi->name_index,
5460 xi->name, xs);
5461 if (ret && ret != -ENODATA)
5462 goto out;
5463 xs->not_found = ret;
5464 allocation = 1;
5465 goto try_again;
5466 } 5622 }
5467 5623
5468xattr_set: 5624 /*
5469 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); 5625 * ocfs2_add_new_xattr_bucket() will have updated
5626 * xs->bucket if it moved, but it will not have updated
5627 * any of the other search fields. Thus, we drop it and
5628 * re-search. Everything should be cached, so it'll be
5629 * quick.
5630 */
5631 ocfs2_xattr_bucket_relse(xs->bucket);
5632 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5633 xi->xi_name_index,
5634 xi->xi_name, xs);
5635 if (ret && ret != -ENODATA)
5636 goto out;
5637 xs->not_found = ret;
5638
5639 /* Ok, we have a new bucket, let's try again */
5640 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5641 if (ret && (ret != -ENOSPC))
5642 mlog_errno(ret);
5643
5470out: 5644out:
5471 mlog_exit(ret); 5645 mlog_exit(ret);
5472 return ret; 5646 return ret;
@@ -5678,7 +5852,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5678 * refcount tree, and make the original extent become 3. So we will need 5852 * refcount tree, and make the original extent become 3. So we will need
5679 * 2 * cluster more extent recs at most. 5853 * 2 * cluster more extent recs at most.
5680 */ 5854 */
5681 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { 5855 if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
5682 5856
5683 ret = ocfs2_refcounted_xattr_delete_need(inode, 5857 ret = ocfs2_refcounted_xattr_delete_need(inode,
5684 &(*ref_tree)->rf_ci, 5858 &(*ref_tree)->rf_ci,
@@ -6354,33 +6528,33 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6354 int indexed) 6528 int indexed)
6355{ 6529{
6356 int ret; 6530 int ret;
6357 handle_t *handle;
6358 struct ocfs2_alloc_context *meta_ac;
6359 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6531 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6532 struct ocfs2_xattr_set_ctxt ctxt;
6360 6533
6361 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6534 memset(&ctxt, 0, sizeof(ctxt));
6535 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6362 if (ret < 0) { 6536 if (ret < 0) {
6363 mlog_errno(ret); 6537 mlog_errno(ret);
6364 return ret; 6538 return ret;
6365 } 6539 }
6366 6540
6367 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); 6541 ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6368 if (IS_ERR(handle)) { 6542 if (IS_ERR(ctxt.handle)) {
6369 ret = PTR_ERR(handle); 6543 ret = PTR_ERR(ctxt.handle);
6370 mlog_errno(ret); 6544 mlog_errno(ret);
6371 goto out; 6545 goto out;
6372 } 6546 }
6373 6547
6374 mlog(0, "create new xattr block for inode %llu, index = %d\n", 6548 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6375 (unsigned long long)fe_bh->b_blocknr, indexed); 6549 (unsigned long long)fe_bh->b_blocknr, indexed);
6376 ret = ocfs2_create_xattr_block(handle, inode, fe_bh, 6550 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
6377 meta_ac, ret_bh, indexed); 6551 ret_bh);
6378 if (ret) 6552 if (ret)
6379 mlog_errno(ret); 6553 mlog_errno(ret);
6380 6554
6381 ocfs2_commit_trans(osb, handle); 6555 ocfs2_commit_trans(osb, ctxt.handle);
6382out: 6556out:
6383 ocfs2_free_alloc_context(meta_ac); 6557 ocfs2_free_alloc_context(ctxt.meta_ac);
6384 return ret; 6558 return ret;
6385} 6559}
6386 6560
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3a..75d9b5ba1d45 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/writeback.h>
14#include <linux/crc-itu-t.h> 15#include <linux/crc-itu-t.h>
15#include "omfs.h" 16#include "omfs.h"
16 17
@@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
89 oi->i_head.h_check_xor = xor; 90 oi->i_head.h_check_xor = xor;
90} 91}
91 92
92static int omfs_write_inode(struct inode *inode, int wait) 93static int __omfs_write_inode(struct inode *inode, int wait)
93{ 94{
94 struct omfs_inode *oi; 95 struct omfs_inode *oi;
95 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); 96 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +163,14 @@ out:
162 return ret; 163 return ret;
163} 164}
164 165
166static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
167{
168 return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
169}
170
165int omfs_sync_inode(struct inode *inode) 171int omfs_sync_inode(struct inode *inode)
166{ 172{
167 return omfs_write_inode(inode, 1); 173 return __omfs_write_inode(inode, 1);
168} 174}
169 175
170/* 176/*
diff --git a/fs/open.c b/fs/open.c
index 040cef72bc00..e17f54454b50 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/file.h> 9#include <linux/file.h>
10#include <linux/fdtable.h> 10#include <linux/fdtable.h>
11#include <linux/quotaops.h>
12#include <linux/fsnotify.h> 11#include <linux/fsnotify.h>
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
@@ -271,17 +270,15 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
271 * Make sure that there are no leases. get_write_access() protects 270 * Make sure that there are no leases. get_write_access() protects
272 * against the truncate racing with a lease-granting setlease(). 271 * against the truncate racing with a lease-granting setlease().
273 */ 272 */
274 error = break_lease(inode, FMODE_WRITE); 273 error = break_lease(inode, O_WRONLY);
275 if (error) 274 if (error)
276 goto put_write_and_out; 275 goto put_write_and_out;
277 276
278 error = locks_verify_truncate(inode, NULL, length); 277 error = locks_verify_truncate(inode, NULL, length);
279 if (!error) 278 if (!error)
280 error = security_path_truncate(&path, length, 0); 279 error = security_path_truncate(&path, length, 0);
281 if (!error) { 280 if (!error)
282 vfs_dq_init(inode);
283 error = do_truncate(path.dentry, length, 0, NULL); 281 error = do_truncate(path.dentry, length, 0, NULL);
284 }
285 282
286put_write_and_out: 283put_write_and_out:
287 put_write_access(inode); 284 put_write_access(inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
104 107
105static void 108static void
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109parse_extended(struct parsed_partitions *state, struct block_device *bdev,
107 u32 first_sector, u32 first_size) 110 sector_t first_sector, sector_t first_size)
108{ 111{
109 struct partition *p; 112 struct partition *p;
110 Sector sect; 113 Sector sect;
111 unsigned char *data; 114 unsigned char *data;
112 u32 this_sector, this_size; 115 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 116 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 117 int loopct = 0; /* number of links followed
115 without finding a data partition */ 118 without finding a data partition */
116 int i; 119 int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 148 * First process the data partition(s)
146 */ 149 */
147 for (i=0; i<4; i++, p++) { 150 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 151 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 152 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 153 continue;
151 154
152 /* Check the 3rd and 4th entries - 155 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 156 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 157 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 158 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 159 next = this_sector + offs;
157 if (i >= 2) { 160 if (i >= 2) {
158 if (offs + size > this_size) 161 if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 182 */
180 p -= 4; 183 p -= 4;
181 for (i=0; i<4; i++, p++) 184 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 185 if (nr_sects(p) && is_extended_partition(p))
183 break; 186 break;
184 if (i == 4) 187 if (i == 4)
185 goto done; /* nothing left to do */ 188 goto done; /* nothing left to do */
186 189
187 this_sector = first_sector + START_SECT(p) * sector_size; 190 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 191 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 192 put_dev_sector(sect);
190 } 193 }
191done: 194done:
@@ -197,7 +200,7 @@ done:
197 200
198static void 201static void
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
200 u32 offset, u32 size, int origin) 203 sector_t offset, sector_t size, int origin)
201{ 204{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 205#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 206 Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
244 */ 247 */
245static void 248static void
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 249parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
247 u32 offset, u32 size, int origin, char *flavour, 250 sector_t offset, sector_t size, int origin, char *flavour,
248 int max_partitions) 251 int max_partitions)
249{ 252{
250 Sector sect; 253 Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 266 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 267 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 268 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 269 sector_t bsd_start, bsd_size;
267 270
268 if (state->next == state->limit) 271 if (state->next == state->limit)
269 break; 272 break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
290 293
291static void 294static void
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
293 u32 offset, u32 size, int origin) 296 sector_t offset, sector_t size, int origin)
294{ 297{
295#ifdef CONFIG_BSD_DISKLABEL 298#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 299 parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
300 303
301static void 304static void
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
303 u32 offset, u32 size, int origin) 306 sector_t offset, sector_t size, int origin)
304{ 307{
305#ifdef CONFIG_BSD_DISKLABEL 308#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 309 parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
310 313
311static void 314static void
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
313 u32 offset, u32 size, int origin) 316 sector_t offset, sector_t size, int origin)
314{ 317{
315#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
324 */ 327 */
325static void 328static void
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 329parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
327 u32 offset, u32 size, int origin) 330 sector_t offset, sector_t size, int origin)
328{ 331{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 332#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 333 Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 351
349 if (p->s_label != UNIXWARE_FS_UNUSED) 352 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 353 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 354 le32_to_cpu(p->start_sect),
355 le32_to_cpu(p->nr_sects));
352 p++; 356 p++;
353 } 357 }
354 put_dev_sector(sect); 358 put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
363 */ 367 */
364static void 368static void
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 369parse_minix(struct parsed_partitions *state, struct block_device *bdev,
366 u32 offset, u32 size, int origin) 370 sector_t offset, sector_t size, int origin)
367{ 371{
368#ifdef CONFIG_MINIX_SUBPARTITION 372#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 373 Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 394 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 395 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 396 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 397 start_sect(p), nr_sects(p));
394 } 398 }
395 printk(" >\n"); 399 printk(" >\n");
396 } 400 }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
401static struct { 405static struct {
402 unsigned char id; 406 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 407 void (*parse)(struct parsed_partitions *, struct block_device *,
404 u32, u32, int); 408 sector_t, sector_t, int);
405} subtypes[] = { 409} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 410 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 411 {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
415 419
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 421{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 422 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 423 Sector sect;
420 unsigned char *data; 424 unsigned char *data;
421 struct partition *p; 425 struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 487
484 state->next = 5; 488 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 489 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 490 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 491 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 492 if (!size)
489 continue; 493 continue;
490 if (is_extended_partition(p)) { 494 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 495 /*
492 extended partition, but leave room for LILO */ 496 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 497 * extended partition, but leave room for LILO
498 * FIXME: this uses one logical sector for > 512b
499 * sector, although it may not be enough/proper.
500 */
501 sector_t n = 2;
502 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n);
504
494 printk(" <"); 505 printk(" <");
495 parse_extended(state, bdev, start, size); 506 parse_extended(state, bdev, start, size);
496 printk(" >"); 507 printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 524 unsigned char id = SYS_IND(p);
514 int n; 525 int n;
515 526
516 if (!NR_SECTS(p)) 527 if (!nr_sects(p))
517 continue; 528 continue;
518 529
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 530 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 532
522 if (!subtypes[n].parse) 533 if (!subtypes[n].parse)
523 continue; 534 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
525 NR_SECTS(p)*sector_size, slot); 536 nr_sects(p)*sector_size, slot);
526 } 537 }
527 put_dev_sector(sect); 538 put_dev_sector(sect);
528 return 1; 539 return 1;
diff --git a/fs/pnode.c b/fs/pnode.c
index 8d5f392ec3d3..5cc564a83149 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt)
86 86
87 /* 87 /*
88 * slave 'mnt' to a peer mount that has the 88 * slave 'mnt' to a peer mount that has the
89 * same root dentry. If none is available than 89 * same root dentry. If none is available then
90 * slave it to anything that is available. 90 * slave it to anything that is available.
91 */ 91 */
92 while ((peer_mnt = next_peer(peer_mnt)) != mnt && 92 while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
@@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
147 * get the next mount in the propagation tree. 147 * get the next mount in the propagation tree.
148 * @m: the mount seen last 148 * @m: the mount seen last
149 * @origin: the original mount from where the tree walk initiated 149 * @origin: the original mount from where the tree walk initiated
150 *
151 * Note that peer groups form contiguous segments of slave lists.
152 * We rely on that in get_source() to be able to find out if
153 * vfsmount found while iterating with propagation_next() is
154 * a peer of one we'd found earlier.
150 */ 155 */
151static struct vfsmount *propagation_next(struct vfsmount *m, 156static struct vfsmount *propagation_next(struct vfsmount *m,
152 struct vfsmount *origin) 157 struct vfsmount *origin)
@@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest,
186{ 191{
187 struct vfsmount *p_last_src = NULL; 192 struct vfsmount *p_last_src = NULL;
188 struct vfsmount *p_last_dest = NULL; 193 struct vfsmount *p_last_dest = NULL;
189 *type = CL_PROPAGATION;
190
191 if (IS_MNT_SHARED(dest))
192 *type |= CL_MAKE_SHARED;
193 194
194 while (last_dest != dest->mnt_master) { 195 while (last_dest != dest->mnt_master) {
195 p_last_dest = last_dest; 196 p_last_dest = last_dest;
@@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest,
202 do { 203 do {
203 p_last_dest = next_peer(p_last_dest); 204 p_last_dest = next_peer(p_last_dest);
204 } while (IS_MNT_NEW(p_last_dest)); 205 } while (IS_MNT_NEW(p_last_dest));
206 /* is that a peer of the earlier? */
207 if (dest == p_last_dest) {
208 *type = CL_MAKE_SHARED;
209 return p_last_src;
210 }
205 } 211 }
206 212 /* slave of the earlier, then */
207 if (dest != p_last_dest) { 213 *type = CL_SLAVE;
208 *type |= CL_SLAVE; 214 /* beginning of peer group among the slaves? */
209 return last_src; 215 if (IS_MNT_SHARED(dest))
210 } else 216 *type |= CL_MAKE_SHARED;
211 return p_last_src; 217 return last_src;
212} 218}
213 219
214/* 220/*
diff --git a/fs/pnode.h b/fs/pnode.h
index 958665d662af..1ea4ae1efcd3 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,12 +21,11 @@
21#define CL_SLAVE 0x02 21#define CL_SLAVE 0x02
22#define CL_COPY_ALL 0x04 22#define CL_COPY_ALL 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PROPAGATION 0x10 24#define CL_PRIVATE 0x10
25#define CL_PRIVATE 0x20
26 25
27static inline void set_mnt_shared(struct vfsmount *mnt) 26static inline void set_mnt_shared(struct vfsmount *mnt)
28{ 27{
29 mnt->mnt_flags &= ~MNT_PNODE_MASK; 28 mnt->mnt_flags &= ~MNT_SHARED_MASK;
30 mnt->mnt_flags |= MNT_SHARED; 29 mnt->mnt_flags |= MNT_SHARED;
31} 30}
32 31
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 18e20feee251..aa8637b81028 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -273,7 +273,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
273 rcu_read_lock(); /* FIXME: is this correct? */ 273 rcu_read_lock(); /* FIXME: is this correct? */
274 qsize = atomic_read(&__task_cred(p)->user->sigpending); 274 qsize = atomic_read(&__task_cred(p)->user->sigpending);
275 rcu_read_unlock(); 275 rcu_read_unlock();
276 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; 276 qlim = task_rlimit(p, RLIMIT_SIGPENDING);
277 unlock_task_sighand(p, &flags); 277 unlock_task_sighand(p, &flags);
278 } 278 }
279 279
@@ -420,7 +420,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
420 cutime = sig->cutime; 420 cutime = sig->cutime;
421 cstime = sig->cstime; 421 cstime = sig->cstime;
422 cgtime = sig->cgtime; 422 cgtime = sig->cgtime;
423 rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; 423 rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
424 424
425 /* add up live thread stats at the group level */ 425 /* add up live thread stats at the group level */
426 if (whole) { 426 if (whole) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 623e2ffb5d2b..b1f6e62773d3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -442,12 +442,13 @@ static const struct file_operations proc_lstats_operations = {
442unsigned long badness(struct task_struct *p, unsigned long uptime); 442unsigned long badness(struct task_struct *p, unsigned long uptime);
443static int proc_oom_score(struct task_struct *task, char *buffer) 443static int proc_oom_score(struct task_struct *task, char *buffer)
444{ 444{
445 unsigned long points; 445 unsigned long points = 0;
446 struct timespec uptime; 446 struct timespec uptime;
447 447
448 do_posix_clock_monotonic_gettime(&uptime); 448 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 449 read_lock(&tasklist_lock);
450 points = badness(task->group_leader, uptime.tv_sec); 450 if (pid_alive(task))
451 points = badness(task, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 452 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 453 return sprintf(buffer, "%lu\n", points);
453} 454}
@@ -647,17 +648,11 @@ static int mounts_release(struct inode *inode, struct file *file)
647static unsigned mounts_poll(struct file *file, poll_table *wait) 648static unsigned mounts_poll(struct file *file, poll_table *wait)
648{ 649{
649 struct proc_mounts *p = file->private_data; 650 struct proc_mounts *p = file->private_data;
650 struct mnt_namespace *ns = p->ns;
651 unsigned res = POLLIN | POLLRDNORM; 651 unsigned res = POLLIN | POLLRDNORM;
652 652
653 poll_wait(file, &ns->poll, wait); 653 poll_wait(file, &p->ns->poll, wait);
654 654 if (mnt_had_events(p))
655 spin_lock(&vfsmount_lock);
656 if (p->event != ns->event) {
657 p->event = ns->event;
658 res |= POLLERR | POLLPRI; 655 res |= POLLERR | POLLPRI;
659 }
660 spin_unlock(&vfsmount_lock);
661 656
662 return res; 657 return res;
663} 658}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 480cb1065eec..08f4d71dacd7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -291,19 +291,17 @@ static const struct inode_operations proc_file_inode_operations = {
291 * returns the struct proc_dir_entry for "/proc/tty/driver", and 291 * returns the struct proc_dir_entry for "/proc/tty/driver", and
292 * returns "serial" in residual. 292 * returns "serial" in residual.
293 */ 293 */
294static int xlate_proc_name(const char *name, 294static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
295 struct proc_dir_entry **ret, const char **residual) 295 const char **residual)
296{ 296{
297 const char *cp = name, *next; 297 const char *cp = name, *next;
298 struct proc_dir_entry *de; 298 struct proc_dir_entry *de;
299 int len; 299 int len;
300 int rtn = 0;
301 300
302 de = *ret; 301 de = *ret;
303 if (!de) 302 if (!de)
304 de = &proc_root; 303 de = &proc_root;
305 304
306 spin_lock(&proc_subdir_lock);
307 while (1) { 305 while (1) {
308 next = strchr(cp, '/'); 306 next = strchr(cp, '/');
309 if (!next) 307 if (!next)
@@ -315,16 +313,25 @@ static int xlate_proc_name(const char *name,
315 break; 313 break;
316 } 314 }
317 if (!de) { 315 if (!de) {
318 rtn = -ENOENT; 316 WARN(1, "name '%s'\n", name);
319 goto out; 317 return -ENOENT;
320 } 318 }
321 cp += len + 1; 319 cp += len + 1;
322 } 320 }
323 *residual = cp; 321 *residual = cp;
324 *ret = de; 322 *ret = de;
325out: 323 return 0;
324}
325
326static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
327 const char **residual)
328{
329 int rv;
330
331 spin_lock(&proc_subdir_lock);
332 rv = __xlate_proc_name(name, ret, residual);
326 spin_unlock(&proc_subdir_lock); 333 spin_unlock(&proc_subdir_lock);
327 return rtn; 334 return rv;
328} 335}
329 336
330static DEFINE_IDA(proc_inum_ida); 337static DEFINE_IDA(proc_inum_ida);
@@ -662,6 +669,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
662 } 669 }
663 return ent; 670 return ent;
664} 671}
672EXPORT_SYMBOL(proc_symlink);
665 673
666struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, 674struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
667 struct proc_dir_entry *parent) 675 struct proc_dir_entry *parent)
@@ -700,6 +708,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
700{ 708{
701 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); 709 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
702} 710}
711EXPORT_SYMBOL(proc_mkdir);
703 712
704struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, 713struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
705 struct proc_dir_entry *parent) 714 struct proc_dir_entry *parent)
@@ -728,6 +737,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
728 } 737 }
729 return ent; 738 return ent;
730} 739}
740EXPORT_SYMBOL(create_proc_entry);
731 741
732struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, 742struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
733 struct proc_dir_entry *parent, 743 struct proc_dir_entry *parent,
@@ -762,6 +772,7 @@ out_free:
762out: 772out:
763 return NULL; 773 return NULL;
764} 774}
775EXPORT_SYMBOL(proc_create_data);
765 776
766static void free_proc_entry(struct proc_dir_entry *de) 777static void free_proc_entry(struct proc_dir_entry *de)
767{ 778{
@@ -793,11 +804,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
793 const char *fn = name; 804 const char *fn = name;
794 int len; 805 int len;
795 806
796 if (xlate_proc_name(name, &parent, &fn) != 0) 807 spin_lock(&proc_subdir_lock);
808 if (__xlate_proc_name(name, &parent, &fn) != 0) {
809 spin_unlock(&proc_subdir_lock);
797 return; 810 return;
811 }
798 len = strlen(fn); 812 len = strlen(fn);
799 813
800 spin_lock(&proc_subdir_lock);
801 for (p = &parent->subdir; *p; p=&(*p)->next ) { 814 for (p = &parent->subdir; *p; p=&(*p)->next ) {
802 if (proc_match(len, fn, *p)) { 815 if (proc_match(len, fn, *p)) {
803 de = *p; 816 de = *p;
@@ -807,8 +820,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
807 } 820 }
808 } 821 }
809 spin_unlock(&proc_subdir_lock); 822 spin_unlock(&proc_subdir_lock);
810 if (!de) 823 if (!de) {
824 WARN(1, "name '%s'\n", name);
811 return; 825 return;
826 }
812 827
813 spin_lock(&de->pde_unload_lock); 828 spin_lock(&de->pde_unload_lock);
814 /* 829 /*
@@ -853,3 +868,4 @@ continue_removing:
853 de->parent->name, de->name, de->subdir->name); 868 de->parent->name, de->name, de->subdir->name);
854 pde_put(de); 869 pde_put(de);
855} 870}
871EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..b442dac8f5f9 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -490,7 +490,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 490 }
491 read_unlock(&kclist_lock); 491 read_unlock(&kclist_lock);
492 492
493 if (m == NULL) { 493 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 494 if (clear_user(buffer, tsz))
495 return -EFAULT; 495 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 496 } else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..cfe90a48a6e8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
12#include <linux/poll.h> 12#include <linux/poll.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/syslog.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io.h> 18#include <asm/io.h>
18 19
19extern wait_queue_head_t log_wait; 20extern wait_queue_head_t log_wait;
20 21
21extern int do_syslog(int type, char __user *bug, int count);
22
23static int kmsg_open(struct inode * inode, struct file * file) 22static int kmsg_open(struct inode * inode, struct file * file)
24{ 23{
25 return do_syslog(1,NULL,0); 24 return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
26} 25}
27 26
28static int kmsg_release(struct inode * inode, struct file * file) 27static int kmsg_release(struct inode * inode, struct file * file)
29{ 28{
30 (void) do_syslog(0,NULL,0); 29 (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
31 return 0; 30 return 0;
32} 31}
33 32
34static ssize_t kmsg_read(struct file *file, char __user *buf, 33static ssize_t kmsg_read(struct file *file, char __user *buf,
35 size_t count, loff_t *ppos) 34 size_t count, loff_t *ppos)
36{ 35{
37 if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) 36 if ((file->f_flags & O_NONBLOCK) &&
37 !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
38 return -EAGAIN; 38 return -EAGAIN;
39 return do_syslog(2, buf, count); 39 return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
40} 40}
41 41
42static unsigned int kmsg_poll(struct file *file, poll_table *wait) 42static unsigned int kmsg_poll(struct file *file, poll_table *wait)
43{ 43{
44 poll_wait(file, &log_wait, wait); 44 poll_wait(file, &log_wait, wait);
45 if (do_syslog(9, NULL, 0)) 45 if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
46 return POLLIN | POLLRDNORM; 46 return POLLIN | POLLRDNORM;
47 return 0; 47 return 0;
48} 48}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e3..757c069f2a65 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns)
220{ 220{
221 mntput(ns->proc_mnt); 221 mntput(ns->proc_mnt);
222} 222}
223
224EXPORT_SYMBOL(proc_symlink);
225EXPORT_SYMBOL(proc_mkdir);
226EXPORT_SYMBOL(create_proc_entry);
227EXPORT_SYMBOL(proc_create_data);
228EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f277c4a111cb..183f8ff5f400 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -16,7 +16,7 @@
16 16
17void task_mem(struct seq_file *m, struct mm_struct *mm) 17void task_mem(struct seq_file *m, struct mm_struct *mm)
18{ 18{
19 unsigned long data, text, lib; 19 unsigned long data, text, lib, swap;
20 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 20 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
21 21
22 /* 22 /*
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
36 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 36 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
37 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 37 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
38 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 38 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
39 swap = get_mm_counter(mm, MM_SWAPENTS);
39 seq_printf(m, 40 seq_printf(m,
40 "VmPeak:\t%8lu kB\n" 41 "VmPeak:\t%8lu kB\n"
41 "VmSize:\t%8lu kB\n" 42 "VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
46 "VmStk:\t%8lu kB\n" 47 "VmStk:\t%8lu kB\n"
47 "VmExe:\t%8lu kB\n" 48 "VmExe:\t%8lu kB\n"
48 "VmLib:\t%8lu kB\n" 49 "VmLib:\t%8lu kB\n"
49 "VmPTE:\t%8lu kB\n", 50 "VmPTE:\t%8lu kB\n"
51 "VmSwap:\t%8lu kB\n",
50 hiwater_vm << (PAGE_SHIFT-10), 52 hiwater_vm << (PAGE_SHIFT-10),
51 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 53 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
52 mm->locked_vm << (PAGE_SHIFT-10), 54 mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 total_rss << (PAGE_SHIFT-10), 56 total_rss << (PAGE_SHIFT-10),
55 data << (PAGE_SHIFT-10), 57 data << (PAGE_SHIFT-10),
56 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 58 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
57 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); 59 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
60 swap << (PAGE_SHIFT-10));
58} 61}
59 62
60unsigned long task_vsize(struct mm_struct *mm) 63unsigned long task_vsize(struct mm_struct *mm)
@@ -65,11 +68,11 @@ unsigned long task_vsize(struct mm_struct *mm)
65int task_statm(struct mm_struct *mm, int *shared, int *text, 68int task_statm(struct mm_struct *mm, int *shared, int *text,
66 int *data, int *resident) 69 int *data, int *resident)
67{ 70{
68 *shared = get_mm_counter(mm, file_rss); 71 *shared = get_mm_counter(mm, MM_FILEPAGES);
69 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 72 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
70 >> PAGE_SHIFT; 73 >> PAGE_SHIFT;
71 *data = mm->total_vm - mm->shared_vm; 74 *data = mm->total_vm - mm->shared_vm;
72 *resident = *shared + get_mm_counter(mm, anon_rss); 75 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
73 return mm->total_vm; 76 return mm->total_vm;
74} 77}
75 78
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index ebf3440d28ca..277575ddc05c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -201,7 +201,8 @@ static const char *qnx4_checkroot(struct super_block *sb)
201 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); 201 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
202 if (rootdir->di_fname != NULL) { 202 if (rootdir->di_fname != NULL) {
203 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); 203 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
204 if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) { 204 if (!strcmp(rootdir->di_fname,
205 QNX4_BMNAME)) {
205 found = 1; 206 found = 1;
206 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); 207 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
207 if (!qnx4_sb(sb)->BitMap) { 208 if (!qnx4_sb(sb)->BitMap) {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index efc02ebb8c70..dad7fb247ddc 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -59,3 +59,8 @@ config QUOTACTL
59 bool 59 bool
60 depends on XFS_QUOTA || QUOTA 60 depends on XFS_QUOTA || QUOTA
61 default y 61 default y
62
63config QUOTACTL_COMPAT
64 bool
65 depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
66 default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6dc0578..5f9e9e276af0 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
3obj-$(CONFIG_QFMT_V2) += quota_v2.o 3obj-$(CONFIG_QFMT_V2) += quota_v2.o
4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o 4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
5obj-$(CONFIG_QUOTACTL) += quota.o 5obj-$(CONFIG_QUOTACTL) += quota.o
6obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
7obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 000000000000..fb1892fe3e56
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
1
2#include <linux/syscalls.h>
3#include <linux/compat.h>
4#include <linux/quotaops.h>
5
6/*
7 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
8 * and is necessary due to alignment problems.
9 */
10struct compat_if_dqblk {
11 compat_u64 dqb_bhardlimit;
12 compat_u64 dqb_bsoftlimit;
13 compat_u64 dqb_curspace;
14 compat_u64 dqb_ihardlimit;
15 compat_u64 dqb_isoftlimit;
16 compat_u64 dqb_curinodes;
17 compat_u64 dqb_btime;
18 compat_u64 dqb_itime;
19 compat_uint_t dqb_valid;
20};
21
22/* XFS structures */
23struct compat_fs_qfilestat {
24 compat_u64 dqb_bhardlimit;
25 compat_u64 qfs_nblks;
26 compat_uint_t qfs_nextents;
27};
28
29struct compat_fs_quota_stat {
30 __s8 qs_version;
31 __u16 qs_flags;
32 __s8 qs_pad;
33 struct compat_fs_qfilestat qs_uquota;
34 struct compat_fs_qfilestat qs_gquota;
35 compat_uint_t qs_incoredqs;
36 compat_int_t qs_btimelimit;
37 compat_int_t qs_itimelimit;
38 compat_int_t qs_rtbtimelimit;
39 __u16 qs_bwarnlimit;
40 __u16 qs_iwarnlimit;
41};
42
43asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
44 qid_t id, void __user *addr)
45{
46 unsigned int cmds;
47 struct if_dqblk __user *dqblk;
48 struct compat_if_dqblk __user *compat_dqblk;
49 struct fs_quota_stat __user *fsqstat;
50 struct compat_fs_quota_stat __user *compat_fsqstat;
51 compat_uint_t data;
52 u16 xdata;
53 long ret;
54
55 cmds = cmd >> SUBCMDSHIFT;
56
57 switch (cmds) {
58 case Q_GETQUOTA:
59 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
60 compat_dqblk = addr;
61 ret = sys_quotactl(cmd, special, id, dqblk);
62 if (ret)
63 break;
64 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
65 get_user(data, &dqblk->dqb_valid) ||
66 put_user(data, &compat_dqblk->dqb_valid))
67 ret = -EFAULT;
68 break;
69 case Q_SETQUOTA:
70 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
71 compat_dqblk = addr;
72 ret = -EFAULT;
73 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
74 get_user(data, &compat_dqblk->dqb_valid) ||
75 put_user(data, &dqblk->dqb_valid))
76 break;
77 ret = sys_quotactl(cmd, special, id, dqblk);
78 break;
79 case Q_XGETQSTAT:
80 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
81 compat_fsqstat = addr;
82 ret = sys_quotactl(cmd, special, id, fsqstat);
83 if (ret)
84 break;
85 ret = -EFAULT;
86 /* Copying qs_version, qs_flags, qs_pad */
87 if (copy_in_user(compat_fsqstat, fsqstat,
88 offsetof(struct compat_fs_quota_stat, qs_uquota)))
89 break;
90 /* Copying qs_uquota */
91 if (copy_in_user(&compat_fsqstat->qs_uquota,
92 &fsqstat->qs_uquota,
93 sizeof(compat_fsqstat->qs_uquota)) ||
94 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
95 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
96 break;
97 /* Copying qs_gquota */
98 if (copy_in_user(&compat_fsqstat->qs_gquota,
99 &fsqstat->qs_gquota,
100 sizeof(compat_fsqstat->qs_gquota)) ||
101 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
102 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
103 break;
104 /* Copying the rest */
105 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
106 &fsqstat->qs_incoredqs,
107 sizeof(struct compat_fs_quota_stat) -
108 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
109 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
110 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
111 break;
112 ret = 0;
113 break;
114 default:
115 ret = sys_quotactl(cmd, special, id, addr);
116 }
117 return ret;
118}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3fc62b097bed..e0b870f4749f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -100,9 +100,13 @@
100 * 100 *
101 * Any operation working on dquots via inode pointers must hold dqptr_sem. If 101 * Any operation working on dquots via inode pointers must hold dqptr_sem. If
102 * operation is just reading pointers from inode (or not using them at all) the 102 * operation is just reading pointers from inode (or not using them at all) the
103 * read lock is enough. If pointers are altered function must hold write lock 103 * read lock is enough. If pointers are altered function must hold write lock.
104 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that 104 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
105 * for altering the flag i_mutex is also needed). 105 * inode is a quota file). Functions adding pointers from inode to dquots have
106 * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
107 * have to do all pointer modifications before dropping dqptr_sem. This makes
108 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
109 * then drops all pointers to dquots from an inode.
106 * 110 *
107 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced 111 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
108 * from inodes (dquot_alloc_space() and such don't check the dq_lock). 112 * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -225,6 +229,9 @@ static struct hlist_head *dquot_hash;
225struct dqstats dqstats; 229struct dqstats dqstats;
226EXPORT_SYMBOL(dqstats); 230EXPORT_SYMBOL(dqstats);
227 231
232static qsize_t inode_get_rsv_space(struct inode *inode);
233static void __dquot_initialize(struct inode *inode, int type);
234
228static inline unsigned int 235static inline unsigned int
229hashfn(const struct super_block *sb, unsigned int id, int type) 236hashfn(const struct super_block *sb, unsigned int id, int type)
230{ 237{
@@ -564,7 +571,7 @@ out:
564} 571}
565EXPORT_SYMBOL(dquot_scan_active); 572EXPORT_SYMBOL(dquot_scan_active);
566 573
567int vfs_quota_sync(struct super_block *sb, int type) 574int vfs_quota_sync(struct super_block *sb, int type, int wait)
568{ 575{
569 struct list_head *dirty; 576 struct list_head *dirty;
570 struct dquot *dquot; 577 struct dquot *dquot;
@@ -609,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type)
609 spin_unlock(&dq_list_lock); 616 spin_unlock(&dq_list_lock);
610 mutex_unlock(&dqopt->dqonoff_mutex); 617 mutex_unlock(&dqopt->dqonoff_mutex);
611 618
619 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
620 return 0;
621
622 /* This is not very clever (and fast) but currently I don't know about
623 * any other simple way of getting quota data to disk and we must get
624 * them there for userspace to be visible... */
625 if (sb->s_op->sync_fs)
626 sb->s_op->sync_fs(sb, 1);
627 sync_blockdev(sb->s_bdev);
628
629 /*
630 * Now when everything is written we can discard the pagecache so
631 * that userspace sees the changes.
632 */
633 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
634 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
635 if (type != -1 && cnt != type)
636 continue;
637 if (!sb_has_quota_active(sb, cnt))
638 continue;
639 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
640 I_MUTEX_QUOTA);
641 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
642 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
643 }
644 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
645
612 return 0; 646 return 0;
613} 647}
614EXPORT_SYMBOL(vfs_quota_sync); 648EXPORT_SYMBOL(vfs_quota_sync);
@@ -840,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type)
840static void add_dquot_ref(struct super_block *sb, int type) 874static void add_dquot_ref(struct super_block *sb, int type)
841{ 875{
842 struct inode *inode, *old_inode = NULL; 876 struct inode *inode, *old_inode = NULL;
877 int reserved = 0;
843 878
844 spin_lock(&inode_lock); 879 spin_lock(&inode_lock);
845 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
846 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
847 continue; 882 continue;
883 if (unlikely(inode_get_rsv_space(inode) > 0))
884 reserved = 1;
848 if (!atomic_read(&inode->i_writecount)) 885 if (!atomic_read(&inode->i_writecount))
849 continue; 886 continue;
850 if (!dqinit_needed(inode, type)) 887 if (!dqinit_needed(inode, type))
@@ -854,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
854 spin_unlock(&inode_lock); 891 spin_unlock(&inode_lock);
855 892
856 iput(old_inode); 893 iput(old_inode);
857 sb->dq_op->initialize(inode, type); 894 __dquot_initialize(inode, type);
858 /* We hold a reference to 'inode' so it couldn't have been 895 /* We hold a reference to 'inode' so it couldn't have been
859 * removed from s_inodes list while we dropped the inode_lock. 896 * removed from s_inodes list while we dropped the inode_lock.
860 * We cannot iput the inode now as we can be holding the last 897 * We cannot iput the inode now as we can be holding the last
@@ -865,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
865 } 902 }
866 spin_unlock(&inode_lock); 903 spin_unlock(&inode_lock);
867 iput(old_inode); 904 iput(old_inode);
905
906 if (reserved) {
907 printk(KERN_WARNING "VFS (%s): Writes happened before quota"
908 " was turned on thus quota information is probably "
909 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
910 }
868} 911}
869 912
870/* 913/*
@@ -978,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
978/* 1021/*
979 * Claim reserved quota space 1022 * Claim reserved quota space
980 */ 1023 */
981static void dquot_claim_reserved_space(struct dquot *dquot, 1024static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
982 qsize_t number)
983{ 1025{
984 WARN_ON(dquot->dq_dqb.dqb_rsvspace < number); 1026 if (dquot->dq_dqb.dqb_rsvspace < number) {
1027 WARN_ON_ONCE(1);
1028 number = dquot->dq_dqb.dqb_rsvspace;
1029 }
985 dquot->dq_dqb.dqb_curspace += number; 1030 dquot->dq_dqb.dqb_curspace += number;
986 dquot->dq_dqb.dqb_rsvspace -= number; 1031 dquot->dq_dqb.dqb_rsvspace -= number;
987} 1032}
@@ -989,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
989static inline 1034static inline
990void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) 1035void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
991{ 1036{
992 dquot->dq_dqb.dqb_rsvspace -= number; 1037 if (dquot->dq_dqb.dqb_rsvspace >= number)
1038 dquot->dq_dqb.dqb_rsvspace -= number;
1039 else {
1040 WARN_ON_ONCE(1);
1041 dquot->dq_dqb.dqb_rsvspace = 0;
1042 }
993} 1043}
994 1044
995static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) 1045static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1131,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1131 *warntype = QUOTA_NL_NOWARN; 1181 *warntype = QUOTA_NL_NOWARN;
1132 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || 1182 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1133 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1183 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1134 return QUOTA_OK; 1184 return 0;
1135 1185
1136 if (dquot->dq_dqb.dqb_ihardlimit && 1186 if (dquot->dq_dqb.dqb_ihardlimit &&
1137 newinodes > dquot->dq_dqb.dqb_ihardlimit && 1187 newinodes > dquot->dq_dqb.dqb_ihardlimit &&
1138 !ignore_hardlimit(dquot)) { 1188 !ignore_hardlimit(dquot)) {
1139 *warntype = QUOTA_NL_IHARDWARN; 1189 *warntype = QUOTA_NL_IHARDWARN;
1140 return NO_QUOTA; 1190 return -EDQUOT;
1141 } 1191 }
1142 1192
1143 if (dquot->dq_dqb.dqb_isoftlimit && 1193 if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1146,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1146 get_seconds() >= dquot->dq_dqb.dqb_itime && 1196 get_seconds() >= dquot->dq_dqb.dqb_itime &&
1147 !ignore_hardlimit(dquot)) { 1197 !ignore_hardlimit(dquot)) {
1148 *warntype = QUOTA_NL_ISOFTLONGWARN; 1198 *warntype = QUOTA_NL_ISOFTLONGWARN;
1149 return NO_QUOTA; 1199 return -EDQUOT;
1150 } 1200 }
1151 1201
1152 if (dquot->dq_dqb.dqb_isoftlimit && 1202 if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1157,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1157 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; 1207 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
1158 } 1208 }
1159 1209
1160 return QUOTA_OK; 1210 return 0;
1161} 1211}
1162 1212
1163/* needs dq_data_lock */ 1213/* needs dq_data_lock */
@@ -1169,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1169 *warntype = QUOTA_NL_NOWARN; 1219 *warntype = QUOTA_NL_NOWARN;
1170 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || 1220 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
1171 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1221 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1172 return QUOTA_OK; 1222 return 0;
1173 1223
1174 tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace 1224 tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
1175 + space; 1225 + space;
@@ -1179,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1179 !ignore_hardlimit(dquot)) { 1229 !ignore_hardlimit(dquot)) {
1180 if (!prealloc) 1230 if (!prealloc)
1181 *warntype = QUOTA_NL_BHARDWARN; 1231 *warntype = QUOTA_NL_BHARDWARN;
1182 return NO_QUOTA; 1232 return -EDQUOT;
1183 } 1233 }
1184 1234
1185 if (dquot->dq_dqb.dqb_bsoftlimit && 1235 if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1189,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1189 !ignore_hardlimit(dquot)) { 1239 !ignore_hardlimit(dquot)) {
1190 if (!prealloc) 1240 if (!prealloc)
1191 *warntype = QUOTA_NL_BSOFTLONGWARN; 1241 *warntype = QUOTA_NL_BSOFTLONGWARN;
1192 return NO_QUOTA; 1242 return -EDQUOT;
1193 } 1243 }
1194 1244
1195 if (dquot->dq_dqb.dqb_bsoftlimit && 1245 if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1205,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1205 * We don't allow preallocation to exceed softlimit so exceeding will 1255 * We don't allow preallocation to exceed softlimit so exceeding will
1206 * be always printed 1256 * be always printed
1207 */ 1257 */
1208 return NO_QUOTA; 1258 return -EDQUOT;
1209 } 1259 }
1210 1260
1211 return QUOTA_OK; 1261 return 0;
1212} 1262}
1213 1263
1214static int info_idq_free(struct dquot *dquot, qsize_t inodes) 1264static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1242,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
1242 return QUOTA_NL_BHARDBELOW; 1292 return QUOTA_NL_BHARDBELOW;
1243 return QUOTA_NL_NOWARN; 1293 return QUOTA_NL_NOWARN;
1244} 1294}
1295
1245/* 1296/*
1246 * Initialize quota pointers in inode 1297 * Initialize quota pointers in inode
1247 * We do things in a bit complicated way but by that we avoid calling 1298 *
1248 * dqget() and thus filesystem callbacks under dqptr_sem. 1299 * We do things in a bit complicated way but by that we avoid calling
1300 * dqget() and thus filesystem callbacks under dqptr_sem.
1301 *
1302 * It is better to call this function outside of any transaction as it
1303 * might need a lot of space in journal for dquot structure allocation.
1249 */ 1304 */
1250int dquot_initialize(struct inode *inode, int type) 1305static void __dquot_initialize(struct inode *inode, int type)
1251{ 1306{
1252 unsigned int id = 0; 1307 unsigned int id = 0;
1253 int cnt, ret = 0; 1308 int cnt;
1254 struct dquot *got[MAXQUOTAS] = { NULL, NULL }; 1309 struct dquot *got[MAXQUOTAS];
1255 struct super_block *sb = inode->i_sb; 1310 struct super_block *sb = inode->i_sb;
1311 qsize_t rsv;
1256 1312
1257 /* First test before acquiring mutex - solves deadlocks when we 1313 /* First test before acquiring mutex - solves deadlocks when we
1258 * re-enter the quota code and are already holding the mutex */ 1314 * re-enter the quota code and are already holding the mutex */
1259 if (IS_NOQUOTA(inode)) 1315 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1260 return 0; 1316 return;
1261 1317
1262 /* First get references to structures we might need. */ 1318 /* First get references to structures we might need. */
1263 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1319 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1320 got[cnt] = NULL;
1264 if (type != -1 && cnt != type) 1321 if (type != -1 && cnt != type)
1265 continue; 1322 continue;
1266 switch (cnt) { 1323 switch (cnt) {
@@ -1275,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type)
1275 } 1332 }
1276 1333
1277 down_write(&sb_dqopt(sb)->dqptr_sem); 1334 down_write(&sb_dqopt(sb)->dqptr_sem);
1278 /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
1279 if (IS_NOQUOTA(inode)) 1335 if (IS_NOQUOTA(inode))
1280 goto out_err; 1336 goto out_err;
1281 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1337 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1287,20 +1343,31 @@ int dquot_initialize(struct inode *inode, int type)
1287 if (!inode->i_dquot[cnt]) { 1343 if (!inode->i_dquot[cnt]) {
1288 inode->i_dquot[cnt] = got[cnt]; 1344 inode->i_dquot[cnt] = got[cnt];
1289 got[cnt] = NULL; 1345 got[cnt] = NULL;
1346 /*
1347 * Make quota reservation system happy if someone
1348 * did a write before quota was turned on
1349 */
1350 rsv = inode_get_rsv_space(inode);
1351 if (unlikely(rsv))
1352 dquot_resv_space(inode->i_dquot[cnt], rsv);
1290 } 1353 }
1291 } 1354 }
1292out_err: 1355out_err:
1293 up_write(&sb_dqopt(sb)->dqptr_sem); 1356 up_write(&sb_dqopt(sb)->dqptr_sem);
1294 /* Drop unused references */ 1357 /* Drop unused references */
1295 dqput_all(got); 1358 dqput_all(got);
1296 return ret; 1359}
1360
1361void dquot_initialize(struct inode *inode)
1362{
1363 __dquot_initialize(inode, -1);
1297} 1364}
1298EXPORT_SYMBOL(dquot_initialize); 1365EXPORT_SYMBOL(dquot_initialize);
1299 1366
1300/* 1367/*
1301 * Release all quotas referenced by inode 1368 * Release all quotas referenced by inode
1302 */ 1369 */
1303int dquot_drop(struct inode *inode) 1370static void __dquot_drop(struct inode *inode)
1304{ 1371{
1305 int cnt; 1372 int cnt;
1306 struct dquot *put[MAXQUOTAS]; 1373 struct dquot *put[MAXQUOTAS];
@@ -1312,32 +1379,31 @@ int dquot_drop(struct inode *inode)
1312 } 1379 }
1313 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1380 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1314 dqput_all(put); 1381 dqput_all(put);
1315 return 0;
1316} 1382}
1317EXPORT_SYMBOL(dquot_drop);
1318 1383
1319/* Wrapper to remove references to quota structures from inode */ 1384void dquot_drop(struct inode *inode)
1320void vfs_dq_drop(struct inode *inode) 1385{
1321{ 1386 int cnt;
1322 /* Here we can get arbitrary inode from clear_inode() so we have 1387
1323 * to be careful. OTOH we don't need locking as quota operations 1388 if (IS_NOQUOTA(inode))
1324 * are allowed to change only at mount time */ 1389 return;
1325 if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op 1390
1326 && inode->i_sb->dq_op->drop) { 1391 /*
1327 int cnt; 1392 * Test before calling to rule out calls from proc and such
1328 /* Test before calling to rule out calls from proc and such 1393 * where we are not allowed to block. Note that this is
1329 * where we are not allowed to block. Note that this is 1394 * actually reliable test even without the lock - the caller
1330 * actually reliable test even without the lock - the caller 1395 * must assure that nobody can come after the DQUOT_DROP and
1331 * must assure that nobody can come after the DQUOT_DROP and 1396 * add quota pointers back anyway.
1332 * add quota pointers back anyway */ 1397 */
1333 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1398 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1334 if (inode->i_dquot[cnt]) 1399 if (inode->i_dquot[cnt])
1335 break; 1400 break;
1336 if (cnt < MAXQUOTAS) 1401 }
1337 inode->i_sb->dq_op->drop(inode); 1402
1338 } 1403 if (cnt < MAXQUOTAS)
1339} 1404 __dquot_drop(inode);
1340EXPORT_SYMBOL(vfs_dq_drop); 1405}
1406EXPORT_SYMBOL(dquot_drop);
1341 1407
1342/* 1408/*
1343 * inode_reserved_space is managed internally by quota, and protected by 1409 * inode_reserved_space is managed internally by quota, and protected by
@@ -1351,28 +1417,30 @@ static qsize_t *inode_reserved_space(struct inode * inode)
1351 return inode->i_sb->dq_op->get_reserved_space(inode); 1417 return inode->i_sb->dq_op->get_reserved_space(inode);
1352} 1418}
1353 1419
1354static void inode_add_rsv_space(struct inode *inode, qsize_t number) 1420void inode_add_rsv_space(struct inode *inode, qsize_t number)
1355{ 1421{
1356 spin_lock(&inode->i_lock); 1422 spin_lock(&inode->i_lock);
1357 *inode_reserved_space(inode) += number; 1423 *inode_reserved_space(inode) += number;
1358 spin_unlock(&inode->i_lock); 1424 spin_unlock(&inode->i_lock);
1359} 1425}
1426EXPORT_SYMBOL(inode_add_rsv_space);
1360 1427
1361 1428void inode_claim_rsv_space(struct inode *inode, qsize_t number)
1362static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
1363{ 1429{
1364 spin_lock(&inode->i_lock); 1430 spin_lock(&inode->i_lock);
1365 *inode_reserved_space(inode) -= number; 1431 *inode_reserved_space(inode) -= number;
1366 __inode_add_bytes(inode, number); 1432 __inode_add_bytes(inode, number);
1367 spin_unlock(&inode->i_lock); 1433 spin_unlock(&inode->i_lock);
1368} 1434}
1435EXPORT_SYMBOL(inode_claim_rsv_space);
1369 1436
1370static void inode_sub_rsv_space(struct inode *inode, qsize_t number) 1437void inode_sub_rsv_space(struct inode *inode, qsize_t number)
1371{ 1438{
1372 spin_lock(&inode->i_lock); 1439 spin_lock(&inode->i_lock);
1373 *inode_reserved_space(inode) -= number; 1440 *inode_reserved_space(inode) -= number;
1374 spin_unlock(&inode->i_lock); 1441 spin_unlock(&inode->i_lock);
1375} 1442}
1443EXPORT_SYMBOL(inode_sub_rsv_space);
1376 1444
1377static qsize_t inode_get_rsv_space(struct inode *inode) 1445static qsize_t inode_get_rsv_space(struct inode *inode)
1378{ 1446{
@@ -1404,38 +1472,34 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1404} 1472}
1405 1473
1406/* 1474/*
1407 * Following four functions update i_blocks+i_bytes fields and 1475 * This functions updates i_blocks+i_bytes fields and quota information
1408 * quota information (together with appropriate checks) 1476 * (together with appropriate checks).
1409 * NOTE: We absolutely rely on the fact that caller dirties 1477 *
1410 * the inode (usually macros in quotaops.h care about this) and 1478 * NOTE: We absolutely rely on the fact that caller dirties the inode
1411 * holds a handle for the current transaction so that dquot write and 1479 * (usually helpers in quotaops.h care about this) and holds a handle for
1412 * inode write go into the same transaction. 1480 * the current transaction so that dquot write and inode write go into the
1481 * same transaction.
1413 */ 1482 */
1414 1483
1415/* 1484/*
1416 * This operation can block, but only after everything is updated 1485 * This operation can block, but only after everything is updated
1417 */ 1486 */
1418int __dquot_alloc_space(struct inode *inode, qsize_t number, 1487int __dquot_alloc_space(struct inode *inode, qsize_t number,
1419 int warn, int reserve) 1488 int warn, int reserve)
1420{ 1489{
1421 int cnt, ret = QUOTA_OK; 1490 int cnt, ret = 0;
1422 char warntype[MAXQUOTAS]; 1491 char warntype[MAXQUOTAS];
1423 1492
1424 /* 1493 /*
1425 * First test before acquiring mutex - solves deadlocks when we 1494 * First test before acquiring mutex - solves deadlocks when we
1426 * re-enter the quota code and are already holding the mutex 1495 * re-enter the quota code and are already holding the mutex
1427 */ 1496 */
1428 if (IS_NOQUOTA(inode)) { 1497 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1429 inode_incr_space(inode, number, reserve); 1498 inode_incr_space(inode, number, reserve);
1430 goto out; 1499 goto out;
1431 } 1500 }
1432 1501
1433 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1502 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1434 if (IS_NOQUOTA(inode)) {
1435 inode_incr_space(inode, number, reserve);
1436 goto out_unlock;
1437 }
1438
1439 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1503 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1440 warntype[cnt] = QUOTA_NL_NOWARN; 1504 warntype[cnt] = QUOTA_NL_NOWARN;
1441 1505
@@ -1443,9 +1507,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1443 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1507 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1444 if (!inode->i_dquot[cnt]) 1508 if (!inode->i_dquot[cnt])
1445 continue; 1509 continue;
1446 if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) 1510 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1447 == NO_QUOTA) { 1511 warntype+cnt);
1448 ret = NO_QUOTA; 1512 if (ret) {
1449 spin_unlock(&dq_data_lock); 1513 spin_unlock(&dq_data_lock);
1450 goto out_flush_warn; 1514 goto out_flush_warn;
1451 } 1515 }
@@ -1466,61 +1530,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1466 mark_all_dquot_dirty(inode->i_dquot); 1530 mark_all_dquot_dirty(inode->i_dquot);
1467out_flush_warn: 1531out_flush_warn:
1468 flush_warnings(inode->i_dquot, warntype); 1532 flush_warnings(inode->i_dquot, warntype);
1469out_unlock:
1470 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1533 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1471out: 1534out:
1472 return ret; 1535 return ret;
1473} 1536}
1474 1537EXPORT_SYMBOL(__dquot_alloc_space);
1475int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
1476{
1477 return __dquot_alloc_space(inode, number, warn, 0);
1478}
1479EXPORT_SYMBOL(dquot_alloc_space);
1480
1481int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
1482{
1483 return __dquot_alloc_space(inode, number, warn, 1);
1484}
1485EXPORT_SYMBOL(dquot_reserve_space);
1486 1538
1487/* 1539/*
1488 * This operation can block, but only after everything is updated 1540 * This operation can block, but only after everything is updated
1489 */ 1541 */
1490int dquot_alloc_inode(const struct inode *inode, qsize_t number) 1542int dquot_alloc_inode(const struct inode *inode)
1491{ 1543{
1492 int cnt, ret = NO_QUOTA; 1544 int cnt, ret = 0;
1493 char warntype[MAXQUOTAS]; 1545 char warntype[MAXQUOTAS];
1494 1546
1495 /* First test before acquiring mutex - solves deadlocks when we 1547 /* First test before acquiring mutex - solves deadlocks when we
1496 * re-enter the quota code and are already holding the mutex */ 1548 * re-enter the quota code and are already holding the mutex */
1497 if (IS_NOQUOTA(inode)) 1549 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1498 return QUOTA_OK; 1550 return 0;
1499 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1551 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1500 warntype[cnt] = QUOTA_NL_NOWARN; 1552 warntype[cnt] = QUOTA_NL_NOWARN;
1501 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1553 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1502 if (IS_NOQUOTA(inode)) {
1503 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1504 return QUOTA_OK;
1505 }
1506 spin_lock(&dq_data_lock); 1554 spin_lock(&dq_data_lock);
1507 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1555 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1508 if (!inode->i_dquot[cnt]) 1556 if (!inode->i_dquot[cnt])
1509 continue; 1557 continue;
1510 if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) 1558 ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
1511 == NO_QUOTA) 1559 if (ret)
1512 goto warn_put_all; 1560 goto warn_put_all;
1513 } 1561 }
1514 1562
1515 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1563 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1516 if (!inode->i_dquot[cnt]) 1564 if (!inode->i_dquot[cnt])
1517 continue; 1565 continue;
1518 dquot_incr_inodes(inode->i_dquot[cnt], number); 1566 dquot_incr_inodes(inode->i_dquot[cnt], 1);
1519 } 1567 }
1520 ret = QUOTA_OK; 1568
1521warn_put_all: 1569warn_put_all:
1522 spin_unlock(&dq_data_lock); 1570 spin_unlock(&dq_data_lock);
1523 if (ret == QUOTA_OK) 1571 if (ret == 0)
1524 mark_all_dquot_dirty(inode->i_dquot); 1572 mark_all_dquot_dirty(inode->i_dquot);
1525 flush_warnings(inode->i_dquot, warntype); 1573 flush_warnings(inode->i_dquot, warntype);
1526 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1574 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1528,23 +1576,19 @@ warn_put_all:
1528} 1576}
1529EXPORT_SYMBOL(dquot_alloc_inode); 1577EXPORT_SYMBOL(dquot_alloc_inode);
1530 1578
1531int dquot_claim_space(struct inode *inode, qsize_t number) 1579/*
1580 * Convert in-memory reserved quotas to real consumed quotas
1581 */
1582int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1532{ 1583{
1533 int cnt; 1584 int cnt;
1534 int ret = QUOTA_OK;
1535 1585
1536 if (IS_NOQUOTA(inode)) { 1586 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1537 inode_claim_rsv_space(inode, number); 1587 inode_claim_rsv_space(inode, number);
1538 goto out; 1588 return 0;
1539 } 1589 }
1540 1590
1541 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1591 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1542 if (IS_NOQUOTA(inode)) {
1543 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1544 inode_claim_rsv_space(inode, number);
1545 goto out;
1546 }
1547
1548 spin_lock(&dq_data_lock); 1592 spin_lock(&dq_data_lock);
1549 /* Claim reserved quotas to allocated quotas */ 1593 /* Claim reserved quotas to allocated quotas */
1550 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1594 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1557,33 +1601,26 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
1557 spin_unlock(&dq_data_lock); 1601 spin_unlock(&dq_data_lock);
1558 mark_all_dquot_dirty(inode->i_dquot); 1602 mark_all_dquot_dirty(inode->i_dquot);
1559 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1603 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1560out: 1604 return 0;
1561 return ret;
1562} 1605}
1563EXPORT_SYMBOL(dquot_claim_space); 1606EXPORT_SYMBOL(dquot_claim_space_nodirty);
1564 1607
1565/* 1608/*
1566 * This operation can block, but only after everything is updated 1609 * This operation can block, but only after everything is updated
1567 */ 1610 */
1568int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1611void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
1569{ 1612{
1570 unsigned int cnt; 1613 unsigned int cnt;
1571 char warntype[MAXQUOTAS]; 1614 char warntype[MAXQUOTAS];
1572 1615
1573 /* First test before acquiring mutex - solves deadlocks when we 1616 /* First test before acquiring mutex - solves deadlocks when we
1574 * re-enter the quota code and are already holding the mutex */ 1617 * re-enter the quota code and are already holding the mutex */
1575 if (IS_NOQUOTA(inode)) { 1618 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1576out_sub:
1577 inode_decr_space(inode, number, reserve); 1619 inode_decr_space(inode, number, reserve);
1578 return QUOTA_OK; 1620 return;
1579 } 1621 }
1580 1622
1581 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1623 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1582 /* Now recheck reliably when holding dqptr_sem */
1583 if (IS_NOQUOTA(inode)) {
1584 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1585 goto out_sub;
1586 }
1587 spin_lock(&dq_data_lock); 1624 spin_lock(&dq_data_lock);
1588 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1625 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1589 if (!inode->i_dquot[cnt]) 1626 if (!inode->i_dquot[cnt])
@@ -1603,56 +1640,34 @@ out_sub:
1603out_unlock: 1640out_unlock:
1604 flush_warnings(inode->i_dquot, warntype); 1641 flush_warnings(inode->i_dquot, warntype);
1605 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1642 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1606 return QUOTA_OK;
1607}
1608
1609int dquot_free_space(struct inode *inode, qsize_t number)
1610{
1611 return __dquot_free_space(inode, number, 0);
1612} 1643}
1613EXPORT_SYMBOL(dquot_free_space); 1644EXPORT_SYMBOL(__dquot_free_space);
1614
1615/*
1616 * Release reserved quota space
1617 */
1618void dquot_release_reserved_space(struct inode *inode, qsize_t number)
1619{
1620 __dquot_free_space(inode, number, 1);
1621
1622}
1623EXPORT_SYMBOL(dquot_release_reserved_space);
1624 1645
1625/* 1646/*
1626 * This operation can block, but only after everything is updated 1647 * This operation can block, but only after everything is updated
1627 */ 1648 */
1628int dquot_free_inode(const struct inode *inode, qsize_t number) 1649void dquot_free_inode(const struct inode *inode)
1629{ 1650{
1630 unsigned int cnt; 1651 unsigned int cnt;
1631 char warntype[MAXQUOTAS]; 1652 char warntype[MAXQUOTAS];
1632 1653
1633 /* First test before acquiring mutex - solves deadlocks when we 1654 /* First test before acquiring mutex - solves deadlocks when we
1634 * re-enter the quota code and are already holding the mutex */ 1655 * re-enter the quota code and are already holding the mutex */
1635 if (IS_NOQUOTA(inode)) 1656 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1636 return QUOTA_OK; 1657 return;
1637 1658
1638 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1659 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1639 /* Now recheck reliably when holding dqptr_sem */
1640 if (IS_NOQUOTA(inode)) {
1641 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1642 return QUOTA_OK;
1643 }
1644 spin_lock(&dq_data_lock); 1660 spin_lock(&dq_data_lock);
1645 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1661 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1646 if (!inode->i_dquot[cnt]) 1662 if (!inode->i_dquot[cnt])
1647 continue; 1663 continue;
1648 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); 1664 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
1649 dquot_decr_inodes(inode->i_dquot[cnt], number); 1665 dquot_decr_inodes(inode->i_dquot[cnt], 1);
1650 } 1666 }
1651 spin_unlock(&dq_data_lock); 1667 spin_unlock(&dq_data_lock);
1652 mark_all_dquot_dirty(inode->i_dquot); 1668 mark_all_dquot_dirty(inode->i_dquot);
1653 flush_warnings(inode->i_dquot, warntype); 1669 flush_warnings(inode->i_dquot, warntype);
1654 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1670 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1655 return QUOTA_OK;
1656} 1671}
1657EXPORT_SYMBOL(dquot_free_inode); 1672EXPORT_SYMBOL(dquot_free_inode);
1658 1673
@@ -1662,37 +1677,31 @@ EXPORT_SYMBOL(dquot_free_inode);
1662 * This operation can block, but only after everything is updated 1677 * This operation can block, but only after everything is updated
1663 * A transaction must be started when entering this function. 1678 * A transaction must be started when entering this function.
1664 */ 1679 */
1665int dquot_transfer(struct inode *inode, struct iattr *iattr) 1680static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
1666{ 1681{
1667 qsize_t space, cur_space; 1682 qsize_t space, cur_space;
1668 qsize_t rsv_space = 0; 1683 qsize_t rsv_space = 0;
1669 struct dquot *transfer_from[MAXQUOTAS]; 1684 struct dquot *transfer_from[MAXQUOTAS];
1670 struct dquot *transfer_to[MAXQUOTAS]; 1685 struct dquot *transfer_to[MAXQUOTAS];
1671 int cnt, ret = QUOTA_OK; 1686 int cnt, ret = 0;
1672 int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
1673 chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
1674 char warntype_to[MAXQUOTAS]; 1687 char warntype_to[MAXQUOTAS];
1675 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1688 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1676 1689
1677 /* First test before acquiring mutex - solves deadlocks when we 1690 /* First test before acquiring mutex - solves deadlocks when we
1678 * re-enter the quota code and are already holding the mutex */ 1691 * re-enter the quota code and are already holding the mutex */
1679 if (IS_NOQUOTA(inode)) 1692 if (IS_NOQUOTA(inode))
1680 return QUOTA_OK; 1693 return 0;
1681 /* Initialize the arrays */ 1694 /* Initialize the arrays */
1682 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1695 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1683 transfer_from[cnt] = NULL; 1696 transfer_from[cnt] = NULL;
1684 transfer_to[cnt] = NULL; 1697 transfer_to[cnt] = NULL;
1685 warntype_to[cnt] = QUOTA_NL_NOWARN; 1698 warntype_to[cnt] = QUOTA_NL_NOWARN;
1686 } 1699 }
1687 if (chuid) 1700 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1688 transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid, 1701 if (mask & (1 << cnt))
1689 USRQUOTA); 1702 transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
1690 if (chgid) 1703 }
1691 transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
1692 GRPQUOTA);
1693
1694 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1704 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1695 /* Now recheck reliably when holding dqptr_sem */
1696 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1705 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1697 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1706 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1698 goto put_all; 1707 goto put_all;
@@ -1706,9 +1715,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1706 if (!transfer_to[cnt]) 1715 if (!transfer_to[cnt])
1707 continue; 1716 continue;
1708 transfer_from[cnt] = inode->i_dquot[cnt]; 1717 transfer_from[cnt] = inode->i_dquot[cnt];
1709 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == 1718 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
1710 NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, 1719 if (ret)
1711 warntype_to + cnt) == NO_QUOTA) 1720 goto over_quota;
1721 ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
1722 if (ret)
1712 goto over_quota; 1723 goto over_quota;
1713 } 1724 }
1714 1725
@@ -1762,22 +1773,32 @@ over_quota:
1762 /* Clear dquot pointers we don't want to dqput() */ 1773 /* Clear dquot pointers we don't want to dqput() */
1763 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1774 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1764 transfer_from[cnt] = NULL; 1775 transfer_from[cnt] = NULL;
1765 ret = NO_QUOTA;
1766 goto warn_put_all; 1776 goto warn_put_all;
1767} 1777}
1768EXPORT_SYMBOL(dquot_transfer);
1769 1778
1770/* Wrapper for transferring ownership of an inode */ 1779/* Wrapper for transferring ownership of an inode for uid/gid only
1771int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) 1780 * Called from FSXXX_setattr()
1781 */
1782int dquot_transfer(struct inode *inode, struct iattr *iattr)
1772{ 1783{
1784 qid_t chid[MAXQUOTAS];
1785 unsigned long mask = 0;
1786
1787 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
1788 mask |= 1 << USRQUOTA;
1789 chid[USRQUOTA] = iattr->ia_uid;
1790 }
1791 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
1792 mask |= 1 << GRPQUOTA;
1793 chid[GRPQUOTA] = iattr->ia_gid;
1794 }
1773 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { 1795 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
1774 vfs_dq_init(inode); 1796 dquot_initialize(inode);
1775 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) 1797 return __dquot_transfer(inode, chid, mask);
1776 return 1;
1777 } 1798 }
1778 return 0; 1799 return 0;
1779} 1800}
1780EXPORT_SYMBOL(vfs_dq_transfer); 1801EXPORT_SYMBOL(dquot_transfer);
1781 1802
1782/* 1803/*
1783 * Write info of quota file to disk 1804 * Write info of quota file to disk
@@ -1798,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info);
1798 * Definitions of diskquota operations. 1819 * Definitions of diskquota operations.
1799 */ 1820 */
1800const struct dquot_operations dquot_operations = { 1821const struct dquot_operations dquot_operations = {
1801 .initialize = dquot_initialize,
1802 .drop = dquot_drop,
1803 .alloc_space = dquot_alloc_space,
1804 .alloc_inode = dquot_alloc_inode,
1805 .free_space = dquot_free_space,
1806 .free_inode = dquot_free_inode,
1807 .transfer = dquot_transfer,
1808 .write_dquot = dquot_commit, 1822 .write_dquot = dquot_commit,
1809 .acquire_dquot = dquot_acquire, 1823 .acquire_dquot = dquot_acquire,
1810 .release_dquot = dquot_release, 1824 .release_dquot = dquot_release,
@@ -1815,6 +1829,20 @@ const struct dquot_operations dquot_operations = {
1815}; 1829};
1816 1830
1817/* 1831/*
1832 * Generic helper for ->open on filesystems supporting disk quotas.
1833 */
1834int dquot_file_open(struct inode *inode, struct file *file)
1835{
1836 int error;
1837
1838 error = generic_file_open(inode, file);
1839 if (!error && (file->f_mode & FMODE_WRITE))
1840 dquot_initialize(inode);
1841 return error;
1842}
1843EXPORT_SYMBOL(dquot_file_open);
1844
1845/*
1818 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1846 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1819 */ 1847 */
1820int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1848int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
@@ -1993,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1993 } 2021 }
1994 2022
1995 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { 2023 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1996 /* As we bypass the pagecache we must now flush the inode so 2024 /* As we bypass the pagecache we must now flush all the
1997 * that we see all the changes from userspace... */ 2025 * dirty data and invalidate caches so that kernel sees
1998 write_inode_now(inode, 1); 2026 * changes from userspace. It is not enough to just flush
1999 /* And now flush the block cache so that kernel sees the 2027 * the quota file since if blocksize < pagesize, invalidation
2000 * changes */ 2028 * of the cache could fail because of other unrelated dirty
2029 * data */
2030 sync_filesystem(sb);
2001 invalidate_bdev(sb->s_bdev); 2031 invalidate_bdev(sb->s_bdev);
2002 } 2032 }
2003 mutex_lock(&dqopt->dqonoff_mutex); 2033 mutex_lock(&dqopt->dqonoff_mutex);
@@ -2010,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2010 /* We don't want quota and atime on quota files (deadlocks 2040 /* We don't want quota and atime on quota files (deadlocks
2011 * possible) Also nobody should write to the file - we use 2041 * possible) Also nobody should write to the file - we use
2012 * special IO operations which ignore the immutable bit. */ 2042 * special IO operations which ignore the immutable bit. */
2013 down_write(&dqopt->dqptr_sem);
2014 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2043 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2015 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | 2044 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
2016 S_NOQUOTA); 2045 S_NOQUOTA);
2017 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 2046 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
2018 mutex_unlock(&inode->i_mutex); 2047 mutex_unlock(&inode->i_mutex);
2019 up_write(&dqopt->dqptr_sem); 2048 /*
2020 sb->dq_op->drop(inode); 2049 * When S_NOQUOTA is set, remove dquot references as no more
2050 * references can be added
2051 */
2052 __dquot_drop(inode);
2021 } 2053 }
2022 2054
2023 error = -EIO; 2055 error = -EIO;
@@ -2053,14 +2085,12 @@ out_file_init:
2053 iput(inode); 2085 iput(inode);
2054out_lock: 2086out_lock:
2055 if (oldflags != -1) { 2087 if (oldflags != -1) {
2056 down_write(&dqopt->dqptr_sem);
2057 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2088 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2058 /* Set the flags back (in the case of accidental quotaon() 2089 /* Set the flags back (in the case of accidental quotaon()
2059 * on a wrong file we don't want to mess up the flags) */ 2090 * on a wrong file we don't want to mess up the flags) */
2060 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); 2091 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
2061 inode->i_flags |= oldflags; 2092 inode->i_flags |= oldflags;
2062 mutex_unlock(&inode->i_mutex); 2093 mutex_unlock(&inode->i_mutex);
2063 up_write(&dqopt->dqptr_sem);
2064 } 2094 }
2065 mutex_unlock(&dqopt->dqonoff_mutex); 2095 mutex_unlock(&dqopt->dqonoff_mutex);
2066out_fmt: 2096out_fmt:
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 000000000000..2663ed90fb03
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,95 @@
1
2#include <linux/cred.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/kernel.h>
6#include <linux/quotaops.h>
7#include <linux/sched.h>
8#include <net/netlink.h>
9#include <net/genetlink.h>
10
11/* Netlink family structure for quota */
12static struct genl_family quota_genl_family = {
13 .id = GENL_ID_GENERATE,
14 .hdrsize = 0,
15 .name = "VFS_DQUOT",
16 .version = 1,
17 .maxattr = QUOTA_NL_A_MAX,
18};
19
20/**
21 * quota_send_warning - Send warning to userspace about exceeded quota
22 * @type: The quota type: USRQQUOTA, GRPQUOTA,...
23 * @id: The user or group id of the quota that was exceeded
24 * @dev: The device on which the fs is mounted (sb->s_dev)
25 * @warntype: The type of the warning: QUOTA_NL_...
26 *
27 * This can be used by filesystems (including those which don't use
28 * dquot) to send a message to userspace relating to quota limits.
29 *
30 */
31
32void quota_send_warning(short type, unsigned int id, dev_t dev,
33 const char warntype)
34{
35 static atomic_t seq;
36 struct sk_buff *skb;
37 void *msg_head;
38 int ret;
39 int msg_size = 4 * nla_total_size(sizeof(u32)) +
40 2 * nla_total_size(sizeof(u64));
41
42 /* We have to allocate using GFP_NOFS as we are called from a
43 * filesystem performing write and thus further recursion into
44 * the fs to free some data could cause deadlocks. */
45 skb = genlmsg_new(msg_size, GFP_NOFS);
46 if (!skb) {
47 printk(KERN_ERR
48 "VFS: Not enough memory to send quota warning.\n");
49 return;
50 }
51 msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
52 &quota_genl_family, 0, QUOTA_NL_C_WARNING);
53 if (!msg_head) {
54 printk(KERN_ERR
55 "VFS: Cannot store netlink header in quota warning.\n");
56 goto err_out;
57 }
58 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
59 if (ret)
60 goto attr_err_out;
61 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
62 if (ret)
63 goto attr_err_out;
64 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
65 if (ret)
66 goto attr_err_out;
67 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
68 if (ret)
69 goto attr_err_out;
70 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
71 if (ret)
72 goto attr_err_out;
73 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
74 if (ret)
75 goto attr_err_out;
76 genlmsg_end(skb, msg_head);
77
78 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
79 return;
80attr_err_out:
81 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
82err_out:
83 kfree_skb(skb);
84}
85EXPORT_SYMBOL(quota_send_warning);
86
87static int __init quota_init(void)
88{
89 if (genl_register_family(&quota_genl_family) != 0)
90 printk(KERN_ERR
91 "VFS: Failed to create quota netlink interface.\n");
92 return 0;
93};
94
95module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ee91e2756950..95388f9b7356 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,7 +10,6 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <asm/current.h> 11#include <asm/current.h>
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13#include <linux/compat.h>
14#include <linux/kernel.h> 13#include <linux/kernel.h>
15#include <linux/security.h> 14#include <linux/security.h>
16#include <linux/syscalls.h> 15#include <linux/syscalls.h>
@@ -18,220 +17,205 @@
18#include <linux/capability.h> 17#include <linux/capability.h>
19#include <linux/quotaops.h> 18#include <linux/quotaops.h>
20#include <linux/types.h> 19#include <linux/types.h>
21#include <net/netlink.h> 20#include <linux/writeback.h>
22#include <net/genetlink.h>
23 21
24/* Check validity of generic quotactl commands */ 22static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
25static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, 23 qid_t id)
26 qid_t id)
27{ 24{
28 if (type >= MAXQUOTAS)
29 return -EINVAL;
30 if (!sb && cmd != Q_SYNC)
31 return -ENODEV;
32 /* Is operation supported? */
33 if (sb && !sb->s_qcop)
34 return -ENOSYS;
35
36 switch (cmd) { 25 switch (cmd) {
37 case Q_GETFMT: 26 /* these commands do not require any special privilegues */
38 break; 27 case Q_GETFMT:
39 case Q_QUOTAON: 28 case Q_SYNC:
40 if (!sb->s_qcop->quota_on) 29 case Q_GETINFO:
41 return -ENOSYS; 30 case Q_XGETQSTAT:
42 break; 31 case Q_XQUOTASYNC:
43 case Q_QUOTAOFF: 32 break;
44 if (!sb->s_qcop->quota_off) 33 /* allow to query information for dquots we "own" */
45 return -ENOSYS; 34 case Q_GETQUOTA:
46 break; 35 case Q_XGETQUOTA:
47 case Q_SETINFO: 36 if ((type == USRQUOTA && current_euid() == id) ||
48 if (!sb->s_qcop->set_info) 37 (type == GRPQUOTA && in_egroup_p(id)))
49 return -ENOSYS;
50 break;
51 case Q_GETINFO:
52 if (!sb->s_qcop->get_info)
53 return -ENOSYS;
54 break;
55 case Q_SETQUOTA:
56 if (!sb->s_qcop->set_dqblk)
57 return -ENOSYS;
58 break;
59 case Q_GETQUOTA:
60 if (!sb->s_qcop->get_dqblk)
61 return -ENOSYS;
62 break;
63 case Q_SYNC:
64 if (sb && !sb->s_qcop->quota_sync)
65 return -ENOSYS;
66 break; 38 break;
67 default: 39 /*FALLTHROUGH*/
68 return -EINVAL; 40 default:
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
69 } 43 }
70 44
71 /* Is quota turned on for commands which need it? */ 45 return security_quotactl(cmd, type, id, sb);
72 switch (cmd) { 46}
73 case Q_GETFMT:
74 case Q_GETINFO:
75 case Q_SETINFO:
76 case Q_SETQUOTA:
77 case Q_GETQUOTA:
78 /* This is just an informative test so we are satisfied
79 * without the lock */
80 if (!sb_has_quota_active(sb, type))
81 return -ESRCH;
82 }
83 47
84 /* Check privileges */ 48static int quota_sync_all(int type)
85 if (cmd == Q_GETQUOTA) { 49{
86 if (((type == USRQUOTA && current_euid() != id) || 50 struct super_block *sb;
87 (type == GRPQUOTA && !in_egroup_p(id))) && 51 int ret;
88 !capable(CAP_SYS_ADMIN)) 52
89 return -EPERM; 53 if (type >= MAXQUOTAS)
54 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret)
57 return ret;
58
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
90 } 74 }
91 else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) 75 spin_unlock(&sb_lock);
92 if (!capable(CAP_SYS_ADMIN))
93 return -EPERM;
94 76
95 return 0; 77 return 0;
96} 78}
97 79
98/* Check validity of XFS Quota Manager commands */ 80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
99static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, 81 void __user *addr)
100 qid_t id)
101{ 82{
102 if (type >= XQM_MAXQUOTAS) 83 char *pathname;
103 return -EINVAL; 84 int ret = -ENOSYS;
104 if (!sb) 85
105 return -ENODEV; 86 pathname = getname(addr);
106 if (!sb->s_qcop) 87 if (IS_ERR(pathname))
107 return -ENOSYS; 88 return PTR_ERR(pathname);
89 if (sb->s_qcop->quota_on)
90 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
91 putname(pathname);
92 return ret;
93}
108 94
109 switch (cmd) { 95static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
110 case Q_XQUOTAON: 96{
111 case Q_XQUOTAOFF: 97 __u32 fmt;
112 case Q_XQUOTARM:
113 if (!sb->s_qcop->set_xstate)
114 return -ENOSYS;
115 break;
116 case Q_XGETQSTAT:
117 if (!sb->s_qcop->get_xstate)
118 return -ENOSYS;
119 break;
120 case Q_XSETQLIM:
121 if (!sb->s_qcop->set_xquota)
122 return -ENOSYS;
123 break;
124 case Q_XGETQUOTA:
125 if (!sb->s_qcop->get_xquota)
126 return -ENOSYS;
127 break;
128 case Q_XQUOTASYNC:
129 if (!sb->s_qcop->quota_sync)
130 return -ENOSYS;
131 break;
132 default:
133 return -EINVAL;
134 }
135 98
136 /* Check privileges */ 99 down_read(&sb_dqopt(sb)->dqptr_sem);
137 if (cmd == Q_XGETQUOTA) { 100 if (!sb_has_quota_active(sb, type)) {
138 if (((type == XQM_USRQUOTA && current_euid() != id) || 101 up_read(&sb_dqopt(sb)->dqptr_sem);
139 (type == XQM_GRPQUOTA && !in_egroup_p(id))) && 102 return -ESRCH;
140 !capable(CAP_SYS_ADMIN))
141 return -EPERM;
142 } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
143 if (!capable(CAP_SYS_ADMIN))
144 return -EPERM;
145 } 103 }
104 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
105 up_read(&sb_dqopt(sb)->dqptr_sem);
106 if (copy_to_user(addr, &fmt, sizeof(fmt)))
107 return -EFAULT;
108 return 0;
109}
146 110
111static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
112{
113 struct if_dqinfo info;
114 int ret;
115
116 if (!sb_has_quota_active(sb, type))
117 return -ESRCH;
118 if (!sb->s_qcop->get_info)
119 return -ENOSYS;
120 ret = sb->s_qcop->get_info(sb, type, &info);
121 if (!ret && copy_to_user(addr, &info, sizeof(info)))
122 return -EFAULT;
123 return ret;
124}
125
126static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
127{
128 struct if_dqinfo info;
129
130 if (copy_from_user(&info, addr, sizeof(info)))
131 return -EFAULT;
132 if (!sb_has_quota_active(sb, type))
133 return -ESRCH;
134 if (!sb->s_qcop->set_info)
135 return -ENOSYS;
136 return sb->s_qcop->set_info(sb, type, &info);
137}
138
139static int quota_getquota(struct super_block *sb, int type, qid_t id,
140 void __user *addr)
141{
142 struct if_dqblk idq;
143 int ret;
144
145 if (!sb_has_quota_active(sb, type))
146 return -ESRCH;
147 if (!sb->s_qcop->get_dqblk)
148 return -ENOSYS;
149 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
150 if (ret)
151 return ret;
152 if (copy_to_user(addr, &idq, sizeof(idq)))
153 return -EFAULT;
147 return 0; 154 return 0;
148} 155}
149 156
150static int check_quotactl_valid(struct super_block *sb, int type, int cmd, 157static int quota_setquota(struct super_block *sb, int type, qid_t id,
151 qid_t id) 158 void __user *addr)
152{ 159{
153 int error; 160 struct if_dqblk idq;
154 161
155 if (XQM_COMMAND(cmd)) 162 if (copy_from_user(&idq, addr, sizeof(idq)))
156 error = xqm_quotactl_valid(sb, type, cmd, id); 163 return -EFAULT;
157 else 164 if (!sb_has_quota_active(sb, type))
158 error = generic_quotactl_valid(sb, type, cmd, id); 165 return -ESRCH;
159 if (!error) 166 if (!sb->s_qcop->set_dqblk)
160 error = security_quotactl(cmd, type, id, sb); 167 return -ENOSYS;
161 return error; 168 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
162} 169}
163 170
164#ifdef CONFIG_QUOTA 171static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
165void sync_quota_sb(struct super_block *sb, int type)
166{ 172{
167 int cnt; 173 __u32 flags;
168 174
169 if (!sb->s_qcop->quota_sync) 175 if (copy_from_user(&flags, addr, sizeof(flags)))
170 return; 176 return -EFAULT;
177 if (!sb->s_qcop->set_xstate)
178 return -ENOSYS;
179 return sb->s_qcop->set_xstate(sb, flags, cmd);
180}
171 181
172 sb->s_qcop->quota_sync(sb, type); 182static int quota_getxstate(struct super_block *sb, void __user *addr)
183{
184 struct fs_quota_stat fqs;
185 int ret;
173 186
174 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) 187 if (!sb->s_qcop->get_xstate)
175 return; 188 return -ENOSYS;
176 /* This is not very clever (and fast) but currently I don't know about 189 ret = sb->s_qcop->get_xstate(sb, &fqs);
177 * any other simple way of getting quota data to disk and we must get 190 if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
178 * them there for userspace to be visible... */ 191 return -EFAULT;
179 if (sb->s_op->sync_fs) 192 return ret;
180 sb->s_op->sync_fs(sb, 1); 193}
181 sync_blockdev(sb->s_bdev);
182 194
183 /* 195static int quota_setxquota(struct super_block *sb, int type, qid_t id,
184 * Now when everything is written we can discard the pagecache so 196 void __user *addr)
185 * that userspace sees the changes. 197{
186 */ 198 struct fs_disk_quota fdq;
187 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 199
188 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 200 if (copy_from_user(&fdq, addr, sizeof(fdq)))
189 if (type != -1 && cnt != type) 201 return -EFAULT;
190 continue; 202 if (!sb->s_qcop->set_xquota)
191 if (!sb_has_quota_active(sb, cnt)) 203 return -ENOSYS;
192 continue; 204 return sb->s_qcop->set_xquota(sb, type, id, &fdq);
193 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
194 I_MUTEX_QUOTA);
195 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
196 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
197 }
198 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
199} 205}
200#endif
201 206
202static void sync_dquots(int type) 207static int quota_getxquota(struct super_block *sb, int type, qid_t id,
208 void __user *addr)
203{ 209{
204 struct super_block *sb; 210 struct fs_disk_quota fdq;
205 int cnt; 211 int ret;
206 212
207 spin_lock(&sb_lock); 213 if (!sb->s_qcop->get_xquota)
208restart: 214 return -ENOSYS;
209 list_for_each_entry(sb, &super_blocks, s_list) { 215 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
210 /* This test just improves performance so it needn't be 216 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
211 * reliable... */ 217 return -EFAULT;
212 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 218 return ret;
213 if (type != -1 && type != cnt)
214 continue;
215 if (!sb_has_quota_active(sb, cnt))
216 continue;
217 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
218 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
219 continue;
220 break;
221 }
222 if (cnt == MAXQUOTAS)
223 continue;
224 sb->s_count++;
225 spin_unlock(&sb_lock);
226 down_read(&sb->s_umount);
227 if (sb->s_root)
228 sync_quota_sb(sb, type);
229 up_read(&sb->s_umount);
230 spin_lock(&sb_lock);
231 if (__put_super_and_need_restart(sb))
232 goto restart;
233 }
234 spin_unlock(&sb_lock);
235} 219}
236 220
237/* Copy parameters and call proper function */ 221/* Copy parameters and call proper function */
@@ -240,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
240{ 224{
241 int ret; 225 int ret;
242 226
227 if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
228 return -EINVAL;
229 if (!sb->s_qcop)
230 return -ENOSYS;
231
232 ret = check_quotactl_permission(sb, type, cmd, id);
233 if (ret < 0)
234 return ret;
235
243 switch (cmd) { 236 switch (cmd) {
244 case Q_QUOTAON: { 237 case Q_QUOTAON:
245 char *pathname; 238 return quota_quotaon(sb, type, cmd, id, addr);
246 239 case Q_QUOTAOFF:
247 pathname = getname(addr); 240 if (!sb->s_qcop->quota_off)
248 if (IS_ERR(pathname)) 241 return -ENOSYS;
249 return PTR_ERR(pathname); 242 return sb->s_qcop->quota_off(sb, type, 0);
250 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 243 case Q_GETFMT:
251 putname(pathname); 244 return quota_getfmt(sb, type, addr);
252 return ret; 245 case Q_GETINFO:
253 } 246 return quota_getinfo(sb, type, addr);
254 case Q_QUOTAOFF: 247 case Q_SETINFO:
255 return sb->s_qcop->quota_off(sb, type, 0); 248 return quota_setinfo(sb, type, addr);
256 249 case Q_GETQUOTA:
257 case Q_GETFMT: { 250 return quota_getquota(sb, type, id, addr);
258 __u32 fmt; 251 case Q_SETQUOTA:
259 252 return quota_setquota(sb, type, id, addr);
260 down_read(&sb_dqopt(sb)->dqptr_sem); 253 case Q_SYNC:
261 if (!sb_has_quota_active(sb, type)) { 254 if (!sb->s_qcop->quota_sync)
262 up_read(&sb_dqopt(sb)->dqptr_sem); 255 return -ENOSYS;
263 return -ESRCH; 256 return sb->s_qcop->quota_sync(sb, type, 1);
264 } 257 case Q_XQUOTAON:
265 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; 258 case Q_XQUOTAOFF:
266 up_read(&sb_dqopt(sb)->dqptr_sem); 259 case Q_XQUOTARM:
267 if (copy_to_user(addr, &fmt, sizeof(fmt))) 260 return quota_setxstate(sb, cmd, addr);
268 return -EFAULT; 261 case Q_XGETQSTAT:
269 return 0; 262 return quota_getxstate(sb, addr);
270 } 263 case Q_XSETQLIM:
271 case Q_GETINFO: { 264 return quota_setxquota(sb, type, id, addr);
272 struct if_dqinfo info; 265 case Q_XGETQUOTA:
273 266 return quota_getxquota(sb, type, id, addr);
274 ret = sb->s_qcop->get_info(sb, type, &info); 267 case Q_XQUOTASYNC:
275 if (ret) 268 /* caller already holds s_umount */
276 return ret; 269 if (sb->s_flags & MS_RDONLY)
277 if (copy_to_user(addr, &info, sizeof(info))) 270 return -EROFS;
278 return -EFAULT; 271 writeback_inodes_sb(sb);
279 return 0; 272 return 0;
280 } 273 default:
281 case Q_SETINFO: { 274 return -EINVAL;
282 struct if_dqinfo info;
283
284 if (copy_from_user(&info, addr, sizeof(info)))
285 return -EFAULT;
286 return sb->s_qcop->set_info(sb, type, &info);
287 }
288 case Q_GETQUOTA: {
289 struct if_dqblk idq;
290
291 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
292 if (ret)
293 return ret;
294 if (copy_to_user(addr, &idq, sizeof(idq)))
295 return -EFAULT;
296 return 0;
297 }
298 case Q_SETQUOTA: {
299 struct if_dqblk idq;
300
301 if (copy_from_user(&idq, addr, sizeof(idq)))
302 return -EFAULT;
303 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
304 }
305 case Q_SYNC:
306 if (sb)
307 sync_quota_sb(sb, type);
308 else
309 sync_dquots(type);
310 return 0;
311
312 case Q_XQUOTAON:
313 case Q_XQUOTAOFF:
314 case Q_XQUOTARM: {
315 __u32 flags;
316
317 if (copy_from_user(&flags, addr, sizeof(flags)))
318 return -EFAULT;
319 return sb->s_qcop->set_xstate(sb, flags, cmd);
320 }
321 case Q_XGETQSTAT: {
322 struct fs_quota_stat fqs;
323
324 if ((ret = sb->s_qcop->get_xstate(sb, &fqs)))
325 return ret;
326 if (copy_to_user(addr, &fqs, sizeof(fqs)))
327 return -EFAULT;
328 return 0;
329 }
330 case Q_XSETQLIM: {
331 struct fs_disk_quota fdq;
332
333 if (copy_from_user(&fdq, addr, sizeof(fdq)))
334 return -EFAULT;
335 return sb->s_qcop->set_xquota(sb, type, id, &fdq);
336 }
337 case Q_XGETQUOTA: {
338 struct fs_disk_quota fdq;
339
340 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
341 if (ret)
342 return ret;
343 if (copy_to_user(addr, &fdq, sizeof(fdq)))
344 return -EFAULT;
345 return 0;
346 }
347 case Q_XQUOTASYNC:
348 return sb->s_qcop->quota_sync(sb, type);
349 /* We never reach here unless validity check is broken */
350 default:
351 BUG();
352 } 275 }
353 return 0;
354} 276}
355 277
356/* 278/*
@@ -397,224 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
397 cmds = cmd >> SUBCMDSHIFT; 319 cmds = cmd >> SUBCMDSHIFT;
398 type = cmd & SUBCMDMASK; 320 type = cmd & SUBCMDMASK;
399 321
400 if (cmds != Q_SYNC || special) { 322 /*
401 sb = quotactl_block(special); 323 * As a special case Q_SYNC can be called without a specific device.
402 if (IS_ERR(sb)) 324 * It will iterate all superblocks that have quota enabled and call
403 return PTR_ERR(sb); 325 * the sync action on each of them.
326 */
327 if (!special) {
328 if (cmds == Q_SYNC)
329 return quota_sync_all(type);
330 return -ENODEV;
404 } 331 }
405 332
406 ret = check_quotactl_valid(sb, type, cmds, id); 333 sb = quotactl_block(special);
407 if (ret >= 0) 334 if (IS_ERR(sb))
408 ret = do_quotactl(sb, type, cmds, id, addr); 335 return PTR_ERR(sb);
409 if (sb)
410 drop_super(sb);
411 336
412 return ret; 337 ret = do_quotactl(sb, type, cmds, id, addr);
413}
414
415#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT)
416/*
417 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
418 * and is necessary due to alignment problems.
419 */
420struct compat_if_dqblk {
421 compat_u64 dqb_bhardlimit;
422 compat_u64 dqb_bsoftlimit;
423 compat_u64 dqb_curspace;
424 compat_u64 dqb_ihardlimit;
425 compat_u64 dqb_isoftlimit;
426 compat_u64 dqb_curinodes;
427 compat_u64 dqb_btime;
428 compat_u64 dqb_itime;
429 compat_uint_t dqb_valid;
430};
431
432/* XFS structures */
433struct compat_fs_qfilestat {
434 compat_u64 dqb_bhardlimit;
435 compat_u64 qfs_nblks;
436 compat_uint_t qfs_nextents;
437};
438
439struct compat_fs_quota_stat {
440 __s8 qs_version;
441 __u16 qs_flags;
442 __s8 qs_pad;
443 struct compat_fs_qfilestat qs_uquota;
444 struct compat_fs_qfilestat qs_gquota;
445 compat_uint_t qs_incoredqs;
446 compat_int_t qs_btimelimit;
447 compat_int_t qs_itimelimit;
448 compat_int_t qs_rtbtimelimit;
449 __u16 qs_bwarnlimit;
450 __u16 qs_iwarnlimit;
451};
452
453asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
454 qid_t id, void __user *addr)
455{
456 unsigned int cmds;
457 struct if_dqblk __user *dqblk;
458 struct compat_if_dqblk __user *compat_dqblk;
459 struct fs_quota_stat __user *fsqstat;
460 struct compat_fs_quota_stat __user *compat_fsqstat;
461 compat_uint_t data;
462 u16 xdata;
463 long ret;
464 338
465 cmds = cmd >> SUBCMDSHIFT; 339 drop_super(sb);
466
467 switch (cmds) {
468 case Q_GETQUOTA:
469 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
470 compat_dqblk = addr;
471 ret = sys_quotactl(cmd, special, id, dqblk);
472 if (ret)
473 break;
474 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
475 get_user(data, &dqblk->dqb_valid) ||
476 put_user(data, &compat_dqblk->dqb_valid))
477 ret = -EFAULT;
478 break;
479 case Q_SETQUOTA:
480 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
481 compat_dqblk = addr;
482 ret = -EFAULT;
483 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
484 get_user(data, &compat_dqblk->dqb_valid) ||
485 put_user(data, &dqblk->dqb_valid))
486 break;
487 ret = sys_quotactl(cmd, special, id, dqblk);
488 break;
489 case Q_XGETQSTAT:
490 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
491 compat_fsqstat = addr;
492 ret = sys_quotactl(cmd, special, id, fsqstat);
493 if (ret)
494 break;
495 ret = -EFAULT;
496 /* Copying qs_version, qs_flags, qs_pad */
497 if (copy_in_user(compat_fsqstat, fsqstat,
498 offsetof(struct compat_fs_quota_stat, qs_uquota)))
499 break;
500 /* Copying qs_uquota */
501 if (copy_in_user(&compat_fsqstat->qs_uquota,
502 &fsqstat->qs_uquota,
503 sizeof(compat_fsqstat->qs_uquota)) ||
504 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
505 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
506 break;
507 /* Copying qs_gquota */
508 if (copy_in_user(&compat_fsqstat->qs_gquota,
509 &fsqstat->qs_gquota,
510 sizeof(compat_fsqstat->qs_gquota)) ||
511 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
512 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
513 break;
514 /* Copying the rest */
515 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
516 &fsqstat->qs_incoredqs,
517 sizeof(struct compat_fs_quota_stat) -
518 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
519 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
520 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
521 break;
522 ret = 0;
523 break;
524 default:
525 ret = sys_quotactl(cmd, special, id, addr);
526 }
527 return ret; 340 return ret;
528} 341}
529#endif
530
531
532#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
533
534/* Netlink family structure for quota */
535static struct genl_family quota_genl_family = {
536 .id = GENL_ID_GENERATE,
537 .hdrsize = 0,
538 .name = "VFS_DQUOT",
539 .version = 1,
540 .maxattr = QUOTA_NL_A_MAX,
541};
542
543/**
544 * quota_send_warning - Send warning to userspace about exceeded quota
545 * @type: The quota type: USRQQUOTA, GRPQUOTA,...
546 * @id: The user or group id of the quota that was exceeded
547 * @dev: The device on which the fs is mounted (sb->s_dev)
548 * @warntype: The type of the warning: QUOTA_NL_...
549 *
550 * This can be used by filesystems (including those which don't use
551 * dquot) to send a message to userspace relating to quota limits.
552 *
553 */
554
555void quota_send_warning(short type, unsigned int id, dev_t dev,
556 const char warntype)
557{
558 static atomic_t seq;
559 struct sk_buff *skb;
560 void *msg_head;
561 int ret;
562 int msg_size = 4 * nla_total_size(sizeof(u32)) +
563 2 * nla_total_size(sizeof(u64));
564
565 /* We have to allocate using GFP_NOFS as we are called from a
566 * filesystem performing write and thus further recursion into
567 * the fs to free some data could cause deadlocks. */
568 skb = genlmsg_new(msg_size, GFP_NOFS);
569 if (!skb) {
570 printk(KERN_ERR
571 "VFS: Not enough memory to send quota warning.\n");
572 return;
573 }
574 msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
575 &quota_genl_family, 0, QUOTA_NL_C_WARNING);
576 if (!msg_head) {
577 printk(KERN_ERR
578 "VFS: Cannot store netlink header in quota warning.\n");
579 goto err_out;
580 }
581 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
582 if (ret)
583 goto attr_err_out;
584 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
585 if (ret)
586 goto attr_err_out;
587 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
588 if (ret)
589 goto attr_err_out;
590 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
591 if (ret)
592 goto attr_err_out;
593 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
594 if (ret)
595 goto attr_err_out;
596 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
597 if (ret)
598 goto attr_err_out;
599 genlmsg_end(skb, msg_head);
600
601 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
602 return;
603attr_err_out:
604 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
605err_out:
606 kfree_skb(skb);
607}
608EXPORT_SYMBOL(quota_send_warning);
609
610static int __init quota_init(void)
611{
612 if (genl_register_family(&quota_genl_family) != 0)
613 printk(KERN_ERR
614 "VFS: Failed to create quota netlink interface.\n");
615 return 0;
616};
617
618module_init(quota_init);
619#endif
620
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 258 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 259 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 260 kiocb.ki_left = len;
261 kiocb.ki_nbytes = len;
261 262
262 for (;;) { 263 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 264 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 314 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 315 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 316 kiocb.ki_left = len;
317 kiocb.ki_nbytes = len;
316 318
317 for (;;) { 319 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 320 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 65c872761177..483442e66ed6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
169 return 0; // No free blocks in this bitmap 169 return 0; // No free blocks in this bitmap
170 } 170 }
171 171
172 /* search for a first zero bit -- beggining of a window */ 172 /* search for a first zero bit -- beginning of a window */
173 *beg = reiserfs_find_next_zero_le_bit 173 *beg = reiserfs_find_next_zero_le_bit
174 ((unsigned long *)(bh->b_data), boundary, *beg); 174 ((unsigned long *)(bh->b_data), boundary, *beg);
175 175
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
425 425
426 journal_mark_dirty(th, s, sbh); 426 journal_mark_dirty(th, s, sbh);
427 if (for_unformatted) 427 if (for_unformatted)
428 vfs_dq_free_block_nodirty(inode, 1); 428 dquot_free_block_nodirty(inode, 1);
429} 429}
430 430
431void reiserfs_free_block(struct reiserfs_transaction_handle *th, 431void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1049 amount_needed, hint->inode->i_uid); 1049 amount_needed, hint->inode->i_uid);
1050#endif 1050#endif
1051 quota_ret = 1051 quota_ret =
1052 vfs_dq_alloc_block_nodirty(hint->inode, amount_needed); 1052 dquot_alloc_block_nodirty(hint->inode, amount_needed);
1053 if (quota_ret) /* Quota exceeded? */ 1053 if (quota_ret) /* Quota exceeded? */
1054 return QUOTA_EXCEEDED; 1054 return QUOTA_EXCEEDED;
1055 if (hint->preallocate && hint->prealloc_size) { 1055 if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1058 "reiserquota: allocating (prealloc) %d blocks id=%u", 1058 "reiserquota: allocating (prealloc) %d blocks id=%u",
1059 hint->prealloc_size, hint->inode->i_uid); 1059 hint->prealloc_size, hint->inode->i_uid);
1060#endif 1060#endif
1061 quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode, 1061 quota_ret = dquot_prealloc_block_nodirty(hint->inode,
1062 hint->prealloc_size); 1062 hint->prealloc_size);
1063 if (quota_ret) 1063 if (quota_ret)
1064 hint->preallocate = hint->prealloc_size = 0; 1064 hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1092 hint->inode->i_uid); 1092 hint->inode->i_uid);
1093#endif 1093#endif
1094 /* Free not allocated blocks */ 1094 /* Free not allocated blocks */
1095 vfs_dq_free_block_nodirty(hint->inode, 1095 dquot_free_block_nodirty(hint->inode,
1096 amount_needed + hint->prealloc_size - 1096 amount_needed + hint->prealloc_size -
1097 nr_allocated); 1097 nr_allocated);
1098 } 1098 }
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1125 REISERFS_I(hint->inode)->i_prealloc_count, 1125 REISERFS_I(hint->inode)->i_prealloc_count,
1126 hint->inode->i_uid); 1126 hint->inode->i_uid);
1127#endif 1127#endif
1128 vfs_dq_free_block_nodirty(hint->inode, amount_needed + 1128 dquot_free_block_nodirty(hint->inode, amount_needed +
1129 hint->prealloc_size - nr_allocated - 1129 hint->prealloc_size - nr_allocated -
1130 REISERFS_I(hint->inode)-> 1130 REISERFS_I(hint->inode)->
1131 i_prealloc_count); 1131 i_prealloc_count);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index da2dba082e2d..1d9c12714c5c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = {
289 .compat_ioctl = reiserfs_compat_ioctl, 289 .compat_ioctl = reiserfs_compat_ioctl,
290#endif 290#endif
291 .mmap = reiserfs_file_mmap, 291 .mmap = reiserfs_file_mmap,
292 .open = generic_file_open, 292 .open = dquot_file_open,
293 .release = reiserfs_file_release, 293 .release = reiserfs_file_release,
294 .fsync = reiserfs_sync_file, 294 .fsync = reiserfs_sync_file,
295 .aio_read = generic_file_aio_read, 295 .aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 2df0f5c7c60b..d1da94b82d8f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -34,6 +34,9 @@ void reiserfs_delete_inode(struct inode *inode)
34 int depth; 34 int depth;
35 int err; 35 int err;
36 36
37 if (!is_bad_inode(inode))
38 dquot_initialize(inode);
39
37 truncate_inode_pages(&inode->i_data, 0); 40 truncate_inode_pages(&inode->i_data, 0);
38 41
39 depth = reiserfs_write_lock_once(inode->i_sb); 42 depth = reiserfs_write_lock_once(inode->i_sb);
@@ -54,7 +57,7 @@ void reiserfs_delete_inode(struct inode *inode)
54 * after delete_object so that quota updates go into the same transaction as 57 * after delete_object so that quota updates go into the same transaction as
55 * stat data deletion */ 58 * stat data deletion */
56 if (!err) 59 if (!err)
57 vfs_dq_free_inode(inode); 60 dquot_free_inode(inode);
58 61
59 if (journal_end(&th, inode->i_sb, jbegin_count)) 62 if (journal_end(&th, inode->i_sb, jbegin_count))
60 goto out; 63 goto out;
@@ -1615,7 +1618,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1615** to properly mark inodes for datasync and such, but only actually 1618** to properly mark inodes for datasync and such, but only actually
1616** does something when called for a synchronous update. 1619** does something when called for a synchronous update.
1617*/ 1620*/
1618int reiserfs_write_inode(struct inode *inode, int do_sync) 1621int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1619{ 1622{
1620 struct reiserfs_transaction_handle th; 1623 struct reiserfs_transaction_handle th;
1621 int jbegin_count = 1; 1624 int jbegin_count = 1;
@@ -1627,7 +1630,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
1627 ** inode needs to reach disk for safety, and they can safely be 1630 ** inode needs to reach disk for safety, and they can safely be
1628 ** ignored because the altered inode has already been logged. 1631 ** ignored because the altered inode has already been logged.
1629 */ 1632 */
1630 if (do_sync && !(current->flags & PF_MEMALLOC)) { 1633 if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
1631 reiserfs_write_lock(inode->i_sb); 1634 reiserfs_write_lock(inode->i_sb);
1632 if (!journal_begin(&th, inode->i_sb, jbegin_count)) { 1635 if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1633 reiserfs_update_sd(&th, inode); 1636 reiserfs_update_sd(&th, inode);
@@ -1765,10 +1768,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1765 1768
1766 BUG_ON(!th->t_trans_id); 1769 BUG_ON(!th->t_trans_id);
1767 1770
1768 if (vfs_dq_alloc_inode(inode)) { 1771 dquot_initialize(inode);
1769 err = -EDQUOT; 1772 err = dquot_alloc_inode(inode);
1773 if (err)
1770 goto out_end_trans; 1774 goto out_end_trans;
1771 }
1772 if (!dir->i_nlink) { 1775 if (!dir->i_nlink) {
1773 err = -EPERM; 1776 err = -EPERM;
1774 goto out_bad_inode; 1777 goto out_bad_inode;
@@ -1959,12 +1962,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1959 INODE_PKEY(inode)->k_objectid = 0; 1962 INODE_PKEY(inode)->k_objectid = 0;
1960 1963
1961 /* Quota change must be inside a transaction for journaling */ 1964 /* Quota change must be inside a transaction for journaling */
1962 vfs_dq_free_inode(inode); 1965 dquot_free_inode(inode);
1963 1966
1964 out_end_trans: 1967 out_end_trans:
1965 journal_end(th, th->t_super, th->t_blocks_allocated); 1968 journal_end(th, th->t_super, th->t_blocks_allocated);
1966 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1969 /* Drop can be outside and it needs more credits so it's better to have it outside */
1967 vfs_dq_drop(inode); 1970 dquot_drop(inode);
1968 inode->i_flags |= S_NOQUOTA; 1971 inode->i_flags |= S_NOQUOTA;
1969 make_bad_inode(inode); 1972 make_bad_inode(inode);
1970 1973
@@ -3073,6 +3076,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3073 3076
3074 depth = reiserfs_write_lock_once(inode->i_sb); 3077 depth = reiserfs_write_lock_once(inode->i_sb);
3075 if (attr->ia_valid & ATTR_SIZE) { 3078 if (attr->ia_valid & ATTR_SIZE) {
3079 dquot_initialize(inode);
3080
3076 /* version 2 items will be caught by the s_maxbytes check 3081 /* version 2 items will be caught by the s_maxbytes check
3077 ** done for us in vmtruncate 3082 ** done for us in vmtruncate
3078 */ 3083 */
@@ -3134,8 +3139,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3134 jbegin_count); 3139 jbegin_count);
3135 if (error) 3140 if (error)
3136 goto out; 3141 goto out;
3137 error = 3142 error = dquot_transfer(inode, attr);
3138 vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
3139 if (error) { 3143 if (error) {
3140 journal_end(&th, inode->i_sb, 3144 journal_end(&th, inode->i_sb,
3141 jbegin_count); 3145 jbegin_count);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..f3de5e8a2ae8 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2217,6 +2217,15 @@ static int journal_read_transaction(struct super_block *sb,
2217 brelse(d_bh); 2217 brelse(d_bh);
2218 return 1; 2218 return 1;
2219 } 2219 }
2220
2221 if (bdev_read_only(sb->s_bdev)) {
2222 reiserfs_warning(sb, "clm-2076",
2223 "device is readonly, unable to replay log");
2224 brelse(c_bh);
2225 brelse(d_bh);
2226 return -EROFS;
2227 }
2228
2220 trans_id = get_desc_trans_id(desc); 2229 trans_id = get_desc_trans_id(desc);
2221 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2230 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2222 log_blocks = kmalloc(get_desc_trans_len(desc) * 2231 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2468,6 @@ static int journal_read(struct super_block *sb)
2459 goto start_log_replay; 2468 goto start_log_replay;
2460 } 2469 }
2461 2470
2462 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2463 reiserfs_warning(sb, "clm-2076",
2464 "device is readonly, unable to replay log");
2465 return -1;
2466 }
2467
2468 /* ok, there are transactions that need to be replayed. start with the first log block, find 2471 /* ok, there are transactions that need to be replayed. start with the first log block, find
2469 ** all the valid transactions, and pick out the oldest. 2472 ** all the valid transactions, and pick out the oldest.
2470 */ 2473 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9d4dcf0b07cb..96e4cbbfaa18 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
546*/ 546*/
547static int drop_new_inode(struct inode *inode) 547static int drop_new_inode(struct inode *inode)
548{ 548{
549 vfs_dq_drop(inode); 549 dquot_drop(inode);
550 make_bad_inode(inode); 550 make_bad_inode(inode);
551 inode->i_flags |= S_NOQUOTA; 551 inode->i_flags |= S_NOQUOTA;
552 iput(inode); 552 iput(inode);
@@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode)
554} 554}
555 555
556/* utility function that does setup for reiserfs_new_inode. 556/* utility function that does setup for reiserfs_new_inode.
557** vfs_dq_init needs lots of credits so it's better to have it 557** dquot_initialize needs lots of credits so it's better to have it
558** outside of a transaction, so we had to pull some bits of 558** outside of a transaction, so we had to pull some bits of
559** reiserfs_new_inode out into this func. 559** reiserfs_new_inode out into this func.
560*/ 560*/
@@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
577 } else { 577 } else {
578 inode->i_gid = current_fsgid(); 578 inode->i_gid = current_fsgid();
579 } 579 }
580 vfs_dq_init(inode); 580 dquot_initialize(inode);
581 return 0; 581 return 0;
582} 582}
583 583
@@ -594,6 +594,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
594 struct reiserfs_transaction_handle th; 594 struct reiserfs_transaction_handle th;
595 struct reiserfs_security_handle security; 595 struct reiserfs_security_handle security;
596 596
597 dquot_initialize(dir);
598
597 if (!(inode = new_inode(dir->i_sb))) { 599 if (!(inode = new_inode(dir->i_sb))) {
598 return -ENOMEM; 600 return -ENOMEM;
599 } 601 }
@@ -666,6 +668,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
666 if (!new_valid_dev(rdev)) 668 if (!new_valid_dev(rdev))
667 return -EINVAL; 669 return -EINVAL;
668 670
671 dquot_initialize(dir);
672
669 if (!(inode = new_inode(dir->i_sb))) { 673 if (!(inode = new_inode(dir->i_sb))) {
670 return -ENOMEM; 674 return -ENOMEM;
671 } 675 }
@@ -739,6 +743,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
739 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + 743 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
740 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); 744 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
741 745
746 dquot_initialize(dir);
747
742#ifdef DISPLACE_NEW_PACKING_LOCALITIES 748#ifdef DISPLACE_NEW_PACKING_LOCALITIES
743 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ 749 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
744 REISERFS_I(dir)->new_packing_locality = 1; 750 REISERFS_I(dir)->new_packing_locality = 1;
@@ -842,6 +848,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
842 JOURNAL_PER_BALANCE_CNT * 2 + 2 + 848 JOURNAL_PER_BALANCE_CNT * 2 + 2 +
843 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 849 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
844 850
851 dquot_initialize(dir);
852
845 reiserfs_write_lock(dir->i_sb); 853 reiserfs_write_lock(dir->i_sb);
846 retval = journal_begin(&th, dir->i_sb, jbegin_count); 854 retval = journal_begin(&th, dir->i_sb, jbegin_count);
847 if (retval) 855 if (retval)
@@ -923,6 +931,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
923 unsigned long savelink; 931 unsigned long savelink;
924 int depth; 932 int depth;
925 933
934 dquot_initialize(dir);
935
926 inode = dentry->d_inode; 936 inode = dentry->d_inode;
927 937
928 /* in this transaction we can be doing at max two balancings and update 938 /* in this transaction we can be doing at max two balancings and update
@@ -1024,6 +1034,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
1024 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + 1034 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
1025 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); 1035 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
1026 1036
1037 dquot_initialize(parent_dir);
1038
1027 if (!(inode = new_inode(parent_dir->i_sb))) { 1039 if (!(inode = new_inode(parent_dir->i_sb))) {
1028 return -ENOMEM; 1040 return -ENOMEM;
1029 } 1041 }
@@ -1111,6 +1123,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1111 JOURNAL_PER_BALANCE_CNT * 3 + 1123 JOURNAL_PER_BALANCE_CNT * 3 +
1112 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 1124 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
1113 1125
1126 dquot_initialize(dir);
1127
1114 reiserfs_write_lock(dir->i_sb); 1128 reiserfs_write_lock(dir->i_sb);
1115 if (inode->i_nlink >= REISERFS_LINK_MAX) { 1129 if (inode->i_nlink >= REISERFS_LINK_MAX) {
1116 //FIXME: sd_nlink is 32 bit for new files 1130 //FIXME: sd_nlink is 32 bit for new files
@@ -1235,6 +1249,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1235 JOURNAL_PER_BALANCE_CNT * 3 + 5 + 1249 JOURNAL_PER_BALANCE_CNT * 3 + 5 +
1236 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); 1250 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
1237 1251
1252 dquot_initialize(old_dir);
1253 dquot_initialize(new_dir);
1254
1238 old_inode = old_dentry->d_inode; 1255 old_inode = old_dentry->d_inode;
1239 new_dentry_inode = new_dentry->d_inode; 1256 new_dentry_inode = new_dentry->d_inode;
1240 1257
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 5fa7118f04e1..313d39d639eb 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1299 "reiserquota delete_item(): freeing %u, id=%u type=%c", 1299 "reiserquota delete_item(): freeing %u, id=%u type=%c",
1300 quota_cut_bytes, inode->i_uid, head2type(&s_ih)); 1300 quota_cut_bytes, inode->i_uid, head2type(&s_ih));
1301#endif 1301#endif
1302 vfs_dq_free_space_nodirty(inode, quota_cut_bytes); 1302 dquot_free_space_nodirty(inode, quota_cut_bytes);
1303 1303
1304 /* Return deleted body length */ 1304 /* Return deleted body length */
1305 return ret_value; 1305 return ret_value;
@@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1383 quota_cut_bytes, inode->i_uid, 1383 quota_cut_bytes, inode->i_uid,
1384 key2type(key)); 1384 key2type(key));
1385#endif 1385#endif
1386 vfs_dq_free_space_nodirty(inode, 1386 dquot_free_space_nodirty(inode,
1387 quota_cut_bytes); 1387 quota_cut_bytes);
1388 } 1388 }
1389 break; 1389 break;
@@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1733 "reiserquota cut_from_item(): freeing %u id=%u type=%c", 1733 "reiserquota cut_from_item(): freeing %u id=%u type=%c",
1734 quota_cut_bytes, inode->i_uid, '?'); 1734 quota_cut_bytes, inode->i_uid, '?');
1735#endif 1735#endif
1736 vfs_dq_free_space_nodirty(inode, quota_cut_bytes); 1736 dquot_free_space_nodirty(inode, quota_cut_bytes);
1737 return ret_value; 1737 return ret_value;
1738} 1738}
1739 1739
@@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1968 key2type(&(key->on_disk_key))); 1968 key2type(&(key->on_disk_key)));
1969#endif 1969#endif
1970 1970
1971 if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) { 1971 retval = dquot_alloc_space_nodirty(inode, pasted_size);
1972 if (retval) {
1972 pathrelse(search_path); 1973 pathrelse(search_path);
1973 return -EDQUOT; 1974 return retval;
1974 } 1975 }
1975 init_tb_struct(th, &s_paste_balance, th->t_super, search_path, 1976 init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
1976 pasted_size); 1977 pasted_size);
@@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
2024 pasted_size, inode->i_uid, 2025 pasted_size, inode->i_uid,
2025 key2type(&(key->on_disk_key))); 2026 key2type(&(key->on_disk_key)));
2026#endif 2027#endif
2027 vfs_dq_free_space_nodirty(inode, pasted_size); 2028 dquot_free_space_nodirty(inode, pasted_size);
2028 return retval; 2029 return retval;
2029} 2030}
2030 2031
@@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2062#endif 2063#endif
2063 /* We can't dirty inode here. It would be immediately written but 2064 /* We can't dirty inode here. It would be immediately written but
2064 * appropriate stat item isn't inserted yet... */ 2065 * appropriate stat item isn't inserted yet... */
2065 if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) { 2066 retval = dquot_alloc_space_nodirty(inode, quota_bytes);
2067 if (retval) {
2066 pathrelse(path); 2068 pathrelse(path);
2067 return -EDQUOT; 2069 return retval;
2068 } 2070 }
2069 } 2071 }
2070 init_tb_struct(th, &s_ins_balance, th->t_super, path, 2072 init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2113 quota_bytes, inode->i_uid, head2type(ih)); 2115 quota_bytes, inode->i_uid, head2type(ih));
2114#endif 2116#endif
2115 if (inode) 2117 if (inode)
2116 vfs_dq_free_space_nodirty(inode, quota_bytes); 2118 dquot_free_space_nodirty(inode, quota_bytes);
2117 return retval; 2119 return retval;
2118} 2120}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4a7dd03bdb9..04bf5d791bda 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s)
246 retval = remove_save_link_only(s, &save_link_key, 0); 246 retval = remove_save_link_only(s, &save_link_key, 0);
247 continue; 247 continue;
248 } 248 }
249 vfs_dq_init(inode); 249 dquot_initialize(inode);
250 250
251 if (truncate && S_ISDIR(inode->i_mode)) { 251 if (truncate && S_ISDIR(inode->i_mode)) {
252 /* We got a truncate request for a dir which is impossible. 252 /* We got a truncate request for a dir which is impossible.
@@ -578,6 +578,11 @@ out:
578 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 578 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
579} 579}
580 580
581static void reiserfs_clear_inode(struct inode *inode)
582{
583 dquot_drop(inode);
584}
585
581#ifdef CONFIG_QUOTA 586#ifdef CONFIG_QUOTA
582static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, 587static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
583 size_t, loff_t); 588 size_t, loff_t);
@@ -590,6 +595,7 @@ static const struct super_operations reiserfs_sops = {
590 .destroy_inode = reiserfs_destroy_inode, 595 .destroy_inode = reiserfs_destroy_inode,
591 .write_inode = reiserfs_write_inode, 596 .write_inode = reiserfs_write_inode,
592 .dirty_inode = reiserfs_dirty_inode, 597 .dirty_inode = reiserfs_dirty_inode,
598 .clear_inode = reiserfs_clear_inode,
593 .delete_inode = reiserfs_delete_inode, 599 .delete_inode = reiserfs_delete_inode,
594 .put_super = reiserfs_put_super, 600 .put_super = reiserfs_put_super,
595 .write_super = reiserfs_write_super, 601 .write_super = reiserfs_write_super,
@@ -616,13 +622,6 @@ static int reiserfs_write_info(struct super_block *, int);
616static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 622static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
617 623
618static const struct dquot_operations reiserfs_quota_operations = { 624static const struct dquot_operations reiserfs_quota_operations = {
619 .initialize = dquot_initialize,
620 .drop = dquot_drop,
621 .alloc_space = dquot_alloc_space,
622 .alloc_inode = dquot_alloc_inode,
623 .free_space = dquot_free_space,
624 .free_inode = dquot_free_inode,
625 .transfer = dquot_transfer,
626 .write_dquot = reiserfs_write_dquot, 625 .write_dquot = reiserfs_write_dquot,
627 .acquire_dquot = reiserfs_acquire_dquot, 626 .acquire_dquot = reiserfs_acquire_dquot,
628 .release_dquot = reiserfs_release_dquot, 627 .release_dquot = reiserfs_release_dquot,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 81f09fab8ae4..37d034ca7d99 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -61,7 +61,6 @@
61static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) 61static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
62{ 62{
63 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 63 BUG_ON(!mutex_is_locked(&dir->i_mutex));
64 vfs_dq_init(dir);
65 return dir->i_op->create(dir, dentry, mode, NULL); 64 return dir->i_op->create(dir, dentry, mode, NULL);
66} 65}
67#endif 66#endif
@@ -69,7 +68,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
69static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) 68static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
70{ 69{
71 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 70 BUG_ON(!mutex_is_locked(&dir->i_mutex));
72 vfs_dq_init(dir);
73 return dir->i_op->mkdir(dir, dentry, mode); 71 return dir->i_op->mkdir(dir, dentry, mode);
74} 72}
75 73
@@ -81,7 +79,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
81{ 79{
82 int error; 80 int error;
83 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 81 BUG_ON(!mutex_is_locked(&dir->i_mutex));
84 vfs_dq_init(dir);
85 82
86 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, 83 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
87 I_MUTEX_CHILD, dir->i_sb); 84 I_MUTEX_CHILD, dir->i_sb);
@@ -97,7 +94,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
97{ 94{
98 int error; 95 int error;
99 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 96 BUG_ON(!mutex_is_locked(&dir->i_mutex));
100 vfs_dq_init(dir);
101 97
102 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, 98 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
103 I_MUTEX_CHILD, dir->i_sb); 99 I_MUTEX_CHILD, dir->i_sb);
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..de1fcffd906b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -76,7 +76,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
76 return error; 76 return error;
77 } 77 }
78 78
79 if (sec->length) { 79 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
80 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 80 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
81 reiserfs_xattr_nblocks(inode, sec->length); 81 reiserfs_xattr_nblocks(inode, sec->length);
82 /* We don't want to count the directories twice if we have 82 /* We don't want to count the directories twice if we have
diff --git a/fs/select.c b/fs/select.c
index fd38ce2e32e3..500a669f7790 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
691} 691}
692#endif /* HAVE_SET_RESTORE_SIGMASK */ 692#endif /* HAVE_SET_RESTORE_SIGMASK */
693 693
694#ifdef __ARCH_WANT_SYS_OLD_SELECT
695struct sel_arg_struct {
696 unsigned long n;
697 fd_set __user *inp, *outp, *exp;
698 struct timeval __user *tvp;
699};
700
701SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
702{
703 struct sel_arg_struct a;
704
705 if (copy_from_user(&a, arg, sizeof(a)))
706 return -EFAULT;
707 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
708}
709#endif
710
694struct poll_list { 711struct poll_list {
695 struct poll_list *next; 712 struct poll_list *next;
696 int len; 713 int len;
@@ -821,7 +838,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
821 struct poll_list *walk = head; 838 struct poll_list *walk = head;
822 unsigned long todo = nfds; 839 unsigned long todo = nfds;
823 840
824 if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 841 if (nfds > rlimit(RLIMIT_NOFILE))
825 return -EINVAL; 842 return -EINVAL;
826 843
827 len = min_t(unsigned int, nfds, N_STACK_PPS); 844 len = min_t(unsigned int, nfds, N_STACK_PPS);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..e1f437be6c3c 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
674 674
675 return NULL; 675 return NULL;
676} 676}
677
678EXPORT_SYMBOL(seq_list_start); 677EXPORT_SYMBOL(seq_list_start);
679 678
680struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) 679struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
684 683
685 return seq_list_start(head, pos - 1); 684 return seq_list_start(head, pos - 1);
686} 685}
687
688EXPORT_SYMBOL(seq_list_start_head); 686EXPORT_SYMBOL(seq_list_start_head);
689 687
690struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) 688struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,131 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
695 ++*ppos; 693 ++*ppos;
696 return lh == head ? NULL : lh; 694 return lh == head ? NULL : lh;
697} 695}
698
699EXPORT_SYMBOL(seq_list_next); 696EXPORT_SYMBOL(seq_list_next);
697
698/**
699 * seq_hlist_start - start an iteration of a hlist
700 * @head: the head of the hlist
701 * @pos: the start position of the sequence
702 *
703 * Called at seq_file->op->start().
704 */
705struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
706{
707 struct hlist_node *node;
708
709 hlist_for_each(node, head)
710 if (pos-- == 0)
711 return node;
712 return NULL;
713}
714EXPORT_SYMBOL(seq_hlist_start);
715
716/**
717 * seq_hlist_start_head - start an iteration of a hlist
718 * @head: the head of the hlist
719 * @pos: the start position of the sequence
720 *
721 * Called at seq_file->op->start(). Call this function if you want to
722 * print a header at the top of the output.
723 */
724struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
725{
726 if (!pos)
727 return SEQ_START_TOKEN;
728
729 return seq_hlist_start(head, pos - 1);
730}
731EXPORT_SYMBOL(seq_hlist_start_head);
732
733/**
734 * seq_hlist_next - move to the next position of the hlist
735 * @v: the current iterator
736 * @head: the head of the hlist
737 * @ppos: the current position
738 *
739 * Called at seq_file->op->next().
740 */
741struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
742 loff_t *ppos)
743{
744 struct hlist_node *node = v;
745
746 ++*ppos;
747 if (v == SEQ_START_TOKEN)
748 return head->first;
749 else
750 return node->next;
751}
752EXPORT_SYMBOL(seq_hlist_next);
753
754/**
755 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
756 * @head: the head of the hlist
757 * @pos: the start position of the sequence
758 *
759 * Called at seq_file->op->start().
760 *
761 * This list-traversal primitive may safely run concurrently with
762 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
763 * as long as the traversal is guarded by rcu_read_lock().
764 */
765struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
766 loff_t pos)
767{
768 struct hlist_node *node;
769
770 __hlist_for_each_rcu(node, head)
771 if (pos-- == 0)
772 return node;
773 return NULL;
774}
775EXPORT_SYMBOL(seq_hlist_start_rcu);
776
777/**
778 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
779 * @head: the head of the hlist
780 * @pos: the start position of the sequence
781 *
782 * Called at seq_file->op->start(). Call this function if you want to
783 * print a header at the top of the output.
784 *
785 * This list-traversal primitive may safely run concurrently with
786 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
787 * as long as the traversal is guarded by rcu_read_lock().
788 */
789struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
790 loff_t pos)
791{
792 if (!pos)
793 return SEQ_START_TOKEN;
794
795 return seq_hlist_start_rcu(head, pos - 1);
796}
797EXPORT_SYMBOL(seq_hlist_start_head_rcu);
798
799/**
800 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
801 * @v: the current iterator
802 * @head: the head of the hlist
803 * @ppos: the current position
804 *
805 * Called at seq_file->op->next().
806 *
807 * This list-traversal primitive may safely run concurrently with
808 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
809 * as long as the traversal is guarded by rcu_read_lock().
810 */
811struct hlist_node *seq_hlist_next_rcu(void *v,
812 struct hlist_head *head,
813 loff_t *ppos)
814{
815 struct hlist_node *node = v;
816
817 ++*ppos;
818 if (v == SEQ_START_TOKEN)
819 return rcu_dereference(head->first);
820 else
821 return rcu_dereference(node->next);
822}
823EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..df8a19ef870d 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..1cb0d81b164b 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/vfs.h> 30#include <linux/vfs.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36 34
37#include "squashfs_fs.h" 35#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h" 37#include "squashfs_fs_i.h"
40#include "squashfs.h" 38#include "squashfs.h"
39#include "decompressor.h"
41 40
42/* 41/*
43 * Read the metadata block length, this is stored in the first two 42 * Read the metadata block length, this is stored in the first two
@@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
153 } 152 }
154 153
155 if (compressed) { 154 if (compressed) {
156 int zlib_err = 0, zlib_init = 0; 155 length = squashfs_decompress(msblk, buffer, bh, b, offset,
157 156 length, srclength, pages);
158 /* 157 if (length < 0)
159 * Uncompress block. 158 goto read_failure;
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate error, data probably corrupt\n");
212 goto release_mutex;
213 }
214
215 zlib_err = zlib_inflateEnd(&msblk->stream);
216 if (zlib_err != Z_OK) {
217 ERROR("zlib_inflate error, data probably corrupt\n");
218 goto release_mutex;
219 }
220 length = msblk->stream.total_out;
221 mutex_unlock(&msblk->read_data_mutex);
222 } else { 159 } else {
223 /* 160 /*
224 * Block is uncompressed. 161 * Block is uncompressed.
@@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
255 kfree(bh); 192 kfree(bh);
256 return length; 193 return length;
257 194
258release_mutex:
259 mutex_unlock(&msblk->read_data_mutex);
260
261block_release: 195block_release:
262 for (; k < b; k++) 196 for (; k < b; k++)
263 put_bh(bh[k]); 197 put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
51#include <linux/sched.h> 51#include <linux/sched.h>
52#include <linux/spinlock.h> 52#include <linux/spinlock.h>
53#include <linux/wait.h> 53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h> 54#include <linux/pagemap.h>
56 55
57#include "squashfs_fs.h" 56#include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..157478da6ac9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * decompressor.c
22 */
23
24#include <linux/types.h>
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27
28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h"
32#include "squashfs.h"
33
34/*
35 * This file (and decompressor.h) implements a decompressor framework for
36 * Squashfs, allowing multiple decompressors to be easily supported
37 */
38
39static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41};
42
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45};
46
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0
49};
50
51static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops,
54 &squashfs_lzo_unsupported_comp_ops,
55 &squashfs_unknown_comp_ops
56};
57
58
59const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
60{
61 int i;
62
63 for (i = 0; decompressor[i]->id; i++)
64 if (id == decompressor[i]->id)
65 break;
66
67 return decompressor[i];
68}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
1#ifndef DECOMPRESSOR_H
2#define DECOMPRESSOR_H
3/*
4 * Squashfs - a compressed read only filesystem for Linux
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * decompressor.h
24 */
25
26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *);
28 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **,
30 struct buffer_head **, int, int, int, int, int);
31 int id;
32 char *name;
33 int supported;
34};
35
36static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
37{
38 return msblk->decompressor->init(msblk);
39}
40
41static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
42 void *s)
43{
44 if (msblk->decompressor)
45 msblk->decompressor->free(s);
46}
47
48static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
49 void **buffer, struct buffer_head **bh, int b, int offset, int length,
50 int srclength, int pages)
51{
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages);
54}
55#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/zlib.h>
34 33
35#include "squashfs_fs.h" 34#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 35#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
39#include <linux/vfs.h> 39#include <linux/vfs.h>
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43#include <linux/slab.h> 42#include <linux/slab.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/pagemap.h> 48#include <linux/pagemap.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/zlib.h>
51 50
52#include "squashfs_fs.h" 51#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h" 52#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
34#include <linux/fs.h> 34#include <linux/fs.h>
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/zlib.h>
38 37
39#include "squashfs_fs.h" 38#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..49daaf669e41 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/zlib.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h" 45#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..5266bd8ad932 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/zlib.h>
61 60
62#include "squashfs_fs.h" 61#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h" 62#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..fe2587af5512 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int); 51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int); 52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53 53
54/* decompressor.c */
55extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
56
54/* export.c */ 57/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, 58extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int); 59 unsigned int);
@@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
71extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
72 75
73/* 76/*
74 * Inodes and files operations 77 * Inodes, files and decompressor operations
75 */ 78 */
76 79
77/* dir.c */ 80/* dir.c */
@@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
88 91
89/* symlink.c */ 92/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops; 93extern const struct address_space_operations squashfs_symlink_aops;
94
95/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..79024245ea00 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
183#define SQUASHFS_MAX_FILE_SIZE (1LL << \ 183#define SQUASHFS_MAX_FILE_SIZE (1LL << \
184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) 184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
185 185
186#define SQUASHFS_MARKER_BYTE 0xff
187
188/* meta index cache */ 186/* meta index cache */
189#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) 187#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
190#define SQUASHFS_META_ENTRIES 127 188#define SQUASHFS_META_ENTRIES 127
@@ -211,7 +209,9 @@ struct meta_index {
211/* 209/*
212 * definitions for structures on disk 210 * definitions for structures on disk
213 */ 211 */
214#define ZLIB_COMPRESSION 1 212#define ZLIB_COMPRESSION 1
213#define LZMA_COMPRESSION 2
214#define LZO_COMPRESSION 3
215 215
216struct squashfs_super_block { 216struct squashfs_super_block {
217 __le32 s_magic; 217 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..2e77dc547e25 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,25 @@ struct squashfs_cache_entry {
52}; 52};
53 53
54struct squashfs_sb_info { 54struct squashfs_sb_info {
55 int devblksize; 55 const struct squashfs_decompressor *decompressor;
56 int devblksize_log2; 56 int devblksize;
57 struct squashfs_cache *block_cache; 57 int devblksize_log2;
58 struct squashfs_cache *fragment_cache; 58 struct squashfs_cache *block_cache;
59 struct squashfs_cache *read_page; 59 struct squashfs_cache *fragment_cache;
60 int next_meta_index; 60 struct squashfs_cache *read_page;
61 __le64 *id_table; 61 int next_meta_index;
62 __le64 *fragment_index; 62 __le64 *id_table;
63 unsigned int *fragment_index_2; 63 __le64 *fragment_index;
64 struct mutex read_data_mutex; 64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 66 struct meta_index *meta_index;
67 z_stream stream; 67 void *stream;
68 __le64 *inode_lookup_table; 68 __le64 *inode_lookup_table;
69 u64 inode_table; 69 u64 inode_table;
70 u64 directory_table; 70 u64 directory_table;
71 unsigned int block_size; 71 unsigned int block_size;
72 unsigned short block_log; 72 unsigned short block_log;
73 long long bytes_used; 73 long long bytes_used;
74 unsigned int inodes; 74 unsigned int inodes;
75}; 75};
76#endif 76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..3550aec2f655 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,41 @@
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/zlib.h>
39#include <linux/magic.h> 38#include <linux/magic.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
44#include "squashfs.h" 43#include "squashfs.h"
44#include "decompressor.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static const struct squashfs_decompressor *supported_squashfs_filesystem(short
50 major, short minor, short id)
50{ 51{
52 const struct squashfs_decompressor *decompressor;
53
51 if (major < SQUASHFS_MAJOR) { 54 if (major < SQUASHFS_MAJOR) {
52 ERROR("Major/Minor mismatch, older Squashfs %d.%d " 55 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
53 "filesystems are unsupported\n", major, minor); 56 "filesystems are unsupported\n", major, minor);
54 return -EINVAL; 57 return NULL;
55 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { 58 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
56 ERROR("Major/Minor mismatch, trying to mount newer " 59 ERROR("Major/Minor mismatch, trying to mount newer "
57 "%d.%d filesystem\n", major, minor); 60 "%d.%d filesystem\n", major, minor);
58 ERROR("Please update your kernel\n"); 61 ERROR("Please update your kernel\n");
59 return -EINVAL; 62 return NULL;
60 } 63 }
61 64
62 if (comp != ZLIB_COMPRESSION) 65 decompressor = squashfs_lookup_decompressor(id);
63 return -EINVAL; 66 if (!decompressor->supported) {
67 ERROR("Filesystem uses \"%s\" compression. This is not "
68 "supported\n", decompressor->name);
69 return NULL;
70 }
64 71
65 return 0; 72 return decompressor;
66} 73}
67 74
68 75
@@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
87 } 94 }
88 msblk = sb->s_fs_info; 95 msblk = sb->s_fs_info;
89 96
90 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
91 GFP_KERNEL);
92 if (msblk->stream.workspace == NULL) {
93 ERROR("Failed to allocate zlib workspace\n");
94 goto failure;
95 }
96
97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); 97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
98 if (sblk == NULL) { 98 if (sblk == NULL) {
99 ERROR("Failed to allocate squashfs_super_block\n"); 99 ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
120 goto failed_mount; 120 goto failed_mount;
121 } 121 }
122 122
123 err = -EINVAL;
124
123 /* Check it is a SQUASHFS superblock */ 125 /* Check it is a SQUASHFS superblock */
124 sb->s_magic = le32_to_cpu(sblk->s_magic); 126 sb->s_magic = le32_to_cpu(sblk->s_magic);
125 if (sb->s_magic != SQUASHFS_MAGIC) { 127 if (sb->s_magic != SQUASHFS_MAGIC) {
126 if (!silent) 128 if (!silent)
127 ERROR("Can't find a SQUASHFS superblock on %s\n", 129 ERROR("Can't find a SQUASHFS superblock on %s\n",
128 bdevname(sb->s_bdev, b)); 130 bdevname(sb->s_bdev, b));
129 err = -EINVAL;
130 goto failed_mount; 131 goto failed_mount;
131 } 132 }
132 133
133 /* Check the MAJOR & MINOR versions and compression type */ 134 /* Check the MAJOR & MINOR versions and lookup compression type */
134 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), 135 msblk->decompressor = supported_squashfs_filesystem(
136 le16_to_cpu(sblk->s_major),
135 le16_to_cpu(sblk->s_minor), 137 le16_to_cpu(sblk->s_minor),
136 le16_to_cpu(sblk->compression)); 138 le16_to_cpu(sblk->compression));
137 if (err < 0) 139 if (msblk->decompressor == NULL)
138 goto failed_mount; 140 goto failed_mount;
139 141
140 err = -EINVAL;
141
142 /* 142 /*
143 * Check if there's xattrs in the filesystem. These are not 143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored. 144 * supported in this version, so warn that they will be ignored.
@@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
205 205
206 err = -ENOMEM; 206 err = -ENOMEM;
207 207
208 msblk->stream = squashfs_decompressor_init(msblk);
209 if (msblk->stream == NULL)
210 goto failed_mount;
211
208 msblk->block_cache = squashfs_cache_init("metadata", 212 msblk->block_cache = squashfs_cache_init("metadata",
209 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); 213 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
210 if (msblk->block_cache == NULL) 214 if (msblk->block_cache == NULL)
@@ -292,17 +296,16 @@ failed_mount:
292 squashfs_cache_delete(msblk->block_cache); 296 squashfs_cache_delete(msblk->block_cache);
293 squashfs_cache_delete(msblk->fragment_cache); 297 squashfs_cache_delete(msblk->fragment_cache);
294 squashfs_cache_delete(msblk->read_page); 298 squashfs_cache_delete(msblk->read_page);
299 squashfs_decompressor_free(msblk, msblk->stream);
295 kfree(msblk->inode_lookup_table); 300 kfree(msblk->inode_lookup_table);
296 kfree(msblk->fragment_index); 301 kfree(msblk->fragment_index);
297 kfree(msblk->id_table); 302 kfree(msblk->id_table);
298 kfree(msblk->stream.workspace);
299 kfree(sb->s_fs_info); 303 kfree(sb->s_fs_info);
300 sb->s_fs_info = NULL; 304 sb->s_fs_info = NULL;
301 kfree(sblk); 305 kfree(sblk);
302 return err; 306 return err;
303 307
304failure: 308failure:
305 kfree(msblk->stream.workspace);
306 kfree(sb->s_fs_info); 309 kfree(sb->s_fs_info);
307 sb->s_fs_info = NULL; 310 sb->s_fs_info = NULL;
308 return -ENOMEM; 311 return -ENOMEM;
@@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb)
346 squashfs_cache_delete(sbi->block_cache); 349 squashfs_cache_delete(sbi->block_cache);
347 squashfs_cache_delete(sbi->fragment_cache); 350 squashfs_cache_delete(sbi->fragment_cache);
348 squashfs_cache_delete(sbi->read_page); 351 squashfs_cache_delete(sbi->read_page);
352 squashfs_decompressor_free(sbi, sbi->stream);
349 kfree(sbi->id_table); 353 kfree(sbi->id_table);
350 kfree(sbi->fragment_index); 354 kfree(sbi->fragment_index);
351 kfree(sbi->meta_index); 355 kfree(sbi->meta_index);
352 kfree(sbi->stream.workspace);
353 kfree(sb->s_fs_info); 356 kfree(sb->s_fs_info);
354 sb->s_fs_info = NULL; 357 sb->s_fs_info = NULL;
355 } 358 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..e80be2022a7f 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -36,7 +36,6 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/pagemap.h> 38#include <linux/pagemap.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..4dd70e04333b
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,150 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * zlib_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/zlib.h>
28
29#include "squashfs_fs.h"
30#include "squashfs_fs_sb.h"
31#include "squashfs_fs_i.h"
32#include "squashfs.h"
33#include "decompressor.h"
34
35static void *zlib_init(struct squashfs_sb_info *dummy)
36{
37 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
38 if (stream == NULL)
39 goto failed;
40 stream->workspace = kmalloc(zlib_inflate_workspacesize(),
41 GFP_KERNEL);
42 if (stream->workspace == NULL)
43 goto failed;
44
45 return stream;
46
47failed:
48 ERROR("Failed to allocate zlib workspace\n");
49 kfree(stream);
50 return NULL;
51}
52
53
54static void zlib_free(void *strm)
55{
56 z_stream *stream = strm;
57
58 if (stream)
59 kfree(stream->workspace);
60 kfree(stream);
61}
62
63
64static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
65 struct buffer_head **bh, int b, int offset, int length, int srclength,
66 int pages)
67{
68 int zlib_err = 0, zlib_init = 0;
69 int avail, bytes, k = 0, page = 0;
70 z_stream *stream = msblk->stream;
71
72 mutex_lock(&msblk->read_data_mutex);
73
74 stream->avail_out = 0;
75 stream->avail_in = 0;
76
77 bytes = length;
78 do {
79 if (stream->avail_in == 0 && k < b) {
80 avail = min(bytes, msblk->devblksize - offset);
81 bytes -= avail;
82 wait_on_buffer(bh[k]);
83 if (!buffer_uptodate(bh[k]))
84 goto release_mutex;
85
86 if (avail == 0) {
87 offset = 0;
88 put_bh(bh[k++]);
89 continue;
90 }
91
92 stream->next_in = bh[k]->b_data + offset;
93 stream->avail_in = avail;
94 offset = 0;
95 }
96
97 if (stream->avail_out == 0 && page < pages) {
98 stream->next_out = buffer[page++];
99 stream->avail_out = PAGE_CACHE_SIZE;
100 }
101
102 if (!zlib_init) {
103 zlib_err = zlib_inflateInit(stream);
104 if (zlib_err != Z_OK) {
105 ERROR("zlib_inflateInit returned unexpected "
106 "result 0x%x, srclength %d\n",
107 zlib_err, srclength);
108 goto release_mutex;
109 }
110 zlib_init = 1;
111 }
112
113 zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
114
115 if (stream->avail_in == 0 && k < b)
116 put_bh(bh[k++]);
117 } while (zlib_err == Z_OK);
118
119 if (zlib_err != Z_STREAM_END) {
120 ERROR("zlib_inflate error, data probably corrupt\n");
121 goto release_mutex;
122 }
123
124 zlib_err = zlib_inflateEnd(stream);
125 if (zlib_err != Z_OK) {
126 ERROR("zlib_inflate error, data probably corrupt\n");
127 goto release_mutex;
128 }
129
130 mutex_unlock(&msblk->read_data_mutex);
131 return stream->total_out;
132
133release_mutex:
134 mutex_unlock(&msblk->read_data_mutex);
135
136 for (; k < b; k++)
137 put_bh(bh[k]);
138
139 return -EIO;
140}
141
142const struct squashfs_decompressor squashfs_zlib_comp_ops = {
143 .init = zlib_init,
144 .free = zlib_free,
145 .decompress = zlib_uncompress,
146 .id = ZLIB_COMPRESSION,
147 .name = "zlib",
148 .supported = 1
149};
150
diff --git a/fs/super.c b/fs/super.c
index aff046b0fe78..f35ac6022109 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -568,7 +568,7 @@ out:
568int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 568int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
569{ 569{
570 int retval; 570 int retval;
571 int remount_rw; 571 int remount_rw, remount_ro;
572 572
573 if (sb->s_frozen != SB_UNFROZEN) 573 if (sb->s_frozen != SB_UNFROZEN)
574 return -EBUSY; 574 return -EBUSY;
@@ -583,9 +583,12 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
583 shrink_dcache_sb(sb); 583 shrink_dcache_sb(sb);
584 sync_filesystem(sb); 584 sync_filesystem(sb);
585 585
586 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
587 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
588
586 /* If we are remounting RDONLY and current sb is read/write, 589 /* If we are remounting RDONLY and current sb is read/write,
587 make sure there are no rw files opened */ 590 make sure there are no rw files opened */
588 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { 591 if (remount_ro) {
589 if (force) 592 if (force)
590 mark_files_ro(sb); 593 mark_files_ro(sb);
591 else if (!fs_may_remount_ro(sb)) 594 else if (!fs_may_remount_ro(sb))
@@ -594,7 +597,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
594 if (retval < 0 && retval != -ENOSYS) 597 if (retval < 0 && retval != -ENOSYS)
595 return -EBUSY; 598 return -EBUSY;
596 } 599 }
597 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
598 600
599 if (sb->s_op->remount_fs) { 601 if (sb->s_op->remount_fs) {
600 retval = sb->s_op->remount_fs(sb, &flags, data); 602 retval = sb->s_op->remount_fs(sb, &flags, data);
@@ -604,6 +606,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
604 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 606 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
605 if (remount_rw) 607 if (remount_rw)
606 vfs_dq_quota_on_remount(sb); 608 vfs_dq_quota_on_remount(sb);
609 /*
610 * Some filesystems modify their metadata via some other path than the
611 * bdev buffer cache (eg. use a private mapping, or directories in
612 * pagecache, etc). Also file data modifications go via their own
613 * mappings. So If we try to mount readonly then copy the filesystem
614 * from bdev, we could get stale data, so invalidate it to give a best
615 * effort at coherency.
616 */
617 if (remount_ro && sb->s_bdev)
618 invalidate_bdev(sb->s_bdev);
607 return 0; 619 return 0;
608} 620}
609 621
@@ -925,6 +937,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
925 if (!mnt) 937 if (!mnt)
926 goto out; 938 goto out;
927 939
940 if (flags & MS_KERNMOUNT)
941 mnt->mnt_flags = MNT_INTERNAL;
942
928 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { 943 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
929 secdata = alloc_secdata(); 944 secdata = alloc_secdata();
930 if (!secdata) 945 if (!secdata)
diff --git a/fs/sync.c b/fs/sync.c
index 418727a2a239..f557d71cb097 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -34,14 +34,14 @@ static int __sync_filesystem(struct super_block *sb, int wait)
34 if (!sb->s_bdi) 34 if (!sb->s_bdi)
35 return 0; 35 return 0;
36 36
37 /* Avoid doing twice syncing and cache pruning for quota sync */ 37 if (sb->s_qcop && sb->s_qcop->quota_sync)
38 if (!wait) { 38 sb->s_qcop->quota_sync(sb, -1, wait);
39 writeout_quota_sb(sb, -1); 39
40 writeback_inodes_sb(sb); 40 if (wait)
41 } else {
42 sync_quota_sb(sb, -1);
43 sync_inodes_sb(sb); 41 sync_inodes_sb(sb);
44 } 42 else
43 writeback_inodes_sb(sb);
44
45 if (sb->s_op->sync_fs) 45 if (sb->s_op->sync_fs)
46 sb->s_op->sync_fs(sb, wait); 46 sb->s_op->sync_fs(sb, wait);
47 return __sync_blockdev(sb->s_bdev, wait); 47 return __sync_blockdev(sb->s_bdev, wait);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a0a500af24a1..e9d293593e52 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
54 int rc; 54 int rc;
55 55
56 /* need attr_sd for attr, its parent for kobj */ 56 /* need attr_sd for attr, its parent for kobj */
57 if (!sysfs_get_active_two(attr_sd)) 57 if (!sysfs_get_active(attr_sd))
58 return -ENODEV; 58 return -ENODEV;
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active_two(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
66 return rc; 66 return rc;
67} 67}
@@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
125 int rc; 125 int rc;
126 126
127 /* need attr_sd for attr, its parent for kobj */ 127 /* need attr_sd for attr, its parent for kobj */
128 if (!sysfs_get_active_two(attr_sd)) 128 if (!sysfs_get_active(attr_sd))
129 return -ENODEV; 129 return -ENODEV;
130 130
131 rc = -EIO; 131 rc = -EIO;
132 if (attr->write) 132 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 133 rc = attr->write(kobj, attr, buffer, offset, count);
134 134
135 sysfs_put_active_two(attr_sd); 135 sysfs_put_active(attr_sd);
136 136
137 return rc; 137 return rc;
138} 138}
@@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
184 if (!bb->vm_ops || !bb->vm_ops->open) 184 if (!bb->vm_ops || !bb->vm_ops->open)
185 return; 185 return;
186 186
187 if (!sysfs_get_active_two(attr_sd)) 187 if (!sysfs_get_active(attr_sd))
188 return; 188 return;
189 189
190 bb->vm_ops->open(vma); 190 bb->vm_ops->open(vma);
191 191
192 sysfs_put_active_two(attr_sd); 192 sysfs_put_active(attr_sd);
193} 193}
194 194
195static void bin_vma_close(struct vm_area_struct *vma) 195static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
201 if (!bb->vm_ops || !bb->vm_ops->close) 201 if (!bb->vm_ops || !bb->vm_ops->close)
202 return; 202 return;
203 203
204 if (!sysfs_get_active_two(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
205 return; 205 return;
206 206
207 bb->vm_ops->close(vma); 207 bb->vm_ops->close(vma);
208 208
209 sysfs_put_active_two(attr_sd); 209 sysfs_put_active(attr_sd);
210} 210}
211 211
212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
219 if (!bb->vm_ops || !bb->vm_ops->fault) 219 if (!bb->vm_ops || !bb->vm_ops->fault)
220 return VM_FAULT_SIGBUS; 220 return VM_FAULT_SIGBUS;
221 221
222 if (!sysfs_get_active_two(attr_sd)) 222 if (!sysfs_get_active(attr_sd))
223 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
224 224
225 ret = bb->vm_ops->fault(vma, vmf); 225 ret = bb->vm_ops->fault(vma, vmf);
226 226
227 sysfs_put_active_two(attr_sd); 227 sysfs_put_active(attr_sd);
228 return ret; 228 return ret;
229} 229}
230 230
@@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
241 if (!bb->vm_ops->page_mkwrite) 241 if (!bb->vm_ops->page_mkwrite)
242 return 0; 242 return 0;
243 243
244 if (!sysfs_get_active_two(attr_sd)) 244 if (!sysfs_get_active(attr_sd))
245 return VM_FAULT_SIGBUS; 245 return VM_FAULT_SIGBUS;
246 246
247 ret = bb->vm_ops->page_mkwrite(vma, vmf); 247 ret = bb->vm_ops->page_mkwrite(vma, vmf);
248 248
249 sysfs_put_active_two(attr_sd); 249 sysfs_put_active(attr_sd);
250 return ret; 250 return ret;
251} 251}
252 252
@@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
261 if (!bb->vm_ops || !bb->vm_ops->access) 261 if (!bb->vm_ops || !bb->vm_ops->access)
262 return -EINVAL; 262 return -EINVAL;
263 263
264 if (!sysfs_get_active_two(attr_sd)) 264 if (!sysfs_get_active(attr_sd))
265 return -EINVAL; 265 return -EINVAL;
266 266
267 ret = bb->vm_ops->access(vma, addr, buf, len, write); 267 ret = bb->vm_ops->access(vma, addr, buf, len, write);
268 268
269 sysfs_put_active_two(attr_sd); 269 sysfs_put_active(attr_sd);
270 return ret; 270 return ret;
271} 271}
272 272
@@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
281 if (!bb->vm_ops || !bb->vm_ops->set_policy) 281 if (!bb->vm_ops || !bb->vm_ops->set_policy)
282 return 0; 282 return 0;
283 283
284 if (!sysfs_get_active_two(attr_sd)) 284 if (!sysfs_get_active(attr_sd))
285 return -EINVAL; 285 return -EINVAL;
286 286
287 ret = bb->vm_ops->set_policy(vma, new); 287 ret = bb->vm_ops->set_policy(vma, new);
288 288
289 sysfs_put_active_two(attr_sd); 289 sysfs_put_active(attr_sd);
290 return ret; 290 return ret;
291} 291}
292 292
@@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
301 if (!bb->vm_ops || !bb->vm_ops->get_policy) 301 if (!bb->vm_ops || !bb->vm_ops->get_policy)
302 return vma->vm_policy; 302 return vma->vm_policy;
303 303
304 if (!sysfs_get_active_two(attr_sd)) 304 if (!sysfs_get_active(attr_sd))
305 return vma->vm_policy; 305 return vma->vm_policy;
306 306
307 pol = bb->vm_ops->get_policy(vma, addr); 307 pol = bb->vm_ops->get_policy(vma, addr);
308 308
309 sysfs_put_active_two(attr_sd); 309 sysfs_put_active(attr_sd);
310 return pol; 310 return pol;
311} 311}
312 312
@@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
321 if (!bb->vm_ops || !bb->vm_ops->migrate) 321 if (!bb->vm_ops || !bb->vm_ops->migrate)
322 return 0; 322 return 0;
323 323
324 if (!sysfs_get_active_two(attr_sd)) 324 if (!sysfs_get_active(attr_sd))
325 return 0; 325 return 0;
326 326
327 ret = bb->vm_ops->migrate(vma, from, to, flags); 327 ret = bb->vm_ops->migrate(vma, from, to, flags);
328 328
329 sysfs_put_active_two(attr_sd); 329 sysfs_put_active(attr_sd);
330 return ret; 330 return ret;
331} 331}
332#endif 332#endif
@@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
356 356
357 /* need attr_sd for attr, its parent for kobj */ 357 /* need attr_sd for attr, its parent for kobj */
358 rc = -ENODEV; 358 rc = -ENODEV;
359 if (!sysfs_get_active_two(attr_sd)) 359 if (!sysfs_get_active(attr_sd))
360 goto out_unlock; 360 goto out_unlock;
361 361
362 rc = -EINVAL; 362 rc = -EINVAL;
@@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
384 bb->vm_ops = vma->vm_ops; 384 bb->vm_ops = vma->vm_ops;
385 vma->vm_ops = &bin_vm_ops; 385 vma->vm_ops = &bin_vm_ops;
386out_put: 386out_put:
387 sysfs_put_active_two(attr_sd); 387 sysfs_put_active(attr_sd);
388out_unlock: 388out_unlock:
389 mutex_unlock(&bb->mutex); 389 mutex_unlock(&bb->mutex);
390 390
@@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file)
399 int error; 399 int error;
400 400
401 /* binary file operations requires both @sd and its parent */ 401 /* binary file operations requires both @sd and its parent */
402 if (!sysfs_get_active_two(attr_sd)) 402 if (!sysfs_get_active(attr_sd))
403 return -ENODEV; 403 return -ENODEV;
404 404
405 error = -EACCES; 405 error = -EACCES;
@@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file)
426 mutex_unlock(&sysfs_bin_lock); 426 mutex_unlock(&sysfs_bin_lock);
427 427
428 /* open succeeded, put active references */ 428 /* open succeeded, put active references */
429 sysfs_put_active_two(attr_sd); 429 sysfs_put_active(attr_sd);
430 return 0; 430 return 0;
431 431
432 err_out: 432 err_out:
433 sysfs_put_active_two(attr_sd); 433 sysfs_put_active(attr_sd);
434 kfree(bb); 434 kfree(bb);
435 return error; 435 return error;
436} 436}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 699f371b9f12..590717861c7a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -93,7 +93,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
93 * RETURNS: 93 * RETURNS:
94 * Pointer to @sd on success, NULL on failure. 94 * Pointer to @sd on success, NULL on failure.
95 */ 95 */
96static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) 96struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
97{ 97{
98 if (unlikely(!sd)) 98 if (unlikely(!sd))
99 return NULL; 99 return NULL;
@@ -124,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
124 * Put an active reference to @sd. This function is noop if @sd 124 * Put an active reference to @sd. This function is noop if @sd
125 * is NULL. 125 * is NULL.
126 */ 126 */
127static void sysfs_put_active(struct sysfs_dirent *sd) 127void sysfs_put_active(struct sysfs_dirent *sd)
128{ 128{
129 struct completion *cmpl; 129 struct completion *cmpl;
130 int v; 130 int v;
@@ -145,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
145} 145}
146 146
147/** 147/**
148 * sysfs_get_active_two - get active references to sysfs_dirent and parent
149 * @sd: sysfs_dirent of interest
150 *
151 * Get active reference to @sd and its parent. Parent's active
152 * reference is grabbed first. This function is noop if @sd is
153 * NULL.
154 *
155 * RETURNS:
156 * Pointer to @sd on success, NULL on failure.
157 */
158struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
159{
160 if (sd) {
161 if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
162 return NULL;
163 if (unlikely(!sysfs_get_active(sd))) {
164 sysfs_put_active(sd->s_parent);
165 return NULL;
166 }
167 }
168 return sd;
169}
170
171/**
172 * sysfs_put_active_two - put active references to sysfs_dirent and parent
173 * @sd: sysfs_dirent of interest
174 *
175 * Put active references to @sd and its parent. This function is
176 * noop if @sd is NULL.
177 */
178void sysfs_put_active_two(struct sysfs_dirent *sd)
179{
180 if (sd) {
181 sysfs_put_active(sd);
182 sysfs_put_active(sd->s_parent);
183 }
184}
185
186/**
187 * sysfs_deactivate - deactivate sysfs_dirent 148 * sysfs_deactivate - deactivate sysfs_dirent
188 * @sd: sysfs_dirent to deactivate 149 * @sd: sysfs_dirent to deactivate
189 * 150 *
@@ -195,6 +156,10 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
195 int v; 156 int v;
196 157
197 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); 158 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
159
160 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
161 return;
162
198 sd->s_sibling = (void *)&wait; 163 sd->s_sibling = (void *)&wait;
199 164
200 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); 165 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
@@ -354,7 +319,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
354 319
355 atomic_set(&sd->s_count, 1); 320 atomic_set(&sd->s_count, 1);
356 atomic_set(&sd->s_active, 0); 321 atomic_set(&sd->s_active, 0);
357 sysfs_dirent_init_lockdep(sd);
358 322
359 sd->s_name = name; 323 sd->s_name = name;
360 sd->s_mode = mode; 324 sd->s_mode = mode;
@@ -681,7 +645,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
681 } 645 }
682 646
683 /* attach dentry and inode */ 647 /* attach dentry and inode */
684 inode = sysfs_get_inode(sd); 648 inode = sysfs_get_inode(dir->i_sb, sd);
685 if (!inode) { 649 if (!inode) {
686 ret = ERR_PTR(-ENOMEM); 650 ret = ERR_PTR(-ENOMEM);
687 goto out_unlock; 651 goto out_unlock;
@@ -837,11 +801,46 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
837 return (sd->s_mode >> 12) & 15; 801 return (sd->s_mode >> 12) & 15;
838} 802}
839 803
804static int sysfs_dir_release(struct inode *inode, struct file *filp)
805{
806 sysfs_put(filp->private_data);
807 return 0;
808}
809
810static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
811 ino_t ino, struct sysfs_dirent *pos)
812{
813 if (pos) {
814 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
815 pos->s_parent == parent_sd &&
816 ino == pos->s_ino;
817 sysfs_put(pos);
818 if (valid)
819 return pos;
820 }
821 pos = NULL;
822 if ((ino > 1) && (ino < INT_MAX)) {
823 pos = parent_sd->s_dir.children;
824 while (pos && (ino > pos->s_ino))
825 pos = pos->s_sibling;
826 }
827 return pos;
828}
829
830static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
831 ino_t ino, struct sysfs_dirent *pos)
832{
833 pos = sysfs_dir_pos(parent_sd, ino, pos);
834 if (pos)
835 pos = pos->s_sibling;
836 return pos;
837}
838
840static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 839static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
841{ 840{
842 struct dentry *dentry = filp->f_path.dentry; 841 struct dentry *dentry = filp->f_path.dentry;
843 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 842 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
844 struct sysfs_dirent *pos; 843 struct sysfs_dirent *pos = filp->private_data;
845 ino_t ino; 844 ino_t ino;
846 845
847 if (filp->f_pos == 0) { 846 if (filp->f_pos == 0) {
@@ -857,29 +856,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
857 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) 856 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
858 filp->f_pos++; 857 filp->f_pos++;
859 } 858 }
860 if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) { 859 mutex_lock(&sysfs_mutex);
861 mutex_lock(&sysfs_mutex); 860 for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
862 861 pos;
863 /* Skip the dentries we have already reported */ 862 pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
864 pos = parent_sd->s_dir.children; 863 const char * name;
865 while (pos && (filp->f_pos > pos->s_ino)) 864 unsigned int type;
866 pos = pos->s_sibling; 865 int len, ret;
867 866
868 for ( ; pos; pos = pos->s_sibling) { 867 name = pos->s_name;
869 const char * name; 868 len = strlen(name);
870 int len; 869 ino = pos->s_ino;
871 870 type = dt_type(pos);
872 name = pos->s_name; 871 filp->f_pos = ino;
873 len = strlen(name); 872 filp->private_data = sysfs_get(pos);
874 filp->f_pos = ino = pos->s_ino;
875 873
876 if (filldir(dirent, name, len, filp->f_pos, ino,
877 dt_type(pos)) < 0)
878 break;
879 }
880 if (!pos)
881 filp->f_pos = INT_MAX;
882 mutex_unlock(&sysfs_mutex); 874 mutex_unlock(&sysfs_mutex);
875 ret = filldir(dirent, name, len, filp->f_pos, ino, type);
876 mutex_lock(&sysfs_mutex);
877 if (ret < 0)
878 break;
879 }
880 mutex_unlock(&sysfs_mutex);
881 if ((filp->f_pos > 1) && !pos) { /* EOF */
882 filp->f_pos = INT_MAX;
883 filp->private_data = NULL;
883 } 884 }
884 return 0; 885 return 0;
885} 886}
@@ -888,5 +889,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
888const struct file_operations sysfs_dir_operations = { 889const struct file_operations sysfs_dir_operations = {
889 .read = generic_read_dir, 890 .read = generic_read_dir,
890 .readdir = sysfs_readdir, 891 .readdir = sysfs_readdir,
892 .release = sysfs_dir_release,
891 .llseek = generic_file_llseek, 893 .llseek = generic_file_llseek,
892}; 894};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dc30d9e31683..e222b2582746 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
53 size_t count; 53 size_t count;
54 loff_t pos; 54 loff_t pos;
55 char * page; 55 char * page;
56 struct sysfs_ops * ops; 56 const struct sysfs_ops * ops;
57 struct mutex mutex; 57 struct mutex mutex;
58 int needs_read_fill; 58 int needs_read_fill;
59 int event; 59 int event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
75{ 75{
76 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 76 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
78 struct sysfs_ops * ops = buffer->ops; 78 const struct sysfs_ops * ops = buffer->ops;
79 int ret = 0; 79 int ret = 0;
80 ssize_t count; 80 ssize_t count;
81 81
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 /* need attr_sd for attr and ops, its parent for kobj */ 87 /* need attr_sd for attr and ops, its parent for kobj */
88 if (!sysfs_get_active_two(attr_sd)) 88 if (!sysfs_get_active(attr_sd))
89 return -ENODEV; 89 return -ENODEV;
90 90
91 buffer->event = atomic_read(&attr_sd->s_attr.open->event); 91 buffer->event = atomic_read(&attr_sd->s_attr.open->event);
92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); 92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
93 93
94 sysfs_put_active_two(attr_sd); 94 sysfs_put_active(attr_sd);
95 95
96 /* 96 /*
97 * The code works fine with PAGE_SIZE return but it's likely to 97 * The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
199{ 199{
200 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 200 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
202 struct sysfs_ops * ops = buffer->ops; 202 const struct sysfs_ops * ops = buffer->ops;
203 int rc; 203 int rc;
204 204
205 /* need attr_sd for attr and ops, its parent for kobj */ 205 /* need attr_sd for attr and ops, its parent for kobj */
206 if (!sysfs_get_active_two(attr_sd)) 206 if (!sysfs_get_active(attr_sd))
207 return -ENODEV; 207 return -ENODEV;
208 208
209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); 209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
210 210
211 sysfs_put_active_two(attr_sd); 211 sysfs_put_active(attr_sd);
212 212
213 return rc; 213 return rc;
214} 214}
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
337 struct sysfs_buffer *buffer; 337 struct sysfs_buffer *buffer;
338 struct sysfs_ops *ops; 338 const struct sysfs_ops *ops;
339 int error = -EACCES; 339 int error = -EACCES;
340 char *p; 340 char *p;
341 341
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
347 if (!sysfs_get_active_two(attr_sd)) 347 if (!sysfs_get_active(attr_sd))
348 return -ENODEV; 348 return -ENODEV;
349 349
350 /* every kobject with an attribute needs a ktype assigned */ 350 /* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
393 goto err_free; 393 goto err_free;
394 394
395 /* open succeeded, put active references */ 395 /* open succeeded, put active references */
396 sysfs_put_active_two(attr_sd); 396 sysfs_put_active(attr_sd);
397 return 0; 397 return 0;
398 398
399 err_free: 399 err_free:
400 kfree(buffer); 400 kfree(buffer);
401 err_out: 401 err_out:
402 sysfs_put_active_two(attr_sd); 402 sysfs_put_active(attr_sd);
403 return error; 403 return error;
404} 404}
405 405
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
437 struct sysfs_open_dirent *od = attr_sd->s_attr.open; 437 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
438 438
439 /* need parent for the kobj, grab both */ 439 /* need parent for the kobj, grab both */
440 if (!sysfs_get_active_two(attr_sd)) 440 if (!sysfs_get_active(attr_sd))
441 goto trigger; 441 goto trigger;
442 442
443 poll_wait(filp, &od->poll, wait); 443 poll_wait(filp, &od->poll, wait);
444 444
445 sysfs_put_active_two(attr_sd); 445 sysfs_put_active(attr_sd);
446 446
447 if (buffer->event != atomic_read(&od->event)) 447 if (buffer->event != atomic_read(&od->event))
448 goto trigger; 448 goto trigger;
@@ -509,6 +509,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
509 if (!sd) 509 if (!sd)
510 return -ENOMEM; 510 return -ENOMEM;
511 sd->s_attr.attr = (void *)attr; 511 sd->s_attr.attr = (void *)attr;
512 sysfs_dirent_init_lockdep(sd);
512 513
513 sysfs_addrm_start(&acxt, dir_sd); 514 sysfs_addrm_start(&acxt, dir_sd);
514 rc = sysfs_add_one(&acxt, sd); 515 rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +543,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
542 543
543} 544}
544 545
546int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
547{
548 int err = 0;
549 int i;
550
551 for (i = 0; ptr[i] && !err; i++)
552 err = sysfs_create_file(kobj, ptr[i]);
553 if (err)
554 while (--i >= 0)
555 sysfs_remove_file(kobj, ptr[i]);
556 return err;
557}
545 558
546/** 559/**
547 * sysfs_add_file_to_group - add an attribute file to a pre-existing group. 560 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -614,6 +627,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
614 sysfs_hash_and_remove(kobj->sd, attr->name); 627 sysfs_hash_and_remove(kobj->sd, attr->name);
615} 628}
616 629
630void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
631{
632 int i;
633 for (i = 0; ptr[i]; i++)
634 sysfs_remove_file(kobj, ptr[i]);
635}
617 636
618/** 637/**
619 * sysfs_remove_file_from_group - remove an attribute file from a group. 638 * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -732,3 +751,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
732 751
733EXPORT_SYMBOL_GPL(sysfs_create_file); 752EXPORT_SYMBOL_GPL(sysfs_create_file);
734EXPORT_SYMBOL_GPL(sysfs_remove_file); 753EXPORT_SYMBOL_GPL(sysfs_remove_file);
754EXPORT_SYMBOL_GPL(sysfs_remove_files);
755EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 6a06a1d1ea7b..082daaecac1b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -111,20 +111,20 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
111 if (!sd) 111 if (!sd)
112 return -EINVAL; 112 return -EINVAL;
113 113
114 mutex_lock(&sysfs_mutex);
114 error = inode_change_ok(inode, iattr); 115 error = inode_change_ok(inode, iattr);
115 if (error) 116 if (error)
116 return error; 117 goto out;
117 118
118 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 119 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
119 120
120 error = inode_setattr(inode, iattr); 121 error = inode_setattr(inode, iattr);
121 if (error) 122 if (error)
122 return error; 123 goto out;
123 124
124 mutex_lock(&sysfs_mutex);
125 error = sysfs_sd_setattr(sd, iattr); 125 error = sysfs_sd_setattr(sd, iattr);
126out:
126 mutex_unlock(&sysfs_mutex); 127 mutex_unlock(&sysfs_mutex);
127
128 return error; 128 return error;
129} 129}
130 130
@@ -283,6 +283,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
283 283
284/** 284/**
285 * sysfs_get_inode - get inode for sysfs_dirent 285 * sysfs_get_inode - get inode for sysfs_dirent
286 * @sb: super block
286 * @sd: sysfs_dirent to allocate inode for 287 * @sd: sysfs_dirent to allocate inode for
287 * 288 *
288 * Get inode for @sd. If such inode doesn't exist, a new inode 289 * Get inode for @sd. If such inode doesn't exist, a new inode
@@ -295,11 +296,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
295 * RETURNS: 296 * RETURNS:
296 * Pointer to allocated inode on success, NULL on failure. 297 * Pointer to allocated inode on success, NULL on failure.
297 */ 298 */
298struct inode * sysfs_get_inode(struct sysfs_dirent *sd) 299struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
299{ 300{
300 struct inode *inode; 301 struct inode *inode;
301 302
302 inode = iget_locked(sysfs_sb, sd->s_ino); 303 inode = iget_locked(sb, sd->s_ino);
303 if (inode && (inode->i_state & I_NEW)) 304 if (inode && (inode->i_state & I_NEW))
304 sysfs_init_inode(sd, inode); 305 sysfs_init_inode(sd, inode);
305 306
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955ccaf..0cb10884a2fc 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,6 @@
23 23
24 24
25static struct vfsmount *sysfs_mount; 25static struct vfsmount *sysfs_mount;
26struct super_block * sysfs_sb = NULL;
27struct kmem_cache *sysfs_dir_cachep; 26struct kmem_cache *sysfs_dir_cachep;
28 27
29static const struct super_operations sysfs_ops = { 28static const struct super_operations sysfs_ops = {
@@ -50,11 +49,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
50 sb->s_magic = SYSFS_MAGIC; 49 sb->s_magic = SYSFS_MAGIC;
51 sb->s_op = &sysfs_ops; 50 sb->s_op = &sysfs_ops;
52 sb->s_time_gran = 1; 51 sb->s_time_gran = 1;
53 sysfs_sb = sb;
54 52
55 /* get root inode, initialize and unlock it */ 53 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex); 54 mutex_lock(&sysfs_mutex);
57 inode = sysfs_get_inode(&sysfs_root); 55 inode = sysfs_get_inode(sb, &sysfs_root);
58 mutex_unlock(&sysfs_mutex); 56 mutex_unlock(&sysfs_mutex);
59 if (!inode) { 57 if (!inode) {
60 pr_debug("sysfs: could not get root inode\n"); 58 pr_debug("sysfs: could not get root inode\n");
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5eff49fa41b..1b9a3a1e8a17 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -123,6 +123,44 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
123 sysfs_hash_and_remove(parent_sd, name); 123 sysfs_hash_and_remove(parent_sd, name);
124} 124}
125 125
126/**
127 * sysfs_rename_link - rename symlink in object's directory.
128 * @kobj: object we're acting for.
129 * @targ: object we're pointing to.
130 * @old: previous name of the symlink.
131 * @new: new name of the symlink.
132 *
133 * A helper function for the common rename symlink idiom.
134 */
135int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
136 const char *old, const char *new)
137{
138 struct sysfs_dirent *parent_sd, *sd = NULL;
139 int result;
140
141 if (!kobj)
142 parent_sd = &sysfs_root;
143 else
144 parent_sd = kobj->sd;
145
146 result = -ENOENT;
147 sd = sysfs_get_dirent(parent_sd, old);
148 if (!sd)
149 goto out;
150
151 result = -EINVAL;
152 if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
153 goto out;
154 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
155 goto out;
156
157 result = sysfs_rename(sd, parent_sd, new);
158
159out:
160 sysfs_put(sd);
161 return result;
162}
163
126static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, 164static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
127 struct sysfs_dirent *target_sd, char *path) 165 struct sysfs_dirent *target_sd, char *path)
128{ 166{
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index cdd9377a6e06..30f5a44fb5d3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -66,8 +66,8 @@ struct sysfs_dirent {
66 }; 66 };
67 67
68 unsigned int s_flags; 68 unsigned int s_flags;
69 unsigned short s_mode;
69 ino_t s_ino; 70 ino_t s_ino;
70 umode_t s_mode;
71 struct sysfs_inode_attrs *s_iattr; 71 struct sysfs_inode_attrs *s_iattr;
72}; 72};
73 73
@@ -79,6 +79,7 @@ struct sysfs_dirent {
79#define SYSFS_KOBJ_BIN_ATTR 0x0004 79#define SYSFS_KOBJ_BIN_ATTR 0x0004
80#define SYSFS_KOBJ_LINK 0x0008 80#define SYSFS_KOBJ_LINK 0x0008
81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
82#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
82 83
83#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 84#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
84#define SYSFS_FLAG_REMOVED 0x0200 85#define SYSFS_FLAG_REMOVED 0x0200
@@ -91,9 +92,12 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
91#ifdef CONFIG_DEBUG_LOCK_ALLOC 92#ifdef CONFIG_DEBUG_LOCK_ALLOC
92#define sysfs_dirent_init_lockdep(sd) \ 93#define sysfs_dirent_init_lockdep(sd) \
93do { \ 94do { \
94 static struct lock_class_key __key; \ 95 struct attribute *attr = sd->s_attr.attr; \
96 struct lock_class_key *key = attr->key; \
97 if (!key) \
98 key = &attr->skey; \
95 \ 99 \
96 lockdep_init_map(&sd->dep_map, "s_active", &__key, 0); \ 100 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
97} while(0) 101} while(0)
98#else 102#else
99#define sysfs_dirent_init_lockdep(sd) do {} while(0) 103#define sysfs_dirent_init_lockdep(sd) do {} while(0)
@@ -111,7 +115,6 @@ struct sysfs_addrm_cxt {
111 * mount.c 115 * mount.c
112 */ 116 */
113extern struct sysfs_dirent sysfs_root; 117extern struct sysfs_dirent sysfs_root;
114extern struct super_block *sysfs_sb;
115extern struct kmem_cache *sysfs_dir_cachep; 118extern struct kmem_cache *sysfs_dir_cachep;
116 119
117/* 120/*
@@ -124,8 +127,8 @@ extern const struct file_operations sysfs_dir_operations;
124extern const struct inode_operations sysfs_dir_inode_operations; 127extern const struct inode_operations sysfs_dir_inode_operations;
125 128
126struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); 129struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
127struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); 130struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
128void sysfs_put_active_two(struct sysfs_dirent *sd); 131void sysfs_put_active(struct sysfs_dirent *sd);
129void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, 132void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
130 struct sysfs_dirent *parent_sd); 133 struct sysfs_dirent *parent_sd);
131int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); 134int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -168,7 +171,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
168/* 171/*
169 * inode.c 172 * inode.c
170 */ 173 */
171struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 174struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
172void sysfs_delete_inode(struct inode *inode); 175void sysfs_delete_inode(struct inode *inode);
173int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 176int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
174int sysfs_permission(struct inode *inode, int mask); 177int sysfs_permission(struct inode *inode, int mask);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a7..4573734d723d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/vfs.h> 28#include <linux/vfs.h>
29#include <linux/writeback.h>
29#include <linux/namei.h> 30#include <linux/namei.h>
30#include <asm/byteorder.h> 31#include <asm/byteorder.h>
31#include "sysv.h" 32#include "sysv.h"
@@ -246,7 +247,7 @@ bad_inode:
246 return ERR_PTR(-EIO); 247 return ERR_PTR(-EIO);
247} 248}
248 249
249int sysv_write_inode(struct inode *inode, int wait) 250static int __sysv_write_inode(struct inode *inode, int wait)
250{ 251{
251 struct super_block * sb = inode->i_sb; 252 struct super_block * sb = inode->i_sb;
252 struct sysv_sb_info * sbi = SYSV_SB(sb); 253 struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait)
296 return 0; 297 return 0;
297} 298}
298 299
300int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
301{
302 return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
303}
304
299int sysv_sync_inode(struct inode *inode) 305int sysv_sync_inode(struct inode *inode)
300{ 306{
301 return sysv_write_inode(inode, 1); 307 return __sysv_write_inode(inode, 1);
302} 308}
303 309
304static void sysv_delete_inode(struct inode *inode) 310static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf60..94cb9b4d76c2 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
142 142
143/* inode.c */ 143/* inode.c */
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 144extern struct inode *sysv_iget(struct super_block *, unsigned int);
145extern int sysv_write_inode(struct inode *, int); 145extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
146extern int sysv_sync_inode(struct inode *); 146extern int sysv_sync_inode(struct inode *);
147extern void sysv_set_inode(struct inode *, dev_t); 147extern void sysv_set_inode(struct inode *, dev_t);
148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); 148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111fff..401e503d44a1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1120 if (release) 1120 if (release)
1121 ubifs_release_budget(c, &ino_req); 1121 ubifs_release_budget(c, &ino_req);
1122 if (IS_SYNC(old_inode)) 1122 if (IS_SYNC(old_inode))
1123 err = old_inode->i_sb->s_op->write_inode(old_inode, 1); 1123 err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
1124 return err; 1124 return err;
1125 1125
1126out_cancel: 1126out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 16a6444330ec..e26c02ab6cd5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1011 /* Is the page fully inside @i_size? */ 1011 /* Is the page fully inside @i_size? */
1012 if (page->index < end_index) { 1012 if (page->index < end_index) {
1013 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { 1013 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
1014 err = inode->i_sb->s_op->write_inode(inode, 1); 1014 err = inode->i_sb->s_op->write_inode(inode, NULL);
1015 if (err) 1015 if (err)
1016 goto out_unlock; 1016 goto out_unlock;
1017 /* 1017 /*
@@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1039 kunmap_atomic(kaddr, KM_USER0); 1039 kunmap_atomic(kaddr, KM_USER0);
1040 1040
1041 if (i_size > synced_i_size) { 1041 if (i_size > synced_i_size) {
1042 err = inode->i_sb->s_op->write_inode(inode, 1); 1042 err = inode->i_sb->s_op->write_inode(inode, NULL);
1043 if (err) 1043 if (err)
1044 goto out_unlock; 1044 goto out_unlock;
1045 } 1045 }
@@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1242 if (release) 1242 if (release)
1243 ubifs_release_budget(c, &req); 1243 ubifs_release_budget(c, &req);
1244 if (IS_SYNC(inode)) 1244 if (IS_SYNC(inode))
1245 err = inode->i_sb->s_op->write_inode(inode, 1); 1245 err = inode->i_sb->s_op->write_inode(inode, NULL);
1246 return err; 1246 return err;
1247 1247
1248out: 1248out:
@@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1316 * the inode unless this is a 'datasync()' call. 1316 * the inode unless this is a 'datasync()' call.
1317 */ 1317 */
1318 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { 1318 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
1319 err = inode->i_sb->s_op->write_inode(inode, 1); 1319 err = inode->i_sb->s_op->write_inode(inode, NULL);
1320 if (err) 1320 if (err)
1321 return err; 1321 return err;
1322 } 1322 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 43f9d19a6f33..4d2f2157dd3f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
283/* 283/*
284 * Note, Linux write-back code calls this without 'i_mutex'. 284 * Note, Linux write-back code calls this without 'i_mutex'.
285 */ 285 */
286static int ubifs_write_inode(struct inode *inode, int wait) 286static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
287{ 287{
288 int err = 0; 288 int err = 0;
289 struct ubifs_info *c = inode->i_sb->s_fs_info; 289 struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 82372e332f08..19626e2491c4 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,55 +31,8 @@
31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr) 31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) 32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) 33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
34#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
35#define udf_find_next_one_bit(addr, size, offset) \ 34#define udf_find_next_one_bit(addr, size, offset) \
36 find_next_one_bit(addr, size, offset) 35 ext2_find_next_bit(addr, size, offset)
37
38#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
39#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
40#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
41#define uintBPL_t uint(BITS_PER_LONG)
42#define uint(x) xuint(x)
43#define xuint(x) __le ## x
44
45static inline int find_next_one_bit(void *addr, int size, int offset)
46{
47 uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
48 int result = offset & ~(BITS_PER_LONG - 1);
49 unsigned long tmp;
50
51 if (offset >= size)
52 return size;
53 size -= result;
54 offset &= (BITS_PER_LONG - 1);
55 if (offset) {
56 tmp = leBPL_to_cpup(p++);
57 tmp &= ~0UL << offset;
58 if (size < BITS_PER_LONG)
59 goto found_first;
60 if (tmp)
61 goto found_middle;
62 size -= BITS_PER_LONG;
63 result += BITS_PER_LONG;
64 }
65 while (size & ~(BITS_PER_LONG - 1)) {
66 tmp = leBPL_to_cpup(p++);
67 if (tmp)
68 goto found_middle;
69 result += BITS_PER_LONG;
70 size -= BITS_PER_LONG;
71 }
72 if (!size)
73 return result;
74 tmp = leBPL_to_cpup(p);
75found_first:
76 tmp &= ~0UL >> (BITS_PER_LONG - size);
77found_middle:
78 return result + ffz(~tmp);
79}
80
81#define find_first_one_bit(addr, size)\
82 find_next_one_bit((addr), (size), 0)
83 36
84static int read_block_bitmap(struct super_block *sb, 37static int read_block_bitmap(struct super_block *sb,
85 struct udf_bitmap *bitmap, unsigned int block, 38 struct udf_bitmap *bitmap, unsigned int block,
@@ -208,7 +161,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
208 ((char *)bh->b_data)[(bit + i) >> 3]); 161 ((char *)bh->b_data)[(bit + i) >> 3]);
209 } else { 162 } else {
210 if (inode) 163 if (inode)
211 vfs_dq_free_block(inode, 1); 164 dquot_free_block(inode, 1);
212 udf_add_free_space(sb, sbi->s_partition, 1); 165 udf_add_free_space(sb, sbi->s_partition, 1);
213 } 166 }
214 } 167 }
@@ -260,11 +213,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
260 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 213 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
261 if (!udf_test_bit(bit, bh->b_data)) 214 if (!udf_test_bit(bit, bh->b_data))
262 goto out; 215 goto out;
263 else if (vfs_dq_prealloc_block(inode, 1)) 216 else if (dquot_prealloc_block(inode, 1))
264 goto out; 217 goto out;
265 else if (!udf_clear_bit(bit, bh->b_data)) { 218 else if (!udf_clear_bit(bit, bh->b_data)) {
266 udf_debug("bit already cleared for block %d\n", bit); 219 udf_debug("bit already cleared for block %d\n", bit);
267 vfs_dq_free_block(inode, 1); 220 dquot_free_block(inode, 1);
268 goto out; 221 goto out;
269 } 222 }
270 block_count--; 223 block_count--;
@@ -390,10 +343,14 @@ got_block:
390 /* 343 /*
391 * Check quota for allocation of this block. 344 * Check quota for allocation of this block.
392 */ 345 */
393 if (inode && vfs_dq_alloc_block(inode, 1)) { 346 if (inode) {
394 mutex_unlock(&sbi->s_alloc_mutex); 347 int ret = dquot_alloc_block(inode, 1);
395 *err = -EDQUOT; 348
396 return 0; 349 if (ret) {
350 mutex_unlock(&sbi->s_alloc_mutex);
351 *err = ret;
352 return 0;
353 }
397 } 354 }
398 355
399 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 356 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
@@ -449,7 +406,7 @@ static void udf_table_free_blocks(struct super_block *sb,
449 /* We do this up front - There are some error conditions that 406 /* We do this up front - There are some error conditions that
450 could occure, but.. oh well */ 407 could occure, but.. oh well */
451 if (inode) 408 if (inode)
452 vfs_dq_free_block(inode, count); 409 dquot_free_block(inode, count);
453 udf_add_free_space(sb, sbi->s_partition, count); 410 udf_add_free_space(sb, sbi->s_partition, count);
454 411
455 start = bloc->logicalBlockNum + offset; 412 start = bloc->logicalBlockNum + offset;
@@ -547,7 +504,7 @@ static void udf_table_free_blocks(struct super_block *sb,
547 } 504 }
548 505
549 if (epos.offset + (2 * adsize) > sb->s_blocksize) { 506 if (epos.offset + (2 * adsize) > sb->s_blocksize) {
550 char *sptr, *dptr; 507 unsigned char *sptr, *dptr;
551 int loffset; 508 int loffset;
552 509
553 brelse(oepos.bh); 510 brelse(oepos.bh);
@@ -694,7 +651,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
694 epos.offset -= adsize; 651 epos.offset -= adsize;
695 652
696 alloc_count = (elen >> sb->s_blocksize_bits); 653 alloc_count = (elen >> sb->s_blocksize_bits);
697 if (inode && vfs_dq_prealloc_block(inode, 654 if (inode && dquot_prealloc_block(inode,
698 alloc_count > block_count ? block_count : alloc_count)) 655 alloc_count > block_count ? block_count : alloc_count))
699 alloc_count = 0; 656 alloc_count = 0;
700 else if (alloc_count > block_count) { 657 else if (alloc_count > block_count) {
@@ -797,12 +754,13 @@ static int udf_table_new_block(struct super_block *sb,
797 newblock = goal_eloc.logicalBlockNum; 754 newblock = goal_eloc.logicalBlockNum;
798 goal_eloc.logicalBlockNum++; 755 goal_eloc.logicalBlockNum++;
799 goal_elen -= sb->s_blocksize; 756 goal_elen -= sb->s_blocksize;
800 757 if (inode) {
801 if (inode && vfs_dq_alloc_block(inode, 1)) { 758 *err = dquot_alloc_block(inode, 1);
802 brelse(goal_epos.bh); 759 if (*err) {
803 mutex_unlock(&sbi->s_alloc_mutex); 760 brelse(goal_epos.bh);
804 *err = -EDQUOT; 761 mutex_unlock(&sbi->s_alloc_mutex);
805 return 0; 762 return 0;
763 }
806 } 764 }
807 765
808 if (goal_elen) 766 if (goal_elen)
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 61d9a76a3a69..f0f2a436251e 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
45 int block, iblock; 45 int block, iblock;
46 loff_t nf_pos = (filp->f_pos - 1) << 2; 46 loff_t nf_pos = (filp->f_pos - 1) << 2;
47 int flen; 47 int flen;
48 char *fname = NULL; 48 unsigned char *fname = NULL;
49 char *nameptr; 49 unsigned char *nameptr;
50 uint16_t liu; 50 uint16_t liu;
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index f311d509b6a3..1eb06774ed90 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,6 +34,7 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
37#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
38#include <linux/aio.h> 39#include <linux/aio.h>
39 40
@@ -207,7 +208,7 @@ const struct file_operations udf_file_operations = {
207 .read = do_sync_read, 208 .read = do_sync_read,
208 .aio_read = generic_file_aio_read, 209 .aio_read = generic_file_aio_read,
209 .ioctl = udf_ioctl, 210 .ioctl = udf_ioctl,
210 .open = generic_file_open, 211 .open = dquot_file_open,
211 .mmap = generic_file_mmap, 212 .mmap = generic_file_mmap,
212 .write = do_sync_write, 213 .write = do_sync_write,
213 .aio_write = udf_file_aio_write, 214 .aio_write = udf_file_aio_write,
@@ -217,6 +218,29 @@ const struct file_operations udf_file_operations = {
217 .llseek = generic_file_llseek, 218 .llseek = generic_file_llseek,
218}; 219};
219 220
221static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
222{
223 struct inode *inode = dentry->d_inode;
224 int error;
225
226 error = inode_change_ok(inode, iattr);
227 if (error)
228 return error;
229
230 if (iattr->ia_valid & ATTR_SIZE)
231 dquot_initialize(inode);
232
233 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
234 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
235 error = dquot_transfer(inode, iattr);
236 if (error)
237 return error;
238 }
239
240 return inode_setattr(inode, iattr);
241}
242
220const struct inode_operations udf_file_inode_operations = { 243const struct inode_operations udf_file_inode_operations = {
221 .truncate = udf_truncate, 244 .truncate = udf_truncate,
245 .setattr = udf_setattr,
222}; 246};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e2..fb68c9cd0c3e 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
36 * Note: we must free any quota before locking the superblock, 36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well. 37 * as writing the quota to disk may need the lock as well.
38 */ 38 */
39 vfs_dq_free_inode(inode); 39 dquot_free_inode(inode);
40 vfs_dq_drop(inode); 40 dquot_drop(inode);
41 41
42 clear_inode(inode); 42 clear_inode(inode);
43 43
@@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 61 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 62 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 63 struct inode *inode;
64 int block; 64 int block, ret;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 66 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 67 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
153 insert_inode_hash(inode); 153 insert_inode_hash(inode);
154 mark_inode_dirty(inode); 154 mark_inode_dirty(inode);
155 155
156 if (vfs_dq_alloc_inode(inode)) { 156 dquot_initialize(inode);
157 vfs_dq_drop(inode); 157 ret = dquot_alloc_inode(inode);
158 if (ret) {
159 dquot_drop(inode);
158 inode->i_flags |= S_NOQUOTA; 160 inode->i_flags |= S_NOQUOTA;
159 inode->i_nlink = 0; 161 inode->i_nlink = 0;
160 iput(inode); 162 iput(inode);
161 *err = -EDQUOT; 163 *err = ret;
162 return NULL; 164 return NULL;
163 } 165 }
164 166
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f90231eb2916..bb863fe579ac 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,6 +36,7 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
39#include <linux/slab.h> 40#include <linux/slab.h>
40#include <linux/crc-itu-t.h> 41#include <linux/crc-itu-t.h>
41 42
@@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
70 71
71void udf_delete_inode(struct inode *inode) 72void udf_delete_inode(struct inode *inode)
72{ 73{
74 if (!is_bad_inode(inode))
75 dquot_initialize(inode);
76
73 truncate_inode_pages(&inode->i_data, 0); 77 truncate_inode_pages(&inode->i_data, 0);
74 78
75 if (is_bad_inode(inode)) 79 if (is_bad_inode(inode))
@@ -102,12 +106,14 @@ void udf_clear_inode(struct inode *inode)
102 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 106 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
103 inode->i_size != iinfo->i_lenExtents) { 107 inode->i_size != iinfo->i_lenExtents) {
104 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " 108 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
105 "inode size %llu different from extent lenght %llu. " 109 "inode size %llu different from extent length %llu. "
106 "Filesystem need not be standards compliant.\n", 110 "Filesystem need not be standards compliant.\n",
107 inode->i_sb->s_id, inode->i_ino, inode->i_mode, 111 inode->i_sb->s_id, inode->i_ino, inode->i_mode,
108 (unsigned long long)inode->i_size, 112 (unsigned long long)inode->i_size,
109 (unsigned long long)iinfo->i_lenExtents); 113 (unsigned long long)iinfo->i_lenExtents);
110 } 114 }
115
116 dquot_drop(inode);
111 kfree(iinfo->i_ext.i_data); 117 kfree(iinfo->i_ext.i_data);
112 iinfo->i_ext.i_data = NULL; 118 iinfo->i_ext.i_data = NULL;
113} 119}
@@ -1373,12 +1379,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1373 return mode; 1379 return mode;
1374} 1380}
1375 1381
1376int udf_write_inode(struct inode *inode, int sync) 1382int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
1377{ 1383{
1378 int ret; 1384 int ret;
1379 1385
1380 lock_kernel(); 1386 lock_kernel();
1381 ret = udf_update_inode(inode, sync); 1387 ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1382 unlock_kernel(); 1388 unlock_kernel();
1383 1389
1384 return ret; 1390 return ret;
@@ -1402,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1402 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 1408 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
1403 struct udf_inode_info *iinfo = UDF_I(inode); 1409 struct udf_inode_info *iinfo = UDF_I(inode);
1404 1410
1405 bh = udf_tread(inode->i_sb, 1411 bh = udf_tgetblk(inode->i_sb,
1406 udf_get_lb_pblock(inode->i_sb, 1412 udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
1407 &iinfo->i_location, 0));
1408 if (!bh) { 1413 if (!bh) {
1409 udf_debug("bread failure\n"); 1414 udf_debug("getblk failure\n");
1410 return -EIO; 1415 return -ENOMEM;
1411 } 1416 }
1412 1417
1413 memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); 1418 lock_buffer(bh);
1414 1419 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1415 fe = (struct fileEntry *)bh->b_data; 1420 fe = (struct fileEntry *)bh->b_data;
1416 efe = (struct extendedFileEntry *)bh->b_data; 1421 efe = (struct extendedFileEntry *)bh->b_data;
1417 1422
1418 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { 1423 if (iinfo->i_use) {
1419 struct unallocSpaceEntry *use = 1424 struct unallocSpaceEntry *use =
1420 (struct unallocSpaceEntry *)bh->b_data; 1425 (struct unallocSpaceEntry *)bh->b_data;
1421 1426
@@ -1423,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1423 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), 1428 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
1424 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1429 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1425 sizeof(struct unallocSpaceEntry)); 1430 sizeof(struct unallocSpaceEntry));
1431 use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
1432 use->descTag.tagLocation =
1433 cpu_to_le32(iinfo->i_location.logicalBlockNum);
1426 crclen = sizeof(struct unallocSpaceEntry) + 1434 crclen = sizeof(struct unallocSpaceEntry) +
1427 iinfo->i_lenAlloc - sizeof(struct tag); 1435 iinfo->i_lenAlloc - sizeof(struct tag);
1428 use->descTag.tagLocation = cpu_to_le32(
1429 iinfo->i_location.
1430 logicalBlockNum);
1431 use->descTag.descCRCLength = cpu_to_le16(crclen); 1436 use->descTag.descCRCLength = cpu_to_le16(crclen);
1432 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1437 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1433 sizeof(struct tag), 1438 sizeof(struct tag),
1434 crclen)); 1439 crclen));
1435 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1440 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1436 1441
1437 mark_buffer_dirty(bh); 1442 goto out;
1438 brelse(bh);
1439 return err;
1440 } 1443 }
1441 1444
1442 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) 1445 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1591,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1591 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); 1594 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
1592 fe->descTag.tagLocation = cpu_to_le32( 1595 fe->descTag.tagLocation = cpu_to_le32(
1593 iinfo->i_location.logicalBlockNum); 1596 iinfo->i_location.logicalBlockNum);
1594 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1597 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
1595 sizeof(struct tag);
1596 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1598 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1597 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), 1599 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1598 crclen)); 1600 crclen));
1599 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1601 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1600 1602
1603out:
1604 set_buffer_uptodate(bh);
1605 unlock_buffer(bh);
1606
1601 /* write the data blocks */ 1607 /* write the data blocks */
1602 mark_buffer_dirty(bh); 1608 mark_buffer_dirty(bh);
1603 if (do_sync) { 1609 if (do_sync) {
1604 sync_dirty_buffer(bh); 1610 sync_dirty_buffer(bh);
1605 if (buffer_req(bh) && !buffer_uptodate(bh)) { 1611 if (buffer_write_io_error(bh)) {
1606 printk(KERN_WARNING "IO error syncing udf inode " 1612 printk(KERN_WARNING "IO error syncing udf inode "
1607 "[%s:%08lx]\n", inode->i_sb->s_id, 1613 "[%s:%08lx]\n", inode->i_sb->s_id,
1608 inode->i_ino); 1614 inode->i_ino);
@@ -1672,7 +1678,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1672 return -1; 1678 return -1;
1673 1679
1674 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { 1680 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
1675 char *sptr, *dptr; 1681 unsigned char *sptr, *dptr;
1676 struct buffer_head *nbh; 1682 struct buffer_head *nbh;
1677 int err, loffset; 1683 int err, loffset;
1678 struct kernel_lb_addr obloc = epos->block; 1684 struct kernel_lb_addr obloc = epos->block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index cd2115060fdc..db423ab078b1 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -34,8 +34,8 @@
34#include <linux/crc-itu-t.h> 34#include <linux/crc-itu-t.h>
35#include <linux/exportfs.h> 35#include <linux/exportfs.h>
36 36
37static inline int udf_match(int len1, const char *name1, int len2, 37static inline int udf_match(int len1, const unsigned char *name1, int len2,
38 const char *name2) 38 const unsigned char *name2)
39{ 39{
40 if (len1 != len2) 40 if (len1 != len2)
41 return 0; 41 return 0;
@@ -142,15 +142,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
142} 142}
143 143
144static struct fileIdentDesc *udf_find_entry(struct inode *dir, 144static struct fileIdentDesc *udf_find_entry(struct inode *dir,
145 struct qstr *child, 145 const struct qstr *child,
146 struct udf_fileident_bh *fibh, 146 struct udf_fileident_bh *fibh,
147 struct fileIdentDesc *cfi) 147 struct fileIdentDesc *cfi)
148{ 148{
149 struct fileIdentDesc *fi = NULL; 149 struct fileIdentDesc *fi = NULL;
150 loff_t f_pos; 150 loff_t f_pos;
151 int block, flen; 151 int block, flen;
152 char *fname = NULL; 152 unsigned char *fname = NULL;
153 char *nameptr; 153 unsigned char *nameptr;
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
@@ -308,7 +308,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
308{ 308{
309 struct super_block *sb = dir->i_sb; 309 struct super_block *sb = dir->i_sb;
310 struct fileIdentDesc *fi = NULL; 310 struct fileIdentDesc *fi = NULL;
311 char *name = NULL; 311 unsigned char *name = NULL;
312 int namelen; 312 int namelen;
313 loff_t f_pos; 313 loff_t f_pos;
314 loff_t size = udf_ext0_offset(dir) + dir->i_size; 314 loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
563 int err; 563 int err;
564 struct udf_inode_info *iinfo; 564 struct udf_inode_info *iinfo;
565 565
566 dquot_initialize(dir);
567
566 lock_kernel(); 568 lock_kernel();
567 inode = udf_new_inode(dir, mode, &err); 569 inode = udf_new_inode(dir, mode, &err);
568 if (!inode) { 570 if (!inode) {
@@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
616 if (!old_valid_dev(rdev)) 618 if (!old_valid_dev(rdev))
617 return -EINVAL; 619 return -EINVAL;
618 620
621 dquot_initialize(dir);
622
619 lock_kernel(); 623 lock_kernel();
620 err = -EIO; 624 err = -EIO;
621 inode = udf_new_inode(dir, mode, &err); 625 inode = udf_new_inode(dir, mode, &err);
@@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
662 struct udf_inode_info *dinfo = UDF_I(dir); 666 struct udf_inode_info *dinfo = UDF_I(dir);
663 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
664 668
669 dquot_initialize(dir);
670
665 lock_kernel(); 671 lock_kernel();
666 err = -EMLINK; 672 err = -EMLINK;
667 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 673 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
799 struct fileIdentDesc *fi, cfi; 805 struct fileIdentDesc *fi, cfi;
800 struct kernel_lb_addr tloc; 806 struct kernel_lb_addr tloc;
801 807
808 dquot_initialize(dir);
809
802 retval = -ENOENT; 810 retval = -ENOENT;
803 lock_kernel(); 811 lock_kernel();
804 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 812 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
845 struct fileIdentDesc cfi; 853 struct fileIdentDesc cfi;
846 struct kernel_lb_addr tloc; 854 struct kernel_lb_addr tloc;
847 855
856 dquot_initialize(dir);
857
848 retval = -ENOENT; 858 retval = -ENOENT;
849 lock_kernel(); 859 lock_kernel();
850 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 860 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -885,20 +895,22 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
885{ 895{
886 struct inode *inode; 896 struct inode *inode;
887 struct pathComponent *pc; 897 struct pathComponent *pc;
888 char *compstart; 898 const char *compstart;
889 struct udf_fileident_bh fibh; 899 struct udf_fileident_bh fibh;
890 struct extent_position epos = {}; 900 struct extent_position epos = {};
891 int eoffset, elen = 0; 901 int eoffset, elen = 0;
892 struct fileIdentDesc *fi; 902 struct fileIdentDesc *fi;
893 struct fileIdentDesc cfi; 903 struct fileIdentDesc cfi;
894 char *ea; 904 uint8_t *ea;
895 int err; 905 int err;
896 int block; 906 int block;
897 char *name = NULL; 907 unsigned char *name = NULL;
898 int namelen; 908 int namelen;
899 struct buffer_head *bh; 909 struct buffer_head *bh;
900 struct udf_inode_info *iinfo; 910 struct udf_inode_info *iinfo;
901 911
912 dquot_initialize(dir);
913
902 lock_kernel(); 914 lock_kernel();
903 inode = udf_new_inode(dir, S_IFLNK, &err); 915 inode = udf_new_inode(dir, S_IFLNK, &err);
904 if (!inode) 916 if (!inode)
@@ -970,7 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
970 982
971 pc = (struct pathComponent *)(ea + elen); 983 pc = (struct pathComponent *)(ea + elen);
972 984
973 compstart = (char *)symname; 985 compstart = symname;
974 986
975 do { 987 do {
976 symname++; 988 symname++;
@@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1069 int err; 1081 int err;
1070 struct buffer_head *bh; 1082 struct buffer_head *bh;
1071 1083
1084 dquot_initialize(dir);
1085
1072 lock_kernel(); 1086 lock_kernel();
1073 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1087 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1074 unlock_kernel(); 1088 unlock_kernel();
@@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1131 struct kernel_lb_addr tloc; 1145 struct kernel_lb_addr tloc;
1132 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1146 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1133 1147
1148 dquot_initialize(old_dir);
1149 dquot_initialize(new_dir);
1150
1134 lock_kernel(); 1151 lock_kernel();
1135 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1152 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1136 if (ofi) { 1153 if (ofi) {
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c3265e1385d4..852e91845688 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -32,12 +32,12 @@
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include "udf_i.h" 33#include "udf_i.h"
34 34
35static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen, 35static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
36 char *to) 36 int fromlen, unsigned char *to)
37{ 37{
38 struct pathComponent *pc; 38 struct pathComponent *pc;
39 int elen = 0; 39 int elen = 0;
40 char *p = to; 40 unsigned char *p = to;
41 41
42 while (elen < fromlen) { 42 while (elen < fromlen) {
43 pc = (struct pathComponent *)(from + elen); 43 pc = (struct pathComponent *)(from + elen);
@@ -75,9 +75,9 @@ static int udf_symlink_filler(struct file *file, struct page *page)
75{ 75{
76 struct inode *inode = page->mapping->host; 76 struct inode *inode = page->mapping->host;
77 struct buffer_head *bh = NULL; 77 struct buffer_head *bh = NULL;
78 char *symlink; 78 unsigned char *symlink;
79 int err = -EIO; 79 int err = -EIO;
80 char *p = kmap(page); 80 unsigned char *p = kmap(page);
81 struct udf_inode_info *iinfo; 81 struct udf_inode_info *iinfo;
82 82
83 lock_kernel(); 83 lock_kernel();
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee7..4223ac855da9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *);
142extern void udf_read_inode(struct inode *); 142extern void udf_read_inode(struct inode *);
143extern void udf_delete_inode(struct inode *); 143extern void udf_delete_inode(struct inode *);
144extern void udf_clear_inode(struct inode *); 144extern void udf_clear_inode(struct inode *);
145extern int udf_write_inode(struct inode *, int); 145extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
146extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
147extern int udf_extend_file(struct inode *, struct extent_position *, 147extern int udf_extend_file(struct inode *, struct extent_position *,
148 struct kernel_long_ad *, sector_t); 148 struct kernel_long_ad *, sector_t);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95dff..5cfa4d85ccf2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 85 "bit already cleared for fragment %u", i);
86 } 86 }
87 87
88 vfs_dq_free_block(inode, count); 88 dquot_free_block(inode, count);
89 89
90 90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 197 ufs_clusteracct (sb, ucpi, blkno, 1);
198 vfs_dq_free_block(inode, uspi->s_fpb); 198 dquot_free_block(inode, uspi->s_fpb);
199 199
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 201 uspi->cs_total.cs_nbfree++;
@@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 511 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 512 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 513 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
514 515
515 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
516 (unsigned long long)fragment, oldcount, newcount); 517 (unsigned long long)fragment, oldcount, newcount);
@@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
556 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
557 for (i = oldcount; i < newcount; i++) 558 for (i = oldcount; i < newcount; i++)
558 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
559 if (vfs_dq_alloc_block(inode, count)) { 560 ret = dquot_alloc_block(inode, count);
560 *err = -EDQUOT; 561 if (ret) {
562 *err = ret;
561 return 0; 563 return 0;
562 } 564 }
563 565
@@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
596 struct ufs_cylinder_group * ucg; 598 struct ufs_cylinder_group * ucg;
597 unsigned oldcg, i, j, k, allocsize; 599 unsigned oldcg, i, j, k, allocsize;
598 u64 result; 600 u64 result;
601 int ret;
599 602
600 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
601 inode->i_ino, cgno, (unsigned long long)goal, count); 604 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -664,7 +667,7 @@ cg_found:
664 for (i = count; i < uspi->s_fpb; i++) 667 for (i = count; i < uspi->s_fpb; i++)
665 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
666 i = uspi->s_fpb - count; 669 i = uspi->s_fpb - count;
667 vfs_dq_free_block(inode, i); 670 dquot_free_block(inode, i);
668 671
669 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
670 uspi->cs_total.cs_nffree += i; 673 uspi->cs_total.cs_nffree += i;
@@ -676,8 +679,9 @@ cg_found:
676 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
677 if (result == INVBLOCK) 680 if (result == INVBLOCK)
678 return 0; 681 return 0;
679 if (vfs_dq_alloc_block(inode, count)) { 682 ret = dquot_alloc_block(inode, count);
680 *err = -EDQUOT; 683 if (ret) {
684 *err = ret;
681 return 0; 685 return 0;
682 } 686 }
683 for (i = 0; i < count; i++) 687 for (i = 0; i < count; i++)
@@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
714 struct ufs_super_block_first * usb1; 718 struct ufs_super_block_first * usb1;
715 struct ufs_cylinder_group * ucg; 719 struct ufs_cylinder_group * ucg;
716 u64 result, blkno; 720 u64 result, blkno;
721 int ret;
717 722
718 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
719 724
@@ -747,8 +752,9 @@ gotit:
747 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
748 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
749 ufs_clusteracct (sb, ucpi, blkno, -1); 754 ufs_clusteracct (sb, ucpi, blkno, -1);
750 if (vfs_dq_alloc_block(inode, uspi->s_fpb)) { 755 ret = dquot_alloc_block(inode, uspi->s_fpb);
751 *err = -EDQUOT; 756 if (ret) {
757 *err = ret;
752 return INVBLOCK; 758 return INVBLOCK;
753 } 759 }
754 760
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 22af68f8b682..317a0d444f6b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -31,7 +31,7 @@
31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller. 31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
32 */ 32 */
33static inline int ufs_match(struct super_block *sb, int len, 33static inline int ufs_match(struct super_block *sb, int len,
34 const char * const name, struct ufs_dir_entry * de) 34 const unsigned char *name, struct ufs_dir_entry *de)
35{ 35{
36 if (len != ufs_get_de_namlen(sb, de)) 36 if (len != ufs_get_de_namlen(sb, de))
37 return 0; 37 return 0;
@@ -70,7 +70,7 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; 70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
71} 71}
72 72
73ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr) 73ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
74{ 74{
75 ino_t res = 0; 75 ino_t res = 0;
76 struct ufs_dir_entry *de; 76 struct ufs_dir_entry *de;
@@ -249,11 +249,11 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
249 * (as a parameter - res_dir). Page is returned mapped and unlocked. 249 * (as a parameter - res_dir). Page is returned mapped and unlocked.
250 * Entry is guaranteed to be valid. 250 * Entry is guaranteed to be valid.
251 */ 251 */
252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr, 252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
253 struct page **res_page) 253 struct page **res_page)
254{ 254{
255 struct super_block *sb = dir->i_sb; 255 struct super_block *sb = dir->i_sb;
256 const char *name = qstr->name; 256 const unsigned char *name = qstr->name;
257 int namelen = qstr->len; 257 int namelen = qstr->len;
258 unsigned reclen = UFS_DIR_REC_LEN(namelen); 258 unsigned reclen = UFS_DIR_REC_LEN(namelen);
259 unsigned long start, n; 259 unsigned long start, n;
@@ -313,7 +313,7 @@ found:
313int ufs_add_link(struct dentry *dentry, struct inode *inode) 313int ufs_add_link(struct dentry *dentry, struct inode *inode)
314{ 314{
315 struct inode *dir = dentry->d_parent->d_inode; 315 struct inode *dir = dentry->d_parent->d_inode;
316 const char *name = dentry->d_name.name; 316 const unsigned char *name = dentry->d_name.name;
317 int namelen = dentry->d_name.len; 317 int namelen = dentry->d_name.len;
318 struct super_block *sb = dir->i_sb; 318 struct super_block *sb = dir->i_sb;
319 unsigned reclen = UFS_DIR_REC_LEN(namelen); 319 unsigned reclen = UFS_DIR_REC_LEN(namelen);
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 73655c61240a..a8962cecde5b 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,6 +24,7 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
27 28
28#include "ufs_fs.h" 29#include "ufs_fs.h"
29#include "ufs.h" 30#include "ufs.h"
@@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = {
40 .write = do_sync_write, 41 .write = do_sync_write,
41 .aio_write = generic_file_aio_write, 42 .aio_write = generic_file_aio_write,
42 .mmap = generic_file_mmap, 43 .mmap = generic_file_mmap,
43 .open = generic_file_open, 44 .open = dquot_file_open,
44 .fsync = simple_fsync, 45 .fsync = simple_fsync,
45 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
46}; 47};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0d..230ecf608026 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
95 95
96 is_directory = S_ISDIR(inode->i_mode); 96 is_directory = S_ISDIR(inode->i_mode);
97 97
98 vfs_dq_free_inode(inode); 98 dquot_free_inode(inode);
99 vfs_dq_drop(inode); 99 dquot_drop(inode);
100 100
101 clear_inode (inode); 101 clear_inode (inode);
102 102
@@ -355,9 +355,10 @@ cg_found:
355 355
356 unlock_super (sb); 356 unlock_super (sb);
357 357
358 if (vfs_dq_alloc_inode(inode)) { 358 dquot_initialize(inode);
359 vfs_dq_drop(inode); 359 err = dquot_alloc_inode(inode);
360 err = -EDQUOT; 360 if (err) {
361 dquot_drop(inode);
361 goto fail_without_unlock; 362 goto fail_without_unlock;
362 } 363 }
363 364
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd46..80b68c3702d1 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,8 @@
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h>
40#include <linux/quotaops.h>
39 41
40#include "ufs_fs.h" 42#include "ufs_fs.h"
41#include "ufs.h" 43#include "ufs.h"
@@ -890,11 +892,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
890 return 0; 892 return 0;
891} 893}
892 894
893int ufs_write_inode (struct inode * inode, int wait) 895int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
894{ 896{
895 int ret; 897 int ret;
896 lock_kernel(); 898 lock_kernel();
897 ret = ufs_update_inode (inode, wait); 899 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
898 unlock_kernel(); 900 unlock_kernel();
899 return ret; 901 return ret;
900} 902}
@@ -908,6 +910,9 @@ void ufs_delete_inode (struct inode * inode)
908{ 910{
909 loff_t old_i_size; 911 loff_t old_i_size;
910 912
913 if (!is_bad_inode(inode))
914 dquot_initialize(inode);
915
911 truncate_inode_pages(&inode->i_data, 0); 916 truncate_inode_pages(&inode->i_data, 0);
912 if (is_bad_inode(inode)) 917 if (is_bad_inode(inode))
913 goto no_delete; 918 goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 4c26d9e8bc94..118556243e7a 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,6 +30,7 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
33 34
34#include "ufs_fs.h" 35#include "ufs_fs.h"
35#include "ufs.h" 36#include "ufs.h"
@@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
84 int err; 85 int err;
85 86
86 UFSD("BEGIN\n"); 87 UFSD("BEGIN\n");
88
89 dquot_initialize(dir);
90
87 inode = ufs_new_inode(dir, mode); 91 inode = ufs_new_inode(dir, mode);
88 err = PTR_ERR(inode); 92 err = PTR_ERR(inode);
89 93
@@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
107 111
108 if (!old_valid_dev(rdev)) 112 if (!old_valid_dev(rdev))
109 return -EINVAL; 113 return -EINVAL;
114
115 dquot_initialize(dir);
116
110 inode = ufs_new_inode(dir, mode); 117 inode = ufs_new_inode(dir, mode);
111 err = PTR_ERR(inode); 118 err = PTR_ERR(inode);
112 if (!IS_ERR(inode)) { 119 if (!IS_ERR(inode)) {
@@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
131 if (l > sb->s_blocksize) 138 if (l > sb->s_blocksize)
132 goto out_notlocked; 139 goto out_notlocked;
133 140
141 dquot_initialize(dir);
142
134 lock_kernel(); 143 lock_kernel();
135 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
136 err = PTR_ERR(inode); 145 err = PTR_ERR(inode);
@@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
176 return -EMLINK; 185 return -EMLINK;
177 } 186 }
178 187
188 dquot_initialize(dir);
189
179 inode->i_ctime = CURRENT_TIME_SEC; 190 inode->i_ctime = CURRENT_TIME_SEC;
180 inode_inc_link_count(inode); 191 inode_inc_link_count(inode);
181 atomic_inc(&inode->i_count); 192 atomic_inc(&inode->i_count);
@@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
193 if (dir->i_nlink >= UFS_LINK_MAX) 204 if (dir->i_nlink >= UFS_LINK_MAX)
194 goto out; 205 goto out;
195 206
207 dquot_initialize(dir);
208
196 lock_kernel(); 209 lock_kernel();
197 inode_inc_link_count(dir); 210 inode_inc_link_count(dir);
198 211
@@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
237 struct page *page; 250 struct page *page;
238 int err = -ENOENT; 251 int err = -ENOENT;
239 252
253 dquot_initialize(dir);
254
240 de = ufs_find_entry(dir, &dentry->d_name, &page); 255 de = ufs_find_entry(dir, &dentry->d_name, &page);
241 if (!de) 256 if (!de)
242 goto out; 257 goto out;
@@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
281 struct ufs_dir_entry *old_de; 296 struct ufs_dir_entry *old_de;
282 int err = -ENOENT; 297 int err = -ENOENT;
283 298
299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
284 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
285 if (!old_de) 303 if (!old_de)
286 goto out; 304 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 143c20bfb04b..14743d935a93 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1016,6 +1016,9 @@ magic_found:
1016 case UFS_FSSTABLE: 1016 case UFS_FSSTABLE:
1017 UFSD("fs is stable\n"); 1017 UFSD("fs is stable\n");
1018 break; 1018 break;
1019 case UFS_FSLOG:
1020 UFSD("fs is logging fs\n");
1021 break;
1019 case UFS_FSOSF1: 1022 case UFS_FSOSF1:
1020 UFSD("fs is DEC OSF/1\n"); 1023 UFSD("fs is DEC OSF/1\n");
1021 break; 1024 break;
@@ -1432,6 +1435,11 @@ static void destroy_inodecache(void)
1432 kmem_cache_destroy(ufs_inode_cachep); 1435 kmem_cache_destroy(ufs_inode_cachep);
1433} 1436}
1434 1437
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1435#ifdef CONFIG_QUOTA 1443#ifdef CONFIG_QUOTA
1436static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); 1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1437static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); 1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
@@ -1442,6 +1450,7 @@ static const struct super_operations ufs_super_ops = {
1442 .destroy_inode = ufs_destroy_inode, 1450 .destroy_inode = ufs_destroy_inode,
1443 .write_inode = ufs_write_inode, 1451 .write_inode = ufs_write_inode,
1444 .delete_inode = ufs_delete_inode, 1452 .delete_inode = ufs_delete_inode,
1453 .clear_inode = ufs_clear_inode,
1445 .put_super = ufs_put_super, 1454 .put_super = ufs_put_super,
1446 .write_super = ufs_write_super, 1455 .write_super = ufs_write_super,
1447 .sync_fs = ufs_sync_fs, 1456 .sync_fs = ufs_sync_fs,
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 41dd431ce228..d3b6270cb377 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,6 +44,7 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
47 48
48#include "ufs_fs.h" 49#include "ufs_fs.h"
49#include "ufs.h" 50#include "ufs.h"
@@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
517 if (error) 518 if (error)
518 return error; 519 return error;
519 520
521 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
522 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
523 error = dquot_transfer(inode, attr);
524 if (error)
525 return error;
526 }
520 if (ia_valid & ATTR_SIZE && 527 if (ia_valid & ATTR_SIZE &&
521 attr->ia_size != i_size_read(inode)) { 528 attr->ia_size != i_size_read(inode)) {
522 loff_t old_i_size = inode->i_size; 529 loff_t old_i_size = inode->i_size;
530
531 dquot_initialize(inode);
532
523 error = vmtruncate(inode, attr->ia_size); 533 error = vmtruncate(inode, attr->ia_size);
524 if (error) 534 if (error)
525 return error; 535 return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 0b4c39bc0d9e..43f9f5d5670e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
86/* dir.c */ 86/* dir.c */
87extern const struct inode_operations ufs_dir_inode_operations; 87extern const struct inode_operations ufs_dir_inode_operations;
88extern int ufs_add_link (struct dentry *, struct inode *); 88extern int ufs_add_link (struct dentry *, struct inode *);
89extern ino_t ufs_inode_by_name(struct inode *, struct qstr *); 89extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
90extern int ufs_make_empty(struct inode *, struct inode *); 90extern int ufs_make_empty(struct inode *, struct inode *);
91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **); 91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); 92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
93extern int ufs_empty_dir (struct inode *); 93extern int ufs_empty_dir (struct inode *);
94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); 94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
106 106
107/* inode.c */ 107/* inode.c */
108extern struct inode *ufs_iget(struct super_block *, unsigned long); 108extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, int); 109extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 110extern int ufs_sync_inode (struct inode *);
111extern void ufs_delete_inode (struct inode *); 111extern void ufs_delete_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); 112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a80..6943ec677c0b 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16;
138 138
139#define UFS_USEEFT ((__u16)65535) 139#define UFS_USEEFT ((__u16)65535)
140 140
141/* fs_clean values */
141#define UFS_FSOK 0x7c269d38 142#define UFS_FSOK 0x7c269d38
142#define UFS_FSACTIVE ((__s8)0x00) 143#define UFS_FSACTIVE ((__s8)0x00)
143#define UFS_FSCLEAN ((__s8)0x01) 144#define UFS_FSCLEAN ((__s8)0x01)
@@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16;
145#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */ 146#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */
146#define UFS_FSBAD ((__s8)0xff) 147#define UFS_FSBAD ((__s8)0xff)
147 148
149/* Solaris-specific fs_clean values */
150#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */
151#define UFS_FSLOG ((__s8)0xfd) /* logging fs */
152#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */
153
148/* From here to next blank line, s_flags for ufs_sb_info */ 154/* From here to next blank line, s_flags for ufs_sb_info */
149/* directory entry encoding */ 155/* directory entry encoding */
150#define UFS_DE_MASK 0x00000010 /* mask for the following */ 156#define UFS_DE_MASK 0x00000010 /* mask for the following */
@@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16;
227 */ 233 */
228#define ufs_cbtocylno(bno) \ 234#define ufs_cbtocylno(bno) \
229 ((bno) * uspi->s_nspf / uspi->s_spc) 235 ((bno) * uspi->s_nspf / uspi->s_spc)
230#define ufs_cbtorpos(bno) \ 236#define ufs_cbtorpos(bno) \
237 ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \
238 (((((bno) * uspi->s_nspf % uspi->s_spc) % \
239 uspi->s_nsect) * \
240 uspi->s_nrpos) / uspi->s_nsect) \
241 : \
231 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \ 242 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
232 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \ 243 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
233 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \ 244 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
234 * uspi->s_nrpos) / uspi->s_npsect) 245 * uspi->s_nrpos) / uspi->s_npsect))
235 246
236/* 247/*
237 * The following macros optimize certain frequently calculated 248 * The following macros optimize certain frequently calculated
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5c5a366aa332..b4769e40e8bc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -105,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
105 xfs_globals.o \ 105 xfs_globals.o \
106 xfs_ioctl.o \ 106 xfs_ioctl.o \
107 xfs_iops.o \ 107 xfs_iops.o \
108 xfs_lrw.o \
109 xfs_super.o \ 108 xfs_super.o \
110 xfs_sync.o \ 109 xfs_sync.o \
111 xfs_xattr.o) 110 xfs_xattr.o)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 66abe36c1213..99628508cb11 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,6 +39,7 @@
39#include "xfs_iomap.h" 39#include "xfs_iomap.h"
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h"
42#include <linux/mpage.h> 43#include <linux/mpage.h>
43#include <linux/pagevec.h> 44#include <linux/pagevec.h>
44#include <linux/writeback.h> 45#include <linux/writeback.h>
@@ -163,14 +164,17 @@ xfs_ioend_new_eof(
163} 164}
164 165
165/* 166/*
166 * Update on-disk file size now that data has been written to disk. 167 * Update on-disk file size now that data has been written to disk. The
167 * The current in-memory file size is i_size. If a write is beyond 168 * current in-memory file size is i_size. If a write is beyond eof i_new_size
168 * eof i_new_size will be the intended file size until i_size is 169 * will be the intended file size until i_size is updated. If this write does
169 * updated. If this write does not extend all the way to the valid 170 * not extend all the way to the valid file size then restrict this update to
170 * file size then restrict this update to the end of the write. 171 * the end of the write.
172 *
173 * This function does not block as blocking on the inode lock in IO completion
174 * can lead to IO completion order dependency deadlocks.. If it can't get the
175 * inode ilock it will return EAGAIN. Callers must handle this.
171 */ 176 */
172 177STATIC int
173STATIC void
174xfs_setfilesize( 178xfs_setfilesize(
175 xfs_ioend_t *ioend) 179 xfs_ioend_t *ioend)
176{ 180{
@@ -181,16 +185,40 @@ xfs_setfilesize(
181 ASSERT(ioend->io_type != IOMAP_READ); 185 ASSERT(ioend->io_type != IOMAP_READ);
182 186
183 if (unlikely(ioend->io_error)) 187 if (unlikely(ioend->io_error))
184 return; 188 return 0;
189
190 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
191 return EAGAIN;
185 192
186 xfs_ilock(ip, XFS_ILOCK_EXCL);
187 isize = xfs_ioend_new_eof(ioend); 193 isize = xfs_ioend_new_eof(ioend);
188 if (isize) { 194 if (isize) {
189 ip->i_d.di_size = isize; 195 ip->i_d.di_size = isize;
190 xfs_mark_inode_dirty_sync(ip); 196 xfs_mark_inode_dirty(ip);
191 } 197 }
192 198
193 xfs_iunlock(ip, XFS_ILOCK_EXCL); 199 xfs_iunlock(ip, XFS_ILOCK_EXCL);
200 return 0;
201}
202
203/*
204 * Schedule IO completion handling on a xfsdatad if this was
205 * the final hold on this ioend. If we are asked to wait,
206 * flush the workqueue.
207 */
208STATIC void
209xfs_finish_ioend(
210 xfs_ioend_t *ioend,
211 int wait)
212{
213 if (atomic_dec_and_test(&ioend->io_remaining)) {
214 struct workqueue_struct *wq;
215
216 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
217 xfsconvertd_workqueue : xfsdatad_workqueue;
218 queue_work(wq, &ioend->io_work);
219 if (wait)
220 flush_workqueue(wq);
221 }
194} 222}
195 223
196/* 224/*
@@ -198,11 +226,11 @@ xfs_setfilesize(
198 */ 226 */
199STATIC void 227STATIC void
200xfs_end_io( 228xfs_end_io(
201 struct work_struct *work) 229 struct work_struct *work)
202{ 230{
203 xfs_ioend_t *ioend = 231 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
204 container_of(work, xfs_ioend_t, io_work); 232 struct xfs_inode *ip = XFS_I(ioend->io_inode);
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 233 int error = 0;
206 234
207 /* 235 /*
208 * For unwritten extents we need to issue transactions to convert a 236 * For unwritten extents we need to issue transactions to convert a
@@ -210,7 +238,6 @@ xfs_end_io(
210 */ 238 */
211 if (ioend->io_type == IOMAP_UNWRITTEN && 239 if (ioend->io_type == IOMAP_UNWRITTEN &&
212 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 240 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
213 int error;
214 241
215 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 242 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
216 ioend->io_size); 243 ioend->io_size);
@@ -222,30 +249,23 @@ xfs_end_io(
222 * We might have to update the on-disk file size after extending 249 * We might have to update the on-disk file size after extending
223 * writes. 250 * writes.
224 */ 251 */
225 if (ioend->io_type != IOMAP_READ) 252 if (ioend->io_type != IOMAP_READ) {
226 xfs_setfilesize(ioend); 253 error = xfs_setfilesize(ioend);
227 xfs_destroy_ioend(ioend); 254 ASSERT(!error || error == EAGAIN);
228}
229
230/*
231 * Schedule IO completion handling on a xfsdatad if this was
232 * the final hold on this ioend. If we are asked to wait,
233 * flush the workqueue.
234 */
235STATIC void
236xfs_finish_ioend(
237 xfs_ioend_t *ioend,
238 int wait)
239{
240 if (atomic_dec_and_test(&ioend->io_remaining)) {
241 struct workqueue_struct *wq;
242
243 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
244 xfsconvertd_workqueue : xfsdatad_workqueue;
245 queue_work(wq, &ioend->io_work);
246 if (wait)
247 flush_workqueue(wq);
248 } 255 }
256
257 /*
258 * If we didn't complete processing of the ioend, requeue it to the
259 * tail of the workqueue for another attempt later. Otherwise destroy
260 * it.
261 */
262 if (error == EAGAIN) {
263 atomic_inc(&ioend->io_remaining);
264 xfs_finish_ioend(ioend, 0);
265 /* ensure we don't spin on blocked ioends */
266 delay(1);
267 } else
268 xfs_destroy_ioend(ioend);
249} 269}
250 270
251/* 271/*
@@ -341,7 +361,7 @@ xfs_submit_ioend_bio(
341 * but don't update the inode size until I/O completion. 361 * but don't update the inode size until I/O completion.
342 */ 362 */
343 if (xfs_ioend_new_eof(ioend)) 363 if (xfs_ioend_new_eof(ioend))
344 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); 364 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
345 365
346 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 366 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
347 WRITE_SYNC_PLUG : WRITE, bio); 367 WRITE_SYNC_PLUG : WRITE, bio);
@@ -874,6 +894,125 @@ xfs_cluster_write(
874 } 894 }
875} 895}
876 896
897STATIC void
898xfs_vm_invalidatepage(
899 struct page *page,
900 unsigned long offset)
901{
902 trace_xfs_invalidatepage(page->mapping->host, page, offset);
903 block_invalidatepage(page, offset);
904}
905
906/*
907 * If the page has delalloc buffers on it, we need to punch them out before we
908 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
909 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
910 * is done on that same region - the delalloc extent is returned when none is
911 * supposed to be there.
912 *
913 * We prevent this by truncating away the delalloc regions on the page before
914 * invalidating it. Because they are delalloc, we can do this without needing a
915 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
916 * truncation without a transaction as there is no space left for block
917 * reservation (typically why we see a ENOSPC in writeback).
918 *
919 * This is not a performance critical path, so for now just do the punching a
920 * buffer head at a time.
921 */
922STATIC void
923xfs_aops_discard_page(
924 struct page *page)
925{
926 struct inode *inode = page->mapping->host;
927 struct xfs_inode *ip = XFS_I(inode);
928 struct buffer_head *bh, *head;
929 loff_t offset = page_offset(page);
930 ssize_t len = 1 << inode->i_blkbits;
931
932 if (!xfs_is_delayed_page(page, IOMAP_DELAY))
933 goto out_invalidate;
934
935 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
936 goto out_invalidate;
937
938 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
939 "page discard on page %p, inode 0x%llx, offset %llu.",
940 page, ip->i_ino, offset);
941
942 xfs_ilock(ip, XFS_ILOCK_EXCL);
943 bh = head = page_buffers(page);
944 do {
945 int done;
946 xfs_fileoff_t offset_fsb;
947 xfs_bmbt_irec_t imap;
948 int nimaps = 1;
949 int error;
950 xfs_fsblock_t firstblock;
951 xfs_bmap_free_t flist;
952
953 if (!buffer_delay(bh))
954 goto next_buffer;
955
956 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
957
958 /*
959 * Map the range first and check that it is a delalloc extent
960 * before trying to unmap the range. Otherwise we will be
961 * trying to remove a real extent (which requires a
962 * transaction) or a hole, which is probably a bad idea...
963 */
964 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
965 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
966 &nimaps, NULL, NULL);
967
968 if (error) {
969 /* something screwed, just bail */
970 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
971 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
972 "page discard failed delalloc mapping lookup.");
973 }
974 break;
975 }
976 if (!nimaps) {
977 /* nothing there */
978 goto next_buffer;
979 }
980 if (imap.br_startblock != DELAYSTARTBLOCK) {
981 /* been converted, ignore */
982 goto next_buffer;
983 }
984 WARN_ON(imap.br_blockcount == 0);
985
986 /*
987 * Note: while we initialise the firstblock/flist pair, they
988 * should never be used because blocks should never be
989 * allocated or freed for a delalloc extent and hence we need
990 * don't cancel or finish them after the xfs_bunmapi() call.
991 */
992 xfs_bmap_init(&flist, &firstblock);
993 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
994 &flist, NULL, &done);
995
996 ASSERT(!flist.xbf_count && !flist.xbf_first);
997 if (error) {
998 /* something screwed, just bail */
999 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1000 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
1001 "page discard unable to remove delalloc mapping.");
1002 }
1003 break;
1004 }
1005next_buffer:
1006 offset += len;
1007
1008 } while ((bh = bh->b_this_page) != head);
1009
1010 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1011out_invalidate:
1012 xfs_vm_invalidatepage(page, 0);
1013 return;
1014}
1015
877/* 1016/*
878 * Calling this without startio set means we are being asked to make a dirty 1017 * Calling this without startio set means we are being asked to make a dirty
879 * page ready for freeing it's buffers. When called with startio set then 1018 * page ready for freeing it's buffers. When called with startio set then
@@ -1125,7 +1264,7 @@ error:
1125 */ 1264 */
1126 if (err != -EAGAIN) { 1265 if (err != -EAGAIN) {
1127 if (!unmapped) 1266 if (!unmapped)
1128 block_invalidatepage(page, 0); 1267 xfs_aops_discard_page(page);
1129 ClearPageUptodate(page); 1268 ClearPageUptodate(page);
1130 } 1269 }
1131 return err; 1270 return err;
@@ -1535,15 +1674,6 @@ xfs_vm_readpages(
1535 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1674 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1536} 1675}
1537 1676
1538STATIC void
1539xfs_vm_invalidatepage(
1540 struct page *page,
1541 unsigned long offset)
1542{
1543 trace_xfs_invalidatepage(page->mapping->host, page, offset);
1544 block_invalidatepage(page, offset);
1545}
1546
1547const struct address_space_operations xfs_address_space_operations = { 1677const struct address_space_operations xfs_address_space_operations = {
1548 .readpage = xfs_vm_readpage, 1678 .readpage = xfs_vm_readpage,
1549 .readpages = xfs_vm_readpages, 1679 .readpages = xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6f76ba85f193..bd111b7e1daa 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -168,75 +168,6 @@ test_page_region(
168} 168}
169 169
170/* 170/*
171 * Mapping of multi-page buffers into contiguous virtual space
172 */
173
174typedef struct a_list {
175 void *vm_addr;
176 struct a_list *next;
177} a_list_t;
178
179static a_list_t *as_free_head;
180static int as_list_len;
181static DEFINE_SPINLOCK(as_lock);
182
183/*
184 * Try to batch vunmaps because they are costly.
185 */
186STATIC void
187free_address(
188 void *addr)
189{
190 a_list_t *aentry;
191
192#ifdef CONFIG_XEN
193 /*
194 * Xen needs to be able to make sure it can get an exclusive
195 * RO mapping of pages it wants to turn into a pagetable. If
196 * a newly allocated page is also still being vmap()ed by xfs,
197 * it will cause pagetable construction to fail. This is a
198 * quick workaround to always eagerly unmap pages so that Xen
199 * is happy.
200 */
201 vunmap(addr);
202 return;
203#endif
204
205 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
206 if (likely(aentry)) {
207 spin_lock(&as_lock);
208 aentry->next = as_free_head;
209 aentry->vm_addr = addr;
210 as_free_head = aentry;
211 as_list_len++;
212 spin_unlock(&as_lock);
213 } else {
214 vunmap(addr);
215 }
216}
217
218STATIC void
219purge_addresses(void)
220{
221 a_list_t *aentry, *old;
222
223 if (as_free_head == NULL)
224 return;
225
226 spin_lock(&as_lock);
227 aentry = as_free_head;
228 as_free_head = NULL;
229 as_list_len = 0;
230 spin_unlock(&as_lock);
231
232 while ((old = aentry) != NULL) {
233 vunmap(aentry->vm_addr);
234 aentry = aentry->next;
235 kfree(old);
236 }
237}
238
239/*
240 * Internal xfs_buf_t object manipulation 171 * Internal xfs_buf_t object manipulation
241 */ 172 */
242 173
@@ -337,7 +268,8 @@ xfs_buf_free(
337 uint i; 268 uint i;
338 269
339 if (xfs_buf_is_vmapped(bp)) 270 if (xfs_buf_is_vmapped(bp))
340 free_address(bp->b_addr - bp->b_offset); 271 vm_unmap_ram(bp->b_addr - bp->b_offset,
272 bp->b_page_count);
341 273
342 for (i = 0; i < bp->b_page_count; i++) { 274 for (i = 0; i < bp->b_page_count; i++) {
343 struct page *page = bp->b_pages[i]; 275 struct page *page = bp->b_pages[i];
@@ -457,10 +389,8 @@ _xfs_buf_map_pages(
457 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
458 bp->b_flags |= XBF_MAPPED; 390 bp->b_flags |= XBF_MAPPED;
459 } else if (flags & XBF_MAPPED) { 391 } else if (flags & XBF_MAPPED) {
460 if (as_list_len > 64) 392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
461 purge_addresses(); 393 -1, PAGE_KERNEL);
462 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
463 VM_MAP, PAGE_KERNEL);
464 if (unlikely(bp->b_addr == NULL)) 394 if (unlikely(bp->b_addr == NULL))
465 return -ENOMEM; 395 return -ENOMEM;
466 bp->b_addr += bp->b_offset; 396 bp->b_addr += bp->b_offset;
@@ -1955,9 +1885,6 @@ xfsbufd(
1955 xfs_buf_iostrategy(bp); 1885 xfs_buf_iostrategy(bp);
1956 count++; 1886 count++;
1957 } 1887 }
1958
1959 if (as_list_len > 0)
1960 purge_addresses();
1961 if (count) 1888 if (count)
1962 blk_run_address_space(target->bt_mapping); 1889 blk_run_address_space(target->bt_mapping);
1963 1890
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 87b8cbd23d4b..846b75aeb2ab 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_vnodeops.h" 29#include "xfs_vnodeops.h"
30#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_inode_item.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
@@ -215,9 +216,28 @@ xfs_fs_get_parent(
215 return d_obtain_alias(VFS_I(cip)); 216 return d_obtain_alias(VFS_I(cip));
216} 217}
217 218
219STATIC int
220xfs_fs_nfs_commit_metadata(
221 struct inode *inode)
222{
223 struct xfs_inode *ip = XFS_I(inode);
224 struct xfs_mount *mp = ip->i_mount;
225 int error = 0;
226
227 xfs_ilock(ip, XFS_ILOCK_SHARED);
228 if (xfs_ipincount(ip)) {
229 error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
230 XFS_LOG_SYNC, NULL);
231 }
232 xfs_iunlock(ip, XFS_ILOCK_SHARED);
233
234 return error;
235}
236
218const struct export_operations xfs_export_operations = { 237const struct export_operations xfs_export_operations = {
219 .encode_fh = xfs_fs_encode_fh, 238 .encode_fh = xfs_fs_encode_fh,
220 .fh_to_dentry = xfs_fs_fh_to_dentry, 239 .fh_to_dentry = xfs_fs_fh_to_dentry,
221 .fh_to_parent = xfs_fs_fh_to_parent, 240 .fh_to_parent = xfs_fs_fh_to_parent,
222 .get_parent = xfs_fs_get_parent, 241 .get_parent = xfs_fs_get_parent,
242 .commit_metadata = xfs_fs_nfs_commit_metadata,
223}; 243};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e4caeb28ce2e..42dd3bcfba6b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h"
19#include "xfs_bit.h" 20#include "xfs_bit.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_inum.h" 22#include "xfs_inum.h"
@@ -34,52 +35,279 @@
34#include "xfs_dir2_sf.h" 35#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 36#include "xfs_dinode.h"
36#include "xfs_inode.h" 37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
37#include "xfs_error.h" 40#include "xfs_error.h"
38#include "xfs_rw.h" 41#include "xfs_rw.h"
39#include "xfs_vnodeops.h" 42#include "xfs_vnodeops.h"
40#include "xfs_da_btree.h" 43#include "xfs_da_btree.h"
41#include "xfs_ioctl.h" 44#include "xfs_ioctl.h"
45#include "xfs_trace.h"
42 46
43#include <linux/dcache.h> 47#include <linux/dcache.h>
44 48
45static const struct vm_operations_struct xfs_file_vm_ops; 49static const struct vm_operations_struct xfs_file_vm_ops;
46 50
47STATIC ssize_t 51/*
48xfs_file_aio_read( 52 * xfs_iozero
49 struct kiocb *iocb, 53 *
50 const struct iovec *iov, 54 * xfs_iozero clears the specified range of buffer supplied,
51 unsigned long nr_segs, 55 * and marks all the affected blocks as valid and modified. If
52 loff_t pos) 56 * an affected block is not allocated, it will be allocated. If
57 * an affected block is not completely overwritten, and is not
58 * valid before the operation, it will be read from disk before
59 * being partially zeroed.
60 */
61STATIC int
62xfs_iozero(
63 struct xfs_inode *ip, /* inode */
64 loff_t pos, /* offset in file */
65 size_t count) /* size of data to zero */
53{ 66{
54 struct file *file = iocb->ki_filp; 67 struct page *page;
55 int ioflags = 0; 68 struct address_space *mapping;
69 int status;
56 70
57 BUG_ON(iocb->ki_pos != pos); 71 mapping = VFS_I(ip)->i_mapping;
58 if (unlikely(file->f_flags & O_DIRECT)) 72 do {
59 ioflags |= IO_ISDIRECT; 73 unsigned offset, bytes;
60 if (file->f_mode & FMODE_NOCMTIME) 74 void *fsdata;
61 ioflags |= IO_INVIS; 75
62 return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, 76 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
63 nr_segs, &iocb->ki_pos, ioflags); 77 bytes = PAGE_CACHE_SIZE - offset;
78 if (bytes > count)
79 bytes = count;
80
81 status = pagecache_write_begin(NULL, mapping, pos, bytes,
82 AOP_FLAG_UNINTERRUPTIBLE,
83 &page, &fsdata);
84 if (status)
85 break;
86
87 zero_user(page, offset, bytes);
88
89 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
90 page, fsdata);
91 WARN_ON(status <= 0); /* can't return less than zero! */
92 pos += bytes;
93 count -= bytes;
94 status = 0;
95 } while (count);
96
97 return (-status);
98}
99
100STATIC int
101xfs_file_fsync(
102 struct file *file,
103 struct dentry *dentry,
104 int datasync)
105{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode);
107 struct xfs_trans *tp;
108 int error = 0;
109 int log_flushed = 0;
110
111 xfs_itrace_entry(ip);
112
113 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
114 return -XFS_ERROR(EIO);
115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117
118 /*
119 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the
121 * log because of committed transactions that haven't hit the disk yet.
122 * Likewise, there could be unflushed non-transactional changes to the
123 * inode core that have to go to disk and this requires us to issue
124 * a synchronous transaction to capture these changes correctly.
125 *
126 * This code relies on the assumption that if the i_update_core field
127 * of the inode is clear and the inode is unpinned then it is clean
128 * and no action is required.
129 */
130 xfs_ilock(ip, XFS_ILOCK_SHARED);
131
132 /*
133 * First check if the VFS inode is marked dirty. All the dirtying
134 * of non-transactional updates no goes through mark_inode_dirty*,
135 * which allows us to distinguish beteeen pure timestamp updates
136 * and i_size updates which need to be caught for fdatasync.
137 * After that also theck for the dirty state in the XFS inode, which
138 * might gets cleared when the inode gets written out via the AIL
139 * or xfs_iflush_cluster.
140 */
141 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
142 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
143 ip->i_update_core) {
144 /*
145 * Kick off a transaction to log the inode core to get the
146 * updates. The sync transaction will also force the log.
147 */
148 xfs_iunlock(ip, XFS_ILOCK_SHARED);
149 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
150 error = xfs_trans_reserve(tp, 0,
151 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
152 if (error) {
153 xfs_trans_cancel(tp, 0);
154 return -error;
155 }
156 xfs_ilock(ip, XFS_ILOCK_EXCL);
157
158 /*
159 * Note - it's possible that we might have pushed ourselves out
160 * of the way during trans_reserve which would flush the inode.
161 * But there's no guarantee that the inode buffer has actually
162 * gone out yet (it's delwri). Plus the buffer could be pinned
163 * anyway if it's part of an inode in another recent
164 * transaction. So we play it safe and fire off the
165 * transaction anyway.
166 */
167 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
168 xfs_trans_ihold(tp, ip);
169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
170 xfs_trans_set_sync(tp);
171 error = _xfs_trans_commit(tp, 0, &log_flushed);
172
173 xfs_iunlock(ip, XFS_ILOCK_EXCL);
174 } else {
175 /*
176 * Timestamps/size haven't changed since last inode flush or
177 * inode transaction commit. That means either nothing got
178 * written or a transaction committed which caught the updates.
179 * If the latter happened and the transaction hasn't hit the
180 * disk yet, the inode will be still be pinned. If it is,
181 * force the log.
182 */
183 if (xfs_ipincount(ip)) {
184 error = _xfs_log_force_lsn(ip->i_mount,
185 ip->i_itemp->ili_last_lsn,
186 XFS_LOG_SYNC, &log_flushed);
187 }
188 xfs_iunlock(ip, XFS_ILOCK_SHARED);
189 }
190
191 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
192 /*
193 * If the log write didn't issue an ordered tag we need
194 * to flush the disk cache for the data device now.
195 */
196 if (!log_flushed)
197 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
198
199 /*
200 * If this inode is on the RT dev we need to flush that
201 * cache as well.
202 */
203 if (XFS_IS_REALTIME_INODE(ip))
204 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
205 }
206
207 return -error;
64} 208}
65 209
66STATIC ssize_t 210STATIC ssize_t
67xfs_file_aio_write( 211xfs_file_aio_read(
68 struct kiocb *iocb, 212 struct kiocb *iocb,
69 const struct iovec *iov, 213 const struct iovec *iovp,
70 unsigned long nr_segs, 214 unsigned long nr_segs,
71 loff_t pos) 215 loff_t pos)
72{ 216{
73 struct file *file = iocb->ki_filp; 217 struct file *file = iocb->ki_filp;
218 struct inode *inode = file->f_mapping->host;
219 struct xfs_inode *ip = XFS_I(inode);
220 struct xfs_mount *mp = ip->i_mount;
221 size_t size = 0;
222 ssize_t ret = 0;
74 int ioflags = 0; 223 int ioflags = 0;
224 xfs_fsize_t n;
225 unsigned long seg;
226
227 XFS_STATS_INC(xs_read_calls);
75 228
76 BUG_ON(iocb->ki_pos != pos); 229 BUG_ON(iocb->ki_pos != pos);
230
77 if (unlikely(file->f_flags & O_DIRECT)) 231 if (unlikely(file->f_flags & O_DIRECT))
78 ioflags |= IO_ISDIRECT; 232 ioflags |= IO_ISDIRECT;
79 if (file->f_mode & FMODE_NOCMTIME) 233 if (file->f_mode & FMODE_NOCMTIME)
80 ioflags |= IO_INVIS; 234 ioflags |= IO_INVIS;
81 return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, 235
82 &iocb->ki_pos, ioflags); 236 /* START copy & waste from filemap.c */
237 for (seg = 0; seg < nr_segs; seg++) {
238 const struct iovec *iv = &iovp[seg];
239
240 /*
241 * If any segment has a negative length, or the cumulative
242 * length ever wraps negative then return -EINVAL.
243 */
244 size += iv->iov_len;
245 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
246 return XFS_ERROR(-EINVAL);
247 }
248 /* END copy & waste from filemap.c */
249
250 if (unlikely(ioflags & IO_ISDIRECT)) {
251 xfs_buftarg_t *target =
252 XFS_IS_REALTIME_INODE(ip) ?
253 mp->m_rtdev_targp : mp->m_ddev_targp;
254 if ((iocb->ki_pos & target->bt_smask) ||
255 (size & target->bt_smask)) {
256 if (iocb->ki_pos == ip->i_size)
257 return 0;
258 return -XFS_ERROR(EINVAL);
259 }
260 }
261
262 n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
263 if (n <= 0 || size == 0)
264 return 0;
265
266 if (n < size)
267 size = n;
268
269 if (XFS_FORCED_SHUTDOWN(mp))
270 return -EIO;
271
272 if (unlikely(ioflags & IO_ISDIRECT))
273 mutex_lock(&inode->i_mutex);
274 xfs_ilock(ip, XFS_IOLOCK_SHARED);
275
276 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
277 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
278 int iolock = XFS_IOLOCK_SHARED;
279
280 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
281 dmflags, &iolock);
282 if (ret) {
283 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
284 if (unlikely(ioflags & IO_ISDIRECT))
285 mutex_unlock(&inode->i_mutex);
286 return ret;
287 }
288 }
289
290 if (unlikely(ioflags & IO_ISDIRECT)) {
291 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip,
293 (iocb->ki_pos & PAGE_CACHE_MASK),
294 -1, FI_REMAPF_LOCKED);
295 }
296 mutex_unlock(&inode->i_mutex);
297 if (ret) {
298 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
299 return ret;
300 }
301 }
302
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
304
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
306 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret);
308
309 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
310 return ret;
83} 311}
84 312
85STATIC ssize_t 313STATIC ssize_t
@@ -87,16 +315,44 @@ xfs_file_splice_read(
87 struct file *infilp, 315 struct file *infilp,
88 loff_t *ppos, 316 loff_t *ppos,
89 struct pipe_inode_info *pipe, 317 struct pipe_inode_info *pipe,
90 size_t len, 318 size_t count,
91 unsigned int flags) 319 unsigned int flags)
92{ 320{
321 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
322 struct xfs_mount *mp = ip->i_mount;
93 int ioflags = 0; 323 int ioflags = 0;
324 ssize_t ret;
325
326 XFS_STATS_INC(xs_read_calls);
94 327
95 if (infilp->f_mode & FMODE_NOCMTIME) 328 if (infilp->f_mode & FMODE_NOCMTIME)
96 ioflags |= IO_INVIS; 329 ioflags |= IO_INVIS;
97 330
98 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), 331 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
99 infilp, ppos, pipe, len, flags, ioflags); 332 return -EIO;
333
334 xfs_ilock(ip, XFS_IOLOCK_SHARED);
335
336 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
337 int iolock = XFS_IOLOCK_SHARED;
338 int error;
339
340 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
341 FILP_DELAY_FLAG(infilp), &iolock);
342 if (error) {
343 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
344 return -error;
345 }
346 }
347
348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
349
350 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
351 if (ret > 0)
352 XFS_STATS_ADD(xs_read_bytes, ret);
353
354 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
355 return ret;
100} 356}
101 357
102STATIC ssize_t 358STATIC ssize_t
@@ -104,16 +360,538 @@ xfs_file_splice_write(
104 struct pipe_inode_info *pipe, 360 struct pipe_inode_info *pipe,
105 struct file *outfilp, 361 struct file *outfilp,
106 loff_t *ppos, 362 loff_t *ppos,
107 size_t len, 363 size_t count,
108 unsigned int flags) 364 unsigned int flags)
109{ 365{
366 struct inode *inode = outfilp->f_mapping->host;
367 struct xfs_inode *ip = XFS_I(inode);
368 struct xfs_mount *mp = ip->i_mount;
369 xfs_fsize_t isize, new_size;
110 int ioflags = 0; 370 int ioflags = 0;
371 ssize_t ret;
372
373 XFS_STATS_INC(xs_write_calls);
111 374
112 if (outfilp->f_mode & FMODE_NOCMTIME) 375 if (outfilp->f_mode & FMODE_NOCMTIME)
113 ioflags |= IO_INVIS; 376 ioflags |= IO_INVIS;
114 377
115 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), 378 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
116 pipe, outfilp, ppos, len, flags, ioflags); 379 return -EIO;
380
381 xfs_ilock(ip, XFS_IOLOCK_EXCL);
382
383 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
384 int iolock = XFS_IOLOCK_EXCL;
385 int error;
386
387 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
388 FILP_DELAY_FLAG(outfilp), &iolock);
389 if (error) {
390 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
391 return -error;
392 }
393 }
394
395 new_size = *ppos + count;
396
397 xfs_ilock(ip, XFS_ILOCK_EXCL);
398 if (new_size > ip->i_size)
399 ip->i_new_size = new_size;
400 xfs_iunlock(ip, XFS_ILOCK_EXCL);
401
402 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
403
404 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
405 if (ret > 0)
406 XFS_STATS_ADD(xs_write_bytes, ret);
407
408 isize = i_size_read(inode);
409 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
410 *ppos = isize;
411
412 if (*ppos > ip->i_size) {
413 xfs_ilock(ip, XFS_ILOCK_EXCL);
414 if (*ppos > ip->i_size)
415 ip->i_size = *ppos;
416 xfs_iunlock(ip, XFS_ILOCK_EXCL);
417 }
418
419 if (ip->i_new_size) {
420 xfs_ilock(ip, XFS_ILOCK_EXCL);
421 ip->i_new_size = 0;
422 if (ip->i_d.di_size > ip->i_size)
423 ip->i_d.di_size = ip->i_size;
424 xfs_iunlock(ip, XFS_ILOCK_EXCL);
425 }
426 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
427 return ret;
428}
429
430/*
431 * This routine is called to handle zeroing any space in the last
432 * block of the file that is beyond the EOF. We do this since the
433 * size is being increased without writing anything to that block
434 * and we don't want anyone to read the garbage on the disk.
435 */
436STATIC int /* error (positive) */
437xfs_zero_last_block(
438 xfs_inode_t *ip,
439 xfs_fsize_t offset,
440 xfs_fsize_t isize)
441{
442 xfs_fileoff_t last_fsb;
443 xfs_mount_t *mp = ip->i_mount;
444 int nimaps;
445 int zero_offset;
446 int zero_len;
447 int error = 0;
448 xfs_bmbt_irec_t imap;
449
450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
451
452 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
453 if (zero_offset == 0) {
454 /*
455 * There are no extra bytes in the last block on disk to
456 * zero, so return.
457 */
458 return 0;
459 }
460
461 last_fsb = XFS_B_TO_FSBT(mp, isize);
462 nimaps = 1;
463 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
464 &nimaps, NULL, NULL);
465 if (error) {
466 return error;
467 }
468 ASSERT(nimaps > 0);
469 /*
470 * If the block underlying isize is just a hole, then there
471 * is nothing to zero.
472 */
473 if (imap.br_startblock == HOLESTARTBLOCK) {
474 return 0;
475 }
476 /*
477 * Zero the part of the last block beyond the EOF, and write it
478 * out sync. We need to drop the ilock while we do this so we
479 * don't deadlock when the buffer cache calls back to us.
480 */
481 xfs_iunlock(ip, XFS_ILOCK_EXCL);
482
483 zero_len = mp->m_sb.sb_blocksize - zero_offset;
484 if (isize + zero_len > offset)
485 zero_len = offset - isize;
486 error = xfs_iozero(ip, isize, zero_len);
487
488 xfs_ilock(ip, XFS_ILOCK_EXCL);
489 ASSERT(error >= 0);
490 return error;
491}
492
493/*
494 * Zero any on disk space between the current EOF and the new,
495 * larger EOF. This handles the normal case of zeroing the remainder
496 * of the last block in the file and the unusual case of zeroing blocks
497 * out beyond the size of the file. This second case only happens
498 * with fixed size extents and when the system crashes before the inode
499 * size was updated but after blocks were allocated. If fill is set,
500 * then any holes in the range are filled and zeroed. If not, the holes
501 * are left alone as holes.
502 */
503
504int /* error (positive) */
505xfs_zero_eof(
506 xfs_inode_t *ip,
507 xfs_off_t offset, /* starting I/O offset */
508 xfs_fsize_t isize) /* current inode size */
509{
510 xfs_mount_t *mp = ip->i_mount;
511 xfs_fileoff_t start_zero_fsb;
512 xfs_fileoff_t end_zero_fsb;
513 xfs_fileoff_t zero_count_fsb;
514 xfs_fileoff_t last_fsb;
515 xfs_fileoff_t zero_off;
516 xfs_fsize_t zero_len;
517 int nimaps;
518 int error = 0;
519 xfs_bmbt_irec_t imap;
520
521 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
522 ASSERT(offset > isize);
523
524 /*
525 * First handle zeroing the block on which isize resides.
526 * We only zero a part of that block so it is handled specially.
527 */
528 error = xfs_zero_last_block(ip, offset, isize);
529 if (error) {
530 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
531 return error;
532 }
533
534 /*
535 * Calculate the range between the new size and the old
536 * where blocks needing to be zeroed may exist. To get the
537 * block where the last byte in the file currently resides,
538 * we need to subtract one from the size and truncate back
539 * to a block boundary. We subtract 1 in case the size is
540 * exactly on a block boundary.
541 */
542 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
543 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
544 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
545 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
546 if (last_fsb == end_zero_fsb) {
547 /*
548 * The size was only incremented on its last block.
549 * We took care of that above, so just return.
550 */
551 return 0;
552 }
553
554 ASSERT(start_zero_fsb <= end_zero_fsb);
555 while (start_zero_fsb <= end_zero_fsb) {
556 nimaps = 1;
557 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
558 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
559 0, NULL, 0, &imap, &nimaps, NULL, NULL);
560 if (error) {
561 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
562 return error;
563 }
564 ASSERT(nimaps > 0);
565
566 if (imap.br_state == XFS_EXT_UNWRITTEN ||
567 imap.br_startblock == HOLESTARTBLOCK) {
568 /*
569 * This loop handles initializing pages that were
570 * partially initialized by the code below this
571 * loop. It basically zeroes the part of the page
572 * that sits on a hole and sets the page as P_HOLE
573 * and calls remapf if it is a mapped file.
574 */
575 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
576 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
577 continue;
578 }
579
580 /*
581 * There are blocks we need to zero.
582 * Drop the inode lock while we're doing the I/O.
583 * We'll still have the iolock to protect us.
584 */
585 xfs_iunlock(ip, XFS_ILOCK_EXCL);
586
587 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
588 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
589
590 if ((zero_off + zero_len) > offset)
591 zero_len = offset - zero_off;
592
593 error = xfs_iozero(ip, zero_off, zero_len);
594 if (error) {
595 goto out_lock;
596 }
597
598 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
599 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
600
601 xfs_ilock(ip, XFS_ILOCK_EXCL);
602 }
603
604 return 0;
605
606out_lock:
607 xfs_ilock(ip, XFS_ILOCK_EXCL);
608 ASSERT(error >= 0);
609 return error;
610}
611
612STATIC ssize_t
613xfs_file_aio_write(
614 struct kiocb *iocb,
615 const struct iovec *iovp,
616 unsigned long nr_segs,
617 loff_t pos)
618{
619 struct file *file = iocb->ki_filp;
620 struct address_space *mapping = file->f_mapping;
621 struct inode *inode = mapping->host;
622 struct xfs_inode *ip = XFS_I(inode);
623 struct xfs_mount *mp = ip->i_mount;
624 ssize_t ret = 0, error = 0;
625 int ioflags = 0;
626 xfs_fsize_t isize, new_size;
627 int iolock;
628 int eventsent = 0;
629 size_t ocount = 0, count;
630 int need_i_mutex;
631
632 XFS_STATS_INC(xs_write_calls);
633
634 BUG_ON(iocb->ki_pos != pos);
635
636 if (unlikely(file->f_flags & O_DIRECT))
637 ioflags |= IO_ISDIRECT;
638 if (file->f_mode & FMODE_NOCMTIME)
639 ioflags |= IO_INVIS;
640
641 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
642 if (error)
643 return error;
644
645 count = ocount;
646 if (count == 0)
647 return 0;
648
649 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
650
651 if (XFS_FORCED_SHUTDOWN(mp))
652 return -EIO;
653
654relock:
655 if (ioflags & IO_ISDIRECT) {
656 iolock = XFS_IOLOCK_SHARED;
657 need_i_mutex = 0;
658 } else {
659 iolock = XFS_IOLOCK_EXCL;
660 need_i_mutex = 1;
661 mutex_lock(&inode->i_mutex);
662 }
663
664 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
665
666start:
667 error = -generic_write_checks(file, &pos, &count,
668 S_ISBLK(inode->i_mode));
669 if (error) {
670 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
671 goto out_unlock_mutex;
672 }
673
674 if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
675 !(ioflags & IO_INVIS) && !eventsent)) {
676 int dmflags = FILP_DELAY_FLAG(file);
677
678 if (need_i_mutex)
679 dmflags |= DM_FLAGS_IMUX;
680
681 xfs_iunlock(ip, XFS_ILOCK_EXCL);
682 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
683 pos, count, dmflags, &iolock);
684 if (error) {
685 goto out_unlock_internal;
686 }
687 xfs_ilock(ip, XFS_ILOCK_EXCL);
688 eventsent = 1;
689
690 /*
691 * The iolock was dropped and reacquired in XFS_SEND_DATA
692 * so we have to recheck the size when appending.
693 * We will only "goto start;" once, since having sent the
694 * event prevents another call to XFS_SEND_DATA, which is
695 * what allows the size to change in the first place.
696 */
697 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
698 goto start;
699 }
700
701 if (ioflags & IO_ISDIRECT) {
702 xfs_buftarg_t *target =
703 XFS_IS_REALTIME_INODE(ip) ?
704 mp->m_rtdev_targp : mp->m_ddev_targp;
705
706 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
707 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
708 return XFS_ERROR(-EINVAL);
709 }
710
711 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
712 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
713 iolock = XFS_IOLOCK_EXCL;
714 need_i_mutex = 1;
715 mutex_lock(&inode->i_mutex);
716 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
717 goto start;
718 }
719 }
720
721 new_size = pos + count;
722 if (new_size > ip->i_size)
723 ip->i_new_size = new_size;
724
725 if (likely(!(ioflags & IO_INVIS)))
726 file_update_time(file);
727
728 /*
729 * If the offset is beyond the size of the file, we have a couple
730 * of things to do. First, if there is already space allocated
731 * we need to either create holes or zero the disk or ...
732 *
733 * If there is a page where the previous size lands, we need
734 * to zero it out up to the new size.
735 */
736
737 if (pos > ip->i_size) {
738 error = xfs_zero_eof(ip, pos, ip->i_size);
739 if (error) {
740 xfs_iunlock(ip, XFS_ILOCK_EXCL);
741 goto out_unlock_internal;
742 }
743 }
744 xfs_iunlock(ip, XFS_ILOCK_EXCL);
745
746 /*
747 * If we're writing the file then make sure to clear the
748 * setuid and setgid bits if the process is not being run
749 * by root. This keeps people from modifying setuid and
750 * setgid binaries.
751 */
752 error = -file_remove_suid(file);
753 if (unlikely(error))
754 goto out_unlock_internal;
755
756 /* We can write back this queue in page reclaim */
757 current->backing_dev_info = mapping->backing_dev_info;
758
759 if ((ioflags & IO_ISDIRECT)) {
760 if (mapping->nrpages) {
761 WARN_ON(need_i_mutex == 0);
762 error = xfs_flushinval_pages(ip,
763 (pos & PAGE_CACHE_MASK),
764 -1, FI_REMAPF_LOCKED);
765 if (error)
766 goto out_unlock_internal;
767 }
768
769 if (need_i_mutex) {
770 /* demote the lock now the cached pages are gone */
771 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 mutex_unlock(&inode->i_mutex);
773
774 iolock = XFS_IOLOCK_SHARED;
775 need_i_mutex = 0;
776 }
777
778 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
779 ret = generic_file_direct_write(iocb, iovp,
780 &nr_segs, pos, &iocb->ki_pos, count, ocount);
781
782 /*
783 * direct-io write to a hole: fall through to buffered I/O
784 * for completing the rest of the request.
785 */
786 if (ret >= 0 && ret != count) {
787 XFS_STATS_ADD(xs_write_bytes, ret);
788
789 pos += ret;
790 count -= ret;
791
792 ioflags &= ~IO_ISDIRECT;
793 xfs_iunlock(ip, iolock);
794 goto relock;
795 }
796 } else {
797 int enospc = 0;
798 ssize_t ret2 = 0;
799
800write_retry:
801 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
802 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
803 pos, &iocb->ki_pos, count, ret);
804 /*
805 * if we just got an ENOSPC, flush the inode now we
806 * aren't holding any page locks and retry *once*
807 */
808 if (ret2 == -ENOSPC && !enospc) {
809 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
810 if (error)
811 goto out_unlock_internal;
812 enospc = 1;
813 goto write_retry;
814 }
815 ret = ret2;
816 }
817
818 current->backing_dev_info = NULL;
819
820 isize = i_size_read(inode);
821 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
822 iocb->ki_pos = isize;
823
824 if (iocb->ki_pos > ip->i_size) {
825 xfs_ilock(ip, XFS_ILOCK_EXCL);
826 if (iocb->ki_pos > ip->i_size)
827 ip->i_size = iocb->ki_pos;
828 xfs_iunlock(ip, XFS_ILOCK_EXCL);
829 }
830
831 if (ret == -ENOSPC &&
832 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
833 xfs_iunlock(ip, iolock);
834 if (need_i_mutex)
835 mutex_unlock(&inode->i_mutex);
836 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
837 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
838 0, 0, 0); /* Delay flag intentionally unused */
839 if (need_i_mutex)
840 mutex_lock(&inode->i_mutex);
841 xfs_ilock(ip, iolock);
842 if (error)
843 goto out_unlock_internal;
844 goto start;
845 }
846
847 error = -ret;
848 if (ret <= 0)
849 goto out_unlock_internal;
850
851 XFS_STATS_ADD(xs_write_bytes, ret);
852
853 /* Handle various SYNC-type writes */
854 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
855 loff_t end = pos + ret - 1;
856 int error2;
857
858 xfs_iunlock(ip, iolock);
859 if (need_i_mutex)
860 mutex_unlock(&inode->i_mutex);
861
862 error2 = filemap_write_and_wait_range(mapping, pos, end);
863 if (!error)
864 error = error2;
865 if (need_i_mutex)
866 mutex_lock(&inode->i_mutex);
867 xfs_ilock(ip, iolock);
868
869 error2 = -xfs_file_fsync(file, file->f_path.dentry,
870 (file->f_flags & __O_SYNC) ? 0 : 1);
871 if (!error)
872 error = error2;
873 }
874
875 out_unlock_internal:
876 if (ip->i_new_size) {
877 xfs_ilock(ip, XFS_ILOCK_EXCL);
878 ip->i_new_size = 0;
879 /*
880 * If this was a direct or synchronous I/O that failed (such
881 * as ENOSPC) then part of the I/O may have been written to
882 * disk before the error occured. In this case the on-disk
883 * file size may have been adjusted beyond the in-memory file
884 * size and now needs to be truncated back.
885 */
886 if (ip->i_d.di_size > ip->i_size)
887 ip->i_d.di_size = ip->i_size;
888 xfs_iunlock(ip, XFS_ILOCK_EXCL);
889 }
890 xfs_iunlock(ip, iolock);
891 out_unlock_mutex:
892 if (need_i_mutex)
893 mutex_unlock(&inode->i_mutex);
894 return -error;
117} 895}
118 896
119STATIC int 897STATIC int
@@ -160,28 +938,6 @@ xfs_file_release(
160 return -xfs_release(XFS_I(inode)); 938 return -xfs_release(XFS_I(inode));
161} 939}
162 940
163/*
164 * We ignore the datasync flag here because a datasync is effectively
165 * identical to an fsync. That is, datasync implies that we need to write
166 * only the metadata needed to be able to access the data that is written
167 * if we crash after the call completes. Hence if we are writing beyond
168 * EOF we have to log the inode size change as well, which makes it a
169 * full fsync. If we don't write beyond EOF, the inode core will be
170 * clean in memory and so we don't need to log the inode, just like
171 * fsync.
172 */
173STATIC int
174xfs_file_fsync(
175 struct file *file,
176 struct dentry *dentry,
177 int datasync)
178{
179 struct xfs_inode *ip = XFS_I(dentry->d_inode);
180
181 xfs_iflags_clear(ip, XFS_ITRUNCATED);
182 return -xfs_fsync(ip);
183}
184
185STATIC int 941STATIC int
186xfs_file_readdir( 942xfs_file_readdir(
187 struct file *filp, 943 struct file *filp,
@@ -203,9 +959,9 @@ xfs_file_readdir(
203 * 959 *
204 * Try to give it an estimate that's good enough, maybe at some 960 * Try to give it an estimate that's good enough, maybe at some
205 * point we can change the ->readdir prototype to include the 961 * point we can change the ->readdir prototype to include the
206 * buffer size. 962 * buffer size. For now we use the current glibc buffer size.
207 */ 963 */
208 bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); 964 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
209 965
210 error = xfs_readdir(ip, dirent, bufsize, 966 error = xfs_readdir(ip, dirent, bufsize,
211 (xfs_off_t *)&filp->f_pos, filldir); 967 (xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e8566bbf0f00..61a99608731e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -91,6 +91,16 @@ xfs_mark_inode_dirty_sync(
91 mark_inode_dirty_sync(inode); 91 mark_inode_dirty_sync(inode);
92} 92}
93 93
94void
95xfs_mark_inode_dirty(
96 xfs_inode_t *ip)
97{
98 struct inode *inode = VFS_I(ip);
99
100 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
101 mark_inode_dirty(inode);
102}
103
94/* 104/*
95 * Change the requested timestamp in the given inode. 105 * Change the requested timestamp in the given inode.
96 * We don't lock across timestamp updates, and we don't log them but 106 * We don't lock across timestamp updates, and we don't log them but
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 5af0c81ca1ae..facfb323a706 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -88,7 +88,6 @@
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h> 89#include <xfs_globals.h>
90#include <xfs_fs_subr.h> 90#include <xfs_fs_subr.h>
91#include <xfs_lrw.h>
92#include <xfs_buf.h> 91#include <xfs_buf.h>
93 92
94/* 93/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index eac6f80d786d..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,796 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h"
30#include "xfs_mount.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_rtalloc.h"
42#include "xfs_error.h"
43#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h"
46#include "xfs_inode_item.h"
47#include "xfs_buf_item.h"
48#include "xfs_utils.h"
49#include "xfs_iomap.h"
50#include "xfs_vnodeops.h"
51#include "xfs_trace.h"
52
53#include <linux/capability.h>
54#include <linux/writeback.h>
55
56
57/*
58 * xfs_iozero
59 *
60 * xfs_iozero clears the specified range of buffer supplied,
61 * and marks all the affected blocks as valid and modified. If
62 * an affected block is not allocated, it will be allocated. If
63 * an affected block is not completely overwritten, and is not
64 * valid before the operation, it will be read from disk before
65 * being partially zeroed.
66 */
67STATIC int
68xfs_iozero(
69 struct xfs_inode *ip, /* inode */
70 loff_t pos, /* offset in file */
71 size_t count) /* size of data to zero */
72{
73 struct page *page;
74 struct address_space *mapping;
75 int status;
76
77 mapping = VFS_I(ip)->i_mapping;
78 do {
79 unsigned offset, bytes;
80 void *fsdata;
81
82 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
83 bytes = PAGE_CACHE_SIZE - offset;
84 if (bytes > count)
85 bytes = count;
86
87 status = pagecache_write_begin(NULL, mapping, pos, bytes,
88 AOP_FLAG_UNINTERRUPTIBLE,
89 &page, &fsdata);
90 if (status)
91 break;
92
93 zero_user(page, offset, bytes);
94
95 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
96 page, fsdata);
97 WARN_ON(status <= 0); /* can't return less than zero! */
98 pos += bytes;
99 count -= bytes;
100 status = 0;
101 } while (count);
102
103 return (-status);
104}
105
106ssize_t /* bytes read, or (-) error */
107xfs_read(
108 xfs_inode_t *ip,
109 struct kiocb *iocb,
110 const struct iovec *iovp,
111 unsigned int segs,
112 loff_t *offset,
113 int ioflags)
114{
115 struct file *file = iocb->ki_filp;
116 struct inode *inode = file->f_mapping->host;
117 xfs_mount_t *mp = ip->i_mount;
118 size_t size = 0;
119 ssize_t ret = 0;
120 xfs_fsize_t n;
121 unsigned long seg;
122
123
124 XFS_STATS_INC(xs_read_calls);
125
126 /* START copy & waste from filemap.c */
127 for (seg = 0; seg < segs; seg++) {
128 const struct iovec *iv = &iovp[seg];
129
130 /*
131 * If any segment has a negative length, or the cumulative
132 * length ever wraps negative then return -EINVAL.
133 */
134 size += iv->iov_len;
135 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
136 return XFS_ERROR(-EINVAL);
137 }
138 /* END copy & waste from filemap.c */
139
140 if (unlikely(ioflags & IO_ISDIRECT)) {
141 xfs_buftarg_t *target =
142 XFS_IS_REALTIME_INODE(ip) ?
143 mp->m_rtdev_targp : mp->m_ddev_targp;
144 if ((*offset & target->bt_smask) ||
145 (size & target->bt_smask)) {
146 if (*offset == ip->i_size) {
147 return (0);
148 }
149 return -XFS_ERROR(EINVAL);
150 }
151 }
152
153 n = XFS_MAXIOFFSET(mp) - *offset;
154 if ((n <= 0) || (size == 0))
155 return 0;
156
157 if (n < size)
158 size = n;
159
160 if (XFS_FORCED_SHUTDOWN(mp))
161 return -EIO;
162
163 if (unlikely(ioflags & IO_ISDIRECT))
164 mutex_lock(&inode->i_mutex);
165 xfs_ilock(ip, XFS_IOLOCK_SHARED);
166
167 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
168 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
169 int iolock = XFS_IOLOCK_SHARED;
170
171 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
172 dmflags, &iolock);
173 if (ret) {
174 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
175 if (unlikely(ioflags & IO_ISDIRECT))
176 mutex_unlock(&inode->i_mutex);
177 return ret;
178 }
179 }
180
181 if (unlikely(ioflags & IO_ISDIRECT)) {
182 if (inode->i_mapping->nrpages)
183 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
184 -1, FI_REMAPF_LOCKED);
185 mutex_unlock(&inode->i_mutex);
186 if (ret) {
187 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
188 return ret;
189 }
190 }
191
192 trace_xfs_file_read(ip, size, *offset, ioflags);
193
194 iocb->ki_pos = *offset;
195 ret = generic_file_aio_read(iocb, iovp, segs, *offset);
196 if (ret > 0)
197 XFS_STATS_ADD(xs_read_bytes, ret);
198
199 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
200 return ret;
201}
202
203ssize_t
204xfs_splice_read(
205 xfs_inode_t *ip,
206 struct file *infilp,
207 loff_t *ppos,
208 struct pipe_inode_info *pipe,
209 size_t count,
210 int flags,
211 int ioflags)
212{
213 xfs_mount_t *mp = ip->i_mount;
214 ssize_t ret;
215
216 XFS_STATS_INC(xs_read_calls);
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 return -EIO;
219
220 xfs_ilock(ip, XFS_IOLOCK_SHARED);
221
222 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
223 int iolock = XFS_IOLOCK_SHARED;
224 int error;
225
226 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
227 FILP_DELAY_FLAG(infilp), &iolock);
228 if (error) {
229 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
230 return -error;
231 }
232 }
233
234 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
235
236 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
237 if (ret > 0)
238 XFS_STATS_ADD(xs_read_bytes, ret);
239
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241 return ret;
242}
243
244ssize_t
245xfs_splice_write(
246 xfs_inode_t *ip,
247 struct pipe_inode_info *pipe,
248 struct file *outfilp,
249 loff_t *ppos,
250 size_t count,
251 int flags,
252 int ioflags)
253{
254 xfs_mount_t *mp = ip->i_mount;
255 ssize_t ret;
256 struct inode *inode = outfilp->f_mapping->host;
257 xfs_fsize_t isize, new_size;
258
259 XFS_STATS_INC(xs_write_calls);
260 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
261 return -EIO;
262
263 xfs_ilock(ip, XFS_IOLOCK_EXCL);
264
265 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
266 int iolock = XFS_IOLOCK_EXCL;
267 int error;
268
269 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
270 FILP_DELAY_FLAG(outfilp), &iolock);
271 if (error) {
272 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
273 return -error;
274 }
275 }
276
277 new_size = *ppos + count;
278
279 xfs_ilock(ip, XFS_ILOCK_EXCL);
280 if (new_size > ip->i_size)
281 ip->i_new_size = new_size;
282 xfs_iunlock(ip, XFS_ILOCK_EXCL);
283
284 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
285
286 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
287 if (ret > 0)
288 XFS_STATS_ADD(xs_write_bytes, ret);
289
290 isize = i_size_read(inode);
291 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
292 *ppos = isize;
293
294 if (*ppos > ip->i_size) {
295 xfs_ilock(ip, XFS_ILOCK_EXCL);
296 if (*ppos > ip->i_size)
297 ip->i_size = *ppos;
298 xfs_iunlock(ip, XFS_ILOCK_EXCL);
299 }
300
301 if (ip->i_new_size) {
302 xfs_ilock(ip, XFS_ILOCK_EXCL);
303 ip->i_new_size = 0;
304 if (ip->i_d.di_size > ip->i_size)
305 ip->i_d.di_size = ip->i_size;
306 xfs_iunlock(ip, XFS_ILOCK_EXCL);
307 }
308 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310}
311
312/*
313 * This routine is called to handle zeroing any space in the last
314 * block of the file that is beyond the EOF. We do this since the
315 * size is being increased without writing anything to that block
316 * and we don't want anyone to read the garbage on the disk.
317 */
318STATIC int /* error (positive) */
319xfs_zero_last_block(
320 xfs_inode_t *ip,
321 xfs_fsize_t offset,
322 xfs_fsize_t isize)
323{
324 xfs_fileoff_t last_fsb;
325 xfs_mount_t *mp = ip->i_mount;
326 int nimaps;
327 int zero_offset;
328 int zero_len;
329 int error = 0;
330 xfs_bmbt_irec_t imap;
331
332 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
333
334 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
335 if (zero_offset == 0) {
336 /*
337 * There are no extra bytes in the last block on disk to
338 * zero, so return.
339 */
340 return 0;
341 }
342
343 last_fsb = XFS_B_TO_FSBT(mp, isize);
344 nimaps = 1;
345 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
346 &nimaps, NULL, NULL);
347 if (error) {
348 return error;
349 }
350 ASSERT(nimaps > 0);
351 /*
352 * If the block underlying isize is just a hole, then there
353 * is nothing to zero.
354 */
355 if (imap.br_startblock == HOLESTARTBLOCK) {
356 return 0;
357 }
358 /*
359 * Zero the part of the last block beyond the EOF, and write it
360 * out sync. We need to drop the ilock while we do this so we
361 * don't deadlock when the buffer cache calls back to us.
362 */
363 xfs_iunlock(ip, XFS_ILOCK_EXCL);
364
365 zero_len = mp->m_sb.sb_blocksize - zero_offset;
366 if (isize + zero_len > offset)
367 zero_len = offset - isize;
368 error = xfs_iozero(ip, isize, zero_len);
369
370 xfs_ilock(ip, XFS_ILOCK_EXCL);
371 ASSERT(error >= 0);
372 return error;
373}
374
375/*
376 * Zero any on disk space between the current EOF and the new,
377 * larger EOF. This handles the normal case of zeroing the remainder
378 * of the last block in the file and the unusual case of zeroing blocks
379 * out beyond the size of the file. This second case only happens
380 * with fixed size extents and when the system crashes before the inode
381 * size was updated but after blocks were allocated. If fill is set,
382 * then any holes in the range are filled and zeroed. If not, the holes
383 * are left alone as holes.
384 */
385
386int /* error (positive) */
387xfs_zero_eof(
388 xfs_inode_t *ip,
389 xfs_off_t offset, /* starting I/O offset */
390 xfs_fsize_t isize) /* current inode size */
391{
392 xfs_mount_t *mp = ip->i_mount;
393 xfs_fileoff_t start_zero_fsb;
394 xfs_fileoff_t end_zero_fsb;
395 xfs_fileoff_t zero_count_fsb;
396 xfs_fileoff_t last_fsb;
397 xfs_fileoff_t zero_off;
398 xfs_fsize_t zero_len;
399 int nimaps;
400 int error = 0;
401 xfs_bmbt_irec_t imap;
402
403 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
404 ASSERT(offset > isize);
405
406 /*
407 * First handle zeroing the block on which isize resides.
408 * We only zero a part of that block so it is handled specially.
409 */
410 error = xfs_zero_last_block(ip, offset, isize);
411 if (error) {
412 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
413 return error;
414 }
415
416 /*
417 * Calculate the range between the new size and the old
418 * where blocks needing to be zeroed may exist. To get the
419 * block where the last byte in the file currently resides,
420 * we need to subtract one from the size and truncate back
421 * to a block boundary. We subtract 1 in case the size is
422 * exactly on a block boundary.
423 */
424 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
425 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
426 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
427 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
428 if (last_fsb == end_zero_fsb) {
429 /*
430 * The size was only incremented on its last block.
431 * We took care of that above, so just return.
432 */
433 return 0;
434 }
435
436 ASSERT(start_zero_fsb <= end_zero_fsb);
437 while (start_zero_fsb <= end_zero_fsb) {
438 nimaps = 1;
439 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
440 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
441 0, NULL, 0, &imap, &nimaps, NULL, NULL);
442 if (error) {
443 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
444 return error;
445 }
446 ASSERT(nimaps > 0);
447
448 if (imap.br_state == XFS_EXT_UNWRITTEN ||
449 imap.br_startblock == HOLESTARTBLOCK) {
450 /*
451 * This loop handles initializing pages that were
452 * partially initialized by the code below this
453 * loop. It basically zeroes the part of the page
454 * that sits on a hole and sets the page as P_HOLE
455 * and calls remapf if it is a mapped file.
456 */
457 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
458 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
459 continue;
460 }
461
462 /*
463 * There are blocks we need to zero.
464 * Drop the inode lock while we're doing the I/O.
465 * We'll still have the iolock to protect us.
466 */
467 xfs_iunlock(ip, XFS_ILOCK_EXCL);
468
469 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
470 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
471
472 if ((zero_off + zero_len) > offset)
473 zero_len = offset - zero_off;
474
475 error = xfs_iozero(ip, zero_off, zero_len);
476 if (error) {
477 goto out_lock;
478 }
479
480 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
481 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
482
483 xfs_ilock(ip, XFS_ILOCK_EXCL);
484 }
485
486 return 0;
487
488out_lock:
489 xfs_ilock(ip, XFS_ILOCK_EXCL);
490 ASSERT(error >= 0);
491 return error;
492}
493
494ssize_t /* bytes written, or (-) error */
495xfs_write(
496 struct xfs_inode *xip,
497 struct kiocb *iocb,
498 const struct iovec *iovp,
499 unsigned int nsegs,
500 loff_t *offset,
501 int ioflags)
502{
503 struct file *file = iocb->ki_filp;
504 struct address_space *mapping = file->f_mapping;
505 struct inode *inode = mapping->host;
506 unsigned long segs = nsegs;
507 xfs_mount_t *mp;
508 ssize_t ret = 0, error = 0;
509 xfs_fsize_t isize, new_size;
510 int iolock;
511 int eventsent = 0;
512 size_t ocount = 0, count;
513 loff_t pos;
514 int need_i_mutex;
515
516 XFS_STATS_INC(xs_write_calls);
517
518 error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
519 if (error)
520 return error;
521
522 count = ocount;
523 pos = *offset;
524
525 if (count == 0)
526 return 0;
527
528 mp = xip->i_mount;
529
530 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
531
532 if (XFS_FORCED_SHUTDOWN(mp))
533 return -EIO;
534
535relock:
536 if (ioflags & IO_ISDIRECT) {
537 iolock = XFS_IOLOCK_SHARED;
538 need_i_mutex = 0;
539 } else {
540 iolock = XFS_IOLOCK_EXCL;
541 need_i_mutex = 1;
542 mutex_lock(&inode->i_mutex);
543 }
544
545 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
546
547start:
548 error = -generic_write_checks(file, &pos, &count,
549 S_ISBLK(inode->i_mode));
550 if (error) {
551 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
552 goto out_unlock_mutex;
553 }
554
555 if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
556 !(ioflags & IO_INVIS) && !eventsent)) {
557 int dmflags = FILP_DELAY_FLAG(file);
558
559 if (need_i_mutex)
560 dmflags |= DM_FLAGS_IMUX;
561
562 xfs_iunlock(xip, XFS_ILOCK_EXCL);
563 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
564 pos, count, dmflags, &iolock);
565 if (error) {
566 goto out_unlock_internal;
567 }
568 xfs_ilock(xip, XFS_ILOCK_EXCL);
569 eventsent = 1;
570
571 /*
572 * The iolock was dropped and reacquired in XFS_SEND_DATA
573 * so we have to recheck the size when appending.
574 * We will only "goto start;" once, since having sent the
575 * event prevents another call to XFS_SEND_DATA, which is
576 * what allows the size to change in the first place.
577 */
578 if ((file->f_flags & O_APPEND) && pos != xip->i_size)
579 goto start;
580 }
581
582 if (ioflags & IO_ISDIRECT) {
583 xfs_buftarg_t *target =
584 XFS_IS_REALTIME_INODE(xip) ?
585 mp->m_rtdev_targp : mp->m_ddev_targp;
586
587 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
588 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
589 return XFS_ERROR(-EINVAL);
590 }
591
592 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
593 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
594 iolock = XFS_IOLOCK_EXCL;
595 need_i_mutex = 1;
596 mutex_lock(&inode->i_mutex);
597 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
598 goto start;
599 }
600 }
601
602 new_size = pos + count;
603 if (new_size > xip->i_size)
604 xip->i_new_size = new_size;
605
606 if (likely(!(ioflags & IO_INVIS)))
607 file_update_time(file);
608
609 /*
610 * If the offset is beyond the size of the file, we have a couple
611 * of things to do. First, if there is already space allocated
612 * we need to either create holes or zero the disk or ...
613 *
614 * If there is a page where the previous size lands, we need
615 * to zero it out up to the new size.
616 */
617
618 if (pos > xip->i_size) {
619 error = xfs_zero_eof(xip, pos, xip->i_size);
620 if (error) {
621 xfs_iunlock(xip, XFS_ILOCK_EXCL);
622 goto out_unlock_internal;
623 }
624 }
625 xfs_iunlock(xip, XFS_ILOCK_EXCL);
626
627 /*
628 * If we're writing the file then make sure to clear the
629 * setuid and setgid bits if the process is not being run
630 * by root. This keeps people from modifying setuid and
631 * setgid binaries.
632 */
633 error = -file_remove_suid(file);
634 if (unlikely(error))
635 goto out_unlock_internal;
636
637 /* We can write back this queue in page reclaim */
638 current->backing_dev_info = mapping->backing_dev_info;
639
640 if ((ioflags & IO_ISDIRECT)) {
641 if (mapping->nrpages) {
642 WARN_ON(need_i_mutex == 0);
643 error = xfs_flushinval_pages(xip,
644 (pos & PAGE_CACHE_MASK),
645 -1, FI_REMAPF_LOCKED);
646 if (error)
647 goto out_unlock_internal;
648 }
649
650 if (need_i_mutex) {
651 /* demote the lock now the cached pages are gone */
652 xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
653 mutex_unlock(&inode->i_mutex);
654
655 iolock = XFS_IOLOCK_SHARED;
656 need_i_mutex = 0;
657 }
658
659 trace_xfs_file_direct_write(xip, count, *offset, ioflags);
660 ret = generic_file_direct_write(iocb, iovp,
661 &segs, pos, offset, count, ocount);
662
663 /*
664 * direct-io write to a hole: fall through to buffered I/O
665 * for completing the rest of the request.
666 */
667 if (ret >= 0 && ret != count) {
668 XFS_STATS_ADD(xs_write_bytes, ret);
669
670 pos += ret;
671 count -= ret;
672
673 ioflags &= ~IO_ISDIRECT;
674 xfs_iunlock(xip, iolock);
675 goto relock;
676 }
677 } else {
678 int enospc = 0;
679 ssize_t ret2 = 0;
680
681write_retry:
682 trace_xfs_file_buffered_write(xip, count, *offset, ioflags);
683 ret2 = generic_file_buffered_write(iocb, iovp, segs,
684 pos, offset, count, ret);
685 /*
686 * if we just got an ENOSPC, flush the inode now we
687 * aren't holding any page locks and retry *once*
688 */
689 if (ret2 == -ENOSPC && !enospc) {
690 error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
691 if (error)
692 goto out_unlock_internal;
693 enospc = 1;
694 goto write_retry;
695 }
696 ret = ret2;
697 }
698
699 current->backing_dev_info = NULL;
700
701 isize = i_size_read(inode);
702 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
703 *offset = isize;
704
705 if (*offset > xip->i_size) {
706 xfs_ilock(xip, XFS_ILOCK_EXCL);
707 if (*offset > xip->i_size)
708 xip->i_size = *offset;
709 xfs_iunlock(xip, XFS_ILOCK_EXCL);
710 }
711
712 if (ret == -ENOSPC &&
713 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
714 xfs_iunlock(xip, iolock);
715 if (need_i_mutex)
716 mutex_unlock(&inode->i_mutex);
717 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
718 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
719 0, 0, 0); /* Delay flag intentionally unused */
720 if (need_i_mutex)
721 mutex_lock(&inode->i_mutex);
722 xfs_ilock(xip, iolock);
723 if (error)
724 goto out_unlock_internal;
725 goto start;
726 }
727
728 error = -ret;
729 if (ret <= 0)
730 goto out_unlock_internal;
731
732 XFS_STATS_ADD(xs_write_bytes, ret);
733
734 /* Handle various SYNC-type writes */
735 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
736 loff_t end = pos + ret - 1;
737 int error2;
738
739 xfs_iunlock(xip, iolock);
740 if (need_i_mutex)
741 mutex_unlock(&inode->i_mutex);
742
743 error2 = filemap_write_and_wait_range(mapping, pos, end);
744 if (!error)
745 error = error2;
746 if (need_i_mutex)
747 mutex_lock(&inode->i_mutex);
748 xfs_ilock(xip, iolock);
749
750 error2 = xfs_fsync(xip);
751 if (!error)
752 error = error2;
753 }
754
755 out_unlock_internal:
756 if (xip->i_new_size) {
757 xfs_ilock(xip, XFS_ILOCK_EXCL);
758 xip->i_new_size = 0;
759 /*
760 * If this was a direct or synchronous I/O that failed (such
761 * as ENOSPC) then part of the I/O may have been written to
762 * disk before the error occured. In this case the on-disk
763 * file size may have been adjusted beyond the in-memory file
764 * size and now needs to be truncated back.
765 */
766 if (xip->i_d.di_size > xip->i_size)
767 xip->i_d.di_size = xip->i_size;
768 xfs_iunlock(xip, XFS_ILOCK_EXCL);
769 }
770 xfs_iunlock(xip, iolock);
771 out_unlock_mutex:
772 if (need_i_mutex)
773 mutex_unlock(&inode->i_mutex);
774 return -error;
775}
776
777/*
778 * If the underlying (data/log/rt) device is readonly, there are some
779 * operations that cannot proceed.
780 */
781int
782xfs_dev_is_read_only(
783 xfs_mount_t *mp,
784 char *message)
785{
786 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
787 xfs_readonly_buftarg(mp->m_logdev_targp) ||
788 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
789 cmn_err(CE_NOTE,
790 "XFS: %s required on read-only device.", message);
791 cmn_err(CE_NOTE,
792 "XFS: write access unavailable, cannot proceed.");
793 return EROFS;
794 }
795 return 0;
796}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index 342ae8c0d011..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LRW_H__
19#define __XFS_LRW_H__
20
21struct xfs_mount;
22struct xfs_inode;
23struct xfs_buf;
24
25extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
26
27extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
28
29#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d634..1947514ce1ad 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
44} 44}
45 45
46STATIC int 46STATIC int
47xfs_fs_quota_sync(
48 struct super_block *sb,
49 int type)
50{
51 struct xfs_mount *mp = XFS_M(sb);
52
53 if (sb->s_flags & MS_RDONLY)
54 return -EROFS;
55 if (!XFS_IS_QUOTA_RUNNING(mp))
56 return -ENOSYS;
57 return -xfs_sync_data(mp, 0);
58}
59
60STATIC int
61xfs_fs_get_xstate( 47xfs_fs_get_xstate(
62 struct super_block *sb, 48 struct super_block *sb,
63 struct fs_quota_stat *fqs) 49 struct fs_quota_stat *fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
82 return -EROFS; 68 return -EROFS;
83 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) 69 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
84 return -ENOSYS; 70 return -ENOSYS;
85 if (!capable(CAP_SYS_ADMIN))
86 return -EPERM;
87 71
88 if (uflags & XFS_QUOTA_UDQ_ACCT) 72 if (uflags & XFS_QUOTA_UDQ_ACCT)
89 flags |= XFS_UQUOTA_ACCT; 73 flags |= XFS_UQUOTA_ACCT;
@@ -144,14 +128,11 @@ xfs_fs_set_xquota(
144 return -ENOSYS; 128 return -ENOSYS;
145 if (!XFS_IS_QUOTA_ON(mp)) 129 if (!XFS_IS_QUOTA_ON(mp))
146 return -ESRCH; 130 return -ESRCH;
147 if (!capable(CAP_SYS_ADMIN))
148 return -EPERM;
149 131
150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 132 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
151} 133}
152 134
153const struct quotactl_ops xfs_quotactl_operations = { 135const struct quotactl_ops xfs_quotactl_operations = {
154 .quota_sync = xfs_fs_quota_sync,
155 .get_xstate = xfs_fs_get_xstate, 136 .get_xstate = xfs_fs_get_xstate,
156 .set_xstate = xfs_fs_set_xstate, 137 .set_xstate = xfs_fs_set_xstate,
157 .get_xquota = xfs_fs_get_xquota, 138 .get_xquota = xfs_fs_get_xquota,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 25ea2408118f..71345a370d9f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1063,7 +1063,7 @@ xfs_log_inode(
1063STATIC int 1063STATIC int
1064xfs_fs_write_inode( 1064xfs_fs_write_inode(
1065 struct inode *inode, 1065 struct inode *inode,
1066 int sync) 1066 struct writeback_control *wbc)
1067{ 1067{
1068 struct xfs_inode *ip = XFS_I(inode); 1068 struct xfs_inode *ip = XFS_I(inode);
1069 struct xfs_mount *mp = ip->i_mount; 1069 struct xfs_mount *mp = ip->i_mount;
@@ -1074,11 +1074,7 @@ xfs_fs_write_inode(
1074 if (XFS_FORCED_SHUTDOWN(mp)) 1074 if (XFS_FORCED_SHUTDOWN(mp))
1075 return XFS_ERROR(EIO); 1075 return XFS_ERROR(EIO);
1076 1076
1077 if (sync) { 1077 if (wbc->sync_mode == WB_SYNC_ALL) {
1078 error = xfs_wait_on_pages(ip, 0, -1);
1079 if (error)
1080 goto out;
1081
1082 /* 1078 /*
1083 * Make sure the inode has hit stable storage. By using the 1079 * Make sure the inode has hit stable storage. By using the
1084 * log and the fsync transactions we reduce the IOs we have 1080 * log and the fsync transactions we reduce the IOs we have
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a9f6d20aff41..05cd85317f6f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -607,7 +607,8 @@ xfssyncd(
607 set_freezable(); 607 set_freezable();
608 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 608 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
609 for (;;) { 609 for (;;) {
610 timeleft = schedule_timeout_interruptible(timeleft); 610 if (list_empty(&mp->m_sync_list))
611 timeleft = schedule_timeout_interruptible(timeleft);
611 /* swsusp */ 612 /* swsusp */
612 try_to_freeze(); 613 try_to_freeze();
613 if (kthread_should_stop() && list_empty(&mp->m_sync_list)) 614 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -627,8 +628,7 @@ xfssyncd(
627 list_add_tail(&mp->m_sync_work.w_list, 628 list_add_tail(&mp->m_sync_work.w_list,
628 &mp->m_sync_list); 629 &mp->m_sync_list);
629 } 630 }
630 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) 631 list_splice_init(&mp->m_sync_list, &tmp);
631 list_move(&work->w_list, &tmp);
632 spin_unlock(&mp->m_sync_lock); 632 spin_unlock(&mp->m_sync_lock);
633 633
634 list_for_each_entry_safe(work, n, &tmp, w_list) { 634 list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -688,12 +688,12 @@ xfs_inode_set_reclaim_tag(
688 struct xfs_perag *pag; 688 struct xfs_perag *pag;
689 689
690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
691 read_lock(&pag->pag_ici_lock); 691 write_lock(&pag->pag_ici_lock);
692 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
693 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
695 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
696 read_unlock(&pag->pag_ici_lock); 696 write_unlock(&pag->pag_ici_lock);
697 xfs_perag_put(pag); 697 xfs_perag_put(pag);
698} 698}
699 699
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 856eb3c8d605..5a107601e969 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -52,22 +52,6 @@
52#include "quota/xfs_dquot.h" 52#include "quota/xfs_dquot.h"
53 53
54/* 54/*
55 * Format fsblock number into a static buffer & return it.
56 */
57STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno)
58{
59 static char rval[50];
60
61 if (bno == NULLFSBLOCK)
62 sprintf(rval, "NULLFSBLOCK");
63 else if (isnullstartblock(bno))
64 sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno));
65 else
66 sprintf(rval, "%lld", (xfs_dfsbno_t)bno);
67 return rval;
68}
69
70/*
71 * We include this last to have the helpers above available for the trace 55 * We include this last to have the helpers above available for the trace
72 * event implementations. 56 * event implementations.
73 */ 57 */
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index a4574dcf5065..fcaa62f0799e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -197,13 +197,13 @@ TRACE_EVENT(xfs_iext_insert,
197 __entry->caller_ip = caller_ip; 197 __entry->caller_ip = caller_ip;
198 ), 198 ),
199 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 199 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
200 "offset %lld block %s count %lld flag %d caller %pf", 200 "offset %lld block %lld count %lld flag %d caller %pf",
201 MAJOR(__entry->dev), MINOR(__entry->dev), 201 MAJOR(__entry->dev), MINOR(__entry->dev),
202 __entry->ino, 202 __entry->ino,
203 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 203 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
204 (long)__entry->idx, 204 (long)__entry->idx,
205 __entry->startoff, 205 __entry->startoff,
206 xfs_fmtfsblock(__entry->startblock), 206 (__int64_t)__entry->startblock,
207 __entry->blockcount, 207 __entry->blockcount,
208 __entry->state, 208 __entry->state,
209 (char *)__entry->caller_ip) 209 (char *)__entry->caller_ip)
@@ -241,13 +241,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
241 __entry->caller_ip = caller_ip; 241 __entry->caller_ip = caller_ip;
242 ), 242 ),
243 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 243 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
244 "offset %lld block %s count %lld flag %d caller %pf", 244 "offset %lld block %lld count %lld flag %d caller %pf",
245 MAJOR(__entry->dev), MINOR(__entry->dev), 245 MAJOR(__entry->dev), MINOR(__entry->dev),
246 __entry->ino, 246 __entry->ino,
247 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 247 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
248 (long)__entry->idx, 248 (long)__entry->idx,
249 __entry->startoff, 249 __entry->startoff,
250 xfs_fmtfsblock(__entry->startblock), 250 (__int64_t)__entry->startblock,
251 __entry->blockcount, 251 __entry->blockcount,
252 __entry->state, 252 __entry->state,
253 (char *)__entry->caller_ip) 253 (char *)__entry->caller_ip)
@@ -593,7 +593,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
593 TP_ARGS(dqp), 593 TP_ARGS(dqp),
594 TP_STRUCT__entry( 594 TP_STRUCT__entry(
595 __field(dev_t, dev) 595 __field(dev_t, dev)
596 __field(__be32, id) 596 __field(u32, id)
597 __field(unsigned, flags) 597 __field(unsigned, flags)
598 __field(unsigned, nrefs) 598 __field(unsigned, nrefs)
599 __field(unsigned long long, res_bcount) 599 __field(unsigned long long, res_bcount)
@@ -606,7 +606,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
606 ), \ 606 ), \
607 TP_fast_assign( 607 TP_fast_assign(
608 __entry->dev = dqp->q_mount->m_super->s_dev; 608 __entry->dev = dqp->q_mount->m_super->s_dev;
609 __entry->id = dqp->q_core.d_id; 609 __entry->id = be32_to_cpu(dqp->q_core.d_id);
610 __entry->flags = dqp->dq_flags; 610 __entry->flags = dqp->dq_flags;
611 __entry->nrefs = dqp->q_nrefs; 611 __entry->nrefs = dqp->q_nrefs;
612 __entry->res_bcount = dqp->q_res_bcount; 612 __entry->res_bcount = dqp->q_res_bcount;
@@ -622,10 +622,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
622 be64_to_cpu(dqp->q_core.d_ino_softlimit); 622 be64_to_cpu(dqp->q_core.d_ino_softlimit);
623 ), 623 ),
624 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " 624 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
625 "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " 625 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
626 "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", 626 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
627 MAJOR(__entry->dev), MINOR(__entry->dev), 627 MAJOR(__entry->dev), MINOR(__entry->dev),
628 be32_to_cpu(__entry->id), 628 __entry->id,
629 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), 629 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
630 __entry->nrefs, 630 __entry->nrefs,
631 __entry->res_bcount, 631 __entry->res_bcount,
@@ -881,7 +881,7 @@ TRACE_EVENT(name, \
881 ), \ 881 ), \
882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
883 "offset 0x%llx count %zd flags %s " \ 883 "offset 0x%llx count %zd flags %s " \
884 "startoff 0x%llx startblock %s blockcount 0x%llx", \ 884 "startoff 0x%llx startblock %lld blockcount 0x%llx", \
885 MAJOR(__entry->dev), MINOR(__entry->dev), \ 885 MAJOR(__entry->dev), MINOR(__entry->dev), \
886 __entry->ino, \ 886 __entry->ino, \
887 __entry->size, \ 887 __entry->size, \
@@ -890,7 +890,7 @@ TRACE_EVENT(name, \
890 __entry->count, \ 890 __entry->count, \
891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ 891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
892 __entry->startoff, \ 892 __entry->startoff, \
893 xfs_fmtfsblock(__entry->startblock), \ 893 (__int64_t)__entry->startblock, \
894 __entry->blockcount) \ 894 __entry->blockcount) \
895) 895)
896DEFINE_IOMAP_EVENT(xfs_iomap_enter); 896DEFINE_IOMAP_EVENT(xfs_iomap_enter);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 1869fb973819..5c11e4d17010 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2550,22 +2550,134 @@ xfs_bmap_rtalloc(
2550} 2550}
2551 2551
2552STATIC int 2552STATIC int
2553xfs_bmap_btalloc_nullfb(
2554 struct xfs_bmalloca *ap,
2555 struct xfs_alloc_arg *args,
2556 xfs_extlen_t *blen)
2557{
2558 struct xfs_mount *mp = ap->ip->i_mount;
2559 struct xfs_perag *pag;
2560 xfs_agnumber_t ag, startag;
2561 int notinit = 0;
2562 int error;
2563
2564 if (ap->userdata && xfs_inode_is_filestream(ap->ip))
2565 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2566 else
2567 args->type = XFS_ALLOCTYPE_START_BNO;
2568 args->total = ap->total;
2569
2570 /*
2571 * Search for an allocation group with a single extent large enough
2572 * for the request. If one isn't found, then adjust the minimum
2573 * allocation size to the largest space found.
2574 */
2575 startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
2576 if (startag == NULLAGNUMBER)
2577 startag = ag = 0;
2578
2579 pag = xfs_perag_get(mp, ag);
2580 while (*blen < ap->alen) {
2581 if (!pag->pagf_init) {
2582 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2583 XFS_ALLOC_FLAG_TRYLOCK);
2584 if (error) {
2585 xfs_perag_put(pag);
2586 return error;
2587 }
2588 }
2589
2590 /*
2591 * See xfs_alloc_fix_freelist...
2592 */
2593 if (pag->pagf_init) {
2594 xfs_extlen_t longest;
2595 longest = xfs_alloc_longest_free_extent(mp, pag);
2596 if (*blen < longest)
2597 *blen = longest;
2598 } else
2599 notinit = 1;
2600
2601 if (xfs_inode_is_filestream(ap->ip)) {
2602 if (*blen >= ap->alen)
2603 break;
2604
2605 if (ap->userdata) {
2606 /*
2607 * If startag is an invalid AG, we've
2608 * come here once before and
2609 * xfs_filestream_new_ag picked the
2610 * best currently available.
2611 *
2612 * Don't continue looping, since we
2613 * could loop forever.
2614 */
2615 if (startag == NULLAGNUMBER)
2616 break;
2617
2618 error = xfs_filestream_new_ag(ap, &ag);
2619 xfs_perag_put(pag);
2620 if (error)
2621 return error;
2622
2623 /* loop again to set 'blen'*/
2624 startag = NULLAGNUMBER;
2625 pag = xfs_perag_get(mp, ag);
2626 continue;
2627 }
2628 }
2629 if (++ag == mp->m_sb.sb_agcount)
2630 ag = 0;
2631 if (ag == startag)
2632 break;
2633 xfs_perag_put(pag);
2634 pag = xfs_perag_get(mp, ag);
2635 }
2636 xfs_perag_put(pag);
2637
2638 /*
2639 * Since the above loop did a BUF_TRYLOCK, it is
2640 * possible that there is space for this request.
2641 */
2642 if (notinit || *blen < ap->minlen)
2643 args->minlen = ap->minlen;
2644 /*
2645 * If the best seen length is less than the request
2646 * length, use the best as the minimum.
2647 */
2648 else if (*blen < ap->alen)
2649 args->minlen = *blen;
2650 /*
2651 * Otherwise we've seen an extent as big as alen,
2652 * use that as the minimum.
2653 */
2654 else
2655 args->minlen = ap->alen;
2656
2657 /*
2658 * set the failure fallback case to look in the selected
2659 * AG as the stream may have moved.
2660 */
2661 if (xfs_inode_is_filestream(ap->ip))
2662 ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2663
2664 return 0;
2665}
2666
2667STATIC int
2553xfs_bmap_btalloc( 2668xfs_bmap_btalloc(
2554 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 2669 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
2555{ 2670{
2556 xfs_mount_t *mp; /* mount point structure */ 2671 xfs_mount_t *mp; /* mount point structure */
2557 xfs_alloctype_t atype = 0; /* type for allocation routines */ 2672 xfs_alloctype_t atype = 0; /* type for allocation routines */
2558 xfs_extlen_t align; /* minimum allocation alignment */ 2673 xfs_extlen_t align; /* minimum allocation alignment */
2559 xfs_agnumber_t ag;
2560 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 2674 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
2561 xfs_agnumber_t startag; 2675 xfs_agnumber_t ag;
2562 xfs_alloc_arg_t args; 2676 xfs_alloc_arg_t args;
2563 xfs_extlen_t blen; 2677 xfs_extlen_t blen;
2564 xfs_extlen_t nextminlen = 0; 2678 xfs_extlen_t nextminlen = 0;
2565 xfs_perag_t *pag;
2566 int nullfb; /* true if ap->firstblock isn't set */ 2679 int nullfb; /* true if ap->firstblock isn't set */
2567 int isaligned; 2680 int isaligned;
2568 int notinit;
2569 int tryagain; 2681 int tryagain;
2570 int error; 2682 int error;
2571 2683
@@ -2612,103 +2724,9 @@ xfs_bmap_btalloc(
2612 args.firstblock = ap->firstblock; 2724 args.firstblock = ap->firstblock;
2613 blen = 0; 2725 blen = 0;
2614 if (nullfb) { 2726 if (nullfb) {
2615 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) 2727 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
2616 args.type = XFS_ALLOCTYPE_NEAR_BNO; 2728 if (error)
2617 else 2729 return error;
2618 args.type = XFS_ALLOCTYPE_START_BNO;
2619 args.total = ap->total;
2620
2621 /*
2622 * Search for an allocation group with a single extent
2623 * large enough for the request.
2624 *
2625 * If one isn't found, then adjust the minimum allocation
2626 * size to the largest space found.
2627 */
2628 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
2629 if (startag == NULLAGNUMBER)
2630 startag = ag = 0;
2631 notinit = 0;
2632 pag = xfs_perag_get(mp, ag);
2633 while (blen < ap->alen) {
2634 if (!pag->pagf_init &&
2635 (error = xfs_alloc_pagf_init(mp, args.tp,
2636 ag, XFS_ALLOC_FLAG_TRYLOCK))) {
2637 xfs_perag_put(pag);
2638 return error;
2639 }
2640 /*
2641 * See xfs_alloc_fix_freelist...
2642 */
2643 if (pag->pagf_init) {
2644 xfs_extlen_t longest;
2645 longest = xfs_alloc_longest_free_extent(mp, pag);
2646 if (blen < longest)
2647 blen = longest;
2648 } else
2649 notinit = 1;
2650
2651 if (xfs_inode_is_filestream(ap->ip)) {
2652 if (blen >= ap->alen)
2653 break;
2654
2655 if (ap->userdata) {
2656 /*
2657 * If startag is an invalid AG, we've
2658 * come here once before and
2659 * xfs_filestream_new_ag picked the
2660 * best currently available.
2661 *
2662 * Don't continue looping, since we
2663 * could loop forever.
2664 */
2665 if (startag == NULLAGNUMBER)
2666 break;
2667
2668 error = xfs_filestream_new_ag(ap, &ag);
2669 xfs_perag_put(pag);
2670 if (error)
2671 return error;
2672
2673 /* loop again to set 'blen'*/
2674 startag = NULLAGNUMBER;
2675 pag = xfs_perag_get(mp, ag);
2676 continue;
2677 }
2678 }
2679 if (++ag == mp->m_sb.sb_agcount)
2680 ag = 0;
2681 if (ag == startag)
2682 break;
2683 xfs_perag_put(pag);
2684 pag = xfs_perag_get(mp, ag);
2685 }
2686 xfs_perag_put(pag);
2687 /*
2688 * Since the above loop did a BUF_TRYLOCK, it is
2689 * possible that there is space for this request.
2690 */
2691 if (notinit || blen < ap->minlen)
2692 args.minlen = ap->minlen;
2693 /*
2694 * If the best seen length is less than the request
2695 * length, use the best as the minimum.
2696 */
2697 else if (blen < ap->alen)
2698 args.minlen = blen;
2699 /*
2700 * Otherwise we've seen an extent as big as alen,
2701 * use that as the minimum.
2702 */
2703 else
2704 args.minlen = ap->alen;
2705
2706 /*
2707 * set the failure fallback case to look in the selected
2708 * AG as the stream may have moved.
2709 */
2710 if (xfs_inode_is_filestream(ap->ip))
2711 ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2712 } else if (ap->low) { 2730 } else if (ap->low) {
2713 if (xfs_inode_is_filestream(ap->ip)) 2731 if (xfs_inode_is_filestream(ap->ip))
2714 args.type = XFS_ALLOCTYPE_FIRST_AG; 2732 args.type = XFS_ALLOCTYPE_FIRST_AG;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f52ac276277e..7cf7220e7d5f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -292,7 +292,8 @@ typedef struct xfs_bstat {
292 __s32 bs_extents; /* number of extents */ 292 __s32 bs_extents; /* number of extents */
293 __u32 bs_gen; /* generation count */ 293 __u32 bs_gen; /* generation count */
294 __u16 bs_projid; /* project id */ 294 __u16 bs_projid; /* project id */
295 unsigned char bs_pad[14]; /* pad space, unused */ 295 __u16 bs_forkoff; /* inode fork offset in bytes */
296 unsigned char bs_pad[12]; /* pad space, unused */
296 __u32 bs_dmevmask; /* DMIG event mask */ 297 __u32 bs_dmevmask; /* DMIG event mask */
297 __u16 bs_dmstate; /* DMIG state info */ 298 __u16 bs_dmstate; /* DMIG state info */
298 __u16 bs_aextents; /* attribute number of extents */ 299 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e281eb4a1c49..6845db90818f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -190,13 +190,12 @@ xfs_iget_cache_hit(
190 trace_xfs_iget_reclaim(ip); 190 trace_xfs_iget_reclaim(ip);
191 191
192 /* 192 /*
193 * We need to set XFS_INEW atomically with clearing the 193 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
194 * reclaimable tag so that we do have an indicator of the 194 * from stomping over us while we recycle the inode. We can't
195 * inode still being initialized. 195 * clear the radix tree reclaimable tag yet as it requires
196 * pag_ici_lock to be held exclusive.
196 */ 197 */
197 ip->i_flags |= XFS_INEW; 198 ip->i_flags |= XFS_IRECLAIM;
198 ip->i_flags &= ~XFS_IRECLAIMABLE;
199 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
200 199
201 spin_unlock(&ip->i_flags_lock); 200 spin_unlock(&ip->i_flags_lock);
202 read_unlock(&pag->pag_ici_lock); 201 read_unlock(&pag->pag_ici_lock);
@@ -216,7 +215,15 @@ xfs_iget_cache_hit(
216 trace_xfs_iget_reclaim(ip); 215 trace_xfs_iget_reclaim(ip);
217 goto out_error; 216 goto out_error;
218 } 217 }
218
219 write_lock(&pag->pag_ici_lock);
220 spin_lock(&ip->i_flags_lock);
221 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
222 ip->i_flags |= XFS_INEW;
223 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
219 inode->i_state = I_NEW; 224 inode->i_state = I_NEW;
225 spin_unlock(&ip->i_flags_lock);
226 write_unlock(&pag->pag_ici_lock);
220 } else { 227 } else {
221 /* If the VFS inode is being torn down, pause and try again. */ 228 /* If the VFS inode is being torn down, pause and try again. */
222 if (!igrab(inode)) { 229 if (!igrab(inode)) {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fa31360046d4..0ffd56447045 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2439,75 +2439,31 @@ xfs_idestroy_fork(
2439} 2439}
2440 2440
2441/* 2441/*
2442 * Increment the pin count of the given buffer. 2442 * This is called to unpin an inode. The caller must have the inode locked
2443 * This value is protected by ipinlock spinlock in the mount structure. 2443 * in at least shared mode so that the buffer cannot be subsequently pinned
2444 * once someone is waiting for it to be unpinned.
2444 */ 2445 */
2445void 2446static void
2446xfs_ipin( 2447xfs_iunpin_nowait(
2447 xfs_inode_t *ip) 2448 struct xfs_inode *ip)
2448{
2449 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2450
2451 atomic_inc(&ip->i_pincount);
2452}
2453
2454/*
2455 * Decrement the pin count of the given inode, and wake up
2456 * anyone in xfs_iwait_unpin() if the count goes to 0. The
2457 * inode must have been previously pinned with a call to xfs_ipin().
2458 */
2459void
2460xfs_iunpin(
2461 xfs_inode_t *ip)
2462{
2463 ASSERT(atomic_read(&ip->i_pincount) > 0);
2464
2465 if (atomic_dec_and_test(&ip->i_pincount))
2466 wake_up(&ip->i_ipin_wait);
2467}
2468
2469/*
2470 * This is called to unpin an inode. It can be directed to wait or to return
2471 * immediately without waiting for the inode to be unpinned. The caller must
2472 * have the inode locked in at least shared mode so that the buffer cannot be
2473 * subsequently pinned once someone is waiting for it to be unpinned.
2474 */
2475STATIC void
2476__xfs_iunpin_wait(
2477 xfs_inode_t *ip,
2478 int wait)
2479{ 2449{
2480 xfs_inode_log_item_t *iip = ip->i_itemp;
2481
2482 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2483 if (atomic_read(&ip->i_pincount) == 0)
2484 return;
2485 2451
2486 /* Give the log a push to start the unpinning I/O */ 2452 /* Give the log a push to start the unpinning I/O */
2487 if (iip && iip->ili_last_lsn) 2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2488 xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0);
2489 else
2490 xfs_log_force(ip->i_mount, 0);
2491 2454
2492 if (wait)
2493 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2494} 2455}
2495 2456
2496void 2457void
2497xfs_iunpin_wait( 2458xfs_iunpin_wait(
2498 xfs_inode_t *ip) 2459 struct xfs_inode *ip)
2499{ 2460{
2500 __xfs_iunpin_wait(ip, 1); 2461 if (xfs_ipincount(ip)) {
2501} 2462 xfs_iunpin_nowait(ip);
2502 2463 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2503static inline void 2464 }
2504xfs_iunpin_nowait(
2505 xfs_inode_t *ip)
2506{
2507 __xfs_iunpin_wait(ip, 0);
2508} 2465}
2509 2466
2510
2511/* 2467/*
2512 * xfs_iextents_copy() 2468 * xfs_iextents_copy()
2513 * 2469 *
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6c912b027596..9965e40a4615 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -471,8 +471,6 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
471int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 471int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
472 472
473void xfs_iext_realloc(xfs_inode_t *, int, int); 473void xfs_iext_realloc(xfs_inode_t *, int, int);
474void xfs_ipin(xfs_inode_t *);
475void xfs_iunpin(xfs_inode_t *);
476void xfs_iunpin_wait(xfs_inode_t *); 474void xfs_iunpin_wait(xfs_inode_t *);
477int xfs_iflush(xfs_inode_t *, uint); 475int xfs_iflush(xfs_inode_t *, uint);
478void xfs_ichgtime(xfs_inode_t *, int); 476void xfs_ichgtime(xfs_inode_t *, int);
@@ -480,6 +478,7 @@ void xfs_lock_inodes(xfs_inode_t **, int, uint);
480void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 478void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
481 479
482void xfs_synchronize_times(xfs_inode_t *); 480void xfs_synchronize_times(xfs_inode_t *);
481void xfs_mark_inode_dirty(xfs_inode_t *);
483void xfs_mark_inode_dirty_sync(xfs_inode_t *); 482void xfs_mark_inode_dirty_sync(xfs_inode_t *);
484 483
485#define IHOLD(ip) \ 484#define IHOLD(ip) \
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d4dc063111f8..7bfea8540159 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -535,23 +535,23 @@ xfs_inode_item_format(
535 535
536/* 536/*
537 * This is called to pin the inode associated with the inode log 537 * This is called to pin the inode associated with the inode log
538 * item in memory so it cannot be written out. Do this by calling 538 * item in memory so it cannot be written out.
539 * xfs_ipin() to bump the pin count in the inode while holding the
540 * inode pin lock.
541 */ 539 */
542STATIC void 540STATIC void
543xfs_inode_item_pin( 541xfs_inode_item_pin(
544 xfs_inode_log_item_t *iip) 542 xfs_inode_log_item_t *iip)
545{ 543{
546 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
547 xfs_ipin(iip->ili_inode); 545
546 atomic_inc(&iip->ili_inode->i_pincount);
548} 547}
549 548
550 549
551/* 550/*
552 * This is called to unpin the inode associated with the inode log 551 * This is called to unpin the inode associated with the inode log
553 * item which was previously pinned with a call to xfs_inode_item_pin(). 552 * item which was previously pinned with a call to xfs_inode_item_pin().
554 * Just call xfs_iunpin() on the inode to do this. 553 *
554 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
555 */ 555 */
556/* ARGSUSED */ 556/* ARGSUSED */
557STATIC void 557STATIC void
@@ -559,7 +559,11 @@ xfs_inode_item_unpin(
559 xfs_inode_log_item_t *iip, 559 xfs_inode_log_item_t *iip,
560 int stale) 560 int stale)
561{ 561{
562 xfs_iunpin(iip->ili_inode); 562 struct xfs_inode *ip = iip->ili_inode;
563
564 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait);
563} 567}
564 568
565/* ARGSUSED */ 569/* ARGSUSED */
@@ -568,7 +572,7 @@ xfs_inode_item_unpin_remove(
568 xfs_inode_log_item_t *iip, 572 xfs_inode_log_item_t *iip,
569 xfs_trans_t *tp) 573 xfs_trans_t *tp)
570{ 574{
571 xfs_iunpin(iip->ili_inode); 575 xfs_inode_item_unpin(iip, 0);
572} 576}
573 577
574/* 578/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 3af02314c605..b1b801e4a28e 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -106,6 +106,7 @@ xfs_bulkstat_one_iget(
106 buf->bs_dmevmask = dic->di_dmevmask; 106 buf->bs_dmevmask = dic->di_dmevmask;
107 buf->bs_dmstate = dic->di_dmstate; 107 buf->bs_dmstate = dic->di_dmstate;
108 buf->bs_aextents = dic->di_anextents; 108 buf->bs_aextents = dic->di_anextents;
109 buf->bs_forkoff = XFS_IFORK_BOFF(ip);
109 110
110 switch (dic->di_format) { 111 switch (dic->di_format) {
111 case XFS_DINODE_FMT_DEV: 112 case XFS_DINODE_FMT_DEV:
@@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode(
176 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); 177 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
177 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); 178 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
178 buf->bs_aextents = be16_to_cpu(dic->di_anextents); 179 buf->bs_aextents = be16_to_cpu(dic->di_anextents);
180 buf->bs_forkoff = XFS_DFORK_BOFF(dic);
179 181
180 switch (dic->di_format) { 182 switch (dic->di_format) {
181 case XFS_DINODE_FMT_DEV: 183 case XFS_DINODE_FMT_DEV:
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4f16be4b6ee5..e8fba92d7cd9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -60,7 +60,7 @@ STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
61STATIC void xlog_dealloc_log(xlog_t *log); 61STATIC void xlog_dealloc_log(xlog_t *log);
62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], 62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
63 int nentries, xfs_log_ticket_t tic, 63 int nentries, struct xlog_ticket *tic,
64 xfs_lsn_t *start_lsn, 64 xfs_lsn_t *start_lsn,
65 xlog_in_core_t **commit_iclog, 65 xlog_in_core_t **commit_iclog,
66 uint flags); 66 uint flags);
@@ -243,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
243 * out when the next write occurs. 243 * out when the next write occurs.
244 */ 244 */
245xfs_lsn_t 245xfs_lsn_t
246xfs_log_done(xfs_mount_t *mp, 246xfs_log_done(
247 xfs_log_ticket_t xtic, 247 struct xfs_mount *mp,
248 void **iclog, 248 struct xlog_ticket *ticket,
249 uint flags) 249 struct xlog_in_core **iclog,
250 uint flags)
250{ 251{
251 xlog_t *log = mp->m_log; 252 struct log *log = mp->m_log;
252 xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; 253 xfs_lsn_t lsn = 0;
253 xfs_lsn_t lsn = 0;
254 254
255 if (XLOG_FORCED_SHUTDOWN(log) || 255 if (XLOG_FORCED_SHUTDOWN(log) ||
256 /* 256 /*
@@ -258,8 +258,7 @@ xfs_log_done(xfs_mount_t *mp,
258 * If we get an error, just continue and give back the log ticket. 258 * If we get an error, just continue and give back the log ticket.
259 */ 259 */
260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
261 (xlog_commit_record(mp, ticket, 261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
262 (xlog_in_core_t **)iclog, &lsn)))) {
263 lsn = (xfs_lsn_t) -1; 262 lsn = (xfs_lsn_t) -1;
264 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
265 flags |= XFS_LOG_REL_PERM_RESERV; 264 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -289,7 +288,7 @@ xfs_log_done(xfs_mount_t *mp,
289 } 288 }
290 289
291 return lsn; 290 return lsn;
292} /* xfs_log_done */ 291}
293 292
294/* 293/*
295 * Attaches a new iclog I/O completion callback routine during 294 * Attaches a new iclog I/O completion callback routine during
@@ -298,11 +297,11 @@ xfs_log_done(xfs_mount_t *mp,
298 * executing the callback at an appropriate time. 297 * executing the callback at an appropriate time.
299 */ 298 */
300int 299int
301xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ 300xfs_log_notify(
302 void *iclog_hndl, /* iclog to hang callback off */ 301 struct xfs_mount *mp,
303 xfs_log_callback_t *cb) 302 struct xlog_in_core *iclog,
303 xfs_log_callback_t *cb)
304{ 304{
305 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
306 int abortflg; 305 int abortflg;
307 306
308 spin_lock(&iclog->ic_callback_lock); 307 spin_lock(&iclog->ic_callback_lock);
@@ -316,16 +315,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
316 } 315 }
317 spin_unlock(&iclog->ic_callback_lock); 316 spin_unlock(&iclog->ic_callback_lock);
318 return abortflg; 317 return abortflg;
319} /* xfs_log_notify */ 318}
320 319
321int 320int
322xfs_log_release_iclog(xfs_mount_t *mp, 321xfs_log_release_iclog(
323 void *iclog_hndl) 322 struct xfs_mount *mp,
323 struct xlog_in_core *iclog)
324{ 324{
325 xlog_t *log = mp->m_log; 325 if (xlog_state_release_iclog(mp->m_log, iclog)) {
326 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
327
328 if (xlog_state_release_iclog(log, iclog)) {
329 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 326 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
330 return EIO; 327 return EIO;
331 } 328 }
@@ -344,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp,
344 * reservation, we prevent over allocation problems. 341 * reservation, we prevent over allocation problems.
345 */ 342 */
346int 343int
347xfs_log_reserve(xfs_mount_t *mp, 344xfs_log_reserve(
348 int unit_bytes, 345 struct xfs_mount *mp,
349 int cnt, 346 int unit_bytes,
350 xfs_log_ticket_t *ticket, 347 int cnt,
351 __uint8_t client, 348 struct xlog_ticket **ticket,
352 uint flags, 349 __uint8_t client,
353 uint t_type) 350 uint flags,
351 uint t_type)
354{ 352{
355 xlog_t *log = mp->m_log; 353 struct log *log = mp->m_log;
356 xlog_ticket_t *internal_ticket; 354 struct xlog_ticket *internal_ticket;
357 int retval = 0; 355 int retval = 0;
358 356
359 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 357 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
360 ASSERT((flags & XFS_LOG_NOSLEEP) == 0); 358 ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
@@ -367,7 +365,7 @@ xfs_log_reserve(xfs_mount_t *mp,
367 365
368 if (*ticket != NULL) { 366 if (*ticket != NULL) {
369 ASSERT(flags & XFS_LOG_PERM_RESERV); 367 ASSERT(flags & XFS_LOG_PERM_RESERV);
370 internal_ticket = (xlog_ticket_t *)*ticket; 368 internal_ticket = *ticket;
371 369
372 trace_xfs_log_reserve(log, internal_ticket); 370 trace_xfs_log_reserve(log, internal_ticket);
373 371
@@ -519,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
519 xlog_in_core_t *first_iclog; 517 xlog_in_core_t *first_iclog;
520#endif 518#endif
521 xfs_log_iovec_t reg[1]; 519 xfs_log_iovec_t reg[1];
522 xfs_log_ticket_t tic = NULL; 520 xlog_ticket_t *tic = NULL;
523 xfs_lsn_t lsn; 521 xfs_lsn_t lsn;
524 int error; 522 int error;
525 523
@@ -656,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp)
656 * transaction occur with one call to xfs_log_write(). 654 * transaction occur with one call to xfs_log_write().
657 */ 655 */
658int 656int
659xfs_log_write(xfs_mount_t * mp, 657xfs_log_write(
660 xfs_log_iovec_t reg[], 658 struct xfs_mount *mp,
661 int nentries, 659 struct xfs_log_iovec reg[],
662 xfs_log_ticket_t tic, 660 int nentries,
663 xfs_lsn_t *start_lsn) 661 struct xlog_ticket *tic,
662 xfs_lsn_t *start_lsn)
664{ 663{
665 int error; 664 struct log *log = mp->m_log;
666 xlog_t *log = mp->m_log; 665 int error;
667 666
668 if (XLOG_FORCED_SHUTDOWN(log)) 667 if (XLOG_FORCED_SHUTDOWN(log))
669 return XFS_ERROR(EIO); 668 return XFS_ERROR(EIO);
670 669
671 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { 670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
671 if (error)
672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
673 }
674 return error; 673 return error;
675} /* xfs_log_write */ 674}
676
677 675
678void 676void
679xfs_log_move_tail(xfs_mount_t *mp, 677xfs_log_move_tail(xfs_mount_t *mp,
@@ -1642,16 +1640,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1642 * bytes have been written out. 1640 * bytes have been written out.
1643 */ 1641 */
1644STATIC int 1642STATIC int
1645xlog_write(xfs_mount_t * mp, 1643xlog_write(
1646 xfs_log_iovec_t reg[], 1644 struct xfs_mount *mp,
1647 int nentries, 1645 struct xfs_log_iovec reg[],
1648 xfs_log_ticket_t tic, 1646 int nentries,
1649 xfs_lsn_t *start_lsn, 1647 struct xlog_ticket *ticket,
1650 xlog_in_core_t **commit_iclog, 1648 xfs_lsn_t *start_lsn,
1651 uint flags) 1649 struct xlog_in_core **commit_iclog,
1650 uint flags)
1652{ 1651{
1653 xlog_t *log = mp->m_log; 1652 xlog_t *log = mp->m_log;
1654 xlog_ticket_t *ticket = (xlog_ticket_t *)tic;
1655 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1653 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */
1656 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1654 xlog_op_header_t *logop_head; /* ptr to log operation header */
1657 __psint_t ptr; /* copy address into data region */ 1655 __psint_t ptr; /* copy address into data region */
@@ -1765,7 +1763,7 @@ xlog_write(xfs_mount_t * mp,
1765 default: 1763 default:
1766 xfs_fs_cmn_err(CE_WARN, mp, 1764 xfs_fs_cmn_err(CE_WARN, mp,
1767 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1765 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1768 logop_head->oh_clientid, tic); 1766 logop_head->oh_clientid, ticket);
1769 return XFS_ERROR(EIO); 1767 return XFS_ERROR(EIO);
1770 } 1768 }
1771 1769
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 7074be9d13e9..97a24c7795a4 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,8 +110,6 @@ typedef struct xfs_log_iovec {
110 uint i_type; /* type of region */ 110 uint i_type; /* type of region */
111} xfs_log_iovec_t; 111} xfs_log_iovec_t;
112 112
113typedef void* xfs_log_ticket_t;
114
115/* 113/*
116 * Structure used to pass callback function and the function's argument 114 * Structure used to pass callback function and the function's argument
117 * to the log manager. 115 * to the log manager.
@@ -126,10 +124,12 @@ typedef struct xfs_log_callback {
126#ifdef __KERNEL__ 124#ifdef __KERNEL__
127/* Log manager interfaces */ 125/* Log manager interfaces */
128struct xfs_mount; 126struct xfs_mount;
127struct xlog_in_core;
129struct xlog_ticket; 128struct xlog_ticket;
129
130xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 130xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
131 xfs_log_ticket_t ticket, 131 struct xlog_ticket *ticket,
132 void **iclog, 132 struct xlog_in_core **iclog,
133 uint flags); 133 uint flags);
134int _xfs_log_force(struct xfs_mount *mp, 134int _xfs_log_force(struct xfs_mount *mp,
135 uint flags, 135 uint flags,
@@ -151,21 +151,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
151void xfs_log_move_tail(struct xfs_mount *mp, 151void xfs_log_move_tail(struct xfs_mount *mp,
152 xfs_lsn_t tail_lsn); 152 xfs_lsn_t tail_lsn);
153int xfs_log_notify(struct xfs_mount *mp, 153int xfs_log_notify(struct xfs_mount *mp,
154 void *iclog, 154 struct xlog_in_core *iclog,
155 xfs_log_callback_t *callback_entry); 155 xfs_log_callback_t *callback_entry);
156int xfs_log_release_iclog(struct xfs_mount *mp, 156int xfs_log_release_iclog(struct xfs_mount *mp,
157 void *iclog_hndl); 157 struct xlog_in_core *iclog);
158int xfs_log_reserve(struct xfs_mount *mp, 158int xfs_log_reserve(struct xfs_mount *mp,
159 int length, 159 int length,
160 int count, 160 int count,
161 xfs_log_ticket_t *ticket, 161 struct xlog_ticket **ticket,
162 __uint8_t clientid, 162 __uint8_t clientid,
163 uint flags, 163 uint flags,
164 uint t_type); 164 uint t_type);
165int xfs_log_write(struct xfs_mount *mp, 165int xfs_log_write(struct xfs_mount *mp,
166 xfs_log_iovec_t region[], 166 xfs_log_iovec_t region[],
167 int nentries, 167 int nentries,
168 xfs_log_ticket_t ticket, 168 struct xlog_ticket *ticket,
169 xfs_lsn_t *start_lsn); 169 xfs_lsn_t *start_lsn);
170int xfs_log_unmount_write(struct xfs_mount *mp); 170int xfs_log_unmount_write(struct xfs_mount *mp);
171void xfs_log_unmount(struct xfs_mount *mp); 171void xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6afaaeb2950a..e79b56b4bca6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1097,13 +1097,15 @@ xfs_default_resblks(xfs_mount_t *mp)
1097 __uint64_t resblks; 1097 __uint64_t resblks;
1098 1098
1099 /* 1099 /*
1100 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. 1100 * We default to 5% or 8192 fsbs of space reserved, whichever is
1101 * This may drive us straight to ENOSPC on mount, but that implies 1101 * smaller. This is intended to cover concurrent allocation
1102 * we were already there on the last unmount. Warn if this occurs. 1102 * transactions when we initially hit enospc. These each require a 4
1103 * block reservation. Hence by default we cover roughly 2000 concurrent
1104 * allocation reservations.
1103 */ 1105 */
1104 resblks = mp->m_sb.sb_dblocks; 1106 resblks = mp->m_sb.sb_dblocks;
1105 do_div(resblks, 20); 1107 do_div(resblks, 20);
1106 resblks = min_t(__uint64_t, resblks, 1024); 1108 resblks = min_t(__uint64_t, resblks, 8192);
1107 return resblks; 1109 return resblks;
1108} 1110}
1109 1111
@@ -1417,6 +1419,9 @@ xfs_mountfs(
1417 * when at ENOSPC. This is needed for operations like create with 1419 * when at ENOSPC. This is needed for operations like create with
1418 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1420 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1419 * are not allowed to use this reserved space. 1421 * are not allowed to use this reserved space.
1422 *
1423 * This may drive us straight to ENOSPC on mount, but that implies
1424 * we were already there on the last unmount. Warn if this occurs.
1420 */ 1425 */
1421 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 1426 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1422 resblks = xfs_default_resblks(mp); 1427 resblks = xfs_default_resblks(mp);
@@ -1725,26 +1730,30 @@ xfs_mod_incore_sb_unlocked(
1725 lcounter += rem; 1730 lcounter += rem;
1726 } 1731 }
1727 } else { /* Taking blocks away */ 1732 } else { /* Taking blocks away */
1728
1729 lcounter += delta; 1733 lcounter += delta;
1734 if (lcounter >= 0) {
1735 mp->m_sb.sb_fdblocks = lcounter +
1736 XFS_ALLOC_SET_ASIDE(mp);
1737 return 0;
1738 }
1730 1739
1731 /* 1740 /*
1732 * If were out of blocks, use any available reserved blocks if 1741 * We are out of blocks, use any available reserved
1733 * were allowed to. 1742 * blocks if were allowed to.
1734 */ 1743 */
1744 if (!rsvd)
1745 return XFS_ERROR(ENOSPC);
1735 1746
1736 if (lcounter < 0) { 1747 lcounter = (long long)mp->m_resblks_avail + delta;
1737 if (rsvd) { 1748 if (lcounter >= 0) {
1738 lcounter = (long long)mp->m_resblks_avail + delta; 1749 mp->m_resblks_avail = lcounter;
1739 if (lcounter < 0) { 1750 return 0;
1740 return XFS_ERROR(ENOSPC);
1741 }
1742 mp->m_resblks_avail = lcounter;
1743 return 0;
1744 } else { /* not reserved */
1745 return XFS_ERROR(ENOSPC);
1746 }
1747 } 1751 }
1752 printk_once(KERN_WARNING
1753 "Filesystem \"%s\": reserve blocks depleted! "
1754 "Consider increasing reserve pool size.",
1755 mp->m_fsname);
1756 return XFS_ERROR(ENOSPC);
1748 } 1757 }
1749 1758
1750 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); 1759 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -2052,6 +2061,26 @@ xfs_mount_log_sb(
2052 return error; 2061 return error;
2053} 2062}
2054 2063
2064/*
2065 * If the underlying (data/log/rt) device is readonly, there are some
2066 * operations that cannot proceed.
2067 */
2068int
2069xfs_dev_is_read_only(
2070 struct xfs_mount *mp,
2071 char *message)
2072{
2073 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2074 xfs_readonly_buftarg(mp->m_logdev_targp) ||
2075 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2076 cmn_err(CE_NOTE,
2077 "XFS: %s required on read-only device.", message);
2078 cmn_err(CE_NOTE,
2079 "XFS: write access unavailable, cannot proceed.");
2080 return EROFS;
2081 }
2082 return 0;
2083}
2055 2084
2056#ifdef HAVE_PERCPU_SB 2085#ifdef HAVE_PERCPU_SB
2057/* 2086/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 70504fcf14cd..4fa0bc7b983e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -245,7 +245,7 @@ typedef struct xfs_mount {
245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ 245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
246 atomic_t m_active_trans; /* number trans frozen */ 246 atomic_t m_active_trans; /* number trans frozen */
247#ifdef HAVE_PERCPU_SB 247#ifdef HAVE_PERCPU_SB
248 xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ 248 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
249 unsigned long m_icsb_counters; /* disabled per-cpu counters */ 249 unsigned long m_icsb_counters; /* disabled per-cpu counters */
250 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ 250 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
251 struct mutex m_icsb_mutex; /* balancer sync lock */ 251 struct mutex m_icsb_mutex; /* balancer sync lock */
@@ -436,6 +436,8 @@ extern void xfs_freesb(xfs_mount_t *);
436extern int xfs_fs_writable(xfs_mount_t *); 436extern int xfs_fs_writable(xfs_mount_t *);
437extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); 437extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
438 438
439extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
440
439extern int xfs_dmops_get(struct xfs_mount *); 441extern int xfs_dmops_get(struct xfs_mount *);
440extern void xfs_dmops_put(struct xfs_mount *); 442extern void xfs_dmops_put(struct xfs_mount *);
441 443
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be942d4e3324..f73e358bae8d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -796,7 +796,7 @@ _xfs_trans_commit(
796 int sync; 796 int sync;
797#define XFS_TRANS_LOGVEC_COUNT 16 797#define XFS_TRANS_LOGVEC_COUNT 16
798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; 798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
799 void *commit_iclog; 799 struct xlog_in_core *commit_iclog;
800 int shutdown; 800 int shutdown;
801 801
802 commit_lsn = -1; 802 commit_lsn = -1;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c93e3a102857..79c8bab9dfff 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -910,7 +910,7 @@ typedef struct xfs_trans {
910 unsigned int t_blk_res_used; /* # of resvd blocks used */ 910 unsigned int t_blk_res_used; /* # of resvd blocks used */
911 unsigned int t_rtx_res; /* # of rt extents resvd */ 911 unsigned int t_rtx_res; /* # of rt extents resvd */
912 unsigned int t_rtx_res_used; /* # of resvd rt extents used */ 912 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
913 xfs_log_ticket_t t_ticket; /* log mgr ticket */ 913 struct xlog_ticket *t_ticket; /* log mgr ticket */
914 xfs_lsn_t t_lsn; /* log seq num of start of 914 xfs_lsn_t t_lsn; /* log seq num of start of
915 * transaction. */ 915 * transaction. */
916 xfs_lsn_t t_commit_lsn; /* log seq num of end of 916 xfs_lsn_t t_commit_lsn; /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 5ffd544434eb..fb586360d1c9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -46,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, 46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
47 xfs_daddr_t, int); 47 xfs_daddr_t, int);
48 48
49/*
50 * Add the locked buffer to the transaction.
51 *
52 * The buffer must be locked, and it cannot be associated with any
53 * transaction.
54 *
55 * If the buffer does not yet have a buf log item associated with it,
56 * then allocate one for it. Then add the buf item to the transaction.
57 */
58STATIC void
59_xfs_trans_bjoin(
60 struct xfs_trans *tp,
61 struct xfs_buf *bp,
62 int reset_recur)
63{
64 struct xfs_buf_log_item *bip;
65
66 ASSERT(XFS_BUF_ISBUSY(bp));
67 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
68
69 /*
70 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
71 * it doesn't have one yet, then allocate one and initialize it.
72 * The checks to see if one is there are in xfs_buf_item_init().
73 */
74 xfs_buf_item_init(bp, tp->t_mountp);
75 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
76 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
77 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
78 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
79 if (reset_recur)
80 bip->bli_recur = 0;
81
82 /*
83 * Take a reference for this transaction on the buf item.
84 */
85 atomic_inc(&bip->bli_refcount);
86
87 /*
88 * Get a log_item_desc to point at the new item.
89 */
90 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
91
92 /*
93 * Initialize b_fsprivate2 so we can find it with incore_match()
94 * in xfs_trans_get_buf() and friends above.
95 */
96 XFS_BUF_SET_FSPRIVATE2(bp, tp);
97
98}
99
100void
101xfs_trans_bjoin(
102 struct xfs_trans *tp,
103 struct xfs_buf *bp)
104{
105 _xfs_trans_bjoin(tp, bp, 0);
106 trace_xfs_trans_bjoin(bp->b_fspriv);
107}
49 108
50/* 109/*
51 * Get and lock the buffer for the caller if it is not already 110 * Get and lock the buffer for the caller if it is not already
@@ -132,40 +191,8 @@ xfs_trans_get_buf(xfs_trans_t *tp,
132 191
133 ASSERT(!XFS_BUF_GETERROR(bp)); 192 ASSERT(!XFS_BUF_GETERROR(bp));
134 193
135 /* 194 _xfs_trans_bjoin(tp, bp, 1);
136 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 195 trace_xfs_trans_get_buf(bp->b_fspriv);
137 * it doesn't have one yet, then allocate one and initialize it.
138 * The checks to see if one is there are in xfs_buf_item_init().
139 */
140 xfs_buf_item_init(bp, tp->t_mountp);
141
142 /*
143 * Set the recursion count for the buffer within this transaction
144 * to 0.
145 */
146 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
147 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
148 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
149 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
150 bip->bli_recur = 0;
151
152 /*
153 * Take a reference for this transaction on the buf item.
154 */
155 atomic_inc(&bip->bli_refcount);
156
157 /*
158 * Get a log_item_desc to point at the new item.
159 */
160 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
161
162 /*
163 * Initialize b_fsprivate2 so we can find it with incore_match()
164 * above.
165 */
166 XFS_BUF_SET_FSPRIVATE2(bp, tp);
167
168 trace_xfs_trans_get_buf(bip);
169 return (bp); 196 return (bp);
170} 197}
171 198
@@ -210,44 +237,11 @@ xfs_trans_getsb(xfs_trans_t *tp,
210 } 237 }
211 238
212 bp = xfs_getsb(mp, flags); 239 bp = xfs_getsb(mp, flags);
213 if (bp == NULL) { 240 if (bp == NULL)
214 return NULL; 241 return NULL;
215 }
216
217 /*
218 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
219 * it doesn't have one yet, then allocate one and initialize it.
220 * The checks to see if one is there are in xfs_buf_item_init().
221 */
222 xfs_buf_item_init(bp, mp);
223
224 /*
225 * Set the recursion count for the buffer within this transaction
226 * to 0.
227 */
228 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
229 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
230 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
231 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
232 bip->bli_recur = 0;
233
234 /*
235 * Take a reference for this transaction on the buf item.
236 */
237 atomic_inc(&bip->bli_refcount);
238
239 /*
240 * Get a log_item_desc to point at the new item.
241 */
242 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
243
244 /*
245 * Initialize b_fsprivate2 so we can find it with incore_match()
246 * above.
247 */
248 XFS_BUF_SET_FSPRIVATE2(bp, tp);
249 242
250 trace_xfs_trans_getsb(bip); 243 _xfs_trans_bjoin(tp, bp, 1);
244 trace_xfs_trans_getsb(bp->b_fspriv);
251 return (bp); 245 return (bp);
252} 246}
253 247
@@ -425,40 +419,9 @@ xfs_trans_read_buf(
425 if (XFS_FORCED_SHUTDOWN(mp)) 419 if (XFS_FORCED_SHUTDOWN(mp))
426 goto shutdown_abort; 420 goto shutdown_abort;
427 421
428 /* 422 _xfs_trans_bjoin(tp, bp, 1);
429 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 423 trace_xfs_trans_read_buf(bp->b_fspriv);
430 * it doesn't have one yet, then allocate one and initialize it.
431 * The checks to see if one is there are in xfs_buf_item_init().
432 */
433 xfs_buf_item_init(bp, tp->t_mountp);
434 424
435 /*
436 * Set the recursion count for the buffer within this transaction
437 * to 0.
438 */
439 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
440 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
441 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
442 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
443 bip->bli_recur = 0;
444
445 /*
446 * Take a reference for this transaction on the buf item.
447 */
448 atomic_inc(&bip->bli_refcount);
449
450 /*
451 * Get a log_item_desc to point at the new item.
452 */
453 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
454
455 /*
456 * Initialize b_fsprivate2 so we can find it with incore_match()
457 * above.
458 */
459 XFS_BUF_SET_FSPRIVATE2(bp, tp);
460
461 trace_xfs_trans_read_buf(bip);
462 *bpp = bp; 425 *bpp = bp;
463 return 0; 426 return 0;
464 427
@@ -623,53 +586,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
623} 586}
624 587
625/* 588/*
626 * Add the locked buffer to the transaction.
627 * The buffer must be locked, and it cannot be associated with any
628 * transaction.
629 *
630 * If the buffer does not yet have a buf log item associated with it,
631 * then allocate one for it. Then add the buf item to the transaction.
632 */
633void
634xfs_trans_bjoin(xfs_trans_t *tp,
635 xfs_buf_t *bp)
636{
637 xfs_buf_log_item_t *bip;
638
639 ASSERT(XFS_BUF_ISBUSY(bp));
640 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
641
642 /*
643 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
644 * it doesn't have one yet, then allocate one and initialize it.
645 * The checks to see if one is there are in xfs_buf_item_init().
646 */
647 xfs_buf_item_init(bp, tp->t_mountp);
648 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
649 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
650 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
651 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
652
653 /*
654 * Take a reference for this transaction on the buf item.
655 */
656 atomic_inc(&bip->bli_refcount);
657
658 /*
659 * Get a log_item_desc to point at the new item.
660 */
661 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
662
663 /*
664 * Initialize b_fsprivate2 so we can find it with incore_match()
665 * in xfs_trans_get_buf() and friends above.
666 */
667 XFS_BUF_SET_FSPRIVATE2(bp, tp);
668
669 trace_xfs_trans_bjoin(bip);
670}
671
672/*
673 * Mark the buffer as not needing to be unlocked when the buf item's 589 * Mark the buffer as not needing to be unlocked when the buf item's
674 * IOP_UNLOCK() routine is called. The buffer must already be locked 590 * IOP_UNLOCK() routine is called. The buffer must already be locked
675 * and associated with the given transaction. 591 * and associated with the given transaction.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ddd2c5d1b854..9d376be0ea38 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -584,113 +584,6 @@ xfs_readlink(
584} 584}
585 585
586/* 586/*
587 * xfs_fsync
588 *
589 * This is called to sync the inode and its data out to disk. We need to hold
590 * the I/O lock while flushing the data, and the inode lock while flushing the
591 * inode. The inode lock CANNOT be held while flushing the data, so acquire
592 * after we're done with that.
593 */
594int
595xfs_fsync(
596 xfs_inode_t *ip)
597{
598 xfs_trans_t *tp;
599 int error = 0;
600 int log_flushed = 0;
601
602 xfs_itrace_entry(ip);
603
604 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
605 return XFS_ERROR(EIO);
606
607 /*
608 * We always need to make sure that the required inode state is safe on
609 * disk. The inode might be clean but we still might need to force the
610 * log because of committed transactions that haven't hit the disk yet.
611 * Likewise, there could be unflushed non-transactional changes to the
612 * inode core that have to go to disk and this requires us to issue
613 * a synchronous transaction to capture these changes correctly.
614 *
615 * This code relies on the assumption that if the update_* fields
616 * of the inode are clear and the inode is unpinned then it is clean
617 * and no action is required.
618 */
619 xfs_ilock(ip, XFS_ILOCK_SHARED);
620
621 if (!ip->i_update_core) {
622 /*
623 * Timestamps/size haven't changed since last inode flush or
624 * inode transaction commit. That means either nothing got
625 * written or a transaction committed which caught the updates.
626 * If the latter happened and the transaction hasn't hit the
627 * disk yet, the inode will be still be pinned. If it is,
628 * force the log.
629 */
630 xfs_iunlock(ip, XFS_ILOCK_SHARED);
631 if (xfs_ipincount(ip)) {
632 if (ip->i_itemp->ili_last_lsn) {
633 error = _xfs_log_force_lsn(ip->i_mount,
634 ip->i_itemp->ili_last_lsn,
635 XFS_LOG_SYNC, &log_flushed);
636 } else {
637 error = _xfs_log_force(ip->i_mount,
638 XFS_LOG_SYNC, &log_flushed);
639 }
640 }
641 } else {
642 /*
643 * Kick off a transaction to log the inode core to get the
644 * updates. The sync transaction will also force the log.
645 */
646 xfs_iunlock(ip, XFS_ILOCK_SHARED);
647 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
648 error = xfs_trans_reserve(tp, 0,
649 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
650 if (error) {
651 xfs_trans_cancel(tp, 0);
652 return error;
653 }
654 xfs_ilock(ip, XFS_ILOCK_EXCL);
655
656 /*
657 * Note - it's possible that we might have pushed ourselves out
658 * of the way during trans_reserve which would flush the inode.
659 * But there's no guarantee that the inode buffer has actually
660 * gone out yet (it's delwri). Plus the buffer could be pinned
661 * anyway if it's part of an inode in another recent
662 * transaction. So we play it safe and fire off the
663 * transaction anyway.
664 */
665 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
666 xfs_trans_ihold(tp, ip);
667 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
668 xfs_trans_set_sync(tp);
669 error = _xfs_trans_commit(tp, 0, &log_flushed);
670
671 xfs_iunlock(ip, XFS_ILOCK_EXCL);
672 }
673
674 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
675 /*
676 * If the log write didn't issue an ordered tag we need
677 * to flush the disk cache for the data device now.
678 */
679 if (!log_flushed)
680 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
681
682 /*
683 * If this inode is on the RT dev we need to flush that
684 * cache as well.
685 */
686 if (XFS_IS_REALTIME_INODE(ip))
687 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
688 }
689
690 return error;
691}
692
693/*
694 * Flags for xfs_free_eofblocks 587 * Flags for xfs_free_eofblocks
695 */ 588 */
696#define XFS_FREE_EOF_TRYLOCK (1<<0) 589#define XFS_FREE_EOF_TRYLOCK (1<<0)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 774f40729ca1..d8dfa8d0dadd 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
22 22
23int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
24int xfs_fsync(struct xfs_inode *ip);
25int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);
26int xfs_inactive(struct xfs_inode *ip); 25int xfs_inactive(struct xfs_inode *ip);
27int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
@@ -50,18 +49,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
50int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); 49int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
51int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 50int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
52 int flags, struct attrlist_cursor_kern *cursor); 51 int flags, struct attrlist_cursor_kern *cursor);
53ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
54 const struct iovec *iovp, unsigned int segs,
55 loff_t *offset, int ioflags);
56ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
57 loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
58 int flags, int ioflags);
59ssize_t xfs_splice_write(struct xfs_inode *ip,
60 struct pipe_inode_info *pipe, struct file *outfilp,
61 loff_t *ppos, size_t count, int flags, int ioflags);
62ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb,
63 const struct iovec *iovp, unsigned int nsegs,
64 loff_t *offset, int ioflags);
65int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 52int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
66 int flags, struct xfs_iomap *iomapp, int *niomaps); 53 int flags, struct xfs_iomap *iomapp, int *niomaps);
67void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, 54void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
@@ -72,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
72 xfs_off_t last, uint64_t flags, int fiopt); 59 xfs_off_t last, uint64_t flags, int fiopt);
73int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); 60int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
74 61
62int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
63
75#endif /* _XFS_VNODEOPS_H */ 64#endif /* _XFS_VNODEOPS_H */